Comparing version 7.1.3 to 8.0.0
@@ -101,3 +101,3 @@ /// <reference types="node" /> | ||
crawlPage?: { | ||
launchBrowser?: PuppeteerLaunchOptions | ||
puppeteerLaunch?: PuppeteerLaunchOptions | ||
} | ||
@@ -104,0 +104,0 @@ } |
{ | ||
"name": "x-crawl", | ||
"version": "7.1.3", | ||
"version": "8.0.0", | ||
"author": "coderHXL", | ||
@@ -34,6 +34,6 @@ "description": "x-crawl is a flexible Node.js multifunctional crawler library.", | ||
"chalk": "4.1.2", | ||
"https-proxy-agent": "^5.0.1", | ||
"puppeteer": "19.10.0" | ||
"https-proxy-agent": "^7.0.1", | ||
"puppeteer": "21.1.0" | ||
}, | ||
"devDependencies": {} | ||
} |
@@ -138,9 +138,9 @@ # x-crawl · [![npm](https://img.shields.io/npm/v/x-crawl.svg)](https://www.npmjs.com/package/x-crawl) [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/coder-hxl/x-crawl/blob/main/LICENSE) | ||
```js | ||
// 1.Import module ES/CJS | ||
// 1. Import module ES/CJS | ||
import xCrawl from 'x-crawl' | ||
// 2.Create a crawler instance | ||
const myXCrawl = xCrawl({ maxRetry: 3, intervalTime: { max: 3000, min: 2000 } }) | ||
// 2. Create a crawler instance | ||
const myXCrawl = xCrawl({ maxRetry: 3, intervalTime: { max: 2000, min: 1000 } }) | ||
// 3.Set the crawling task | ||
// 3. Set the crawling task | ||
/* | ||
@@ -151,6 +151,6 @@ Call the startPolling API to start the polling function, | ||
myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => { | ||
// Call crawlPage API to crawl Page | ||
const res = await myXCrawl.crawlPage({ | ||
// Call the crawlPage API to crawl the page | ||
const pageResults = await myXCrawl.crawlPage({ | ||
targets: [ | ||
'https://www.airbnb.cn/s/experiences', | ||
'https://www.airbnb.cn/s/*/experiences', | ||
'https://www.airbnb.cn/s/plus_homes' | ||
@@ -161,24 +161,24 @@ ], | ||
// Store the image URL to targets | ||
const targets = [] | ||
const elSelectorMap = ['._fig15y', '._aov0j6'] | ||
for (const item of res) { | ||
// Obtain the image URL by traversing the crawled page results | ||
const imgUrls = [] | ||
for (const item of pageResults) { | ||
const { id } = item | ||
const { page } = item.data | ||
const elSelector = id === 1 ? '.i9cqrtb' : '.c4mnd7m' | ||
// Wait for the page to load | ||
await new Promise((r) => setTimeout(r, 300)) | ||
// wait for the page element to appear | ||
await page.waitForSelector(elSelector) | ||
// Gets the URL of the page image | ||
const urls = await page.$$eval(`${elSelectorMap[id - 1]} img`, (imgEls) => { | ||
return imgEls.map((item) => item.src) | ||
}) | ||
targets.push(...urls) | ||
// Get the URL of the page image | ||
const urls = await page.$$eval(`${elSelector} picture img`, (imgEls) => | ||
imgEls.map((item) => item.src) | ||
) | ||
imgUrls.push(...urls.slice(0, 8)) | ||
// Close page | ||
// close the page | ||
page.close() | ||
} | ||
// Call the crawlFile API to crawl pictures | ||
myXCrawl.crawlFile({ targets, storeDirs: './upload' }) | ||
// Call crawlFile API to crawl pictures | ||
await myXCrawl.crawlFile({ targets: imgUrls, storeDirs: './upload' }) | ||
}) | ||
@@ -190,11 +190,7 @@ ``` | ||
<div align="center"> | ||
<img src="https://raw.githubusercontent.com/coder-hxl/x-crawl/main/assets/en/crawler.png" /> | ||
<img src="https://raw.githubusercontent.com/coder-hxl/x-crawl/main/assets/example.gif" /> | ||
</div> | ||
<div align="center"> | ||
<img src="https://raw.githubusercontent.com/coder-hxl/x-crawl/main/assets/en/crawler-result.png" /> | ||
</div> | ||
**Note:** Please do not crawl randomly, you can check the **robots.txt** protocol before crawling. The class name of the website may change, this is just to demonstrate how to use x-crawl. | ||
**Note:** Do not crawl at will, you can check the **robots.txt** protocol before crawling. This is just to demonstrate how to use x-crawl. | ||
## Core Concepts | ||
@@ -342,3 +338,3 @@ | ||
// Cancel running the browser in headless mode | ||
crawlPage: { launchBrowser: { headless: false } } | ||
crawlPage: { puppeteerLaunch: { headless: false } } | ||
}) | ||
@@ -1310,3 +1306,3 @@ | ||
crawlPage?: { | ||
launchBrowser?: PuppeteerLaunchOptions // puppeteer | ||
puppeteerLaunch?: PuppeteerLaunchOptions // puppeteer | ||
} | ||
@@ -1313,0 +1309,0 @@ } |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
200697
130
3161
1785
+ Added@puppeteer/browsers@1.7.0(transitive)
+ Added@tootallnate/quickjs-emscripten@0.23.0(transitive)
+ Addedagent-base@7.1.3(transitive)
+ Addedast-types@0.13.4(transitive)
+ Addedb4a@1.6.7(transitive)
+ Addedbare-events@2.5.4(transitive)
+ Addedbasic-ftp@5.0.5(transitive)
+ Addedchromium-bidi@0.4.20(transitive)
+ Addedcosmiconfig@8.2.0(transitive)
+ Addedcross-fetch@4.0.0(transitive)
+ Addeddata-uri-to-buffer@6.0.2(transitive)
+ Addeddegenerator@5.0.1(transitive)
+ Addeddevtools-protocol@0.0.1159816(transitive)
+ Addedescodegen@2.1.0(transitive)
+ Addedesprima@4.0.1(transitive)
+ Addedestraverse@5.3.0(transitive)
+ Addedesutils@2.0.3(transitive)
+ Addedfast-fifo@1.3.2(transitive)
+ Addedget-uri@6.0.4(transitive)
+ Addedhttp-proxy-agent@7.0.2(transitive)
+ Addedhttps-proxy-agent@7.0.6(transitive)
+ Addedip-address@9.0.5(transitive)
+ Addedjsbn@1.1.0(transitive)
+ Addedlru-cache@7.18.3(transitive)
+ Addedmitt@3.0.1(transitive)
+ Addednetmask@2.0.2(transitive)
+ Addednode-fetch@2.7.0(transitive)
+ Addedpac-proxy-agent@7.1.0(transitive)
+ Addedpac-resolver@7.0.1(transitive)
+ Addedproxy-agent@6.3.0(transitive)
+ Addedpuppeteer@21.1.0(transitive)
+ Addedpuppeteer-core@21.1.0(transitive)
+ Addedsmart-buffer@4.2.0(transitive)
+ Addedsocks@2.8.4(transitive)
+ Addedsocks-proxy-agent@8.0.5(transitive)
+ Addedsource-map@0.6.1(transitive)
+ Addedsprintf-js@1.1.3(transitive)
+ Addedstreamx@2.22.0(transitive)
+ Addedtar-fs@3.0.4(transitive)
+ Addedtar-stream@3.1.7(transitive)
+ Addedtext-decoder@1.2.3(transitive)
+ Addedtslib@2.8.1(transitive)
- Removed@puppeteer/browsers@0.4.1(transitive)
- Removedagent-base@6.0.2(transitive)
- Removedbl@4.1.0(transitive)
- Removedchownr@1.1.4(transitive)
- Removedchromium-bidi@0.4.6(transitive)
- Removedcosmiconfig@8.1.3(transitive)
- Removedcross-fetch@3.1.5(transitive)
- Removeddevtools-protocol@0.0.1107588(transitive)
- Removedfs-constants@1.0.0(transitive)
- Removedhttps-proxy-agent@5.0.1(transitive)
- Removedinherits@2.0.4(transitive)
- Removedmitt@3.0.0(transitive)
- Removednode-fetch@2.6.7(transitive)
- Removedpuppeteer@19.10.0(transitive)
- Removedpuppeteer-core@19.10.0(transitive)
- Removedreadable-stream@3.6.2(transitive)
- Removedsafe-buffer@5.2.1(transitive)
- Removedstring_decoder@1.3.0(transitive)
- Removedtar-fs@2.1.1(transitive)
- Removedtar-stream@2.2.0(transitive)
- Removedutil-deprecate@1.0.2(transitive)
Updatedhttps-proxy-agent@^7.0.1
Updatedpuppeteer@21.1.0