Comparing version 3.1.1 to 3.2.0
@@ -230,3 +230,3 @@ 'use strict'; | ||
_loop = /*#__PURE__*/_regeneratorRuntime().mark(function _loop() { | ||
var requestConfig, id, requesttem; | ||
var requestConfig, id, requestItem; | ||
return _regeneratorRuntime().wrap(function _loop$(_context5) { | ||
@@ -240,3 +240,3 @@ while (1) switch (_context5.prev = _context5.next) { | ||
case 4: | ||
requesttem = request(requestConfig)["catch"](function (error) { | ||
requestItem = request(requestConfig)["catch"](function (error) { | ||
errorTotal++; | ||
@@ -258,3 +258,3 @@ var message = "Request ".concat(id, " is an error: ").concat(error.message); | ||
}); | ||
requestQueue.push(requesttem); | ||
requestQueue.push(requestItem); | ||
case 6: | ||
@@ -457,3 +457,3 @@ case "end": | ||
var createBrowserState = null; | ||
var callTotal = 0; | ||
var haveCreateBrowser = false; | ||
function crawlPage(_x15, _x16) { | ||
@@ -468,6 +468,5 @@ return _crawlPage.apply(this, arguments); | ||
case 0: | ||
// 记录调用次数, 目的: 关闭浏览器 | ||
callTotal++; | ||
// 只创建一次浏览器 | ||
if (callTotal === 1) { | ||
// 创建浏览器 | ||
if (!haveCreateBrowser) { | ||
haveCreateBrowser = true; | ||
createBrowserState = puppeteer.launch().then(function (res) { | ||
@@ -479,15 +478,15 @@ browser = res; | ||
if (!createBrowserState) { | ||
_context.next = 6; | ||
_context.next = 5; | ||
break; | ||
} | ||
_context.next = 5; | ||
_context.next = 4; | ||
return Promise.all([createBrowserState]); | ||
case 4: | ||
createBrowserState = null; | ||
case 5: | ||
createBrowserState = null; | ||
case 6: | ||
_context.next = 8; | ||
_context.next = 7; | ||
return browser.newPage(); | ||
case 8: | ||
case 7: | ||
page = _context.sent; | ||
_context.next = 11; | ||
_context.next = 10; | ||
return page.setViewport({ | ||
@@ -497,3 +496,3 @@ width: 1280, | ||
}); | ||
case 11: | ||
case 10: | ||
// 合并 baseConfig 配置 | ||
@@ -505,38 +504,33 @@ _mergeConfig = mergeConfig(baseConfig, { | ||
if (!requestConfig.proxy) { | ||
_context.next = 18; | ||
_context.next = 17; | ||
break; | ||
} | ||
_context.next = 16; | ||
_context.next = 15; | ||
return browser.createIncognitoBrowserContext({ | ||
proxyServer: requestConfig.proxy | ||
}); | ||
case 16: | ||
_context.next = 20; | ||
case 15: | ||
_context.next = 19; | ||
break; | ||
case 18: | ||
_context.next = 20; | ||
case 17: | ||
_context.next = 19; | ||
return browser.createIncognitoBrowserContext({ | ||
proxyServer: undefined | ||
}); | ||
case 20: | ||
_context.next = 22; | ||
case 19: | ||
_context.next = 21; | ||
return page["goto"](requestConfig.url, { | ||
timeout: requestConfig.timeout | ||
}); | ||
case 22: | ||
case 21: | ||
httpResponse = _context.sent; | ||
_context.next = 25; | ||
_context.next = 24; | ||
return page.content(); | ||
case 25: | ||
case 24: | ||
content = _context.sent; | ||
// 关闭浏览器 | ||
if (--callTotal === 0) { | ||
browser.close(); | ||
} | ||
res = { | ||
httpResponse: httpResponse, | ||
data: { | ||
page: page, | ||
jsdom: new jsdom.JSDOM(content) | ||
} | ||
browser: browser, | ||
page: page, | ||
jsdom: new jsdom.JSDOM(content) | ||
}; | ||
@@ -547,3 +541,3 @@ if (callback) { | ||
return _context.abrupt("return", res); | ||
case 30: | ||
case 28: | ||
case "end": | ||
@@ -550,0 +544,0 @@ return _context.stop(); |
/// <reference types="node" /> | ||
import { IncomingHttpHeaders } from 'node:http' | ||
import { HTTPResponse, Page } from 'puppeteer' | ||
import { Browser, HTTPResponse, Page } from 'puppeteer' | ||
import { JSDOM } from 'jsdom' | ||
@@ -67,7 +67,6 @@ import { RequestConfigObject } from './request' | ||
httpResponse: HTTPResponse | null | ||
data: { | ||
page: Page | ||
jsdom: JSDOM | ||
} | ||
browser: Browser | ||
page: Page | ||
jsdom: JSDOM | ||
} | ||
export {} |
{ | ||
"name": "x-crawl", | ||
"version": "3.1.1", | ||
"version": "3.2.0", | ||
"author": "coderHXL", | ||
@@ -5,0 +5,0 @@ "description": "x-crawl is a flexible nodejs crawler library.", |
@@ -7,3 +7,3 @@ # x-crawl | ||
If it helps you, please give the [x-crawl repository](https://github.com/coder-hxl/x-crawl) a Star to support it. | ||
If it helps you, you can give [x-crawl repository](https://github.com/coder-hxl/x-crawl) a Star support. | ||
@@ -18,3 +18,3 @@ ## Features | ||
- Anthropomorphic request interval. | ||
- Written in TypeScript, providing generics. | ||
- Written in TypeScript, has type hints, provides generics. | ||
@@ -41,2 +41,5 @@ ## Relationship with puppeteer | ||
* [Crawl page](#Crawl-page) | ||
+ [jsdom](#jsdom) | ||
+ [browser](#browser) | ||
+ [page](#page) | ||
* [Crawl interface](#Crawl-interface) | ||
@@ -55,3 +58,2 @@ * [Crawl files](#Crawl-files) | ||
+ [Example](#Example-2) | ||
+ [About page](#About-page) | ||
* [crawlData](#crawlData) | ||
@@ -114,3 +116,3 @@ + [Type](#Type-3) | ||
myXCrawl.crawlPage('https://www.youtube.com/').then((res) => { | ||
const { jsdom } = res.data // By default, the JSDOM library is used to parse Page | ||
const { browser, jsdom } = res // By default, the JSDOM library is used to parse Page | ||
@@ -135,2 +137,5 @@ // Get the cover image element of the Promoted Video | ||
}) | ||
// Close the browser | ||
browser.close() | ||
}) | ||
@@ -215,6 +220,23 @@ }) | ||
myXCrawl.crawlPage('https://xxx.com').then(res => { | ||
const { jsdom, page } = res.data | ||
const { jsdom, browser, page } = res | ||
// Close the browser | ||
browser.close() | ||
}) | ||
``` | ||
#### jsdom | ||
Refer to [jsdom](https://github.com/jsdom/jsdom) for specific usage. | ||
#### browser | ||
**Purpose of calling close: **browser will keep running, so the file will not be terminated. Do not call [crawlPage](#crawlPage) or [page](#page) if you need to use it later. When you modify the properties of the browser object, it will affect the browser inside the crawlPage of the crawler instance, the returned page, and the browser, because the browser is shared within the crawlPage API of the crawler instance. | ||
Refer to [browser](https://pptr.dev/api/puppeteer.browser) for specific usage. | ||
#### page | ||
The page attribute can be used for interactive operations such as events. For details, refer to [page](https://pptr.dev/api/puppeteer.page). | ||
### Crawl interface | ||
@@ -286,3 +308,6 @@ | ||
myXCrawl.crawlPage('https://xxx.com').then(res => { | ||
const { jsdom, page } = res.data | ||
const { jsdom, browser, page } = res | ||
// Close the browser | ||
browser.close() | ||
}) | ||
@@ -491,11 +516,10 @@ }) | ||
myXCrawl.crawlPage('https://xxx.com/xxxx').then((res) => { | ||
const { jsdom, page } = res.data | ||
const { jsdom, browser, page } = res | ||
console.log(jsdom.window.document.querySelector('title')?.textContent) | ||
// Close the browser | ||
browser.close() | ||
}) | ||
``` | ||
#### About page | ||
The page attribute can be used for interactive operations such as events. For details, refer to [page](https://pptr.dev/api/puppeteer.page). | ||
### crawlData | ||
@@ -783,6 +807,5 @@ | ||
httpResponse: HTTPResponse | null // The type of HTTPResponse in the puppeteer library | ||
data: { | ||
page: Page // The type of Page in the puppeteer library | ||
jsdom: JSDOM // The type of JSDOM in the jsdom library | ||
} | ||
browser // The type of Browser in the puppeteer library | ||
page: Page // The type of Page in the puppeteer library | ||
jsdom: JSDOM // The type of JSDOM in the jsdom library | ||
} | ||
@@ -789,0 +812,0 @@ ``` |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
108964
807
1694