@xapp/arachne
Advanced tools
Comparing version 1.3.3 to 1.3.4
@@ -59,28 +59,17 @@ /*! Copyright (c) 2020, XAPP AI */ | ||
export declare class Arachne { | ||
/** | ||
* The current running crawler. If it is undefined then there is no crawler running. | ||
*/ | ||
private runCallback?; | ||
private pageHandler; | ||
private browser?; | ||
private pool?; | ||
private queue; | ||
private readonly stealth; | ||
private maxCrawlAttempts; | ||
private totalCrawlAttempts; | ||
private maxRetries; | ||
private maxConcurrentPages; | ||
private stealth; | ||
private pageHandlerTimeout; | ||
private launchOptions?; | ||
/** | ||
* Set by the run() method, it is the resolve callback. | ||
*/ | ||
private complete?; | ||
/** | ||
* Keeps track of the active promises awaiting a page | ||
*/ | ||
private activePromises; | ||
constructor(props: ArachneProps); | ||
/** | ||
* Check to see if we are done and if so, close out the promise on | ||
* the run() method. | ||
*/ | ||
private checkComplete; | ||
/** | ||
* Updates the request with the error, increasing the request attempt field | ||
@@ -103,5 +92,5 @@ * and then either fails the request or reclaims it in the queue. | ||
*/ | ||
run(): Promise<void>; | ||
run(): Promise<unknown>; | ||
/** | ||
* Stop the crawler. | ||
* Stop the crawler if it is running. | ||
*/ | ||
@@ -108,0 +97,0 @@ stop(): Promise<void>; |
@@ -13,5 +13,2 @@ "use strict"; | ||
exports.Arachne = exports.DEFAULT_MAX_RETRIES = exports.DEFAULT_MAX_PAGE_HANDLER_DURATION = exports.DEFAULT_MAX_CRAWL_ATTEMPTS = exports.DEFAULT_MAX_CONCURRENT_PAGES = void 0; | ||
/*! Copyright (c) 2020, XAPP AI */ | ||
// import * as puppeteer from "puppeteer"; | ||
const puppeteer_1 = require("puppeteer"); | ||
const puppeteer_extra_1 = require("puppeteer-extra"); | ||
@@ -26,15 +23,2 @@ const StealthPlugin = require("puppeteer-extra-plugin-stealth"); | ||
exports.DEFAULT_MAX_RETRIES = 3; | ||
class PagedError extends Error { | ||
constructor(error, page, request) { | ||
const message = typeof error === "string" ? error : error.message; | ||
super(message); | ||
this.name = "PagedError"; | ||
this.timedOut = false; | ||
this.page = page; | ||
this.request = request; | ||
if (error instanceof puppeteer_1.TimeoutError) { | ||
this.timedOut = true; | ||
} | ||
} | ||
} | ||
/** | ||
@@ -46,12 +30,6 @@ * A web crawler based on puppeteer. | ||
this.maxCrawlAttempts = exports.DEFAULT_MAX_CRAWL_ATTEMPTS; | ||
this.totalCrawlAttempts = 0; | ||
this.maxRetries = exports.DEFAULT_MAX_RETRIES; | ||
this.maxConcurrentPages = exports.DEFAULT_MAX_CONCURRENT_PAGES; | ||
this.stealth = false; | ||
this.pageHandlerTimeout = exports.DEFAULT_MAX_PAGE_HANDLER_DURATION; | ||
this.launchOptions = {}; | ||
/** | ||
* Keeps track of the active promises awaiting a page | ||
*/ | ||
this.activePromises = new Map(); | ||
this.browser = props.browser; | ||
@@ -66,15 +44,14 @@ this.launchOptions = props.launchOptions; | ||
this.stealth = !!props.stealth; | ||
} | ||
/** | ||
* Check to see if we are done and if so, close out the promise on | ||
* the run() method. | ||
*/ | ||
checkComplete() { | ||
if (this.activePromises.size === 0) { | ||
// This is the end. | ||
if (typeof this.complete === "function") { | ||
this.complete(); | ||
this.complete = undefined; | ||
} | ||
if (this.maxConcurrentPages < 0) { | ||
throw new Error("maxConcurrentPages must be greater than 0"); | ||
} | ||
if (this.maxCrawlAttempts < 0) { | ||
throw new Error("maxCrawlAttempts must be greater than 0"); | ||
} | ||
if (this.maxRetries < 0) { | ||
throw new Error("maxRetries must be greater than 0"); | ||
} | ||
if (this.pageHandlerTimeout < 0) { | ||
throw new Error("pageHandlerTimeout must be greater than 0"); | ||
} | ||
} | ||
@@ -116,97 +93,86 @@ /** | ||
*/ | ||
requestCurrentQueue() { | ||
requestCurrentQueue(runner) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
// First exit condition, no pool | ||
if (!this.pool) { | ||
return this.checkComplete(); | ||
} | ||
// Only fetch a nextRequest from the queue if we have not met our max crawl attempts | ||
if (this.totalCrawlAttempts >= this.maxCrawlAttempts) { | ||
return this.checkComplete(); | ||
} | ||
// The initial request | ||
let next = yield this.queue.nextRequest(); | ||
if (!next) { | ||
return this.checkComplete(); | ||
} | ||
while (next) { | ||
// Increment the attempts | ||
this.totalCrawlAttempts++; | ||
// Copy the request since next will be overwritten on next loop | ||
const currentRequest = Object.assign({}, next); | ||
// The pool will return a page when it is available | ||
const promiseForPage = this.pool.getPage(); | ||
// Set the active promise, used to track how many are going | ||
this.activePromises.set(currentRequest.id, promiseForPage); | ||
// Setup the then() for the page | ||
promiseForPage.then((page) => __awaiter(this, void 0, void 0, function* () { | ||
let response; | ||
try { | ||
(0, stentor_logger_1.log)().debug(`Loading page ${currentRequest.url}...`); | ||
// Sometimes networkidle2 never settles so we switch it to just load | ||
let waitUntil = "networkidle2"; | ||
if (typeof currentRequest.requestAttempts === "number" && currentRequest.requestAttempts > 0) { | ||
(0, stentor_logger_1.log)().debug(`Failed request on the first attempt, trying again with waitUntil: load`); | ||
waitUntil = "load"; | ||
} | ||
const pageResponse = yield page.goto(currentRequest.url, { timeout: 10000, waitUntil }); | ||
(0, stentor_logger_1.log)().debug(`Page ${currentRequest.url} loaded with status ${pageResponse === null || pageResponse === void 0 ? void 0 : pageResponse.status()}`); | ||
if (pageResponse) { | ||
response = pageResponse; | ||
} | ||
else { | ||
throw new Error('Response was not provided by page.goto()'); | ||
} | ||
/** | ||
* Performs the actions on a page. Returns true if it satisfied a request. False otherwise. | ||
* @param page | ||
* @param index | ||
*/ | ||
const crawlPage = (page) => __awaiter(this, void 0, void 0, function* () { | ||
// Always need to recycle the page on every exit. *very important* | ||
if (runner.totalCrawlAttempts >= this.maxCrawlAttempts) { | ||
// There are no more attempts to be made. | ||
return false; | ||
} | ||
const nextRequest = yield this.queue.nextRequest(); | ||
if (!nextRequest) { | ||
// There's nothing left in the queue. We are done. | ||
return false; | ||
} | ||
runner.totalCrawlAttempts++; | ||
const currentRequest = Object.assign({}, nextRequest); | ||
let response; | ||
try { | ||
(0, stentor_logger_1.log)().debug(`Loading page ${currentRequest.url}...`); | ||
// Sometimes networkidle2 never settles so we switch it to just load | ||
let waitUntil = "networkidle2"; | ||
if (typeof currentRequest.requestAttempts === "number" && currentRequest.requestAttempts > 0) { | ||
(0, stentor_logger_1.log)().debug(`Failed request on the first attempt, trying again with waitUntil: load`); | ||
waitUntil = "load"; | ||
} | ||
catch (e) { | ||
// Throw it again with the page so it gets caught! | ||
throw new PagedError(e, page, currentRequest); | ||
const pageResponse = yield page.goto(currentRequest.url, { timeout: 10000, waitUntil }); | ||
(0, stentor_logger_1.log)().debug(`Page ${currentRequest.url} loaded with status ${pageResponse === null || pageResponse === void 0 ? void 0 : pageResponse.status()}`); | ||
if (pageResponse) { | ||
response = pageResponse; | ||
} | ||
else { | ||
throw new Error('Response was not provided by page.goto()'); | ||
} | ||
// When pulling from a file (file:/path/to/file), the status is 0 | ||
if (response.status() !== 0 && response.status() < 200 || response.status() > 299) { | ||
throw new PagedError(`The page returned an invalid response code: ${response.status()}`, page, currentRequest); | ||
throw new Error(`The page returned an invalid response code: ${response.status()}`); | ||
} | ||
try { | ||
yield (0, arachne_utils_1.promiseWithTimeout)(this.pageHandlerTimeout, this.pageHandler.bind(this, page, currentRequest, response), `ArachnePageHandler function exceeded timeout of ${this.pageHandlerTimeout} ms`); | ||
yield (0, arachne_utils_1.promiseWithTimeout)(this.pageHandlerTimeout, this.pageHandler.bind(this, page, currentRequest, response), `ArachnePageHandler function exceeded timeout of ${this.pageHandlerTimeout} ms`); | ||
// We got through it all, so tell the queue that it was handled. | ||
yield this.queue.handledRequest(currentRequest); | ||
} | ||
catch (e) { | ||
(0, stentor_logger_1.log)().error("There was an error handling the current request.", e); | ||
if (e.name === "TimeoutError") { | ||
// eslint-disable-next-line no-console | ||
(0, stentor_logger_1.log)().debug(e.message); | ||
} | ||
catch (e) { | ||
// Throw it again with the page so it gets caught! | ||
console.error(e); | ||
if (e.name === "TimeoutError") { | ||
// eslint-disable-next-line no-console | ||
console.debug(e.message); | ||
} | ||
else { | ||
console.error(`Caught ${e.name} in pageHandler `); | ||
console.error(e); | ||
} | ||
// what happens here when we rethrow? | ||
throw new PagedError(e, page, currentRequest); | ||
else { | ||
(0, stentor_logger_1.log)().error(`Caught ${e.name} in pageHandler `); | ||
(0, stentor_logger_1.log)().error(e); | ||
} | ||
// Tell the queue it was handled | ||
yield this.queue.handledRequest(currentRequest); | ||
return page; | ||
})).then((page) => __awaiter(this, void 0, void 0, function* () { | ||
var _a; | ||
// Always recycle the page | ||
yield ((_a = this.pool) === null || _a === void 0 ? void 0 : _a.recyclePage(page)); | ||
})).catch((e) => __awaiter(this, void 0, void 0, function* () { | ||
var _b; | ||
if (e instanceof PagedError) { | ||
(_b = this.pool) === null || _b === void 0 ? void 0 : _b.recyclePage(e.page); | ||
} | ||
yield this.errorTheRequest(currentRequest, e); | ||
})).finally(() => { | ||
// Clear out the active promise | ||
this.activePromises.delete(currentRequest.id); | ||
this.requestCurrentQueue(); | ||
}); | ||
if (this.totalCrawlAttempts >= this.maxCrawlAttempts) { | ||
// break out before we get another request | ||
break; | ||
} | ||
// This kicks off the flow | ||
next = yield this.queue.nextRequest(); | ||
return true; | ||
}); | ||
const pages = []; | ||
try { | ||
for (let i = 0; i < this.maxConcurrentPages; i++) { | ||
pages.push(yield runner.pool.getPage()); | ||
} | ||
let handledRequests = 0; | ||
do { | ||
handledRequests = 0; | ||
const crawlPagePromises = []; | ||
for (const page of pages) { | ||
crawlPagePromises.push(crawlPage(page).then((result) => { | ||
if (result) { | ||
handledRequests++; | ||
} | ||
return result; | ||
})); | ||
} | ||
yield Promise.all(crawlPagePromises); | ||
} while (handledRequests > 0); | ||
} | ||
this.checkComplete(); | ||
finally { | ||
for (const page of pages) { | ||
yield runner.pool.recyclePage(page); | ||
} | ||
} | ||
}); | ||
@@ -222,28 +188,53 @@ } | ||
return __awaiter(this, void 0, void 0, function* () { | ||
return new Promise((resolve, reject) => __awaiter(this, void 0, void 0, function* () { | ||
this.complete = resolve; | ||
if (!this.browser) { | ||
// Create and launch the browser! | ||
try { | ||
if (this.stealth) { | ||
puppeteer_extra_1.default.use(StealthPlugin()); | ||
if (this.runCallback) { | ||
return this.runCallback.currentRun; | ||
} | ||
// Returning a Promise.resolve() first so all code, including the catch, is | ||
// in a promise chain. This allows us to catch any errors that occur in any portion | ||
// whether they were a rejected promise or thrown error. | ||
// const arachne = this; | ||
const runPromise = Promise.resolve().then(() => __awaiter(this, void 0, void 0, function* () { | ||
const customBrowser = !!this.browser; | ||
let browser; | ||
if (customBrowser) { | ||
browser = this.browser; | ||
} | ||
else { | ||
if (this.stealth) { | ||
puppeteer_extra_1.default.use(StealthPlugin()); | ||
} | ||
browser = yield puppeteer_extra_1.default.launch(this.launchOptions); | ||
} | ||
const pool = new ArachnePool_1.ArachnePool(browser, { maxPages: this.maxConcurrentPages }); | ||
return this.runCallback = { | ||
currentRun: runPromise, // trippy right? It's the top of the entire promise chain. Only necessary so the run method will return the same thing if called again | ||
totalCrawlAttempts: 0, | ||
pool, | ||
browser, | ||
activeCrawls: {}, | ||
cancel: () => { | ||
const cancelPromises = []; | ||
cancelPromises.push(pool.destroy().catch((e) => { | ||
(0, stentor_logger_1.log)().warn("Failed to destroy pool", e); | ||
})); | ||
if (!customBrowser) { | ||
// Do not close a browser that was given to us from the outside. | ||
cancelPromises.push(browser.close().catch((e) => { | ||
(0, stentor_logger_1.log)().warn("Failed to close browser", e); | ||
})); | ||
} | ||
// eslint-disable-next-line @typescript-eslint/ban-ts-comment | ||
// @ts-ignore The types are exactly the same | ||
this.browser = yield puppeteer_extra_1.default.launch(this.launchOptions); | ||
return Promise.all(cancelPromises) | ||
.then(() => { }); | ||
} | ||
catch (e) { | ||
console.error(e); | ||
reject(e); | ||
} | ||
} | ||
if (this.browser) { | ||
// Fill up the pool! | ||
this.pool = new ArachnePool_1.ArachnePool(this.browser, { maxPages: this.maxConcurrentPages }); | ||
// kick off the run loop | ||
this.requestCurrentQueue(); | ||
} | ||
})).then(() => { | ||
// Before we fully resolve the promise, we end it | ||
return this.stop(); | ||
}; | ||
})); | ||
const runCallback = this.runCallback = yield runPromise; | ||
return this.runCallback.currentRun | ||
.then(() => this.requestCurrentQueue(runCallback)) | ||
.catch((e) => { | ||
(0, stentor_logger_1.log)().error("Error performing crawl.", e); | ||
throw e; | ||
}).finally(() => { | ||
this.runCallback = undefined; | ||
return runCallback.cancel(); | ||
}); | ||
@@ -253,9 +244,9 @@ }); | ||
/** | ||
* Stop the crawler. | ||
* Stop the crawler if it is running. | ||
*/ | ||
stop() { | ||
var _a, _b; | ||
return __awaiter(this, void 0, void 0, function* () { | ||
yield ((_a = this.pool) === null || _a === void 0 ? void 0 : _a.destroy()); | ||
yield ((_b = this.browser) === null || _b === void 0 ? void 0 : _b.close()); | ||
if (this.runCallback) { | ||
return this.runCallback.cancel(); | ||
} | ||
}); | ||
@@ -262,0 +253,0 @@ } |
@@ -58,3 +58,3 @@ "use strict"; | ||
return __awaiter(this, void 0, void 0, function* () { | ||
return this.pool.drain(); | ||
return this.pool.drain().then(() => this.pool.clear()); | ||
}); | ||
@@ -61,0 +61,0 @@ } |
{ | ||
"name": "@xapp/arachne", | ||
"version": "1.3.3", | ||
"version": "1.3.4", | ||
"types": "lib/index", | ||
@@ -21,3 +21,3 @@ "main": "lib/index", | ||
"@types/mocha": "10.0.6", | ||
"@types/node": "20.11.7", | ||
"@types/node": "20.11.10", | ||
"@types/sinon": "17.0.3", | ||
@@ -28,3 +28,3 @@ "@types/sinon-chai": "3.2.12", | ||
"mocha": "10.2.0", | ||
"puppeteer": "21.6.1", | ||
"puppeteer": "21.10.0", | ||
"sinon": "17.0.1", | ||
@@ -37,3 +37,3 @@ "sinon-chai": "3.7.0", | ||
"dependencies": { | ||
"@xapp/arachne-utils": "1.3.3", | ||
"@xapp/arachne-utils": "1.3.4", | ||
"generic-pool": "3.9.0", | ||
@@ -44,3 +44,3 @@ "puppeteer-extra": "3.3.6", | ||
"peerDependencies": { | ||
"puppeteer": "21.6.1" | ||
"puppeteer": "21.10.0" | ||
}, | ||
@@ -53,3 +53,3 @@ "scripts": { | ||
}, | ||
"gitHead": "14a5571867b6172a795541e8a34437ffd43e6384" | ||
"gitHead": "8ca135c134ade218cab2e12d631acedcb7364d4d" | ||
} |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
69420
956
+ Added@puppeteer/browsers@1.9.1(transitive)
+ Added@xapp/arachne-utils@1.3.4(transitive)
+ Addedchromium-bidi@0.5.6(transitive)
+ Addedcosmiconfig@9.0.0(transitive)
+ Addeddevtools-protocol@0.0.1232444(transitive)
+ Addedenv-paths@2.2.1(transitive)
+ Addedpuppeteer@21.10.0(transitive)
+ Addedpuppeteer-core@21.10.0(transitive)
+ Addedurlpattern-polyfill@10.0.0(transitive)
+ Addedws@8.16.0(transitive)
- Removed@puppeteer/browsers@1.9.0(transitive)
- Removed@xapp/arachne-utils@1.3.3(transitive)
- Removedchromium-bidi@0.5.1(transitive)
- Removedcosmiconfig@8.3.6(transitive)
- Removeddevtools-protocol@0.0.1203626(transitive)
- Removedpath-type@4.0.0(transitive)
- Removedpuppeteer@21.6.1(transitive)
- Removedpuppeteer-core@21.6.1(transitive)
- Removedurlpattern-polyfill@9.0.0(transitive)
- Removedws@8.15.1(transitive)
Updated@xapp/arachne-utils@1.3.4