Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

@xapp/arachne

Package Overview
Dependencies
Maintainers
5
Versions
61
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@xapp/arachne - npm Package Compare versions

Comparing version 1.3.3 to 1.3.4

25

lib/Arachne.d.ts

@@ -59,28 +59,17 @@ /*! Copyright (c) 2020, XAPP AI */

export declare class Arachne {
/**
* The current running crawler. If it is undefined then there is no crawler running.
*/
private runCallback?;
private pageHandler;
private browser?;
private pool?;
private queue;
private readonly stealth;
private maxCrawlAttempts;
private totalCrawlAttempts;
private maxRetries;
private maxConcurrentPages;
private stealth;
private pageHandlerTimeout;
private launchOptions?;
/**
* Set by the run() method, it is the resolve callback.
*/
private complete?;
/**
* Keeps track of the active promises awaiting a page
*/
private activePromises;
constructor(props: ArachneProps);
/**
* Check to see if we are done and if so, close out the promise on
* the run() method.
*/
private checkComplete;
/**
* Updates the request with the error, increasing the request attempt field

@@ -103,5 +92,5 @@ * and then either fails the request or reclaims it in the queue.

*/
run(): Promise<void>;
run(): Promise<unknown>;
/**
* Stop the crawler.
* Stop the crawler if it is running.
*/

@@ -108,0 +97,0 @@ stop(): Promise<void>;

@@ -13,5 +13,2 @@ "use strict";

exports.Arachne = exports.DEFAULT_MAX_RETRIES = exports.DEFAULT_MAX_PAGE_HANDLER_DURATION = exports.DEFAULT_MAX_CRAWL_ATTEMPTS = exports.DEFAULT_MAX_CONCURRENT_PAGES = void 0;
/*! Copyright (c) 2020, XAPP AI */
// import * as puppeteer from "puppeteer";
const puppeteer_1 = require("puppeteer");
const puppeteer_extra_1 = require("puppeteer-extra");

@@ -26,15 +23,2 @@ const StealthPlugin = require("puppeteer-extra-plugin-stealth");

exports.DEFAULT_MAX_RETRIES = 3;
class PagedError extends Error {
constructor(error, page, request) {
const message = typeof error === "string" ? error : error.message;
super(message);
this.name = "PagedError";
this.timedOut = false;
this.page = page;
this.request = request;
if (error instanceof puppeteer_1.TimeoutError) {
this.timedOut = true;
}
}
}
/**

@@ -46,12 +30,6 @@ * A web crawler based on puppeteer.

this.maxCrawlAttempts = exports.DEFAULT_MAX_CRAWL_ATTEMPTS;
this.totalCrawlAttempts = 0;
this.maxRetries = exports.DEFAULT_MAX_RETRIES;
this.maxConcurrentPages = exports.DEFAULT_MAX_CONCURRENT_PAGES;
this.stealth = false;
this.pageHandlerTimeout = exports.DEFAULT_MAX_PAGE_HANDLER_DURATION;
this.launchOptions = {};
/**
* Keeps track of the active promises awaiting a page
*/
this.activePromises = new Map();
this.browser = props.browser;

@@ -66,15 +44,14 @@ this.launchOptions = props.launchOptions;

this.stealth = !!props.stealth;
}
/**
* Check to see if we are done and if so, close out the promise on
* the run() method.
*/
checkComplete() {
if (this.activePromises.size === 0) {
// This is the end.
if (typeof this.complete === "function") {
this.complete();
this.complete = undefined;
}
if (this.maxConcurrentPages < 0) {
throw new Error("maxConcurrentPages must be greater than 0");
}
if (this.maxCrawlAttempts < 0) {
throw new Error("maxCrawlAttempts must be greater than 0");
}
if (this.maxRetries < 0) {
throw new Error("maxRetries must be greater than 0");
}
if (this.pageHandlerTimeout < 0) {
throw new Error("pageHandlerTimeout must be greater than 0");
}
}

@@ -116,97 +93,86 @@ /**

*/
requestCurrentQueue() {
requestCurrentQueue(runner) {
return __awaiter(this, void 0, void 0, function* () {
// First exit condition, no pool
if (!this.pool) {
return this.checkComplete();
}
// Only fetch a nextRequest from the queue if we have not met our max crawl attempts
if (this.totalCrawlAttempts >= this.maxCrawlAttempts) {
return this.checkComplete();
}
// The initial request
let next = yield this.queue.nextRequest();
if (!next) {
return this.checkComplete();
}
while (next) {
// Increment the attempts
this.totalCrawlAttempts++;
// Copy the request since next will be overwritten on next loop
const currentRequest = Object.assign({}, next);
// The pool will return a page when it is available
const promiseForPage = this.pool.getPage();
// Set the active promise, used to track how many are going
this.activePromises.set(currentRequest.id, promiseForPage);
// Setup the then() for the page
promiseForPage.then((page) => __awaiter(this, void 0, void 0, function* () {
let response;
try {
(0, stentor_logger_1.log)().debug(`Loading page ${currentRequest.url}...`);
// Sometimes networkidle2 never settles so we switch it to just load
let waitUntil = "networkidle2";
if (typeof currentRequest.requestAttempts === "number" && currentRequest.requestAttempts > 0) {
(0, stentor_logger_1.log)().debug(`Failed request on the first attempt, trying again with waitUntil: load`);
waitUntil = "load";
}
const pageResponse = yield page.goto(currentRequest.url, { timeout: 10000, waitUntil });
(0, stentor_logger_1.log)().debug(`Page ${currentRequest.url} loaded with status ${pageResponse === null || pageResponse === void 0 ? void 0 : pageResponse.status()}`);
if (pageResponse) {
response = pageResponse;
}
else {
throw new Error('Response was not provided by page.goto()');
}
/**
* Performs the actions on a page. Returns true if it satisfied a request. False otherwise.
* @param page
* @param index
*/
const crawlPage = (page) => __awaiter(this, void 0, void 0, function* () {
// Always need to recycle the page on every exit. *very important*
if (runner.totalCrawlAttempts >= this.maxCrawlAttempts) {
// There are no more attempts to be made.
return false;
}
const nextRequest = yield this.queue.nextRequest();
if (!nextRequest) {
// There's nothing left in the queue. We are done.
return false;
}
runner.totalCrawlAttempts++;
const currentRequest = Object.assign({}, nextRequest);
let response;
try {
(0, stentor_logger_1.log)().debug(`Loading page ${currentRequest.url}...`);
// Sometimes networkidle2 never settles so we switch it to just load
let waitUntil = "networkidle2";
if (typeof currentRequest.requestAttempts === "number" && currentRequest.requestAttempts > 0) {
(0, stentor_logger_1.log)().debug(`Failed request on the first attempt, trying again with waitUntil: load`);
waitUntil = "load";
}
catch (e) {
// Throw it again with the page so it gets caught!
throw new PagedError(e, page, currentRequest);
const pageResponse = yield page.goto(currentRequest.url, { timeout: 10000, waitUntil });
(0, stentor_logger_1.log)().debug(`Page ${currentRequest.url} loaded with status ${pageResponse === null || pageResponse === void 0 ? void 0 : pageResponse.status()}`);
if (pageResponse) {
response = pageResponse;
}
else {
throw new Error('Response was not provided by page.goto()');
}
// When pulling from a file (file:/path/to/file), the status is 0
if (response.status() !== 0 && response.status() < 200 || response.status() > 299) {
throw new PagedError(`The page returned an invalid response code: ${response.status()}`, page, currentRequest);
throw new Error(`The page returned an invalid response code: ${response.status()}`);
}
try {
yield (0, arachne_utils_1.promiseWithTimeout)(this.pageHandlerTimeout, this.pageHandler.bind(this, page, currentRequest, response), `ArachnePageHandler function exceeded timeout of ${this.pageHandlerTimeout} ms`);
yield (0, arachne_utils_1.promiseWithTimeout)(this.pageHandlerTimeout, this.pageHandler.bind(this, page, currentRequest, response), `ArachnePageHandler function exceeded timeout of ${this.pageHandlerTimeout} ms`);
// We got through it all, so tell the queue that it was handled.
yield this.queue.handledRequest(currentRequest);
}
catch (e) {
(0, stentor_logger_1.log)().error("There was an error handling the current request.", e);
if (e.name === "TimeoutError") {
// eslint-disable-next-line no-console
(0, stentor_logger_1.log)().debug(e.message);
}
catch (e) {
// Throw it again with the page so it gets caught!
console.error(e);
if (e.name === "TimeoutError") {
// eslint-disable-next-line no-console
console.debug(e.message);
}
else {
console.error(`Caught ${e.name} in pageHandler `);
console.error(e);
}
// what happens here when we rethrow?
throw new PagedError(e, page, currentRequest);
else {
(0, stentor_logger_1.log)().error(`Caught ${e.name} in pageHandler `);
(0, stentor_logger_1.log)().error(e);
}
// Tell the queue it was handled
yield this.queue.handledRequest(currentRequest);
return page;
})).then((page) => __awaiter(this, void 0, void 0, function* () {
var _a;
// Always recycle the page
yield ((_a = this.pool) === null || _a === void 0 ? void 0 : _a.recyclePage(page));
})).catch((e) => __awaiter(this, void 0, void 0, function* () {
var _b;
if (e instanceof PagedError) {
(_b = this.pool) === null || _b === void 0 ? void 0 : _b.recyclePage(e.page);
}
yield this.errorTheRequest(currentRequest, e);
})).finally(() => {
// Clear out the active promise
this.activePromises.delete(currentRequest.id);
this.requestCurrentQueue();
});
if (this.totalCrawlAttempts >= this.maxCrawlAttempts) {
// break out before we get another request
break;
}
// This kicks off the flow
next = yield this.queue.nextRequest();
return true;
});
const pages = [];
try {
for (let i = 0; i < this.maxConcurrentPages; i++) {
pages.push(yield runner.pool.getPage());
}
let handledRequests = 0;
do {
handledRequests = 0;
const crawlPagePromises = [];
for (const page of pages) {
crawlPagePromises.push(crawlPage(page).then((result) => {
if (result) {
handledRequests++;
}
return result;
}));
}
yield Promise.all(crawlPagePromises);
} while (handledRequests > 0);
}
this.checkComplete();
finally {
for (const page of pages) {
yield runner.pool.recyclePage(page);
}
}
});

@@ -222,28 +188,53 @@ }

return __awaiter(this, void 0, void 0, function* () {
return new Promise((resolve, reject) => __awaiter(this, void 0, void 0, function* () {
this.complete = resolve;
if (!this.browser) {
// Create and launch the browser!
try {
if (this.stealth) {
puppeteer_extra_1.default.use(StealthPlugin());
if (this.runCallback) {
return this.runCallback.currentRun;
}
// Returning a Promise.resolve() first so all code, including the catch, is
// in a promise chain. This allows us to catch any errors that occur in any portion
// whether they were a rejected promise or thrown error.
// const arachne = this;
const runPromise = Promise.resolve().then(() => __awaiter(this, void 0, void 0, function* () {
const customBrowser = !!this.browser;
let browser;
if (customBrowser) {
browser = this.browser;
}
else {
if (this.stealth) {
puppeteer_extra_1.default.use(StealthPlugin());
}
browser = yield puppeteer_extra_1.default.launch(this.launchOptions);
}
const pool = new ArachnePool_1.ArachnePool(browser, { maxPages: this.maxConcurrentPages });
return this.runCallback = {
currentRun: runPromise, // trippy right? It's the top of the entire promise chain. Only necessary so the run method will return the same thing if called again
totalCrawlAttempts: 0,
pool,
browser,
activeCrawls: {},
cancel: () => {
const cancelPromises = [];
cancelPromises.push(pool.destroy().catch((e) => {
(0, stentor_logger_1.log)().warn("Failed to destroy pool", e);
}));
if (!customBrowser) {
// Do not close a browser that was given to us from the outside.
cancelPromises.push(browser.close().catch((e) => {
(0, stentor_logger_1.log)().warn("Failed to close browser", e);
}));
}
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore The types are exactly the same
this.browser = yield puppeteer_extra_1.default.launch(this.launchOptions);
return Promise.all(cancelPromises)
.then(() => { });
}
catch (e) {
console.error(e);
reject(e);
}
}
if (this.browser) {
// Fill up the pool!
this.pool = new ArachnePool_1.ArachnePool(this.browser, { maxPages: this.maxConcurrentPages });
// kick off the run loop
this.requestCurrentQueue();
}
})).then(() => {
// Before we fully resolve the promise, we end it
return this.stop();
};
}));
const runCallback = this.runCallback = yield runPromise;
return this.runCallback.currentRun
.then(() => this.requestCurrentQueue(runCallback))
.catch((e) => {
(0, stentor_logger_1.log)().error("Error performing crawl.", e);
throw e;
}).finally(() => {
this.runCallback = undefined;
return runCallback.cancel();
});

@@ -253,9 +244,9 @@ });

/**
* Stop the crawler.
* Stop the crawler if it is running.
*/
stop() {
var _a, _b;
return __awaiter(this, void 0, void 0, function* () {
yield ((_a = this.pool) === null || _a === void 0 ? void 0 : _a.destroy());
yield ((_b = this.browser) === null || _b === void 0 ? void 0 : _b.close());
if (this.runCallback) {
return this.runCallback.cancel();
}
});

@@ -262,0 +253,0 @@ }

@@ -58,3 +58,3 @@ "use strict";

return __awaiter(this, void 0, void 0, function* () {
return this.pool.drain();
return this.pool.drain().then(() => this.pool.clear());
});

@@ -61,0 +61,0 @@ }

{
"name": "@xapp/arachne",
"version": "1.3.3",
"version": "1.3.4",
"types": "lib/index",

@@ -21,3 +21,3 @@ "main": "lib/index",

"@types/mocha": "10.0.6",
"@types/node": "20.11.7",
"@types/node": "20.11.10",
"@types/sinon": "17.0.3",

@@ -28,3 +28,3 @@ "@types/sinon-chai": "3.2.12",

"mocha": "10.2.0",
"puppeteer": "21.6.1",
"puppeteer": "21.10.0",
"sinon": "17.0.1",

@@ -37,3 +37,3 @@ "sinon-chai": "3.7.0",

"dependencies": {
"@xapp/arachne-utils": "1.3.3",
"@xapp/arachne-utils": "1.3.4",
"generic-pool": "3.9.0",

@@ -44,3 +44,3 @@ "puppeteer-extra": "3.3.6",

"peerDependencies": {
"puppeteer": "21.6.1"
"puppeteer": "21.10.0"
},

@@ -53,3 +53,3 @@ "scripts": {

},
"gitHead": "14a5571867b6172a795541e8a34437ffd43e6384"
"gitHead": "8ca135c134ade218cab2e12d631acedcb7364d4d"
}

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc