Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

@qualweb/crawler

Package Overview
Dependencies
Maintainers
0
Versions
38
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@qualweb/crawler - npm Package Compare versions

Comparing version 0.3.20 to 0.4.0

dist/Crawler.object.d.ts

359

dist/index.js
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __exportStar = (this && this.__exportStar) || function(m, exports) {
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Crawler = void 0;
const log_update_1 = __importDefault(require("log-update"));
class Crawler {
constructor(browser, startingUrl, viewport, waitUntil) {
this.browser = browser;
this.startingUrl = this.verifyStartingUrl(startingUrl);
this.isDomain = this.isStaringUrlADomain(startingUrl);
this.viewport = viewport;
this.waitUntil = waitUntil !== null && waitUntil !== void 0 ? waitUntil : 'domcontentloaded';
this.urls = new Array();
}
verifyStartingUrl(startingUrl) {
const url = new URL(decodeURIComponent(startingUrl));
const newStartingUrl = url.origin + url.pathname;
if (!newStartingUrl.endsWith('/')) {
return newStartingUrl + '/';
}
else {
return newStartingUrl;
}
}
isStaringUrlADomain(startingUrl) {
const url = new URL(startingUrl);
return url.pathname === '/';
}
async crawl(options) {
var _a, _b, _c, _d, _e;
const maxDepth = (_a = options === null || options === void 0 ? void 0 : options.maxDepth) !== null && _a !== void 0 ? _a : -1;
const maxUrls = (_b = options === null || options === void 0 ? void 0 : options.maxUrls) !== null && _b !== void 0 ? _b : -1;
const parallel = (options === null || options === void 0 ? void 0 : options.maxParallelCrawls) || 5;
const timeout = (_c = options === null || options === void 0 ? void 0 : options.timeout) !== null && _c !== void 0 ? _c : -1;
let currentDepth = 0;
let currentUrlCount = 1;
let continueCrawling = true;
let surpassedMax = false;
let timer = 0;
const timerHandle = setInterval(() => {
timer += 2;
if (options === null || options === void 0 ? void 0 : options.logging) {
this.log(currentDepth, currentUrlCount, timer);
}
}, 2000);
let timeoutHandle = null;
let timeoutReached = false;
if (timeout > 0) {
timeoutHandle = setTimeout(() => (timeoutReached = true), timeout * 1000);
}
if (options === null || options === void 0 ? void 0 : options.logging) {
this.log(currentDepth, currentUrlCount, timer);
}
const urlsByDepth = {};
const urlsCrawled = {};
urlsCrawled[this.startingUrl] = true;
const [firstPageUrls, relativePathsToTest] = await this.fetchPageLinks(this.startingUrl);
urlsByDepth[currentDepth] = [...firstPageUrls];
const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePathsToTest));
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls];
this.addUrlsToCrawl(urlsCrawled, firstPageUrls);
this.addUrlsToCrawl(urlsCrawled, newUrls);
currentUrlCount += firstPageUrls.length + newUrls.length;
if (options === null || options === void 0 ? void 0 : options.logging) {
this.log(currentDepth, currentUrlCount, timer);
}
if (maxUrls >= 0 && currentUrlCount >= maxUrls) {
surpassedMax = true;
}
while (currentDepth !== maxDepth && currentUrlCount !== maxUrls && continueCrawling) {
const promises = new Array();
currentDepth++;
let depthCompleted = false;
if (options === null || options === void 0 ? void 0 : options.logging) {
this.log(currentDepth, currentUrlCount, timer);
}
while (!depthCompleted) {
const letsCrawl = new Array();
let count = 0;
for (const url of (_d = urlsByDepth[currentDepth - 1]) !== null && _d !== void 0 ? _d : []) {
if (!urlsCrawled[url]) {
urlsCrawled[url] = true;
letsCrawl.push(url);
count++;
}
if (count === parallel) {
break;
}
}
if (count < parallel) {
depthCompleted = true;
}
let delay = 0;
const delayIncrement = 1000;
for (const url of letsCrawl !== null && letsCrawl !== void 0 ? letsCrawl : []) {
delay += delayIncrement;
promises.push(new Promise((resolve) => {
setTimeout(async () => {
resolve(await this.fetchPageLinks(url));
}, delay);
}));
}
const listUrls = await Promise.all(promises);
urlsByDepth[currentDepth] = new Array();
for (const [urls, relativePaths] of listUrls !== null && listUrls !== void 0 ? listUrls : []) {
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...urls];
const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePaths));
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls];
this.addUrlsToCrawl(urlsCrawled, urls);
this.addUrlsToCrawl(urlsCrawled, newUrls);
currentUrlCount = Object.keys(urlsCrawled).length;
if (options === null || options === void 0 ? void 0 : options.logging) {
this.log(currentDepth, currentUrlCount, timer);
}
if (maxUrls >= 0 && currentUrlCount >= maxUrls) {
surpassedMax = true;
depthCompleted = true;
continueCrawling = false;
break;
}
}
if (timeoutReached) {
continueCrawling = false;
break;
}
}
if (!((_e = urlsByDepth[currentDepth]) === null || _e === void 0 ? void 0 : _e.length)) {
continueCrawling = false;
}
}
if (timeoutHandle) {
clearTimeout(timeoutHandle);
}
clearInterval(timerHandle);
if (surpassedMax) {
this.urls = Object.keys(urlsCrawled).slice(0, maxUrls);
}
else {
this.urls = Object.keys(urlsCrawled);
}
}
log(currentDepth, currentUrlCount, timer) {
(0, log_update_1.default)(`Starting url: ${this.startingUrl} Current depth: ${currentDepth} Urls found: ${currentUrlCount} Time passed: ${timer} seconds`);
}
addUrlsToCrawl(urlsCrawled, urls) {
for (const url of urls !== null && urls !== void 0 ? urls : []) {
if (!urlsCrawled[url]) {
urlsCrawled[url] = false;
}
}
}
async fetchPageLinks(url) {
let urls = new Array();
let relativePathsToTest = new Array();
try {
const page = await this.browser.newPage();
if (this.viewport) {
await page.setViewport(this.viewport);
}
await page.goto(url, {
waitUntil: this.waitUntil
});
[urls, relativePathsToTest] = await page.evaluate((startingUrl, isDomain) => {
function getUrlWithoutExtension(url) {
if (!url.endsWith('/')) {
const parts = url.split('/');
parts.pop();
return parts.join('/') + '/';
}
else {
return url;
}
}
const notHtml = 'css|jpg|jpeg|gif|svg|pdf|docx|js|png|ico|xml|mp4|mp3|mkv|wav|rss|json|pptx|txt|zip'.split('|');
const links = document.querySelectorAll('body a');
const urls = new Array();
const relativePathsToTest = new Array();
links.forEach((link) => {
var _a;
if (link.hasAttribute('href')) {
let href = (_a = link.getAttribute('href')) === null || _a === void 0 ? void 0 : _a.trim();
if (href === null || href === void 0 ? void 0 : href.startsWith('//'))
href = href.replace('//', 'https://');
if (href &&
!isDomain &&
!href.startsWith('http') &&
!href.startsWith('#') &&
!href.includes('javascript:') &&
!href.includes('tel:') &&
!href.includes('mailto:')) {
let valid = true;
for (const not of notHtml || []) {
if (href.endsWith(not) || href.includes('.' + not + '/')) {
valid = false;
break;
}
const parts = href.split('/');
if (parts.length > 0) {
const lastPart = parts[parts.length - 1];
if (lastPart.startsWith('#')) {
valid = false;
break;
}
}
}
if (valid) {
if (href.startsWith('/')) {
const url = new URL(window.location.href);
relativePathsToTest.push(url.origin + href);
}
else {
relativePathsToTest.push(getUrlWithoutExtension(window.location.href) + href);
}
}
}
if (href &&
isDomain &&
(href.startsWith(startingUrl) ||
href.startsWith('/') ||
href.startsWith('./') ||
(!href.startsWith('http') && !href.startsWith('#'))) &&
!href.includes('javascript:') &&
!href.includes('tel:') &&
!href.includes('mailto:')) {
let valid = true;
for (const not of notHtml || []) {
if (href.endsWith(not) || href.includes('.' + not + '/')) {
valid = false;
break;
}
const parts = href.split('/');
if (parts.length > 0) {
const lastPart = parts[parts.length - 1];
if (lastPart.startsWith('#')) {
valid = false;
break;
}
}
}
if (valid) {
try {
let correctUrl = '';
if (href.startsWith(startingUrl)) {
correctUrl = href;
}
else if (href.startsWith('./')) {
correctUrl = startingUrl + href.slice(2);
}
else if (href.startsWith('/')) {
correctUrl = startingUrl + href.slice(1);
}
else {
correctUrl = startingUrl + href;
}
const parsedUrl = new URL(correctUrl);
if (parsedUrl.hash.trim() === '') {
urls.push(correctUrl);
}
}
catch (err) {
console.error(err);
}
}
}
}
});
return [urls, relativePathsToTest];
}, this.startingUrl, this.isDomain);
}
catch (err) {
console.error(err);
}
return [[], [...relativePathsToTest, ...this.normalizeAndSort(urls)]];
}
async checkRelativePathsUrls(urls) {
const newUrlsToValidate = new Array();
await Promise.all(urls.map(async (url) => {
try {
const page = await this.browser.newPage();
if (this.viewport) {
await page.setViewport(this.viewport);
}
await page.goto(url, {
waitUntil: this.waitUntil
});
const newUrl = await page.evaluate((startingUrl) => {
function getUrlWithoutExtension(url) {
if (!url.endsWith('/')) {
const parts = url.split('/');
parts.pop();
return parts.join('/') + '/';
}
else {
return url;
}
}
if (window.location.href.startsWith(getUrlWithoutExtension(startingUrl))) {
return window.location.href;
}
else {
return null;
}
}, this.startingUrl);
if (newUrl !== null) {
newUrlsToValidate.push(newUrl);
}
await page.close();
}
catch (err) {
console.error(err);
}
}));
return newUrlsToValidate;
}
normalizeAndSort(urls) {
const normalizedUrls = urls.map((u) => {
if (u.includes('#')) {
const parts = u.split('#');
parts.pop();
u = parts.join('#');
}
if (u.startsWith(this.startingUrl)) {
return u.trim();
}
else {
return (this.startingUrl + u).trim();
}
});
const unique = [...new Set(normalizedUrls)]
.map((u) => {
try {
return decodeURIComponent(u);
}
catch (err) {
return null;
}
})
.filter((u) => u !== null);
return unique.sort();
}
getResults() {
return this.urls;
}
}
exports.Crawler = Crawler;
__exportStar(require("./Crawler.object"), exports);
__exportStar(require("./CrawlOptions"), exports);
//# sourceMappingURL=index.js.map
{
"name": "@qualweb/crawler",
"version": "0.3.20",
"version": "0.4.0",
"description": "Webpage crawler for qualweb",

@@ -9,2 +9,3 @@ "main": "dist/index.js",

],
"types": "dist/index.d.ts",
"scripts": {

@@ -14,7 +15,6 @@ "test": "mocha",

"tsc": "tsc",
"lint": "eslint src --ext .ts",
"lint:fix": "eslint src --ext .ts --fix",
"lint": "eslint .",
"format": "prettier src/**/*.ts --write",
"prebuild": "rimraf dist",
"build": "npm run prebuild && tsc --build tsconfig.prod.json",
"build": "tsc --build tsconfig.prod.json",
"prepublishOnly": "npm run build"

@@ -40,3 +40,3 @@ },

"engines": {
"node": ">=12.0.0"
"node": ">=18.0.0"
},

@@ -48,14 +48,13 @@ "dependencies": {

"@koa/router": "^12.0.1",
"@qualweb/types": "0.7.27",
"@tsconfig/recommended": "^1.0.3",
"@types/chai": "^4.3.16",
"@types/koa": "^2.14.0",
"@types/koa__router": "^12.0.4",
"@types/mocha": "^10.0.6",
"@types/node": "^16.11.2",
"chai": "^4.4.1",
"eslint": "^8.56.0",
"@types/node": "^20.12.12",
"chai": "4.5.0",
"koa": "^2.15.0",
"mocha": "^10.2.0",
"prettier": "^3.1.1",
"puppeteer": "^21.6.1",
"puppeteer": "^22.10.0",
"puppeteer-extra": "^3.2.3",

@@ -65,4 +64,4 @@ "puppeteer-extra-plugin-stealth": "^2.9.0",

"ts-node": "^10.9.2",
"typescript": "^4.4.4"
"typescript": "^5.6.3"
}
}

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc