@qualweb/crawler
Advanced tools
Comparing version 0.3.20 to 0.4.0
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __exportStar = (this && this.__exportStar) || function(m, exports) { | ||
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p); | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.Crawler = void 0; | ||
const log_update_1 = __importDefault(require("log-update")); | ||
class Crawler { | ||
constructor(browser, startingUrl, viewport, waitUntil) { | ||
this.browser = browser; | ||
this.startingUrl = this.verifyStartingUrl(startingUrl); | ||
this.isDomain = this.isStaringUrlADomain(startingUrl); | ||
this.viewport = viewport; | ||
this.waitUntil = waitUntil !== null && waitUntil !== void 0 ? waitUntil : 'domcontentloaded'; | ||
this.urls = new Array(); | ||
} | ||
verifyStartingUrl(startingUrl) { | ||
const url = new URL(decodeURIComponent(startingUrl)); | ||
const newStartingUrl = url.origin + url.pathname; | ||
if (!newStartingUrl.endsWith('/')) { | ||
return newStartingUrl + '/'; | ||
} | ||
else { | ||
return newStartingUrl; | ||
} | ||
} | ||
isStaringUrlADomain(startingUrl) { | ||
const url = new URL(startingUrl); | ||
return url.pathname === '/'; | ||
} | ||
async crawl(options) { | ||
var _a, _b, _c, _d, _e; | ||
const maxDepth = (_a = options === null || options === void 0 ? void 0 : options.maxDepth) !== null && _a !== void 0 ? _a : -1; | ||
const maxUrls = (_b = options === null || options === void 0 ? void 0 : options.maxUrls) !== null && _b !== void 0 ? _b : -1; | ||
const parallel = (options === null || options === void 0 ? void 0 : options.maxParallelCrawls) || 5; | ||
const timeout = (_c = options === null || options === void 0 ? void 0 : options.timeout) !== null && _c !== void 0 ? _c : -1; | ||
let currentDepth = 0; | ||
let currentUrlCount = 1; | ||
let continueCrawling = true; | ||
let surpassedMax = false; | ||
let timer = 0; | ||
const timerHandle = setInterval(() => { | ||
timer += 2; | ||
if (options === null || options === void 0 ? void 0 : options.logging) { | ||
this.log(currentDepth, currentUrlCount, timer); | ||
} | ||
}, 2000); | ||
let timeoutHandle = null; | ||
let timeoutReached = false; | ||
if (timeout > 0) { | ||
timeoutHandle = setTimeout(() => (timeoutReached = true), timeout * 1000); | ||
} | ||
if (options === null || options === void 0 ? void 0 : options.logging) { | ||
this.log(currentDepth, currentUrlCount, timer); | ||
} | ||
const urlsByDepth = {}; | ||
const urlsCrawled = {}; | ||
urlsCrawled[this.startingUrl] = true; | ||
const [firstPageUrls, relativePathsToTest] = await this.fetchPageLinks(this.startingUrl); | ||
urlsByDepth[currentDepth] = [...firstPageUrls]; | ||
const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePathsToTest)); | ||
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls]; | ||
this.addUrlsToCrawl(urlsCrawled, firstPageUrls); | ||
this.addUrlsToCrawl(urlsCrawled, newUrls); | ||
currentUrlCount += firstPageUrls.length + newUrls.length; | ||
if (options === null || options === void 0 ? void 0 : options.logging) { | ||
this.log(currentDepth, currentUrlCount, timer); | ||
} | ||
if (maxUrls >= 0 && currentUrlCount >= maxUrls) { | ||
surpassedMax = true; | ||
} | ||
while (currentDepth !== maxDepth && currentUrlCount !== maxUrls && continueCrawling) { | ||
const promises = new Array(); | ||
currentDepth++; | ||
let depthCompleted = false; | ||
if (options === null || options === void 0 ? void 0 : options.logging) { | ||
this.log(currentDepth, currentUrlCount, timer); | ||
} | ||
while (!depthCompleted) { | ||
const letsCrawl = new Array(); | ||
let count = 0; | ||
for (const url of (_d = urlsByDepth[currentDepth - 1]) !== null && _d !== void 0 ? _d : []) { | ||
if (!urlsCrawled[url]) { | ||
urlsCrawled[url] = true; | ||
letsCrawl.push(url); | ||
count++; | ||
} | ||
if (count === parallel) { | ||
break; | ||
} | ||
} | ||
if (count < parallel) { | ||
depthCompleted = true; | ||
} | ||
let delay = 0; | ||
const delayIncrement = 1000; | ||
for (const url of letsCrawl !== null && letsCrawl !== void 0 ? letsCrawl : []) { | ||
delay += delayIncrement; | ||
promises.push(new Promise((resolve) => { | ||
setTimeout(async () => { | ||
resolve(await this.fetchPageLinks(url)); | ||
}, delay); | ||
})); | ||
} | ||
const listUrls = await Promise.all(promises); | ||
urlsByDepth[currentDepth] = new Array(); | ||
for (const [urls, relativePaths] of listUrls !== null && listUrls !== void 0 ? listUrls : []) { | ||
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...urls]; | ||
const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePaths)); | ||
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls]; | ||
this.addUrlsToCrawl(urlsCrawled, urls); | ||
this.addUrlsToCrawl(urlsCrawled, newUrls); | ||
currentUrlCount = Object.keys(urlsCrawled).length; | ||
if (options === null || options === void 0 ? void 0 : options.logging) { | ||
this.log(currentDepth, currentUrlCount, timer); | ||
} | ||
if (maxUrls >= 0 && currentUrlCount >= maxUrls) { | ||
surpassedMax = true; | ||
depthCompleted = true; | ||
continueCrawling = false; | ||
break; | ||
} | ||
} | ||
if (timeoutReached) { | ||
continueCrawling = false; | ||
break; | ||
} | ||
} | ||
if (!((_e = urlsByDepth[currentDepth]) === null || _e === void 0 ? void 0 : _e.length)) { | ||
continueCrawling = false; | ||
} | ||
} | ||
if (timeoutHandle) { | ||
clearTimeout(timeoutHandle); | ||
} | ||
clearInterval(timerHandle); | ||
if (surpassedMax) { | ||
this.urls = Object.keys(urlsCrawled).slice(0, maxUrls); | ||
} | ||
else { | ||
this.urls = Object.keys(urlsCrawled); | ||
} | ||
} | ||
log(currentDepth, currentUrlCount, timer) { | ||
(0, log_update_1.default)(`Starting url: ${this.startingUrl} Current depth: ${currentDepth} Urls found: ${currentUrlCount} Time passed: ${timer} seconds`); | ||
} | ||
addUrlsToCrawl(urlsCrawled, urls) { | ||
for (const url of urls !== null && urls !== void 0 ? urls : []) { | ||
if (!urlsCrawled[url]) { | ||
urlsCrawled[url] = false; | ||
} | ||
} | ||
} | ||
async fetchPageLinks(url) { | ||
let urls = new Array(); | ||
let relativePathsToTest = new Array(); | ||
try { | ||
const page = await this.browser.newPage(); | ||
if (this.viewport) { | ||
await page.setViewport(this.viewport); | ||
} | ||
await page.goto(url, { | ||
waitUntil: this.waitUntil | ||
}); | ||
[urls, relativePathsToTest] = await page.evaluate((startingUrl, isDomain) => { | ||
function getUrlWithoutExtension(url) { | ||
if (!url.endsWith('/')) { | ||
const parts = url.split('/'); | ||
parts.pop(); | ||
return parts.join('/') + '/'; | ||
} | ||
else { | ||
return url; | ||
} | ||
} | ||
const notHtml = 'css|jpg|jpeg|gif|svg|pdf|docx|js|png|ico|xml|mp4|mp3|mkv|wav|rss|json|pptx|txt|zip'.split('|'); | ||
const links = document.querySelectorAll('body a'); | ||
const urls = new Array(); | ||
const relativePathsToTest = new Array(); | ||
links.forEach((link) => { | ||
var _a; | ||
if (link.hasAttribute('href')) { | ||
let href = (_a = link.getAttribute('href')) === null || _a === void 0 ? void 0 : _a.trim(); | ||
if (href === null || href === void 0 ? void 0 : href.startsWith('//')) | ||
href = href.replace('//', 'https://'); | ||
if (href && | ||
!isDomain && | ||
!href.startsWith('http') && | ||
!href.startsWith('#') && | ||
!href.includes('javascript:') && | ||
!href.includes('tel:') && | ||
!href.includes('mailto:')) { | ||
let valid = true; | ||
for (const not of notHtml || []) { | ||
if (href.endsWith(not) || href.includes('.' + not + '/')) { | ||
valid = false; | ||
break; | ||
} | ||
const parts = href.split('/'); | ||
if (parts.length > 0) { | ||
const lastPart = parts[parts.length - 1]; | ||
if (lastPart.startsWith('#')) { | ||
valid = false; | ||
break; | ||
} | ||
} | ||
} | ||
if (valid) { | ||
if (href.startsWith('/')) { | ||
const url = new URL(window.location.href); | ||
relativePathsToTest.push(url.origin + href); | ||
} | ||
else { | ||
relativePathsToTest.push(getUrlWithoutExtension(window.location.href) + href); | ||
} | ||
} | ||
} | ||
if (href && | ||
isDomain && | ||
(href.startsWith(startingUrl) || | ||
href.startsWith('/') || | ||
href.startsWith('./') || | ||
(!href.startsWith('http') && !href.startsWith('#'))) && | ||
!href.includes('javascript:') && | ||
!href.includes('tel:') && | ||
!href.includes('mailto:')) { | ||
let valid = true; | ||
for (const not of notHtml || []) { | ||
if (href.endsWith(not) || href.includes('.' + not + '/')) { | ||
valid = false; | ||
break; | ||
} | ||
const parts = href.split('/'); | ||
if (parts.length > 0) { | ||
const lastPart = parts[parts.length - 1]; | ||
if (lastPart.startsWith('#')) { | ||
valid = false; | ||
break; | ||
} | ||
} | ||
} | ||
if (valid) { | ||
try { | ||
let correctUrl = ''; | ||
if (href.startsWith(startingUrl)) { | ||
correctUrl = href; | ||
} | ||
else if (href.startsWith('./')) { | ||
correctUrl = startingUrl + href.slice(2); | ||
} | ||
else if (href.startsWith('/')) { | ||
correctUrl = startingUrl + href.slice(1); | ||
} | ||
else { | ||
correctUrl = startingUrl + href; | ||
} | ||
const parsedUrl = new URL(correctUrl); | ||
if (parsedUrl.hash.trim() === '') { | ||
urls.push(correctUrl); | ||
} | ||
} | ||
catch (err) { | ||
console.error(err); | ||
} | ||
} | ||
} | ||
} | ||
}); | ||
return [urls, relativePathsToTest]; | ||
}, this.startingUrl, this.isDomain); | ||
} | ||
catch (err) { | ||
console.error(err); | ||
} | ||
return [[], [...relativePathsToTest, ...this.normalizeAndSort(urls)]]; | ||
} | ||
async checkRelativePathsUrls(urls) { | ||
const newUrlsToValidate = new Array(); | ||
await Promise.all(urls.map(async (url) => { | ||
try { | ||
const page = await this.browser.newPage(); | ||
if (this.viewport) { | ||
await page.setViewport(this.viewport); | ||
} | ||
await page.goto(url, { | ||
waitUntil: this.waitUntil | ||
}); | ||
const newUrl = await page.evaluate((startingUrl) => { | ||
function getUrlWithoutExtension(url) { | ||
if (!url.endsWith('/')) { | ||
const parts = url.split('/'); | ||
parts.pop(); | ||
return parts.join('/') + '/'; | ||
} | ||
else { | ||
return url; | ||
} | ||
} | ||
if (window.location.href.startsWith(getUrlWithoutExtension(startingUrl))) { | ||
return window.location.href; | ||
} | ||
else { | ||
return null; | ||
} | ||
}, this.startingUrl); | ||
if (newUrl !== null) { | ||
newUrlsToValidate.push(newUrl); | ||
} | ||
await page.close(); | ||
} | ||
catch (err) { | ||
console.error(err); | ||
} | ||
})); | ||
return newUrlsToValidate; | ||
} | ||
normalizeAndSort(urls) { | ||
const normalizedUrls = urls.map((u) => { | ||
if (u.includes('#')) { | ||
const parts = u.split('#'); | ||
parts.pop(); | ||
u = parts.join('#'); | ||
} | ||
if (u.startsWith(this.startingUrl)) { | ||
return u.trim(); | ||
} | ||
else { | ||
return (this.startingUrl + u).trim(); | ||
} | ||
}); | ||
const unique = [...new Set(normalizedUrls)] | ||
.map((u) => { | ||
try { | ||
return decodeURIComponent(u); | ||
} | ||
catch (err) { | ||
return null; | ||
} | ||
}) | ||
.filter((u) => u !== null); | ||
return unique.sort(); | ||
} | ||
getResults() { | ||
return this.urls; | ||
} | ||
} | ||
exports.Crawler = Crawler; | ||
__exportStar(require("./Crawler.object"), exports); | ||
__exportStar(require("./CrawlOptions"), exports); | ||
//# sourceMappingURL=index.js.map |
{ | ||
"name": "@qualweb/crawler", | ||
"version": "0.3.20", | ||
"version": "0.4.0", | ||
"description": "Webpage crawler for qualweb", | ||
@@ -9,2 +9,3 @@ "main": "dist/index.js", | ||
], | ||
"types": "dist/index.d.ts", | ||
"scripts": { | ||
@@ -14,7 +15,6 @@ "test": "mocha", | ||
"tsc": "tsc", | ||
"lint": "eslint src --ext .ts", | ||
"lint:fix": "eslint src --ext .ts --fix", | ||
"lint": "eslint .", | ||
"format": "prettier src/**/*.ts --write", | ||
"prebuild": "rimraf dist", | ||
"build": "npm run prebuild && tsc --build tsconfig.prod.json", | ||
"build": "tsc --build tsconfig.prod.json", | ||
"prepublishOnly": "npm run build" | ||
@@ -40,3 +40,3 @@ }, | ||
"engines": { | ||
"node": ">=12.0.0" | ||
"node": ">=18.0.0" | ||
}, | ||
@@ -48,14 +48,13 @@ "dependencies": { | ||
"@koa/router": "^12.0.1", | ||
"@qualweb/types": "0.7.27", | ||
"@tsconfig/recommended": "^1.0.3", | ||
"@types/chai": "^4.3.16", | ||
"@types/koa": "^2.14.0", | ||
"@types/koa__router": "^12.0.4", | ||
"@types/mocha": "^10.0.6", | ||
"@types/node": "^16.11.2", | ||
"chai": "^4.4.1", | ||
"eslint": "^8.56.0", | ||
"@types/node": "^20.12.12", | ||
"chai": "4.5.0", | ||
"koa": "^2.15.0", | ||
"mocha": "^10.2.0", | ||
"prettier": "^3.1.1", | ||
"puppeteer": "^21.6.1", | ||
"puppeteer": "^22.10.0", | ||
"puppeteer-extra": "^3.2.3", | ||
@@ -65,4 +64,4 @@ "puppeteer-extra-plugin-stealth": "^2.9.0", | ||
"ts-node": "^10.9.2", | ||
"typescript": "^4.4.4" | ||
"typescript": "^5.6.3" | ||
} | ||
} |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
32831
17
14
395
1