@qualweb/crawler
Advanced tools
Comparing version 0.3.9 to 0.3.10
@@ -9,17 +9,24 @@ "use strict"; | ||
class Crawler { | ||
constructor(browser, domain, viewport) { | ||
constructor(browser, startingUrl, viewport, waitUntil) { | ||
this.browser = browser; | ||
this.domain = this.verifyDomain(domain); | ||
this.startingUrl = this.verifyStartingUrl(startingUrl); | ||
this.isDomain = this.isStaringUrlADomain(startingUrl); | ||
this.viewport = viewport; | ||
this.waitUntil = waitUntil !== null && waitUntil !== void 0 ? waitUntil : 'domcontentloaded'; | ||
this.urls = new Array(); | ||
} | ||
verifyDomain(domain) { | ||
domain = decodeURIComponent(domain); | ||
if (domain.endsWith('/')) { | ||
return domain.substring(0, domain.length - 1); | ||
verifyStartingUrl(startingUrl) { | ||
const url = new URL(decodeURIComponent(startingUrl)); | ||
const newStartingUrl = url.origin + url.pathname; | ||
if (!newStartingUrl.endsWith('/')) { | ||
return newStartingUrl + '/'; | ||
} | ||
else { | ||
return domain; | ||
return newStartingUrl; | ||
} | ||
} | ||
isStaringUrlADomain(startingUrl) { | ||
const url = new URL(startingUrl); | ||
return url.pathname === '/'; | ||
} | ||
async crawl(options) { | ||
@@ -52,7 +59,10 @@ var _a, _b, _c, _d, _e; | ||
const urlsCrawled = {}; | ||
urlsCrawled[this.domain] = true; | ||
const firstPageUrls = await this.fetchPageLinks(this.domain); | ||
urlsCrawled[this.startingUrl] = true; | ||
const [firstPageUrls, relativePathsToTest] = await this.fetchPageLinks(this.startingUrl); | ||
urlsByDepth[currentDepth] = [...firstPageUrls]; | ||
const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePathsToTest)); | ||
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls]; | ||
this.addUrlsToCrawl(urlsCrawled, firstPageUrls); | ||
currentUrlCount += firstPageUrls.length; | ||
this.addUrlsToCrawl(urlsCrawled, newUrls); | ||
currentUrlCount += firstPageUrls.length + newUrls.length; | ||
if (options === null || options === void 0 ? void 0 : options.logging) { | ||
@@ -92,5 +102,8 @@ this.log(currentDepth, currentUrlCount, timer); | ||
urlsByDepth[currentDepth] = new Array(); | ||
for (const urls of listUrls !== null && listUrls !== void 0 ? listUrls : []) { | ||
for (const [urls, relativePaths] of listUrls !== null && listUrls !== void 0 ? listUrls : []) { | ||
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...urls]; | ||
const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePaths)); | ||
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls]; | ||
this.addUrlsToCrawl(urlsCrawled, urls); | ||
this.addUrlsToCrawl(urlsCrawled, newUrls); | ||
currentUrlCount = Object.keys(urlsCrawled).length; | ||
@@ -128,3 +141,3 @@ if (options === null || options === void 0 ? void 0 : options.logging) { | ||
log(currentDepth, currentUrlCount, timer) { | ||
log_update_1.default(`Domain: ${this.domain} Current depth: ${currentDepth} Urls found: ${currentUrlCount} Time passed: ${timer} seconds`); | ||
(0, log_update_1.default)(`Starting url: ${this.startingUrl} Current depth: ${currentDepth} Urls found: ${currentUrlCount} Time passed: ${timer} seconds`); | ||
} | ||
@@ -140,2 +153,3 @@ addUrlsToCrawl(urlsCrawled, urls) { | ||
let urls = new Array(); | ||
let relativePathsToTest = new Array(); | ||
try { | ||
@@ -147,14 +161,59 @@ const page = await this.browser.newPage(); | ||
await page.goto(url, { | ||
waitUntil: 'domcontentloaded' | ||
waitUntil: this.waitUntil | ||
}); | ||
urls = await page.evaluate((domain) => { | ||
[urls, relativePathsToTest] = await page.evaluate((startingUrl, isDomain) => { | ||
function getUrlWithoutExtension(url) { | ||
if (!url.endsWith('/')) { | ||
const parts = url.split('/'); | ||
parts.pop(); | ||
return parts.join('/') + '/'; | ||
} | ||
else { | ||
return url; | ||
} | ||
} | ||
const notHtml = 'css|jpg|jpeg|gif|svg|pdf|docx|js|png|ico|xml|mp4|mp3|mkv|wav|rss|json|pptx|txt'.split('|'); | ||
const links = document.querySelectorAll('body a'); | ||
const urls = new Array(); | ||
const relativePathsToTest = new Array(); | ||
links.forEach((link) => { | ||
var _a; | ||
console.log(link); | ||
if (link.hasAttribute('href')) { | ||
const href = (_a = link.getAttribute('href')) === null || _a === void 0 ? void 0 : _a.trim(); | ||
if (href && | ||
(href.startsWith(domain) || | ||
!isDomain && | ||
!href.startsWith('http') && | ||
!href.startsWith('#') && | ||
!href.includes('javascript:') && | ||
!href.includes('tel:') && | ||
!href.includes('mailto:')) { | ||
let valid = true; | ||
for (const not of notHtml || []) { | ||
if (href.endsWith(not)) { | ||
valid = false; | ||
break; | ||
} | ||
const parts = href.split('/'); | ||
if (parts.length > 0) { | ||
const lastPart = parts[parts.length - 1]; | ||
if (lastPart.startsWith('#')) { | ||
valid = false; | ||
break; | ||
} | ||
} | ||
} | ||
if (valid) { | ||
if (href.startsWith('/')) { | ||
const url = new URL(window.location.href); | ||
relativePathsToTest.push(url.origin + href); | ||
} | ||
else { | ||
relativePathsToTest.push(getUrlWithoutExtension(window.location.href) + href); | ||
} | ||
} | ||
} | ||
if (href && | ||
isDomain && | ||
(href.startsWith(startingUrl) || | ||
href.startsWith('/') || | ||
@@ -184,13 +243,13 @@ href.startsWith('./') || | ||
let correctUrl = ''; | ||
if (href.startsWith(domain)) { | ||
if (href.startsWith(startingUrl)) { | ||
correctUrl = href; | ||
} | ||
else if (href.startsWith('./')) { | ||
correctUrl = domain + href.slice(1); | ||
correctUrl = startingUrl + href.slice(2); | ||
} | ||
else if (!href.startsWith('/')) { | ||
correctUrl = domain + '/' + href; | ||
else if (href.startsWith('/')) { | ||
correctUrl = startingUrl + href.slice(1); | ||
} | ||
else { | ||
correctUrl = domain + href; | ||
correctUrl = startingUrl + href; | ||
} | ||
@@ -209,4 +268,4 @@ const parsedUrl = new URL(correctUrl); | ||
}); | ||
return urls; | ||
}, this.domain); | ||
return [urls, relativePathsToTest]; | ||
}, this.startingUrl, this.isDomain); | ||
await page.close(); | ||
@@ -216,14 +275,60 @@ } | ||
} | ||
return this.normalizeAndSort(urls); | ||
console.log(urls); | ||
return [this.normalizeAndSort(urls), relativePathsToTest]; | ||
} | ||
async checkRelativePathsUrls(urls) { | ||
const newUrlsToValidate = new Array(); | ||
for (const url of urls !== null && urls !== void 0 ? urls : []) { | ||
try { | ||
const page = await this.browser.newPage(); | ||
if (this.viewport) { | ||
await page.setViewport(this.viewport); | ||
} | ||
await page.goto(url, { | ||
waitUntil: this.waitUntil | ||
}); | ||
const newUrl = await page.evaluate((startingUrl) => { | ||
function getUrlWithoutExtension(url) { | ||
if (!url.endsWith('/')) { | ||
const parts = url.split('/'); | ||
parts.pop(); | ||
return parts.join('/') + '/'; | ||
} | ||
else { | ||
return url; | ||
} | ||
} | ||
if (window.location.href.startsWith(getUrlWithoutExtension(startingUrl))) { | ||
return window.location.href; | ||
} | ||
else { | ||
return null; | ||
} | ||
}, this.startingUrl); | ||
if (newUrl !== null) { | ||
newUrlsToValidate.push(newUrl); | ||
} | ||
await page.close(); | ||
} | ||
catch (err) { | ||
console.error(err); | ||
} | ||
} | ||
return newUrlsToValidate; | ||
} | ||
normalizeAndSort(urls) { | ||
const normalizedUrls = urls.map((u) => { | ||
if (u.endsWith('/')) { | ||
u = u.substring(0, u.length - 1); | ||
if (u.includes('#')) { | ||
const parts = u.split('#'); | ||
parts.pop(); | ||
u = parts.join('#'); | ||
} | ||
if (u.startsWith(this.domain)) { | ||
if (!u.endsWith('/')) { | ||
u = u + '/'; | ||
} | ||
if (u.startsWith(this.startingUrl)) { | ||
return u.trim(); | ||
} | ||
else { | ||
return (this.domain + u).trim(); | ||
return (this.startingUrl + u).trim(); | ||
} | ||
@@ -230,0 +335,0 @@ }); |
{ | ||
"name": "@qualweb/crawler", | ||
"version": "0.3.9", | ||
"version": "0.3.10", | ||
"description": "Webpage crawler for qualweb", | ||
@@ -41,22 +41,23 @@ "main": "dist/index.js", | ||
"dependencies": { | ||
"log-update": "^4.0.0" | ||
"log-update": "4.0.0" | ||
}, | ||
"devDependencies": { | ||
"@qualweb/types": "^0.6.9", | ||
"@qualweb/types": "0.7.14", | ||
"@tsconfig/recommended": "^1.0.1", | ||
"@types/node": "^16.3.3", | ||
"@typescript-eslint/eslint-plugin": "^4.28.3", | ||
"@typescript-eslint/parser": "^4.28.3", | ||
"@types/node": "^16.11.2", | ||
"@typescript-eslint/eslint-plugin": "^5.1.0", | ||
"@typescript-eslint/parser": "^5.1.0", | ||
"chai": "^4.3.4", | ||
"eslint": "^7.31.0", | ||
"eslint": "^8.0.1", | ||
"eslint-config-prettier": "^8.3.0", | ||
"eslint-plugin-prettier": "^3.4.0", | ||
"eslint-plugin-prettier": "^4.0.0", | ||
"eslint-plugin-sonarjs": "^0.9.1", | ||
"esm": "^3.2.25", | ||
"mocha": "^9.0.2", | ||
"prettier": "^2.3.2", | ||
"puppeteer": "^10.1.0", | ||
"mocha": "^9.1.3", | ||
"prettier": "^2.4.1", | ||
"puppeteer": "^10.4.0", | ||
"puppeteer-extra": "^3.2.3", | ||
"puppeteer-extra-plugin-stealth": "^2.9.0", | ||
"rimraf": "^3.0.2", | ||
"typescript": "^4.3.5" | ||
"typescript": "^4.4.4" | ||
} | ||
} |
Sorry, the diff of this file is not supported yet
29967
334
17
Updatedlog-update@4.0.0