Socket
Socket
Sign inDemoInstall

@qualweb/crawler

Package Overview
Dependencies
19
Maintainers
4
Versions
26
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.3.9 to 0.3.10

159

dist/index.js

@@ -9,17 +9,24 @@ "use strict";

class Crawler {
constructor(browser, domain, viewport) {
constructor(browser, startingUrl, viewport, waitUntil) {
this.browser = browser;
this.domain = this.verifyDomain(domain);
this.startingUrl = this.verifyStartingUrl(startingUrl);
this.isDomain = this.isStaringUrlADomain(startingUrl);
this.viewport = viewport;
this.waitUntil = waitUntil !== null && waitUntil !== void 0 ? waitUntil : 'domcontentloaded';
this.urls = new Array();
}
verifyDomain(domain) {
domain = decodeURIComponent(domain);
if (domain.endsWith('/')) {
return domain.substring(0, domain.length - 1);
verifyStartingUrl(startingUrl) {
const url = new URL(decodeURIComponent(startingUrl));
const newStartingUrl = url.origin + url.pathname;
if (!newStartingUrl.endsWith('/')) {
return newStartingUrl + '/';
}
else {
return domain;
return newStartingUrl;
}
}
isStaringUrlADomain(startingUrl) {
const url = new URL(startingUrl);
return url.pathname === '/';
}
async crawl(options) {

@@ -52,7 +59,10 @@ var _a, _b, _c, _d, _e;

const urlsCrawled = {};
urlsCrawled[this.domain] = true;
const firstPageUrls = await this.fetchPageLinks(this.domain);
urlsCrawled[this.startingUrl] = true;
const [firstPageUrls, relativePathsToTest] = await this.fetchPageLinks(this.startingUrl);
urlsByDepth[currentDepth] = [...firstPageUrls];
const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePathsToTest));
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls];
this.addUrlsToCrawl(urlsCrawled, firstPageUrls);
currentUrlCount += firstPageUrls.length;
this.addUrlsToCrawl(urlsCrawled, newUrls);
currentUrlCount += firstPageUrls.length + newUrls.length;
if (options === null || options === void 0 ? void 0 : options.logging) {

@@ -92,5 +102,8 @@ this.log(currentDepth, currentUrlCount, timer);

urlsByDepth[currentDepth] = new Array();
for (const urls of listUrls !== null && listUrls !== void 0 ? listUrls : []) {
for (const [urls, relativePaths] of listUrls !== null && listUrls !== void 0 ? listUrls : []) {
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...urls];
const newUrls = this.normalizeAndSort(await this.checkRelativePathsUrls(relativePaths));
urlsByDepth[currentDepth] = [...urlsByDepth[currentDepth], ...newUrls];
this.addUrlsToCrawl(urlsCrawled, urls);
this.addUrlsToCrawl(urlsCrawled, newUrls);
currentUrlCount = Object.keys(urlsCrawled).length;

@@ -128,3 +141,3 @@ if (options === null || options === void 0 ? void 0 : options.logging) {

log(currentDepth, currentUrlCount, timer) {
log_update_1.default(`Domain: ${this.domain} Current depth: ${currentDepth} Urls found: ${currentUrlCount} Time passed: ${timer} seconds`);
(0, log_update_1.default)(`Starting url: ${this.startingUrl} Current depth: ${currentDepth} Urls found: ${currentUrlCount} Time passed: ${timer} seconds`);
}

@@ -140,2 +153,3 @@ addUrlsToCrawl(urlsCrawled, urls) {

let urls = new Array();
let relativePathsToTest = new Array();
try {

@@ -147,14 +161,59 @@ const page = await this.browser.newPage();

await page.goto(url, {
waitUntil: 'domcontentloaded'
waitUntil: this.waitUntil
});
urls = await page.evaluate((domain) => {
[urls, relativePathsToTest] = await page.evaluate((startingUrl, isDomain) => {
function getUrlWithoutExtension(url) {
if (!url.endsWith('/')) {
const parts = url.split('/');
parts.pop();
return parts.join('/') + '/';
}
else {
return url;
}
}
const notHtml = 'css|jpg|jpeg|gif|svg|pdf|docx|js|png|ico|xml|mp4|mp3|mkv|wav|rss|json|pptx|txt'.split('|');
const links = document.querySelectorAll('body a');
const urls = new Array();
const relativePathsToTest = new Array();
links.forEach((link) => {
var _a;
console.log(link);
if (link.hasAttribute('href')) {
const href = (_a = link.getAttribute('href')) === null || _a === void 0 ? void 0 : _a.trim();
if (href &&
(href.startsWith(domain) ||
!isDomain &&
!href.startsWith('http') &&
!href.startsWith('#') &&
!href.includes('javascript:') &&
!href.includes('tel:') &&
!href.includes('mailto:')) {
let valid = true;
for (const not of notHtml || []) {
if (href.endsWith(not)) {
valid = false;
break;
}
const parts = href.split('/');
if (parts.length > 0) {
const lastPart = parts[parts.length - 1];
if (lastPart.startsWith('#')) {
valid = false;
break;
}
}
}
if (valid) {
if (href.startsWith('/')) {
const url = new URL(window.location.href);
relativePathsToTest.push(url.origin + href);
}
else {
relativePathsToTest.push(getUrlWithoutExtension(window.location.href) + href);
}
}
}
if (href &&
isDomain &&
(href.startsWith(startingUrl) ||
href.startsWith('/') ||

@@ -184,13 +243,13 @@ href.startsWith('./') ||

let correctUrl = '';
if (href.startsWith(domain)) {
if (href.startsWith(startingUrl)) {
correctUrl = href;
}
else if (href.startsWith('./')) {
correctUrl = domain + href.slice(1);
correctUrl = startingUrl + href.slice(2);
}
else if (!href.startsWith('/')) {
correctUrl = domain + '/' + href;
else if (href.startsWith('/')) {
correctUrl = startingUrl + href.slice(1);
}
else {
correctUrl = domain + href;
correctUrl = startingUrl + href;
}

@@ -209,4 +268,4 @@ const parsedUrl = new URL(correctUrl);

});
return urls;
}, this.domain);
return [urls, relativePathsToTest];
}, this.startingUrl, this.isDomain);
await page.close();

@@ -216,14 +275,60 @@ }

}
return this.normalizeAndSort(urls);
console.log(urls);
return [this.normalizeAndSort(urls), relativePathsToTest];
}
async checkRelativePathsUrls(urls) {
const newUrlsToValidate = new Array();
for (const url of urls !== null && urls !== void 0 ? urls : []) {
try {
const page = await this.browser.newPage();
if (this.viewport) {
await page.setViewport(this.viewport);
}
await page.goto(url, {
waitUntil: this.waitUntil
});
const newUrl = await page.evaluate((startingUrl) => {
function getUrlWithoutExtension(url) {
if (!url.endsWith('/')) {
const parts = url.split('/');
parts.pop();
return parts.join('/') + '/';
}
else {
return url;
}
}
if (window.location.href.startsWith(getUrlWithoutExtension(startingUrl))) {
return window.location.href;
}
else {
return null;
}
}, this.startingUrl);
if (newUrl !== null) {
newUrlsToValidate.push(newUrl);
}
await page.close();
}
catch (err) {
console.error(err);
}
}
return newUrlsToValidate;
}
normalizeAndSort(urls) {
const normalizedUrls = urls.map((u) => {
if (u.endsWith('/')) {
u = u.substring(0, u.length - 1);
if (u.includes('#')) {
const parts = u.split('#');
parts.pop();
u = parts.join('#');
}
if (u.startsWith(this.domain)) {
if (!u.endsWith('/')) {
u = u + '/';
}
if (u.startsWith(this.startingUrl)) {
return u.trim();
}
else {
return (this.domain + u).trim();
return (this.startingUrl + u).trim();
}

@@ -230,0 +335,0 @@ });

27

package.json
{
"name": "@qualweb/crawler",
"version": "0.3.9",
"version": "0.3.10",
"description": "Webpage crawler for qualweb",

@@ -41,22 +41,23 @@ "main": "dist/index.js",

"dependencies": {
"log-update": "^4.0.0"
"log-update": "4.0.0"
},
"devDependencies": {
"@qualweb/types": "^0.6.9",
"@qualweb/types": "0.7.14",
"@tsconfig/recommended": "^1.0.1",
"@types/node": "^16.3.3",
"@typescript-eslint/eslint-plugin": "^4.28.3",
"@typescript-eslint/parser": "^4.28.3",
"@types/node": "^16.11.2",
"@typescript-eslint/eslint-plugin": "^5.1.0",
"@typescript-eslint/parser": "^5.1.0",
"chai": "^4.3.4",
"eslint": "^7.31.0",
"eslint": "^8.0.1",
"eslint-config-prettier": "^8.3.0",
"eslint-plugin-prettier": "^3.4.0",
"eslint-plugin-prettier": "^4.0.0",
"eslint-plugin-sonarjs": "^0.9.1",
"esm": "^3.2.25",
"mocha": "^9.0.2",
"prettier": "^2.3.2",
"puppeteer": "^10.1.0",
"mocha": "^9.1.3",
"prettier": "^2.4.1",
"puppeteer": "^10.4.0",
"puppeteer-extra": "^3.2.3",
"puppeteer-extra-plugin-stealth": "^2.9.0",
"rimraf": "^3.0.2",
"typescript": "^4.3.5"
"typescript": "^4.4.4"
}
}

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc