Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

scrape-emails

Package Overview
Dependencies
Maintainers
1
Versions
3
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

scrape-emails - npm Package Compare versions

Comparing version 1.0.1 to 1.0.2

2

package.json
{
"name": "scrape-emails",
"version": "1.0.1",
"version": "1.0.2",
"description": "Scrape emails from whole rendered website with puppeteer",

@@ -5,0 +5,0 @@ "main": "src/cli.js",

@@ -13,2 +13,8 @@ #!/usr/bin/env node

.default("w", 2500)
.alias("n", "navigation-timeout")
.describe("n", "Navigation timeout (milliseconds)")
.default("n", 30000)
.alias("l", "levels")
.describe("l", "Path levels to follow. Set 0 for all levels.")
.default("l", 0)
.boolean("json")

@@ -31,4 +37,6 @@ .describe("json", "Return array in JSON format")

const scraper = new Scraper({
levels: argv.levels,
concurrency: argv.concurrency,
waitForPageLoad: argv.wait
waitForPageLoad: argv.wait,
navigationTimeout: argv.navigationTimeout
});

@@ -35,0 +43,0 @@ const emails = await scraper.scrape(argv._[0]);

@@ -9,4 +9,6 @@ // Dependencies

const OPTIONS = {
levels: 0,
concurrency: 2,
waitForPageLoad: 2500,
waitForPageLoad: 500,
navigationTimeout: 30000,
puppeteer: {}

@@ -27,3 +29,3 @@ };

const emails = [].concat.apply([], data).filter((v, i, a) => {
return a.indexOf(v) === i;
return a.indexOf(v) === i && !!v;
});

@@ -42,34 +44,79 @@

async _fetchUrl(link, callback) {
const page = await this._browser.newPage();
await page.goto(link);
await page.waitFor(this._options.waitForPageLoad);
_shouldAbortRequest(request) {
return (
["stylesheet", "image", "media", "font", "websocket"].indexOf(
request.resourceType()
) >= 0
);
}
const data = await page.evaluate(() => {
return {
origin: window.location.origin,
html: document.documentElement.outerHTML,
mailto: [].slice
.call(document.querySelectorAll('a[href^="mailto:"]'))
.map(element => {
return element.pathname;
}),
links: Array.from(document.getElementsByTagName("a"))
.filter(element => {
return (
element.hostname === window.location.hostname &&
(element.protocol === "http:" || element.protocol === "https:") &&
element.pathname
);
})
.map(element => {
return element.pathname;
})
.filter((v, i, a) => {
return a.indexOf(v) === i;
})
};
async _addPageInterception(page) {
await page.setRequestInterception(true);
page.on("request", request => {
this._shouldAbortRequest(request) ? request.abort() : request.continue();
});
}
async _fetchUrl(link, callback) {
let page, data;
try {
page = await this._browser.newPage();
await this._addPageInterception(page);
await page.goto(link, {
waitUntil: ["load", "domcontentloaded"],
timeout: this._options.navigationTimeout
});
await page.waitFor(this._options.waitForPageLoad);
data = await page.evaluate(
regex => {
regex = new RegExp(regex.source, regex.flags);
return {
origin: window.location.origin,
emails: [].slice
.call(document.querySelectorAll('a[href^="mailto:"]'))
.map(element => {
return element.pathname;
})
.concat(document.documentElement.outerHTML.match(regex) || []),
links: Array.from(document.getElementsByTagName("a"))
.filter(element => {
return (
element.hostname === window.location.hostname &&
(element.protocol === "http:" ||
element.protocol === "https:") &&
element.pathname
);
})
.map(element => {
return element.pathname;
})
.filter((v, i, a) => {
return a.indexOf(v) === i;
})
};
},
{
source: EMAIL_REGEX.source,
flags: EMAIL_REGEX.flags
}
);
} catch (_) {
callback(null);
return;
} finally {
try {
await page.close();
} catch (_) {}
}
data.links.forEach(link => {
if (
this._options.levels > 0 &&
link.split("/").filter(path => !!path).length > this._options.levels
) {
return;
}
if (!this._links.has(link)) {

@@ -83,6 +130,3 @@ this._links.add(link);

const emails = [...data.mailto, ...(data.html.match(EMAIL_REGEX) || [])];
await page.close();
callback(emails);
callback(data.emails);
}

@@ -89,0 +133,0 @@

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc