scrape-emails
Advanced tools
Comparing version 1.0.1 to 1.0.2
{ | ||
"name": "scrape-emails", | ||
"version": "1.0.1", | ||
"version": "1.0.2", | ||
"description": "Scrape emails from whole rendered website with puppeteer", | ||
@@ -5,0 +5,0 @@ "main": "src/cli.js", |
@@ -13,2 +13,8 @@ #!/usr/bin/env node | ||
.default("w", 2500) | ||
.alias("n", "navigation-timeout") | ||
.describe("n", "Navigation timeout (milliseconds)") | ||
.default("n", 30000) | ||
.alias("l", "levels") | ||
.describe("l", "Path levels to follow. Set 0 for all levels.") | ||
.default("l", 0) | ||
.boolean("json") | ||
@@ -31,4 +37,6 @@ .describe("json", "Return array in JSON format") | ||
const scraper = new Scraper({ | ||
levels: argv.levels, | ||
concurrency: argv.concurrency, | ||
waitForPageLoad: argv.wait | ||
waitForPageLoad: argv.wait, | ||
navigationTimeout: argv.navigationTimeout | ||
}); | ||
@@ -35,0 +43,0 @@ const emails = await scraper.scrape(argv._[0]); |
@@ -9,4 +9,6 @@ // Dependencies | ||
const OPTIONS = { | ||
levels: 0, | ||
concurrency: 2, | ||
waitForPageLoad: 2500, | ||
waitForPageLoad: 500, | ||
navigationTimeout: 30000, | ||
puppeteer: {} | ||
@@ -27,3 +29,3 @@ }; | ||
const emails = [].concat.apply([], data).filter((v, i, a) => { | ||
return a.indexOf(v) === i; | ||
return a.indexOf(v) === i && !!v; | ||
}); | ||
@@ -42,34 +44,79 @@ | ||
async _fetchUrl(link, callback) { | ||
const page = await this._browser.newPage(); | ||
await page.goto(link); | ||
await page.waitFor(this._options.waitForPageLoad); | ||
_shouldAbortRequest(request) { | ||
return ( | ||
["stylesheet", "image", "media", "font", "websocket"].indexOf( | ||
request.resourceType() | ||
) >= 0 | ||
); | ||
} | ||
const data = await page.evaluate(() => { | ||
return { | ||
origin: window.location.origin, | ||
html: document.documentElement.outerHTML, | ||
mailto: [].slice | ||
.call(document.querySelectorAll('a[href^="mailto:"]')) | ||
.map(element => { | ||
return element.pathname; | ||
}), | ||
links: Array.from(document.getElementsByTagName("a")) | ||
.filter(element => { | ||
return ( | ||
element.hostname === window.location.hostname && | ||
(element.protocol === "http:" || element.protocol === "https:") && | ||
element.pathname | ||
); | ||
}) | ||
.map(element => { | ||
return element.pathname; | ||
}) | ||
.filter((v, i, a) => { | ||
return a.indexOf(v) === i; | ||
}) | ||
}; | ||
async _addPageInterception(page) { | ||
await page.setRequestInterception(true); | ||
page.on("request", request => { | ||
this._shouldAbortRequest(request) ? request.abort() : request.continue(); | ||
}); | ||
} | ||
async _fetchUrl(link, callback) { | ||
let page, data; | ||
try { | ||
page = await this._browser.newPage(); | ||
await this._addPageInterception(page); | ||
await page.goto(link, { | ||
waitUntil: ["load", "domcontentloaded"], | ||
timeout: this._options.navigationTimeout | ||
}); | ||
await page.waitFor(this._options.waitForPageLoad); | ||
data = await page.evaluate( | ||
regex => { | ||
regex = new RegExp(regex.source, regex.flags); | ||
return { | ||
origin: window.location.origin, | ||
emails: [].slice | ||
.call(document.querySelectorAll('a[href^="mailto:"]')) | ||
.map(element => { | ||
return element.pathname; | ||
}) | ||
.concat(document.documentElement.outerHTML.match(regex) || []), | ||
links: Array.from(document.getElementsByTagName("a")) | ||
.filter(element => { | ||
return ( | ||
element.hostname === window.location.hostname && | ||
(element.protocol === "http:" || | ||
element.protocol === "https:") && | ||
element.pathname | ||
); | ||
}) | ||
.map(element => { | ||
return element.pathname; | ||
}) | ||
.filter((v, i, a) => { | ||
return a.indexOf(v) === i; | ||
}) | ||
}; | ||
}, | ||
{ | ||
source: EMAIL_REGEX.source, | ||
flags: EMAIL_REGEX.flags | ||
} | ||
); | ||
} catch (_) { | ||
callback(null); | ||
return; | ||
} finally { | ||
try { | ||
await page.close(); | ||
} catch (_) {} | ||
} | ||
data.links.forEach(link => { | ||
if ( | ||
this._options.levels > 0 && | ||
link.split("/").filter(path => !!path).length > this._options.levels | ||
) { | ||
return; | ||
} | ||
if (!this._links.has(link)) { | ||
@@ -83,6 +130,3 @@ this._links.add(link); | ||
const emails = [...data.mailto, ...(data.html.match(EMAIL_REGEX) || [])]; | ||
await page.close(); | ||
callback(emails); | ||
callback(data.emails); | ||
} | ||
@@ -89,0 +133,0 @@ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
7840
181