node-crawler
Crawls web urls from a list
Example
const options = new CrawlerOptions({
name: 'node-crawler-agent',
concurrency: 1,
readRobotsTxt: true,
dataPath: 'data/crawler',
});
const crawler = new Crawler(options);
const links = [{ url: "https://www.google.com" }];
async function init() {
for (const link of links) {
crawler.add(link.url).then((result) => {
if (result) {
console.info('Crawled', company.name, link.url);
}
}
// To avoid saturating the CPU immediately on startup we don't fill the queue up all the way.
await crawler.queue.onSizeLessThan(options.concurrency * 2);
}
await crawler.queue.onEmpty();
}
init().then(async () => {
console.info('Crawling complete');
await crawler.close();
process.exit();
});
TODO