😎 nest-crawler 😎
Crawler and Scraper Module for NestJS
Installation
$ npm install --save nest-crawler
Usage
First, register it in the application module so that Nest can handle dependencies:
import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
@Module({
imports: [
NestCrawlerModule,
],
})
export class AppModule {}
Then, just import it and use it:
crawler.module.ts
import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
@Module({
imports: [
NestCrawlerModule,
],
})
export class CrawlerModule {}
crawler.service.ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async scrape(): Promise<void> {
interface ExampleCom {
title: string;
info: string;
content: string;
}
const data: ExampleCom = await this.crawler.fetch({
target: 'http://example.com',
fetch: {
title: 'h1',
info: {
selector: 'p > a',
attr: 'href',
},
content: {
selector: '.content',
how: 'html',
},
},
});
console.log(data);
}
public async crawl(): Promise<void> {
interface HackerNewsPage {
title: string;
}
const pages: HackerNewsPage[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});
console.log(pages);
}
}
Recipe
Single Page Scraping
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async scrape(): Promise<void> {
interface ExampleCom {
title: string;
info: string;
content: string;
}
const data: ExampleCom = await this.crawler.fetch({
target: 'http://example.com',
fetch: {
title: 'h1',
info: {
selector: 'p > a',
attr: 'href',
},
content: {
selector: '.content',
how: 'html',
}
},
});
console.log(data);
}
}
Multi Pages Crawling
You Know the target urls already
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async crawl(): Promise<void> {
interface Site {
title: string;
}
const sites: Site[] = await this.crawler.fetch({
target: [
'https://example1.com',
'https://example2.com',
'https://example3.com',
],
fetch: (data: any, index: number, url: string) => ({
title: 'h1',
}),
});
console.log(sites);
}
}
You Don't Know the Target Urls so Want to Crawl Dynamically
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async crawl(): Promise<void> {
interface Page {
title: string;
}
const pages: Page[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});
console.log(pages);
}
}
You Need to Pass Data Dynamically
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async crawl(): Promise<void> {
interface Img {
src: string;
}
const images: Img[] = await this.crawler.fetch({
target: {
url: 'https://some.image.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://some.image.com${x}`,
},
fetch: {
imageIds: {
listItem: 'div.image',
data: {
id: {
selector: 'div.image-wrapper',
attr: 'data-image-id',
},
},
},
},
},
fetch: (data: any, index: number, url: string) => ({
src: {
convert: () => `https://some.image.com/images/${data.imageIds[index]}.png`,
},
}),
});
console.log(images);
}
}
Waitable (by using puppeteer
)
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async crawl(): Promise<void> {
interface Page {
title: string;
}
const pages: Page[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
waitFor: 3 * 1000,
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});
console.log(pages);
}
}
Related