Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

nest-crawler

Package Overview
Dependencies
Maintainers
1
Versions
19
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

nest-crawler

An easiest crawling and scraping module for NestJS

  • 1.9.0
  • latest
  • Source
  • npm
  • Socket score

Version published
Weekly downloads
36
decreased by-10%
Maintainers
1
Weekly downloads
 
Created
Source

Nest Logo

😎 nest-crawler 😎

Crawler and Scraper Module for NestJS

Package License (MIT)

Installation

$ npm install --save nest-crawler

Usage

First, register it in the application module so that Nest can handle dependencies:

import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';

@Module({
  imports: [
    NestCrawlerModule,
  ],
})
export class AppModule {}

Then, just import it and use it:

crawler.module.ts

import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
@Module({
  imports: [
    NestCrawlerModule,
  ],
})
export class CrawlerModule {}

crawler.service.ts

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  // scraping the specific page
  public async scrape(): Promise<void> {
    interface ExampleCom {
      title: string;
      info: string;
      content: string;
    }

    const data: ExampleCom = await this.crawler.fetch({
      target: 'http://example.com',
      fetch: {
        title: 'h1',
        info: {
          selector: 'p > a',
          attr: 'href',
        },
        content: {
          selector: '.content',
          how: 'html',
        },
      },
    });

    console.log(data);
    // {
    //   title: 'Example Domain',
    //   info: 'http://www.iana.org/domains/example',
    //   content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>'
    // }
  }

  // crawling multi pages is also supported
  public async crawl(): Promise<void> {
    interface HackerNewsPage {
      title: string;
    }

    const pages: HackerNewsPage[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });

    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

Recipe

Single Page Scraping

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async scrape(): Promise<void> {
    interface ExampleCom {
      title: string;
      info: string;
      content: string;
    }

    const data: ExampleCom = await this.crawler.fetch({
      target: 'http://example.com',
      fetch: {
        title: 'h1',
        info: {
          selector: 'p > a',
          attr: 'href',
        },
        content: {
          selector: '.content',
          how: 'html',
        }
      },
    });

    console.log(data);
    // {
    //   title: 'Example Domain',
    //   info: 'http://www.iana.org/domains/example',
    //   content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>'
    // }
  }
}

Multi Pages Crawling

You Know the target urls already
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Site {
      title: string;
    }

    const sites: Site[] = await this.crawler.fetch({
      target: [
        'https://example1.com',
        'https://example2.com',
        'https://example3.com',
      ],
      fetch: (data: any, index: number, url: string) => ({
        title: 'h1',
      }),
    });

    console.log(sites);
    // [
    //   { title: 'An easiest crawling and scraping module for NestJS' },
    //   { title: 'A minimalistic boilerplate on top of Webpack, Babel, TypeScript and React' },
    //   { title: '[Experimental] React SSR as a view template engine' }
    // ]
  }
}
You Don't Know the Target Urls so Want to Crawl Dynamically
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Page {
      title: string;
    }

    const pages: Page[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      // fetch each `https://news.ycombinator.com/${x}` and scrape data
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });

    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}
You Need to Pass Data Dynamically
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Img {
      src: string;
    }

    const images: Img[] = await this.crawler.fetch({
      target: {
        url: 'https://some.image.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://some.image.com${x}`,
        },
        fetch: {
          imageIds: {
            listItem: 'div.image',
            data: {
              id: {
                selector: 'div.image-wrapper',
                attr: 'data-image-id',
              },
            },
          },
        },
      },
      // fetch each `https://some.image.com${x}`, pass data and scrape data
      fetch: (data: any, index: number, url: string) => ({
        src: {
          convert: () => `https://some.image.com/images/${data.imageIds[index]}.png`,
        },
      }),
    });

    console.log(images);
    // [
    //   { src: 'https://some.image.com/images/1.png' },
    //   { src: 'https://some.image.com/images/2.png' },
    //   ...
    //   ...
    //   { src: 'https://some.image.com/images/100.png' }
    // ]
  }
}
Waitable (by using puppeteer)
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Page {
      title: string;
    }

    const pages: Page[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      waitFor: 3 * 1000, // wait for the content loaded! (like single page apps)
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });

    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

Keywords

FAQs

Package last updated on 01 Nov 2019

Did you know?

Socket

Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.

Install

Related posts

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc