import { PupCrawler } from 'pup-crawler'
;async () => {
  const crawler = new PupCrawler()
  await crawler.open()
  await crawler.crawlPage({
    name: 'list',
    url: 'https://www.example.com/list',
    target: {
      values: [{ label: 'detailData', attr: 'href', css: '.list-item > a', all: true }],
    },
    callback: async (result: any) => {
      const { detailData } = result
      console.log(detailData)
    },
  })
  await crawler.close()
}

复杂用法：详细看 example.ts 文件，那以腾讯动漫为例，爬取列表和详情和内容页。

// Target 类型
target: {
    actives: [
      { type: 'click', css: 'a.btn-startchat', delay: 1000 },
      { type: 'input', css: '#chat-input', value: 'input text...', delay: 1000 },
      ...
    ],
    values: [
        // 1. 普通获取值, 例如获取 .item > a 中的文本内容。attr默认获取textContent
        {label: 'val', css: '.item > a'},
        // 2. 获取属性值, 例如获取 .item > a 中的href属性值。attr = getAttribute('xxx')
        {label: 'val2', attr: 'href', css: '.item > a'},
        // 3. 实现 document.querySelectorAll('.item > a') 功能。 加 all: true=querySelectorAll, false=querySelector
        {label: 'val3', attr: 'href', css: '.item > a', all: true},
        // 4. 实现 document.querySelectorAll('.item > a')[3] 功能。 加 all: true, allIdx: 3
        {label: 'val4', attr: 'href', css: '.item > a', all: true, allIdx: 3},
        // 5. 实现 document.querySelectorAll('.item > a')[3].querySelector('.sub-item > a') 功能。 加 all: true,  allIdx: 3
        {label: 'val5', attr: 'href', css: ['.item > a', '.sub-item > a'], all: true, allIdx: 3},
        // 6. 获取 window.location.href 值, 不用加css, 需要从window对象开始获取
        {label: 'val6',  attr: 'window.location.href'},
        // 7. 获取多个a标签的href值，且循环遍历。 加 loopOpt: CrawlOptions; loopOpt执行完的值是下一个target.values的对象，会赋给label，
        {label: 'val7', attr: 'href', css: '.list-item > ul > li > a', all: true, loopOpt: NextPageOpt},
        ...
    ],
    // 在本类型页面循环，例如获取某个电视剧播放的集数列表的播放源
    // loopKey：1、从上面values中选循环的label对应的值（一般是all: true的，loopOpt：不再做下一层循环）
    // loopVals：2、从上面values中选循环的label需要返回的值。比如最后一个页面没必要太多值，只需要val2， val4这两个值
    recursion: { loopKey: 'playList', loopVals: ['val2', 'val4'] },
    // 前置函数，返回true则继续执行。常用控制页面爬取，例如数据库检查当前爬取值是否已存在
    before: () => boolean | Promise<boolean>,
    // 后置函数，如果为false则立即返回爬取结果，否则返回对象和入参时一样。可以过滤一些不必要的参数往下执行
    after: (obj: object) => false | Promise<obj>,
}

配置项

export interface CrawlOptions {
  /** 名 */
  name?: string
  /** 爬取模式: 默认dynamic */
  mode?: 'static' | 'dynamic'
  /** 要爬取的页面地址 */
  url?: string
  /** 超时时间: 默认60s */
  timeout?: number
  /** 延迟时间 */
  delayTime?: number
  /** 页面加载配置: 目前用在动态页面配置 */
  pageOpts?: GoToOptions
  /** 要爬取的目标 */
  target: Target
  /** 前置函数 */
  before?: () => boolean | Promise<boolean>
  /** 后置函数, 如果返回false则终止爬取，否者会把值传给后续继续处理 */
  after?: (obj: object) => false | Promise<obj>
  /** 回调函数 */
  callback?: (obj: object) => obj | Promise<obj>
  /** 报错函数 */
  error?: (err: any) => void
  /** 最终执行函数 */
  finally?: (obj?: object) => void
  /** 自循环递归 */
  recursion?: {
    /** 循环的key */
    loopKey: string
    /** 循环需要取值target.values的label值，会统计成一个object */
    loopVals: string[]
  }
}
export interface IProps {
  /** 网页前缀 */
  host?: string
  /** 打印 */
  showLog?: boolean
}

API

PupCrawler 类：用于创建爬虫实例，并提供一些方法用于控制爬虫的运行。
open 方法：打开浏览器，并等待浏览器启动完成。
close 方法：关闭浏览器，并等待浏览器关闭完成。
crawlPage 方法：爬取页面。

⚠ 注意事项

- 如果设置了回调函数打印但是控制台没有打印结果，则注意检查 label 属性名是否有冲突，或 css 选择器是否正确。

🚀 更新日志

------------------ 2025-05-01 ------------------

1、优化代码
2、增加静态页面爬取 mode=static，避免浏览器过多请求
3、去除 console，默认打印
4、抽象爬取功能和解析功能
5、调整 scroll 传参
6、优化 after 函数的功能

------------------ 2025-05-09 ------------------ 1、去除 autoScroll, 操作归纳到 target.actives 中 2、增加 actives 和 finally

Keywords

FAQs

What is pup-crawler?

Is pup-crawler well maintained?

Package last updated on 09 May 2025

Did you know?

Socket for GitHub automatically highlights issues in each pull request and monitors the health of all your open source dependencies. Discover the contents of your packages and block harmful activity before you install or update your dependencies.

Install

pup-crawler

PUP Crawler

Usage

配置项

API

⚠ 注意事项

🚀 更新日志

Keywords

Related posts

Axios Maintainer Confirms Social Engineering Attack Behind npm Compromise

Node.js Drops Bug Bounty Rewards After Funding Dries Up