@web-master/node-web-crawler
Advanced tools
Comparing version 0.4.0 to 0.5.0
@@ -1,16 +0,4 @@ | ||
import { ScraperConfig, ScrapeOptions, ScrapeOptionElement, ScrapeOptionList, ScrapeResult } from '@web-master/node-web-scraper'; | ||
interface CrawlLinkOptions { | ||
url: string; | ||
crawl: string | { | ||
selector: string; | ||
convert?: (link: string) => string; | ||
}; | ||
fetch?: ScrapeOptions; | ||
} | ||
interface CrawlerConfig { | ||
target: string[] | CrawlLinkOptions; | ||
fetch: (data?: any, index?: number) => ScrapeOptions; | ||
} | ||
declare function crawl<T>(config: CrawlerConfig): Promise<T[]>; | ||
export { crawl, CrawlerConfig, CrawlLinkOptions, ScraperConfig, ScrapeOptions, ScrapeOptionElement, ScrapeOptionList, ScrapeResult, }; | ||
import { CrawlConfig, CrawlConfigPuppeteer, CrawlLinkOptions } from './interfaces'; | ||
declare function crawl<T>(config: CrawlConfig | CrawlConfigPuppeteer): Promise<T[]>; | ||
export { crawl, CrawlConfig, CrawlConfigPuppeteer, CrawlLinkOptions, }; | ||
export default crawl; |
@@ -34,5 +34,5 @@ "use strict"; | ||
} | ||
const { url, crawl, fetch } = possibleUrls; | ||
const { url, iterator, fetch } = possibleUrls; | ||
let holder; | ||
if (typeof crawl === 'string') { | ||
if (typeof iterator === 'string') { | ||
holder = await node_web_scraper_1.scrape({ | ||
@@ -42,3 +42,3 @@ target: url, | ||
urls: { | ||
listItem: crawl, | ||
listItem: iterator, | ||
data: { | ||
@@ -57,3 +57,3 @@ url: { attr: 'href' }, | ||
else { | ||
const { selector, convert } = crawl; | ||
const { selector, convert } = iterator; | ||
holder = await node_web_scraper_1.scrape({ | ||
@@ -78,14 +78,26 @@ target: url, | ||
async function crawl(config) { | ||
const { target, fetch } = config; | ||
const [urls, data] = await resolve(target); | ||
return crawlAll(urls, fetch, data); | ||
if (node_web_scraper_1.isScrapeConfigDefault(config)) { | ||
const { target, fetch } = config; | ||
const [urls, data] = await resolve(target); | ||
return crawlAll(urls, fetch, data); | ||
} | ||
if (node_web_scraper_1.isScrapeConfigPuppeteer(config)) { | ||
const { target, fetch, waitFor } = config; | ||
const [urls, data] = await resolve(target); | ||
return crawlAll(urls, fetch, data, waitFor); | ||
} | ||
throw new Error('InvalidProgramException'); | ||
} | ||
exports.crawl = crawl; | ||
async function crawlAll(urls, fetch, data) { | ||
async function crawlAll(urls, fetch, data, waitFor) { | ||
const results = []; | ||
for (let i = 0; i < urls.length; i++) { | ||
results.push(await node_web_scraper_1.scrape({ | ||
let config = { | ||
target: urls[i], | ||
fetch: fetch(data, i), | ||
})); | ||
}; | ||
if (waitFor) { | ||
config = Object.assign(config, { waitFor }); | ||
} | ||
results.push(await node_web_scraper_1.scrape(config)); | ||
} | ||
@@ -92,0 +104,0 @@ return results; |
import { | ||
crawl, | ||
CrawlerConfig, | ||
CrawlConfig, | ||
CrawlConfigPuppeteer, | ||
CrawlLinkOptions, | ||
ScraperConfig, | ||
ScrapeOptions, | ||
ScrapeOptionElement, | ||
ScrapeOptionList, | ||
ScrapeResult, | ||
} from './dist'; | ||
export { | ||
crawl, | ||
CrawlerConfig, | ||
CrawlConfig, | ||
CrawlConfigPuppeteer, | ||
CrawlLinkOptions, | ||
ScraperConfig, | ||
ScrapeOptions, | ||
ScrapeOptionElement, | ||
ScrapeOptionList, | ||
ScrapeResult, | ||
}; | ||
export default crawl; |
{ | ||
"name": "@web-master/node-web-crawler", | ||
"version": "0.4.0", | ||
"version": "0.5.0", | ||
"description": "Crawl web as easy as possible", | ||
@@ -8,3 +8,3 @@ "repository": "git@github.com:saltyshiomix/web-master.git", | ||
"license": "MIT", | ||
"homepage": "https://github.com/saltyshiomix/web-master/tree/master/packages/node-web-crawler", | ||
"homepage": "https://github.com/saltyshiomix/web-master/tree/master/packages/node-web-crawler/README.md", | ||
"keywords": [ | ||
@@ -37,3 +37,3 @@ "crawler", | ||
"dependencies": { | ||
"@web-master/node-web-scraper": "^0.4.0" | ||
"@web-master/node-web-scraper": "^0.5.0" | ||
}, | ||
@@ -56,3 +56,3 @@ "devDependencies": { | ||
}, | ||
"gitHead": "8b5b10b85e10637a2e80a89a3a214003c923833b" | ||
"gitHead": "3330c23e8934e241f32c906c9b94d26a12500b18" | ||
} |
@@ -25,2 +25,4 @@ <p align="center">😎 @web-master/node-web-crawler 😎</p> | ||
### Basic | ||
```js | ||
@@ -53,2 +55,32 @@ import crawl from '@web-master/node-web-crawler'; | ||
### Waitable (by using `puppeteer`) | ||
```js | ||
import crawl from '@web-master/node-web-crawler'; | ||
// crawl data on each link | ||
const data = await crawl({ | ||
target: { | ||
url: 'https://news.ycombinator.com', | ||
iterator: { | ||
selector: 'span.age > a', | ||
convert: (path) => `https://news.ycombinator.com/${path}`, | ||
}, | ||
}, | ||
waitFor: 3 * 1000, // wait for the content loaded! (like single page apps) | ||
fetch: () => ({ | ||
title: '.title', | ||
}), | ||
}); | ||
console.log(data); | ||
// [ | ||
// { title: 'An easiest crawling and scraping module for NestJS' }, | ||
// { title: 'A minimalistic boilerplate on top of Webpack, Babel, TypeScript and React' }, | ||
// ... | ||
// ... | ||
// { title: '[Experimental] React SSR as a view template engine' } | ||
// ] | ||
``` | ||
## TypeScript Support | ||
@@ -55,0 +87,0 @@ |
Sorry, the diff of this file is not supported yet
13143
11
148
120
7
+ Added@sindresorhus/is@0.14.0(transitive)
+ Added@szmarczak/http-timer@1.1.2(transitive)
+ Added@web-master/node-web-scraper@0.5.0(transitive)
+ Addedagent-base@4.3.0(transitive)
+ Addedasync-limiter@1.0.1(transitive)
+ Addedbalanced-match@1.0.2(transitive)
+ Addedbrace-expansion@1.1.11(transitive)
+ Addedbuffer-crc32@0.2.13(transitive)
+ Addedbuffer-from@1.1.2(transitive)
+ Addedcacheable-request@6.1.0(transitive)
+ Addedclone-response@1.0.3(transitive)
+ Addedconcat-map@0.0.1(transitive)
+ Addedconcat-stream@1.6.2(transitive)
+ Addedcss-select@2.1.0(transitive)
+ Addedcss-what@3.4.2(transitive)
+ Addeddebug@2.6.93.2.74.3.4(transitive)
+ Addeddecompress-response@3.3.0(transitive)
+ Addeddefer-to-connect@1.1.3(transitive)
+ Addeddom-serializer@0.2.21.4.1(transitive)
+ Addeddomhandler@3.3.04.3.1(transitive)
+ Addeddomutils@1.7.02.8.0(transitive)
+ Addedduplexer3@0.1.5(transitive)
+ Addedend-of-stream@1.4.4(transitive)
+ Addedentities@2.2.0(transitive)
+ Addedes6-promise@4.2.8(transitive)
+ Addedes6-promisify@5.0.0(transitive)
+ Addedextract-zip@1.7.0(transitive)
+ Addedfd-slicer@1.1.0(transitive)
+ Addedfs.realpath@1.0.0(transitive)
+ Addedget-stream@4.1.05.2.0(transitive)
+ Addedglob@7.2.3(transitive)
+ Addedgot@9.6.0(transitive)
+ Addedhtmlparser2@4.1.0(transitive)
+ Addedhttp-cache-semantics@4.1.1(transitive)
+ Addedhttps-proxy-agent@2.2.4(transitive)
+ Addedinflight@1.0.6(transitive)
+ Addedisarray@1.0.0(transitive)
+ Addedjson-buffer@3.0.0(transitive)
+ Addedkeyv@3.1.0(transitive)
+ Addedlowercase-keys@1.0.12.0.0(transitive)
+ Addedmime@2.6.0(transitive)
+ Addedmimic-response@1.0.1(transitive)
+ Addedminimatch@3.1.2(transitive)
+ Addedminimist@1.2.8(transitive)
+ Addedmkdirp@0.5.6(transitive)
+ Addedms@2.0.02.1.2(transitive)
+ Addednormalize-url@4.5.1(transitive)
+ Addedonce@1.4.0(transitive)
+ Addedp-cancelable@1.1.0(transitive)
+ Addedpath-is-absolute@1.0.1(transitive)
+ Addedpend@1.2.0(transitive)
+ Addedprepend-http@2.0.0(transitive)
+ Addedprocess-nextick-args@2.0.1(transitive)
+ Addedprogress@2.0.3(transitive)
+ Addedproxy-from-env@1.1.0(transitive)
+ Addedpump@3.0.0(transitive)
+ Addedpuppeteer@1.20.0(transitive)
+ Addedreadable-stream@2.3.8(transitive)
+ Addedresponselike@1.0.2(transitive)
+ Addedrimraf@2.7.1(transitive)
+ Addedsafe-buffer@5.1.2(transitive)
+ Addedstring_decoder@1.1.1(transitive)
+ Addedto-readable-stream@1.0.0(transitive)
+ Addedtypedarray@0.0.6(transitive)
+ Addedurl-parse-lax@3.0.0(transitive)
+ Addedutil-deprecate@1.0.2(transitive)
+ Addedwrappy@1.0.2(transitive)
+ Addedws@6.2.2(transitive)
+ Addedyauzl@2.10.0(transitive)
- Removed@types/cheerio@0.22.35(transitive)
- Removed@types/node@20.12.12(transitive)
- Removed@web-master/node-web-scraper@0.4.0(transitive)
- Removedabab@1.0.4(transitive)
- Removedacorn@2.7.0(transitive)
- Removedacorn-globals@1.0.9(transitive)
- Removedajv@6.12.6(transitive)
- Removedasn1@0.2.6(transitive)
- Removedassert-plus@1.0.0(transitive)
- Removedassured@1.0.15(transitive)
- Removedasynckit@0.4.0(transitive)
- Removedaws-sign2@0.7.0(transitive)
- Removedaws4@1.13.0(transitive)
- Removedbarbe@3.0.16(transitive)
- Removedbcrypt-pbkdf@1.0.2(transitive)
- Removedcaseless@0.12.0(transitive)
- Removedcheerio@0.20.01.0.0-rc.12(transitive)
- Removedcheerio-req@1.2.4(transitive)
- Removedcheerio-select@2.1.0(transitive)
- Removedcombined-stream@1.0.8(transitive)
- Removedcore-util-is@1.0.2(transitive)
- Removedcss-select@1.2.05.1.0(transitive)
- Removedcss-what@2.1.36.1.0(transitive)
- Removedcssom@0.3.8(transitive)
- Removedcssstyle@0.2.37(transitive)
- Removeddashdash@1.14.1(transitive)
- Removeddeep-is@0.1.4(transitive)
- Removeddeffy@2.2.4(transitive)
- Removeddelayed-stream@1.0.0(transitive)
- Removeddom-serializer@0.1.12.0.0(transitive)
- Removeddomhandler@2.3.05.0.3(transitive)
- Removeddomutils@1.5.13.1.0(transitive)
- Removedecc-jsbn@0.1.2(transitive)
- Removedentities@1.0.01.1.24.5.0(transitive)
- Removederr@2.1.12(transitive)
- Removedescodegen@1.14.3(transitive)
- Removedesprima@4.0.1(transitive)
- Removedestraverse@4.3.0(transitive)
- Removedesutils@2.0.3(transitive)
- Removedextend@3.0.2(transitive)
- Removedextsprintf@1.3.0(transitive)
- Removedfast-deep-equal@3.1.3(transitive)
- Removedfast-json-stable-stringify@2.1.0(transitive)
- Removedfast-levenshtein@2.0.6(transitive)
- Removedfollow-redirects@1.15.6(transitive)
- Removedforever-agent@0.6.1(transitive)
- Removedform-data@2.3.3(transitive)
- Removedfunction.name@1.0.13(transitive)
- Removedgetpass@0.1.7(transitive)
- Removedhar-schema@2.0.0(transitive)
- Removedhar-validator@5.1.5(transitive)
- Removedhtmlparser2@3.8.38.0.2(transitive)
- Removedhttp-signature@1.2.0(transitive)
- Removedis-empty-obj@1.0.13(transitive)
- Removedis-typedarray@1.0.0(transitive)
- Removedisarray@0.0.1(transitive)
- Removedisstream@0.1.2(transitive)
- Removediterate-object@1.3.4(transitive)
- Removedjsbn@0.1.1(transitive)
- Removedjsdom@7.2.2(transitive)
- Removedjson-schema@0.4.0(transitive)
- Removedjson-schema-traverse@0.4.1(transitive)
- Removedjson-stringify-safe@5.0.1(transitive)
- Removedjsprim@1.4.2(transitive)
- Removedlevn@0.3.0(transitive)
- Removedlodash@4.17.21(transitive)
- Removedmime-db@1.52.0(transitive)
- Removedmime-types@2.1.35(transitive)
- Removednoop6@1.0.9(transitive)
- Removednth-check@2.1.1(transitive)
- Removednwmatcher@1.4.4(transitive)
- Removedoauth-sign@0.9.0(transitive)
- Removedobj-def@1.0.9(transitive)
- Removedoptionator@0.8.3(transitive)
- Removedparse5@1.5.17.1.2(transitive)
- Removedparse5-htmlparser2-tree-adapter@7.0.0(transitive)
- Removedperformance-now@2.1.0(transitive)
- Removedprelude-ls@1.1.2(transitive)
- Removedpsl@1.9.0(transitive)
- Removedpunycode@2.3.1(transitive)
- Removedqs@6.5.3(transitive)
- Removedreadable-stream@1.1.14(transitive)
- Removedregex-escape@3.4.10(transitive)
- Removedrequest@2.88.2(transitive)
- Removedsafe-buffer@5.2.1(transitive)
- Removedsafer-buffer@2.1.2(transitive)
- Removedsax@1.4.1(transitive)
- Removedscrape-it@5.3.2(transitive)
- Removedscrape-it-core@1.0.0(transitive)
- Removedsliced@1.0.1(transitive)
- Removedsource-map@0.6.1(transitive)
- Removedsshpk@1.18.0(transitive)
- Removedstring_decoder@0.10.31(transitive)
- Removedsymbol-tree@3.2.4(transitive)
- Removedtinyreq@3.4.2(transitive)
- Removedtough-cookie@2.5.0(transitive)
- Removedtr46@0.0.3(transitive)
- Removedtunnel-agent@0.6.0(transitive)
- Removedtweetnacl@0.14.5(transitive)
- Removedtype-check@0.3.2(transitive)
- Removedtyppy@2.3.13(transitive)
- Removedul@5.2.15(transitive)
- Removedundici-types@5.26.5(transitive)
- Removeduri-js@4.4.1(transitive)
- Removeduuid@3.4.0(transitive)
- Removedverror@1.10.0(transitive)
- Removedwebidl-conversions@2.0.1(transitive)
- Removedwhatwg-url-compat@0.6.5(transitive)
- Removedword-wrap@1.2.5(transitive)
- Removedxml-name-validator@2.0.1(transitive)