Comparing version
@@ -10,2 +10,3 @@ const Bottleneck = require('bottleneck') | ||
const defaults = { | ||
parseDOM: true, | ||
json: false, | ||
@@ -35,2 +36,4 @@ maxConcurrent: 5, | ||
async function getPage (page, emitter, opts) { | ||
emitter.emit('beforePageLoad', page) | ||
if (opts.json) { | ||
@@ -47,4 +50,4 @@ try { | ||
const body = (await got(page.url)).body | ||
const $ = cheerio.load(body) | ||
const pageCopy = Object.assign({}, page, { body, $ }) | ||
const pageCopy = Object.assign({}, page, { body }) | ||
if (opts.parseDOM) pageCopy.$ = cheerio.load(body) | ||
emitter.emit('page', pageCopy) | ||
@@ -51,0 +54,0 @@ } catch (err) { |
{ | ||
"name": "domwaiter", | ||
"description": "A well-behaved URL scraper that brings you delicious DOM objects", | ||
"version": "1.1.0", | ||
"version": "1.4.0", | ||
"repository": "https://github.com/zeke/domwaiter", | ||
@@ -14,3 +14,3 @@ "main": "index.js", | ||
"cheerio": "^1.0.0-rc.3", | ||
"got": "^10.6.0" | ||
"got": "^11.8.5" | ||
}, | ||
@@ -27,3 +27,8 @@ "devDependencies": { | ||
} | ||
}, | ||
"release": { | ||
"branches": [ | ||
"main" | ||
] | ||
} | ||
} |
@@ -13,3 +13,3 @@ # domwaiter | ||
- Rate limiting powered by [bottleneck](https://ghub.io/bottleneck) | ||
- DOM parsing powered by [cheerio](https://ghub.io/cheerio) | ||
- DOM parsing powered by [cheerio](https://ghub.io/cheerio) (optional; can be disabled) | ||
- HTTP requests powered by [got](https://ghub.io/got) | ||
@@ -54,3 +54,4 @@ | ||
- `opts` Object (optional) | ||
- `json` Boolean - Set to `true` if you're fetching JSON instead of HTML. If `true`, a `json` property will be present on each emitted `page` object (and the `$` and `body` properties will NOT be present). | ||
- `parseDOM` Boolean - Defaults to `true`. Set to `false` if you don't need the parsed `page.$` DOM object. Disabling DOM parsing will boost performance. | ||
- `json` Boolean - Defaults to `false`. Set to `true` if you're fetching JSON instead of HTML. If `true`, a `json` property will be present on each emitted `page` object (and the `$` and `body` properties will NOT be present). | ||
- `maxConcurrent` Number - How many jobs can be executing at the same time. Defaults to `5`. This option is passed to the underlying [bottleneck](https://ghub.io/bottleneck#docs) instance. | ||
@@ -63,3 +64,4 @@ - `minTime`: Number - How long to wait after launching a job before launching another one. Defaults to `500` (milliseconds). This option is passed to the underlying [bottleneck](https://ghub.io/bottleneck#docs) instance. | ||
- `page` - Emitted as each page has been requested and parsed. Returns an object which is a shallow clone of the original `page` object you provided, but with two added properties: | ||
- `beforePageLoad` - Emitted with `page` object for any optional prehandling you want to do, e.g. setting up a request timer. | ||
- `page` - Emitted after the page has been requested and the response is parsed. Returns an object which is a shallow clone of the original `page` object you provided, but with two added properties: | ||
- `body`: the raw HTTP response body text | ||
@@ -66,0 +68,0 @@ - `$`: The body parsed into a jQuery-like [cheerio](https://ghub.io/cheerio) DOM object. |
50
test.js
@@ -12,3 +12,3 @@ const domwaiter = require('.') | ||
test('emits events', (done) => { | ||
test('emits `page` and `done` events', (done) => { | ||
const mock = nock('https://example.com') | ||
@@ -47,2 +47,21 @@ .get('/foo') | ||
test('emits a `beforePageLoad` event with page object', (done) => { | ||
const mock = nock('https://example.com') | ||
.get('/foo') | ||
.reply(200) | ||
const pages = [ | ||
{ url: 'https://example.com/foo' } | ||
] | ||
const waiter = domwaiter(pages, { minTime: 10 }) | ||
waiter | ||
.on('beforePageLoad', (page) => { | ||
expect(mock.isDone()) | ||
expect(page && page.url) | ||
done() | ||
}) | ||
}) | ||
test('emits errors for failed requests', (done) => { | ||
@@ -78,2 +97,31 @@ const mock = nock('https://example.com') | ||
test('allows `parseDOM` option to skip cheerio parsing', (done) => { | ||
const mock = nock('https://example.com') | ||
.get('/foo') | ||
.reply(200, '<html><title>Hello, foo</title></html>') | ||
const pages = [ | ||
{ url: 'https://example.com/foo' } | ||
] | ||
const waiter = domwaiter(pages, { minTime: 10, parseDOM: false }) | ||
const results = [] | ||
waiter | ||
.on('page', (page) => { | ||
results.push(page) | ||
}) | ||
.on('done', () => { | ||
expect(mock.isDone()).toBe(true) | ||
expect(results.length).toBe(1) | ||
expect(results[0].body).toContain('Hello, foo') | ||
expect(results[0].$).toBe(undefined) | ||
done() | ||
}) | ||
.on('error', (err) => { | ||
console.error('domwaiter error') | ||
console.error(err) | ||
}) | ||
}) | ||
test('supports json responses', (done) => { | ||
@@ -80,0 +128,0 @@ const mock = nock('https://example.com') |
Sorry, the diff of this file is not supported yet
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
No License Found
License(Experimental) License information could not be found.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
12090
30.1%8
14.29%0
-100%196
27.27%86
2.38%+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
- Removed
Updated