website-scraper
Advanced tools
Comparing version 3.2.0 to 3.3.0
@@ -52,2 +52,3 @@ 'use strict'; | ||
}, | ||
requestConcurrency: Infinity, | ||
urlFilter: null, | ||
@@ -54,0 +55,0 @@ recursive: false, |
'use strict'; | ||
const Promise = require('bluebird'); | ||
const PromiseQueue = require('p-queue'); | ||
const _ = require('lodash'); | ||
@@ -20,10 +21,8 @@ | ||
function Scraper (options) { | ||
const self = this; | ||
this.options = u.extend(defaults, options); | ||
this.options.request = u.extend(defaults.request, options.request); | ||
this.options.urls = Array.isArray(this.options.urls) ? this.options.urls : [this.options.urls]; | ||
self.options = u.extend(defaults, options); | ||
self.options.request = u.extend(defaults.request, options.request); | ||
self.options.urls = Array.isArray(self.options.urls) ? self.options.urls : [self.options.urls]; | ||
if (self.options.subdirectories) { | ||
self.options.subdirectories.forEach((element) => { | ||
if (this.options.subdirectories) { | ||
this.options.subdirectories.forEach((element) => { | ||
element.extensions = element.extensions.map((ext) => ext.toLowerCase()); | ||
@@ -33,23 +32,24 @@ }); | ||
self.options.recursiveSources = recursiveSources; | ||
if (self.options.recursive) { | ||
self.options.sources = u.union(self.options.sources, self.options.recursiveSources); | ||
this.options.recursiveSources = recursiveSources; | ||
if (this.options.recursive) { | ||
this.options.sources = u.union(this.options.sources, this.options.recursiveSources); | ||
} | ||
logger.info('init with options', self.options); | ||
logger.info('init with options', this.options); | ||
self.request = new Request(self.options); | ||
self.resourceHandler = new ResourceHandler(self.options, self); | ||
self.filenameGenerator = new FilenameGenerator(self.options); | ||
self.resourceSaver = self.options.resourceSaver ? new self.options.resourceSaver(u.clone(self.options)) : new ResourceSaver(self.options); | ||
this.request = new Request(this.options); | ||
this.resourceHandler = new ResourceHandler(this.options, this); | ||
this.filenameGenerator = new FilenameGenerator(this.options); | ||
this.resourceSaver = this.options.resourceSaver ? new this.options.resourceSaver(u.clone(this.options)) : new ResourceSaver(this.options); | ||
// Array of Resources for downloading | ||
self.resources = self.options.urls.map((obj) => { | ||
this.resources = this.options.urls.map((obj) => { | ||
const url = (obj && obj.url) ? obj.url : obj; | ||
const filename = (obj && obj.filename) ? obj.filename : self.options.defaultFilename; | ||
const filename = (obj && obj.filename) ? obj.filename : this.options.defaultFilename; | ||
return new Resource(url, filename); | ||
}); | ||
self.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise | ||
self.loadedResources = new NormalizedUrlMap(); // Map url -> resource | ||
this.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise | ||
this.loadedResources = new NormalizedUrlMap(); // Map url -> resource | ||
this.requestQueue = new PromiseQueue({concurrency: this.options.requestConcurrency}); | ||
} | ||
@@ -95,3 +95,3 @@ | ||
const referer = resource.parent ? resource.parent.getUrl() : null; | ||
return self.request.get(url, referer); | ||
return self.requestQueue.add(() => self.request.get(url, referer)); | ||
}).then(function requestCompleted (responseData) { | ||
@@ -98,0 +98,0 @@ |
{ | ||
"name": "website-scraper", | ||
"version": "3.2.0", | ||
"version": "3.3.0", | ||
"description": "Download website to a local directory (including all css, images, js, etc.)", | ||
@@ -44,2 +44,3 @@ "readmeFilename": "README.md", | ||
"normalize-url": "^1.5.3", | ||
"p-queue": "^1.1.0", | ||
"request": "^2.81.0", | ||
@@ -46,0 +47,0 @@ "srcset": "^1.0.0" |
@@ -66,2 +66,3 @@ ## Introduction | ||
* [updateMissingSources](#updatemissingsources) - update url for missing sources with absolute url | ||
* [requestConcurrency](#requestconcurrency) - set maximum concurrent requests | ||
@@ -281,3 +282,6 @@ Default options you can find in [lib/config/defaults.js](https://github.com/website-scraper/node-website-scraper/blob/master/lib/config/defaults.js) or get them using `scrape.defaults`. | ||
#### requestConcurrency | ||
Number, maximum amount of concurrent requests. Defaults to `Infinity`. | ||
## callback | ||
@@ -284,0 +288,0 @@ Callback function, optional, includes following parameters: |
53486
1058
301
11
+ Addedp-queue@^1.1.0
+ Addedp-queue@1.2.0(transitive)