website-scraper
Advanced tools
Comparing version 2.3.0 to 2.4.0
@@ -35,3 +35,5 @@ 'use strict'; | ||
{ selector: 'audio source', attr: 'src' }, | ||
{ selector: 'audio track', attr: 'src' } | ||
{ selector: 'audio track', attr: 'src' }, | ||
{ selector: 'frame', attr: 'src' }, | ||
{ selector: 'iframe', attr: 'src' } | ||
], | ||
@@ -52,2 +54,3 @@ subdirectories: [ | ||
recursive: false, | ||
maxRecursiveDepth: null, | ||
maxDepth: null, | ||
@@ -54,0 +57,0 @@ ignoreErrors: true, |
@@ -1,8 +0,9 @@ | ||
var ImgSrcsetTag = require('../path-containers/html-img-srcset-tag'); | ||
var CommonTag = require('../path-containers/html-common-tag'); | ||
var CssText = require('../path-containers/css-text'); | ||
var _ = require('lodash'); | ||
var utils = require('../../utils'); | ||
'use strict'; | ||
var pathContainersByRule = [ | ||
const ImgSrcsetTag = require('../path-containers/html-img-srcset-tag'); | ||
const CommonTag = require('../path-containers/html-common-tag'); | ||
const CssText = require('../path-containers/css-text'); | ||
const utils = require('../../utils'); | ||
const pathContainersByRule = [ | ||
{ selector: '[style]', attr: 'style', containerClass: CssText }, | ||
@@ -15,40 +16,56 @@ { selector: 'style', containerClass: CssText }, | ||
* Represents pair of cheerio element and rule to find text with children resources | ||
* @param {Object} el - cheerio obj for dom element | ||
* @param {Object} rule - rule used to find current element | ||
* @param {string} rule.selector - cheerio selector | ||
* @param {string} rule.attr - attribute to find text which contains resources. if not set - use inner html | ||
* @constructor | ||
*/ | ||
function HtmlSourceElement (el, rule) { | ||
this.el = el; | ||
this.rule = rule; | ||
} | ||
class HtmlSourceElement { | ||
/** | ||
* @param {Object} el - cheerio obj for dom element | ||
* @param {Object} rule - rule used to find current element | ||
* @param {string} rule.selector - cheerio selector | ||
* @param {string} rule.attr - attribute to find text which contains resources. if not set - use inner html | ||
*/ | ||
constructor (el, rule) { | ||
this.el = el; | ||
this.rule = rule; | ||
} | ||
/** | ||
* Get text from attr or from innerHtml of element based on rule | ||
* @returns {string} | ||
*/ | ||
HtmlSourceElement.prototype.getData = function getData () { | ||
var text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text(); | ||
return utils.decodeHtmlEntities(text); | ||
}; | ||
/** | ||
* Get resource data from element using rule | ||
* @returns {string} | ||
*/ | ||
getData () { | ||
const text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text(); | ||
return utils.decodeHtmlEntities(text); | ||
} | ||
HtmlSourceElement.prototype.setData = function setData (newData) { | ||
this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData); | ||
}; | ||
/** | ||
* Update attribute or inner text of el with new data | ||
* @param {string} newData | ||
*/ | ||
setData (newData) { | ||
this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData); | ||
} | ||
HtmlSourceElement.prototype.getPathContainerClass = function getPathContainerClass () { | ||
var selectedRule = _.find(pathContainersByRule, (containerByRule) => { | ||
return this.el.is(containerByRule.selector) && this.rule.attr === containerByRule.attr; | ||
}); | ||
/** | ||
* Returns PathContainer instance for element | ||
* @returns {CssText|HtmlCommonTag|HtmlImgSrcSetTag|null} | ||
*/ | ||
getPathContainer () { | ||
const selectedRule = this.findMatchedRule(pathContainersByRule); | ||
const ContainerClass = selectedRule ? selectedRule.containerClass : CommonTag; | ||
const textWithResources = this.getData(); | ||
return textWithResources ? new ContainerClass(textWithResources) : null; | ||
} | ||
return selectedRule ? selectedRule.containerClass : CommonTag; | ||
}; | ||
matchesRule (rule) { | ||
return this.el.is(rule.selector) && this.rule.attr === rule.attr; | ||
} | ||
HtmlSourceElement.prototype.getPathContainer = function getPathContainer () { | ||
var ContainerClass = this.getPathContainerClass(); | ||
var textWithResources = this.getData(); | ||
return textWithResources ? new ContainerClass(textWithResources) : null; | ||
}; | ||
findMatchedRule (rulesArray) { | ||
return rulesArray.find(this.matchesRule, this); | ||
} | ||
toString () { | ||
return JSON.stringify({selector: this.rule.selector, attr: this.rule.attr, data: this.getData()}); | ||
} | ||
} | ||
module.exports = HtmlSourceElement; |
@@ -1,42 +0,55 @@ | ||
var cheerio = require('cheerio'); | ||
var Promise = require('bluebird'); | ||
var utils = require('../../utils'); | ||
var HtmlSourceElement = require('./html-source-element'); | ||
'use strict'; | ||
function HtmlResourceHandler (options, handleChildrenPaths) { | ||
this.options = options; | ||
this.handleChildrenPaths = handleChildrenPaths; | ||
} | ||
const cheerio = require('cheerio'); | ||
const Promise = require('bluebird'); | ||
const utils = require('../../utils'); | ||
const logger = require('../../logger'); | ||
const HtmlSourceElement = require('./html-source-element'); | ||
HtmlResourceHandler.prototype.handle = function handle (resource) { | ||
var $ = loadTextToCheerio(resource.getText()); | ||
prepareToLoad($, resource); | ||
class HtmlResourceHandler { | ||
constructor (options, handleChildrenPaths) { | ||
this.options = options; | ||
this.handleChildrenPaths = handleChildrenPaths; | ||
} | ||
return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource)) | ||
.then(function updateResource () { | ||
resource.setText($.html()); | ||
return resource; | ||
}); | ||
}; | ||
handle (resource) { | ||
const $ = loadTextToCheerio(resource.getText()); | ||
prepareToLoad($, resource); | ||
HtmlResourceHandler.prototype.loadResourcesForRule = function loadResourcesForRule ($, parentResource, rule) { | ||
var self = this; | ||
var promises = $(rule.selector).map(function loadForElement () { | ||
var el = new HtmlSourceElement($(this), rule); | ||
var pathContainer = el.getPathContainer(); | ||
if (!pathContainer) { | ||
return Promise.resolve(); | ||
} | ||
return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el)); | ||
}).get(); | ||
return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource)) | ||
.then(function updateResource () { | ||
resource.setText($.html()); | ||
return resource; | ||
}); | ||
} | ||
return utils.waitAllFulfilled(promises); | ||
}; | ||
loadResourcesForRule ($, parentResource, rule) { | ||
const self = this; | ||
const promises = $(rule.selector).map(function loadForElement () { | ||
const el = new HtmlSourceElement($(this), rule); | ||
const isRecursive = self.options.recursiveSources && Boolean(el.findMatchedRule(self.options.recursiveSources)); | ||
const isDepthGreaterThanMax = self.options.maxRecursiveDepth && parentResource.getDepth() >= self.options.maxRecursiveDepth; | ||
if (isRecursive && isDepthGreaterThanMax) { | ||
logger.debug(`filtering out ${el} by max recursive depth`); | ||
return Promise.resolve(); | ||
} | ||
const pathContainer = el.getPathContainer(); | ||
if (!pathContainer) { | ||
return Promise.resolve(); | ||
} | ||
return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el)); | ||
}).get(); | ||
return utils.waitAllFulfilled(promises); | ||
} | ||
} | ||
function prepareToLoad ($, resource) { | ||
$('base').each(function handleBaseTag () { | ||
var el = $(this); | ||
var href = el.attr('href'); | ||
const el = $(this); | ||
const href = el.attr('href'); | ||
if (href) { | ||
var newUrl = utils.getUrl(resource.getUrl(), href); | ||
const newUrl = utils.getUrl(resource.getUrl(), href); | ||
resource.setUrl(newUrl); | ||
@@ -43,0 +56,0 @@ el.remove(); |
@@ -9,11 +9,10 @@ var _ = require('lodash'); | ||
var supportedOptions = ['prettifyUrls', 'sources', 'defaultFilename']; | ||
var supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename']; | ||
function ResourceHandler (options, context) { | ||
var self = this; | ||
self.options = _.pick(options, supportedOptions); | ||
self.context = context; | ||
this.options = _.pick(options, supportedOptions); | ||
this.context = context; | ||
self.htmlHandler = new HtmlHandler(self.options, self.handleChildrenResources.bind(self)); | ||
self.cssHandler = new CssHandler(self.options, self.handleChildrenResources.bind(self)); | ||
this.htmlHandler = new HtmlHandler(this.options, this.handleChildrenResources.bind(this)); | ||
this.cssHandler = new CssHandler(this.options, this.handleChildrenResources.bind(this)); | ||
} | ||
@@ -20,0 +19,0 @@ |
@@ -6,6 +6,3 @@ 'use strict'; | ||
const Promise = require('bluebird'); | ||
const fs = require('fs-extra'); | ||
const outputFileAsync = Promise.promisify(fs.outputFile); | ||
const removeAsync = Promise.promisify(fs.remove); | ||
@@ -39,3 +36,3 @@ const supportedOptions = [ 'directory' ]; | ||
const text = resource.getText(); | ||
return outputFileAsync(filename, text, { encoding: 'binary' }).then(() => { | ||
return fs.outputFile(filename, text, { encoding: 'binary' }).then(() => { | ||
this.loadedResources.push(resource); | ||
@@ -51,3 +48,3 @@ }); | ||
if (!_.isEmpty(this.loadedResources)) { | ||
return removeAsync(this.absoluteDirectoryPath); | ||
return fs.remove(this.absoluteDirectoryPath); | ||
} | ||
@@ -54,0 +51,0 @@ return Promise.resolve(); |
@@ -32,4 +32,5 @@ 'use strict'; | ||
self.options.recursiveSources = recursiveSources; | ||
if (self.options.recursive) { | ||
self.options.sources = _.union(self.options.sources, recursiveSources); | ||
self.options.sources = _.union(self.options.sources, self.options.recursiveSources); | ||
} | ||
@@ -36,0 +37,0 @@ |
{ | ||
"name": "website-scraper", | ||
"version": "2.3.0", | ||
"version": "2.4.0", | ||
"description": "Download website to a local directory (including all css, images, js, etc.)", | ||
@@ -40,3 +40,3 @@ "readmeFilename": "README.md", | ||
"debug": "^2.4.5", | ||
"fs-extra": "^2.0.0", | ||
"fs-extra": "^3.0.0", | ||
"he": "^1.1.0", | ||
@@ -58,3 +58,7 @@ "lodash": "^4.11.1", | ||
"sinon": "^2.1.0" | ||
} | ||
}, | ||
"files": [ | ||
"index.js", | ||
"lib" | ||
] | ||
} |
@@ -51,4 +51,5 @@ ## Introduction | ||
* [sources](#sources) - selects which resources should be downloaded | ||
* [recursive](#recursive) - follow anchors in html files | ||
* [maxDepth](#maxdepth) - maximum depth for dependencies | ||
* [recursive](#recursive) - follow hyperlinks in html files | ||
* [maxRecursiveDepth](#maxrecursivedepth) - maximum depth for hyperlinks | ||
* [maxDepth](#maxdepth) - maximum depth for all dependencies | ||
* [request](#request) - custom options for for [request](https://github.com/request/request) | ||
@@ -100,6 +101,9 @@ * [subdirectories](#subdirectories) - subdirectories for file extensions | ||
#### recursive | ||
Boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading. Defaults to `false`. | ||
Boolean, if `true` scraper will follow hyperlinks in html files. Don't forget to set `maxRecursiveDepth` to avoid infinite downloading. Defaults to `false`. | ||
#### maxRecursiveDepth | ||
Positive number, maximum allowed depth for hyperlinks. Other dependencies will be saved regardless of their depth. Defaults to `null` - no maximum recursive depth set. | ||
#### maxDepth | ||
Positive number, maximum allowed depth for dependencies. Defaults to `null` - no maximum depth set. | ||
Positive number, maximum allowed depth for all dependencies. Defaults to `null` - no maximum depth set. | ||
@@ -106,0 +110,0 @@ #### request |
49408
968
272
27
+ Addedfs-extra@3.0.1(transitive)
+ Addedjsonfile@3.0.1(transitive)
+ Addeduniversalify@0.1.2(transitive)
- Removedfs-extra@2.1.2(transitive)
- Removedjsonfile@2.4.0(transitive)
Updatedfs-extra@^3.0.0