New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

website-scraper

Package Overview
Dependencies
Maintainers
1
Versions
60
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

website-scraper - npm Package Compare versions

Comparing version 2.3.0 to 2.4.0

5

lib/config/defaults.js

@@ -35,3 +35,5 @@ 'use strict';

{ selector: 'audio source', attr: 'src' },
{ selector: 'audio track', attr: 'src' }
{ selector: 'audio track', attr: 'src' },
{ selector: 'frame', attr: 'src' },
{ selector: 'iframe', attr: 'src' }
],

@@ -52,2 +54,3 @@ subdirectories: [

recursive: false,
maxRecursiveDepth: null,
maxDepth: null,

@@ -54,0 +57,0 @@ ignoreErrors: true,

91

lib/resource-handler/html/html-source-element.js

@@ -1,8 +0,9 @@

var ImgSrcsetTag = require('../path-containers/html-img-srcset-tag');
var CommonTag = require('../path-containers/html-common-tag');
var CssText = require('../path-containers/css-text');
var _ = require('lodash');
var utils = require('../../utils');
'use strict';
var pathContainersByRule = [
const ImgSrcsetTag = require('../path-containers/html-img-srcset-tag');
const CommonTag = require('../path-containers/html-common-tag');
const CssText = require('../path-containers/css-text');
const utils = require('../../utils');
const pathContainersByRule = [
{ selector: '[style]', attr: 'style', containerClass: CssText },

@@ -15,40 +16,56 @@ { selector: 'style', containerClass: CssText },

* Represents pair of cheerio element and rule to find text with children resources
* @param {Object} el - cheerio obj for dom element
* @param {Object} rule - rule used to find current element
* @param {string} rule.selector - cheerio selector
* @param {string} rule.attr - attribute to find text which contains resources. if not set - use inner html
* @constructor
*/
function HtmlSourceElement (el, rule) {
this.el = el;
this.rule = rule;
}
class HtmlSourceElement {
/**
* @param {Object} el - cheerio obj for dom element
* @param {Object} rule - rule used to find current element
* @param {string} rule.selector - cheerio selector
* @param {string} rule.attr - attribute to find text which contains resources. if not set - use inner html
*/
constructor (el, rule) {
this.el = el;
this.rule = rule;
}
/**
* Get text from attr or from innerHtml of element based on rule
* @returns {string}
*/
HtmlSourceElement.prototype.getData = function getData () {
var text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text();
return utils.decodeHtmlEntities(text);
};
/**
* Get resource data from element using rule
* @returns {string}
*/
getData () {
const text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text();
return utils.decodeHtmlEntities(text);
}
HtmlSourceElement.prototype.setData = function setData (newData) {
this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData);
};
/**
* Update attribute or inner text of el with new data
* @param {string} newData
*/
setData (newData) {
this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData);
}
HtmlSourceElement.prototype.getPathContainerClass = function getPathContainerClass () {
var selectedRule = _.find(pathContainersByRule, (containerByRule) => {
return this.el.is(containerByRule.selector) && this.rule.attr === containerByRule.attr;
});
/**
* Returns PathContainer instance for element
* @returns {CssText|HtmlCommonTag|HtmlImgSrcSetTag|null}
*/
getPathContainer () {
const selectedRule = this.findMatchedRule(pathContainersByRule);
const ContainerClass = selectedRule ? selectedRule.containerClass : CommonTag;
const textWithResources = this.getData();
return textWithResources ? new ContainerClass(textWithResources) : null;
}
return selectedRule ? selectedRule.containerClass : CommonTag;
};
matchesRule (rule) {
return this.el.is(rule.selector) && this.rule.attr === rule.attr;
}
HtmlSourceElement.prototype.getPathContainer = function getPathContainer () {
var ContainerClass = this.getPathContainerClass();
var textWithResources = this.getData();
return textWithResources ? new ContainerClass(textWithResources) : null;
};
findMatchedRule (rulesArray) {
return rulesArray.find(this.matchesRule, this);
}
toString () {
return JSON.stringify({selector: this.rule.selector, attr: this.rule.attr, data: this.getData()});
}
}
module.exports = HtmlSourceElement;

@@ -1,42 +0,55 @@

var cheerio = require('cheerio');
var Promise = require('bluebird');
var utils = require('../../utils');
var HtmlSourceElement = require('./html-source-element');
'use strict';
function HtmlResourceHandler (options, handleChildrenPaths) {
this.options = options;
this.handleChildrenPaths = handleChildrenPaths;
}
const cheerio = require('cheerio');
const Promise = require('bluebird');
const utils = require('../../utils');
const logger = require('../../logger');
const HtmlSourceElement = require('./html-source-element');
HtmlResourceHandler.prototype.handle = function handle (resource) {
var $ = loadTextToCheerio(resource.getText());
prepareToLoad($, resource);
class HtmlResourceHandler {
constructor (options, handleChildrenPaths) {
this.options = options;
this.handleChildrenPaths = handleChildrenPaths;
}
return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource))
.then(function updateResource () {
resource.setText($.html());
return resource;
});
};
handle (resource) {
const $ = loadTextToCheerio(resource.getText());
prepareToLoad($, resource);
HtmlResourceHandler.prototype.loadResourcesForRule = function loadResourcesForRule ($, parentResource, rule) {
var self = this;
var promises = $(rule.selector).map(function loadForElement () {
var el = new HtmlSourceElement($(this), rule);
var pathContainer = el.getPathContainer();
if (!pathContainer) {
return Promise.resolve();
}
return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el));
}).get();
return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource))
.then(function updateResource () {
resource.setText($.html());
return resource;
});
}
return utils.waitAllFulfilled(promises);
};
loadResourcesForRule ($, parentResource, rule) {
const self = this;
const promises = $(rule.selector).map(function loadForElement () {
const el = new HtmlSourceElement($(this), rule);
const isRecursive = self.options.recursiveSources && Boolean(el.findMatchedRule(self.options.recursiveSources));
const isDepthGreaterThanMax = self.options.maxRecursiveDepth && parentResource.getDepth() >= self.options.maxRecursiveDepth;
if (isRecursive && isDepthGreaterThanMax) {
logger.debug(`filtering out ${el} by max recursive depth`);
return Promise.resolve();
}
const pathContainer = el.getPathContainer();
if (!pathContainer) {
return Promise.resolve();
}
return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el));
}).get();
return utils.waitAllFulfilled(promises);
}
}
function prepareToLoad ($, resource) {
$('base').each(function handleBaseTag () {
var el = $(this);
var href = el.attr('href');
const el = $(this);
const href = el.attr('href');
if (href) {
var newUrl = utils.getUrl(resource.getUrl(), href);
const newUrl = utils.getUrl(resource.getUrl(), href);
resource.setUrl(newUrl);

@@ -43,0 +56,0 @@ el.remove();

@@ -9,11 +9,10 @@ var _ = require('lodash');

var supportedOptions = ['prettifyUrls', 'sources', 'defaultFilename'];
var supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename'];
function ResourceHandler (options, context) {
var self = this;
self.options = _.pick(options, supportedOptions);
self.context = context;
this.options = _.pick(options, supportedOptions);
this.context = context;
self.htmlHandler = new HtmlHandler(self.options, self.handleChildrenResources.bind(self));
self.cssHandler = new CssHandler(self.options, self.handleChildrenResources.bind(self));
this.htmlHandler = new HtmlHandler(this.options, this.handleChildrenResources.bind(this));
this.cssHandler = new CssHandler(this.options, this.handleChildrenResources.bind(this));
}

@@ -20,0 +19,0 @@

@@ -6,6 +6,3 @@ 'use strict';

const Promise = require('bluebird');
const fs = require('fs-extra');
const outputFileAsync = Promise.promisify(fs.outputFile);
const removeAsync = Promise.promisify(fs.remove);

@@ -39,3 +36,3 @@ const supportedOptions = [ 'directory' ];

const text = resource.getText();
return outputFileAsync(filename, text, { encoding: 'binary' }).then(() => {
return fs.outputFile(filename, text, { encoding: 'binary' }).then(() => {
this.loadedResources.push(resource);

@@ -51,3 +48,3 @@ });

if (!_.isEmpty(this.loadedResources)) {
return removeAsync(this.absoluteDirectoryPath);
return fs.remove(this.absoluteDirectoryPath);
}

@@ -54,0 +51,0 @@ return Promise.resolve();

@@ -32,4 +32,5 @@ 'use strict';

self.options.recursiveSources = recursiveSources;
if (self.options.recursive) {
self.options.sources = _.union(self.options.sources, recursiveSources);
self.options.sources = _.union(self.options.sources, self.options.recursiveSources);
}

@@ -36,0 +37,0 @@

{
"name": "website-scraper",
"version": "2.3.0",
"version": "2.4.0",
"description": "Download website to a local directory (including all css, images, js, etc.)",

@@ -40,3 +40,3 @@ "readmeFilename": "README.md",

"debug": "^2.4.5",
"fs-extra": "^2.0.0",
"fs-extra": "^3.0.0",
"he": "^1.1.0",

@@ -58,3 +58,7 @@ "lodash": "^4.11.1",

"sinon": "^2.1.0"
}
},
"files": [
"index.js",
"lib"
]
}

@@ -51,4 +51,5 @@ ## Introduction

* [sources](#sources) - selects which resources should be downloaded
* [recursive](#recursive) - follow anchors in html files
* [maxDepth](#maxdepth) - maximum depth for dependencies
* [recursive](#recursive) - follow hyperlinks in html files
* [maxRecursiveDepth](#maxrecursivedepth) - maximum depth for hyperlinks
* [maxDepth](#maxdepth) - maximum depth for all dependencies
* [request](#request) - custom options for for [request](https://github.com/request/request)

@@ -100,6 +101,9 @@ * [subdirectories](#subdirectories) - subdirectories for file extensions

#### recursive
Boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading. Defaults to `false`.
Boolean, if `true` scraper will follow hyperlinks in html files. Don't forget to set `maxRecursiveDepth` to avoid infinite downloading. Defaults to `false`.
#### maxRecursiveDepth
Positive number, maximum allowed depth for hyperlinks. Other dependencies will be saved regardless of their depth. Defaults to `null` - no maximum recursive depth set.
#### maxDepth
Positive number, maximum allowed depth for dependencies. Defaults to `null` - no maximum depth set.
Positive number, maximum allowed depth for all dependencies. Defaults to `null` - no maximum depth set.

@@ -106,0 +110,0 @@ #### request

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc