New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

website-scraper

Package Overview
Dependencies
Maintainers
1
Versions
60
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

website-scraper - npm Package Compare versions

Comparing version 0.3.6 to 1.0.0

lib/config/resource-extensions-by-type.js

6

index.js
var Scraper = require('./lib/scraper.js');
module.exports.scrape = function (options, callback) {
return new Scraper(options).scrape(callback);
};
module.exports.scrape = function scrape (options, callback) {
return new Scraper(options).scrape(callback);
};
var config = {
defaultFilename: 'index.html',
sources: [
{
selector: 'img',
attr: 'src'
},
{
selector: 'img',
attr: 'srcset'
},
{
selector: 'input',
attr: 'src'
},
{
selector: 'object',
attr: 'data'
},
{
selector: 'embed',
attr: 'src'
},
{
selector: 'param[name="movie"]',
attr: 'value'
},
{
selector: 'script',
attr: 'src'
},
{
selector: 'link[rel="stylesheet"]',
attr: 'href'
},
{
selector: 'link[rel*="icon"]',
attr: 'href'
}
],
subdirectories: [
{
directory: 'images',
extensions: ['.png', '.jpg', '.jpeg', '.gif']
},
{
directory: 'js',
extensions: ['.js']
},
{
directory: 'css',
extensions: ['.css']
},
{
directory: 'fonts',
extensions: ['.ttf', '.woff', '.eot', '.svg']
}
]
filenameGenerator: 'byType',
defaultFilename: 'index.html',
prettifyUrls: false,
sources: [
{ selector: 'img', attr: 'src' },
{ selector: 'img', attr: 'srcset' },
{ selector: 'input', attr: 'src' },
{ selector: 'object', attr: 'data' },
{ selector: 'embed', attr: 'src' },
{ selector: 'param[name="movie"]', attr: 'value' },
{ selector: 'script', attr: 'src' },
{ selector: 'link[rel="stylesheet"]', attr: 'href' },
{ selector: 'link[rel*="icon"]', attr: 'href' }
],
subdirectories: [
{ directory: 'images', extensions: ['.png', '.jpg', '.jpeg', '.gif'] },
{ directory: 'js', extensions: ['.js'] },
{ directory: 'css', extensions: ['.css'] },
{ directory: 'fonts', extensions: ['.ttf', '.woff', '.eot', '.svg'] }
],
request: {
encoding: 'binary',
strictSSL: false,
jar: true,
gzip: true
},
urlFilter: function urlFilter (){
return true;
}
};
module.exports = config;
module.exports = [
{ selector: 'a', attr: 'href' }
];
];

@@ -7,2 +7,3 @@ var types = require('./resource-types');

{ tagName: 'link', attributeName: 'href' }
];

@@ -9,0 +10,0 @@ typesByHtmlTag[types.html] = [

@@ -1,6 +0,16 @@

var _ = require('underscore');
var Promise = require('bluebird');
var _ = require('lodash');
var format = require('util').format;
var getCssUrls = require('css-url-parser');
var utils = require('../utils');
function changeExactlyMatchedUrl (text, oldUrl, newUrl) {
// starts with ' " ( ends with ' " )
var exactlyMatchedPattern = format('([\'"\\(\\s])%s([\'"\\)\\s])', _.escapeRegExp(oldUrl));
var exactlyMatchedRegexp = new RegExp(exactlyMatchedPattern, 'g');
text = text.replace(exactlyMatchedRegexp, function changeUrl (match, g1, g2) {
return g1 + newUrl + g2;
});
return text;
}
function loadCss (context, resource) {

@@ -13,13 +23,16 @@ var url = resource.getUrl();

var promises = _.map(cssUrls, function loadResourceFromCssUrl (cssUrl) {
var resourceUrl = utils.getUrl(url, cssUrl);
var cssResource = resource.createChild(resourceUrl);
var childUrl = utils.getUrl(url, cssUrl);
var childResource = resource.createChild(childUrl);
return context.loadResource(cssResource).then(function handleLoadedSource (loadedResource) {
var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());
text = text.replace(new RegExp(cssUrl, 'g'), relativePath);
return Promise.resolve();
return context.loadResource(childResource).then(function handleLoadedSource (loadedResource) {
if(loadedResource){
resource.updateChild(childResource, loadedResource);
var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());
text = changeExactlyMatchedUrl(text, cssUrl, relativePath);
}
});
});
return utils.waitAllFulfilled(promises).then(function () {
return utils.waitAllFulfilled(promises).then(function updateCssText () {
resource.setText(text);

@@ -26,0 +39,0 @@ return resource;

@@ -10,3 +10,3 @@ var cheerio = require('cheerio');

rules.forEach(function (rule) {
rules.forEach(function loadForRule (rule) {
p = p.then(function loadResources () {

@@ -19,8 +19,13 @@ return loadResourcesForRule(context, resource, rule);

function loadTextToCheerio (text) {
return cheerio.load(text, {
decodeEntities: false
});
}
function beforeHandle (resource) {
var text = resource.getText();
var $ = cheerio.load(text);
var $ = loadTextToCheerio(text);
// Handle <base> tag
$('base').each(function () {
$('base').each(function handleBaseTag () {
var el = $(this);

@@ -62,3 +67,3 @@ var href = el.attr('href');

attributeValue: el.attr(attrName)
}
};
}

@@ -82,3 +87,6 @@

return context.loadResource(childResource).then(function updateSrcsetPart (loadedResource) {
imgScrsetPart.url = loadedResource.getFilename();
if(loadedResource){
parentResource.updateChild(childResource, loadedResource);
imgScrsetPart.url = loadedResource.getFilename();
}
});

@@ -105,3 +113,11 @@ }).then(function updateSrcset () {

return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) {
if(!loadedResource){
return attr;
}
parentResource.updateChild(htmlResource, loadedResource);
var relativePath = utils.getRelativePath(parentResource.getFilename(), loadedResource.getFilename());
if(context.options.prettifyUrls){
relativePath = relativePath.replace(context.options.defaultFilename, '');
}
var hash = utils.getHashFromUrl(attr);

@@ -113,3 +129,3 @@

return Promise.resolve(relativePath);
return relativePath;
});

@@ -120,3 +136,3 @@ }

var text = resource.getText();
var $ = cheerio.load(text);
var $ = loadTextToCheerio(text);

@@ -134,5 +150,5 @@ var promises = $(rule.selector).map(function loadForElement () {

return Promise.reject();
});
}).get();
return utils.waitAllFulfilled(promises).then(function () {
return utils.waitAllFulfilled(promises).then(function updateHtmlText () {
text = $.html();

@@ -139,0 +155,0 @@ resource.setText(text);

@@ -1,2 +0,2 @@

var _ = require('underscore');
var _ = require('lodash');
var Promise = require('bluebird');

@@ -6,23 +6,7 @@ var request = require('request');

var defaultOptions = {
method: 'GET',
encoding: 'binary',
strictSSL: false,
jar: true,
gzip: true
};
function getDefaultOptions() {
return defaultOptions;
}
function getCustomOptions(options) {
return _.extend({}, defaultOptions, options);
}
function makeRequest(options, url) {
var requestOptions = getCustomOptions(options);
function makeRequest (options, url) {
var requestOptions = _.clone(options);
requestOptions.url = url;
return get(requestOptions).then(function handleResponse(data) {
return get(requestOptions).then(function handleResponse (data) {
return {

@@ -35,4 +19,2 @@ url: data.request.href,

module.exports.makeRequest = makeRequest;
module.exports.getDefaultOptions = getDefaultOptions;
module.exports.getCustomOptions = getCustomOptions;
module.exports = makeRequest;

@@ -1,2 +0,2 @@

var _ = require('underscore');
var _ = require('lodash');
var path = require('path');

@@ -8,3 +8,3 @@ var types = require('./config/resource-types');

var type = _.findKey(typesByHtmlData, function containsHtmlData (rules) {
return _.findWhere(rules, htmlData);
return _.find(rules, htmlData);
});

@@ -17,2 +17,3 @@ return type || types.other;

this.filename = filename;
this.children = [];
}

@@ -28,5 +29,18 @@

this.children.push(child);
return child;
};
Resource.prototype.updateChild = function updateChild (oldChild, newChild) {
var index = _.indexOf(this.children, oldChild);
if (index >= 0) {
this.children[index] = newChild;
}
};
Resource.prototype.getChildren = function getChildren () {
return this.children;
};
Resource.prototype.getUrl = function getUrl () {

@@ -95,3 +109,3 @@ return this.url;

var parent = this.parent;
var hasHtmlData = !!this.htmlData;
var hasHtmlData = !_.isEmpty(this.htmlData);

@@ -98,0 +112,0 @@ switch (true) {

@@ -9,3 +9,3 @@ var Promise = require('bluebird');

var path = require('path');
var _ = require('underscore');
var _ = require('lodash');

@@ -15,4 +15,6 @@ var defaults = require('./config/defaults');

var utils = require('./utils.js');
var request = require('./request');
var Resource = require('./resource');
var getFilenameGenerator = require('./filename-generators/filename-generator-getter');
var makeRequest = require('./request');
var compareUrls = require('compare-urls');

@@ -23,3 +25,3 @@

function loadHtmlAndCss (context, po) {
return loadHtml(context, po).then(function (loaded) {
return loadHtml(context, po).then(function loadCssFromHtml (loaded) {
return loadCss(context, loaded);

@@ -34,7 +36,9 @@ });

this.options = _.extend({}, defaults, options);
this.options.request = _.extend({}, defaults.request, options.request);
this.options.directory = path.resolve(process.cwd(), this.options.directory || '');
this.options.filenameGenerator = getFilenameGenerator(this.options.filenameGenerator);
}
Scraper.prototype.getLoadedResource = function getLoadedResource (resource) {
return _.find(this.loadedResources, function(lr) {
return _.find(this.loadedResources, function checkUrlsEqual (lr) {
return compareUrls(resource.getUrl(), lr.getUrl());

@@ -48,8 +52,2 @@ });

Scraper.prototype.getOccupiedFilenames = function getOccupiedFilenames () {
var subdirectories = _.map(this.options.subdirectories, function (dir) { return dir.directory; });
var loadedFiles = _.map(this.loadedResources, function(r) { return r.getFilename(); });
return subdirectories.concat(loadedFiles);
};
Scraper.prototype.getHtmlSources = function getHtmlSources () {

@@ -59,32 +57,2 @@ return this.options.sources;

Scraper.prototype.generateFilename = function generateFilename (resource) {
var self = this;
var occupiedFilenames = self.getOccupiedFilenames();
var preferredFilename = resource.getFilename(); // which was set in options
var urlFilename = utils.getFilenameFromUrl(resource.getUrl()); // try to get filename from url
var filename = preferredFilename || urlFilename || self.options.defaultFilename;
var ext = path.extname(filename);
var dir = self.getDirectoryByExtension(ext);
var currentFilename = path.join(dir, filename);
var basename = path.basename(filename, ext);
var index = 1;
while (_.contains(occupiedFilenames, currentFilename)) {
currentFilename = path.join(dir, basename + '_' + index + ext);
index++;
}
return currentFilename;
};
Scraper.prototype.getDirectoryByExtension = function getDirectoryByExtension (ext) {
return _.chain(this.options.subdirectories)
.filter(function (dir) { return _.contains(dir.extensions, ext); })
.map(function (dir) { return dir.directory; })
.first()
.value() || '';
};
Scraper.prototype.getResourceHandler = function getHandler (resource) {

@@ -106,10 +74,12 @@ var self = this;

var loaded = self.getLoadedResource(resource); // try to find already loaded
if(!self.options.urlFilter(resource.url)){
return Promise.resolve(null);
}
var url = resource.getUrl();
var filename;
var handleFile;
// try to find already loaded
var loaded = self.getLoadedResource(resource);
if (!loaded) {
filename = self.generateFilename(resource);
var url = resource.getUrl();
var filename = self.options.filenameGenerator(resource, self.options, self.loadedResources);
resource.setFilename(filename);

@@ -120,12 +90,12 @@

// Request -> processing -> save to fs
return self.makeRequest(url).then(function requestCompleted(data) {
return self.makeRequest(url).then(function requestCompleted (data) {
resource.setUrl(data.url); // Url may be changed in redirects
resource.setText(data.body);
handleFile = self.getResourceHandler(resource);
var handleFile = self.getResourceHandler(resource);
return handleFile(self, resource);
}).then(function fileHandled() {
}).then(function fileHandled () {
var filename = path.join(self.options.directory, resource.getFilename());
var text = resource.getText();
return outputFileAsync(filename, text, { encoding: 'binary' });
}).then(function fileSaved() {
}).then(function fileSaved () {
return Promise.resolve(resource);

@@ -150,7 +120,7 @@ });

// Create makeRequest function with custom request params
self.makeRequest = request.makeRequest.bind(null, self.options.request);
self.makeRequest = makeRequest.bind(null, self.options.request);
// Create array of Resource for downloading
self.options.urls = _.isArray(self.options.urls) ? self.options.urls : [self.options.urls];
self.originalResources = _.map(self.options.urls, function createResource(obj) {
self.originalResources = _.map(self.options.urls, function createResource (obj) {
var url = _.isObject(obj) && _.has(obj, 'url') ? obj.url : obj;

@@ -171,8 +141,3 @@ var filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename;

return Promise.map(self.originalResources, function loadPage (po) {
return self.loadResource(po).then(function pageLoaded (loaded) {
return Promise.resolve({
url: loaded.getUrl(),
filename: loaded.getFilename()
});
});
return self.loadResource(po).then(utils.createOutputObject);
});

@@ -188,3 +153,3 @@ };

Scraper.prototype.scrape = function scrape(callback) {
Scraper.prototype.scrape = function scrape (callback) {
var self = this;

@@ -191,0 +156,0 @@ return Promise.bind(self)

var url = require('url');
var path = require('path');
var _ = require('lodash');
var Promise = require('bluebird');
function isUrl(path) {
function isUrl (path) {
var urlRegexp = /^((http[s]?:)?\/\/)/;

@@ -20,7 +21,7 @@ return urlRegexp.test(path);

function getUnixPath(filepath) {
function getUnixPath (filepath) {
return filepath.replace(/\\/g, '/');
}
function getRelativePath(path1, path2) {
function getRelativePath (path1, path2) {
var dirname = path.dirname(path1);

@@ -39,4 +40,4 @@ var relativePath = path.relative(dirname, path2);

function waitAllFulfilled(promises) {
return Promise.all(promises.map(function(promise) {
function waitAllFulfilled (promises) {
return Promise.all(promises.map(function returnWhenFulfilled (promise) {
return promise.reflect();

@@ -46,2 +47,15 @@ }));

function createOutputObject (resource) {
var assets = _(resource.getChildren())
.map(createOutputObject)
.uniq()
.value();
return {
url: resource.getUrl(),
filename: resource.getFilename(),
assets: assets
};
}
module.exports = {

@@ -54,3 +68,4 @@ isUrl: isUrl,

getHashFromUrl: getHashFromUrl,
waitAllFulfilled: waitAllFulfilled
waitAllFulfilled: waitAllFulfilled,
createOutputObject: createOutputObject
};
{
"name": "website-scraper",
"version": "0.3.6",
"version": "1.0.0",
"description": "Download website to a local directory (including all css, images, js, etc.)",

@@ -8,3 +8,4 @@ "readmeFilename": "README.md",

"scripts": {
"test": "istanbul cover ./node_modules/mocha/bin/_mocha --dir ./coverage --report lcov -- -R spec --recursive ./test"
"test": "istanbul cover _mocha --dir ./coverage --report lcov -- -R spec --recursive --timeout 7000 ./test && npm run eslint",
"eslint": "eslint lib/** index.js"
},

@@ -36,17 +37,19 @@ "repository": {

"bluebird": "^3.0.1",
"cheerio": "0.11.0",
"cheerio": "0.20.0",
"compare-urls": "^1.0.0",
"css-url-parser": "^0.1.0",
"fs-extra": "^0.26.0",
"css-url-parser": "^1.0.0",
"fs-extra": "^0.29.0",
"lodash": "^4.11.1",
"request": "^2.42.0",
"srcset": "^1.0.0",
"underscore": "^1.7.0"
"srcset": "^1.0.0"
},
"devDependencies": {
"codeclimate-test-reporter": "^0.1.0",
"codeclimate-test-reporter": "^0.3.1",
"coveralls": "^2.11.8",
"eslint": "^2.8.0",
"istanbul": "^0.4.0",
"mocha": "^2.2.5",
"nock": "^2.9.1",
"nock": "^8.0.0",
"proxyquire": "^1.7.3",
"should": "^7.0.2",
"should": "^8.2.2",
"sinon": "^1.15.4",

@@ -53,0 +56,0 @@ "sinon-as-promised": "^4.0.0"

@@ -10,2 +10,3 @@ ## Introduction

[![Dependency Status](https://david-dm.org/s0ph1e/node-website-scraper.svg?style=flat)](https://david-dm.org/s0ph1e/node-website-scraper)
[![Gitter](https://badges.gitter.im/s0ph1e/node-website-scraper.svg)](https://gitter.im/s0ph1e/node-website-scraper?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)

@@ -46,7 +47,10 @@ [![NPM Stats](https://nodei.co/npm/website-scraper.png?downloadRank=true&stars=true)](https://www.npmjs.org/package/website-scraper)

- `urls:` array of urls to load and filenames for them *(required, see example below)*
- `directory:` path to save loaded files *(required)*
- `defaultFilename:` filename for index page *(optional, default: 'index.html')*
- `sources:` array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see default value in `lib/config/defaults.js`)*
- `subdirectories:` array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
- `urls`: array of urls to load and filenames for them *(required, see example below)*
- `urlFilter`: function which is called for each url to check whether it should be scraped. *(optional, see example below)*
- `directory`: path to save loaded files *(required)*
- `filenameGenerator`: name of one of the bundled filenameGenerators, or a custom filenameGenerator function *(optional, default: 'byType')*
- `defaultFilename`: filename for index page *(optional, default: 'index.html')*
- `prettifyUrls`: whether urls should be 'prettified', by having the `defaultFilename` removed *(optional, default: false)*
- `sources`: array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see default value in `lib/config/defaults.js`)*
- `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)*

@@ -59,8 +63,22 @@ - `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)*

- `error:` if error - `Error` object, if success - `null`
- `result:` if error - `null`, if success - array if objects containing:
- `url:` url of loaded page
- `filename:` filename where page was saved (relative to `directory`)
- `error`: if error - `Error` object, if success - `null`
- `result`: if error - `null`, if success - array of objects containing:
- `url`: url of loaded page
- `filename`: filename where page was saved (relative to `directory`)
- `assets`: array of children resources (each of them contains `url`, `filename`, `assets`)
### Filename Generators
The filename generator determines where the scraped files are saved.
#### byType (default)
When the `byType` filenameGenerator is used the downloaded files are saved by type (as defined by the `subdirectories` setting)
or directly in the `directory` folder, if no subdirectory is specified for the specific type.
#### bySiteStructure
When the `bySiteStructure` filenameGenerator is used the downloaded files are saved in `directory` using same structure as on the website:
- `/` => `DIRECTORY/index.html`
- `/about` => `DIRECTORY/about/index.html`
- `/resources/javascript/libraries/jquery.min.js` => `DIRECTORY/resources/javascript/libraries/jquery.min.js`
## Examples

@@ -123,1 +141,33 @@ #### Example 1

```
#### Example 3. Filtering out external resources
```javascript
// Links to other websites are filtered out by the urlFilter
var scraper = require('website-scraper');
scraper.scrape({
urls: ['http://example.com/'],
urlFilter: function(url){
return url.indexOf('http://example.com') === 0;
},
directory: '/path/to/save'
}).then(console.log).catch(console.log);
```
#### Example 4. Downloading an entire website
```javascript
// Downloads all the crawlable files of example.com.
// The files are saved in the same structure as the structure of the website, by using the `bySiteStructure` filenameGenerator.
// Links to other websites are filtered out by the urlFilter
var scraper = require('website-scraper');
scraper.scrape({
urls: ['http://example.com/'],
urlFilter: function(url){
return url.indexOf('http://example.com') === 0;
},
recursive: true,
maxDepth: 100,
prettifyUrls: true,
filenameGenerator: 'bySiteStructure',
directory: '/path/to/save'
}).then(console.log).catch(console.log);
```

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc