New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

website-scraper

Package Overview
Dependencies
Maintainers
1
Versions
60
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

website-scraper - npm Package Compare versions

Comparing version 2.1.1 to 2.2.0

9

lib/config/defaults.js

@@ -33,10 +33,11 @@ var config = {

},
urlFilter: function urlFilter () {
return true;
},
urlFilter: null,
recursive: false,
maxDepth: null,
ignoreErrors: true
ignoreErrors: true,
httpResponseHandler: null,
onResourceSaved: null,
onResourceError: null
};
module.exports = config;

@@ -1,7 +0,9 @@

var _ = require('lodash');
var Promise = require('bluebird');
var request = require('request');
var get = Promise.promisify(request.get);
var logger = require('./logger');
'use strict';
const _ = require('lodash');
const Promise = require('bluebird');
const request = require('request');
const get = Promise.promisify(request.get);
const logger = require('./logger');
function getMimeType (contentType) {

@@ -11,23 +13,68 @@ return contentType ? contentType.split(';')[0] : null;

function makeRequest (options, url, referer) {
var requestOptions = _.clone(options);
requestOptions.url = url;
function defaultResponseHandler (response) {
return Promise.resolve(response.body);
}
if (referer) {
requestOptions.headers = requestOptions.headers || {};
requestOptions.headers.referer = referer;
function transformResult (result) {
switch (true) {
case _.isString(result):
return {
body: result,
metadata: null
};
case _.isPlainObject(result):
return {
body: result.body,
metadata: result.metadata || null
};
default:
throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result);
}
}
logger.debug(`[request] sending request for url ${url}, referer ${referer}`);
class Request {
/**
*
* @param {Object} options
* @param {function} options.httpResponseHandler - custom response handler
* @param {Object} options.request - custom options for request module
*/
constructor (options) {
this.handleResponse = options && options.httpResponseHandler ? options.httpResponseHandler : defaultResponseHandler;
this.options = options && options.request ? _.clone(options.request) : {};
}
return get(requestOptions).then(function handleResponse (data) {
logger.debug(`[request] received response for ${data.request.href}, statusCode ${data.statusCode}`);
return {
url: data.request.href,
mimeType: getMimeType(data.headers['content-type']),
body: data.body
};
});
/**
* Performs get request to url and returns data for resource
* @param {string} url - url of resource
* @param {string} referer - url of parent resource
* @return {Promise}
*/
get (url, referer) {
let requestOptions = _.clone(this.options);
requestOptions.url = url;
if (referer) {
requestOptions.headers = requestOptions.headers || {};
requestOptions.headers.referer = referer;
}
logger.debug(`[request] sending request for url ${url}, referer ${referer}`);
return get(requestOptions).then((response) => {
logger.debug(`[request] received response for ${response.request.href}, statusCode ${response.statusCode}`);
return this.handleResponse(response)
.then(transformResult)
.then((responseHandlerResult) => {
return {
url: response.request.href,
mimeType: getMimeType(response.headers['content-type']),
body: responseHandlerResult.body,
metadata: responseHandlerResult.metadata
};
});
});
}
}
module.exports = makeRequest;
module.exports = Request;

@@ -91,2 +91,6 @@ var types = require('./config/resource-types');

Resource.prototype.setMetadata = function setMetadata (metadata) {
this.metadata = metadata;
};
module.exports = Resource;

@@ -11,3 +11,3 @@ var Promise = require('bluebird');

var FilenameGenerator = require('./filename-generator');
var makeRequest = require('./request');
var Request = require('./request');
var ResourceHandler = require('./resource-handler');

@@ -32,3 +32,3 @@ var FSAdapter = require('./fs-adaper');

self.makeRequest = makeRequest.bind(null, self.options.request);
self.request = new Request(self.options);
self.resourceHandler = new ResourceHandler(self.options, self);

@@ -72,5 +72,9 @@ self.filenameGenerator = new FilenameGenerator(self.options);

return self.fsAdapter.saveResource(resource);
}).then(function afterResourceSaved () {
if (self.options.onResourceSaved) {
self.options.onResourceSaved(resource);
}
}).catch(function handleError (err) {
logger.warn('failed to save resource ' + resource);
return self.handleError(err);
return self.handleError(err, resource);
});

@@ -86,3 +90,3 @@ };

var referer = resource.parent ? resource.parent.getUrl() : null;
return self.makeRequest(url, referer);
return self.request.get(url, referer);
}).then(function requestCompleted (responseData) {

@@ -111,2 +115,6 @@

if (responseData.metadata) {
resource.setMetadata(responseData.metadata);
}
resource.setText(responseData.body);

@@ -117,3 +125,3 @@ self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad

logger.warn('failed to request resource ' + resource);
return self.handleError(err);
return self.handleError(err, resource);
});

@@ -129,3 +137,3 @@

if (!self.options.urlFilter(url)) {
if (self.options.urlFilter && !self.options.urlFilter(url)) {
logger.debug('filtering out ' + resource + ' by url filter');

@@ -175,3 +183,6 @@ return Promise.resolve(null);

Scraper.prototype.handleError = function handleError (err) {
Scraper.prototype.handleError = function handleError (err, resource) {
if (resource && this.options.onResourceError) {
this.options.onResourceError(resource, err);
}
if (this.options.ignoreErrors) {

@@ -178,0 +189,0 @@ logger.warn('ignoring error: ' + err.message);

{
"name": "website-scraper",
"version": "2.1.1",
"version": "2.2.0",
"description": "Download website to a local directory (including all css, images, js, etc.)",

@@ -29,3 +29,3 @@ "readmeFilename": "README.md",

],
"author": "s0ph1e",
"author": "Sophia Antipenko <sophia@antipenko.pp.ua>",
"license": "MIT",

@@ -32,0 +32,0 @@ "bugs": {

@@ -34,9 +34,11 @@ ## Introduction

// with callback
scrape(options, function (error, result) {
// with promise
scrape(options).then((result) => {
/* some code here */
}).catch((err) => {
/* some code here */
});
// or with promise
scrape(options).then(function (result) {
// or with callback
scrape(options, (error, result) => {
/* some code here */

@@ -46,62 +48,24 @@ });

## API
### scrape(options, callback)
Makes requests to `urls` and saves all files found with `sources` to `directory`.
**options** - object containing next options:
- `urls`: array of urls to load and filenames for them *(required, see example below)*
- `urlFilter`: function which is called for each url to check whether it should be scraped. *(optional, see example below)*
- `directory`: path to save loaded files *(required)*
- `filenameGenerator`: name of one of the bundled filenameGenerators, or a custom filenameGenerator function *(optional, default: 'byType')*
- `defaultFilename`: filename for index page *(optional, default: 'index.html')*
- `prettifyUrls`: whether urls should be 'prettified', by having the `defaultFilename` removed *(optional, default: false)*
- `sources`: array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see example below)*
- `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)*
- `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)*
- `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)*
- `ignoreErrors`: boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error *(optional, default: true)*
## options
* [urls](#urls) - urls to download, *required*
* [directory](#directory) - path to save files, *required*
* [sources](#sources) - selects which resources should be downloaded
* [recursive](#recursive) - follow anchors in html files
* [maxDepth](#maxdepth) - maximum depth for dependencies
* [request](#request) - custom options for for [request](https://github.com/request/request)
* [subdirectories](#subdirectories) - subdirectories for file extensions
* [defaultFilename](#defaultfilename) - filename for index page
* [prettifyUrls](#prettifyurls) - prettify urls
* [ignoreErrors](#ignoreerrors) - whether to ignore errors on resource downloading
* [urlFilter](#urlfilter) - skip some urls
* [filenameGenerator](#filenamegenerator) - generate filename for downloaded resource
* [httpResponseHandler](#httpresponsehandler) - customize http response handling
* [onResourceSaved](#onresourcesaved) - callback called when resource is saved
* [onResourceError](#onresourceerror) - callback called when resource's downloading is failed
Default options you can find in [lib/config/defaults.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/config/defaults.js).
**callback** - callback function *(optional)*, includes following parameters:
- `error`: if error - `Error` object, if success - `null`
- `result`: if error - `null`, if success - array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects containing:
- `url`: url of loaded page
- `filename`: filename where page was saved (relative to `directory`)
- `children`: array of children Resources
### Filename Generators
The filename generator determines where the scraped files are saved.
#### byType (default)
When the `byType` filenameGenerator is used the downloaded files are saved by type (as defined by the `subdirectories` setting)
or directly in the `directory` folder, if no subdirectory is specified for the specific type.
#### bySiteStructure
When the `bySiteStructure` filenameGenerator is used the downloaded files are saved in `directory` using same structure as on the website:
- `/` => `DIRECTORY/index.html`
- `/about` => `DIRECTORY/about/index.html`
- `/resources/javascript/libraries/jquery.min.js` => `DIRECTORY/resources/javascript/libraries/jquery.min.js`
## Examples
#### Example 1
Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`.
Imagine we want to load:
- [Home page](http://nodejs.org/) to `index.html`
- [About page](http://nodejs.org/about/) to `about.html`
- [Blog](http://blog.nodejs.org/) to `blog.html`
and separate files into directories:
- `img` for .jpg, .png, .svg (full path `/path/to/save/img`)
- `js` for .js (full path `/path/to/save/js`)
- `css` for .css (full path `/path/to/save/css`)
#### urls
Array of objects which contain urls to download and filenames for them. **_Required_**.
```javascript
var scrape = require('website-scraper');
scrape({

@@ -113,8 +77,16 @@ urls: [

],
directory: '/path/to/save'
}).then(console.log).catch(console.log);
```
#### directory
String, absolute path to directory where downloaded files will be saved. Directory should not exist. It will be created by scraper. **_Required_**.
#### sources
Array of objects to download, specifies selectors and attribute values to select files for downloading. By default scraper tries to download all possible resources.
```javascript
// Downloading images, css files and scripts
scrape({
urls: ['http://nodejs.org/'],
directory: '/path/to/save',
subdirectories: [
{directory: 'img', extensions: ['.jpg', '.png', '.svg']},
{directory: 'js', extensions: ['.js']},
{directory: 'css', extensions: ['.css']}
],
sources: [

@@ -124,3 +96,18 @@ {selector: 'img', attr: 'src'},

{selector: 'script', attr: 'src'}
],
]
}).then(console.log).catch(console.log);
```
#### recursive
Boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading. Defaults to `false`.
#### maxDepth
Positive number, maximum allowed depth for dependencies. Defaults to `null` - no maximum depth set.
#### request
Object, custom options for [request](https://github.com/request/request#requestoptions-callback). Allows to set cookies, userAgent, etc.
```javascript
scrape({
urls: ['http://example.com/'],
directory: '/path/to/save',
request: {

@@ -131,23 +118,35 @@ headers: {

}
}).then(function (result) {
console.log(result);
}).catch(function(err){
console.log(err);
});
}).then(console.log).catch(console.log);
```
#### Example 2. Recursive downloading
#### subdirectories
Array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory`.
```javascript
// Links from example.com will be followed
// Links from links will be ignored because theirs depth = 2 is greater than maxDepth
var scrape = require('website-scraper');
/* Separate files into directories:
- `img` for .jpg, .png, .svg (full path `/path/to/save/img`)
- `js` for .js (full path `/path/to/save/js`)
- `css` for .css (full path `/path/to/save/css`)
*/
scrape({
urls: ['http://example.com/'],
urls: ['http://example.com'],
directory: '/path/to/save',
recursive: true,
maxDepth: 1
subdirectories: [
{directory: 'img', extensions: ['.jpg', '.png', '.svg']},
{directory: 'js', extensions: ['.js']},
{directory: 'css', extensions: ['.css']}
]
}).then(console.log).catch(console.log);
```
#### Example 3. Filtering out external resources
#### defaultFilename
String, filename for index page. Defaults to `index.html`.
#### prettifyUrls
Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`.
#### ignoreErrors
Boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error. Defaults to `true`.
#### urlFilter
Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied.
```javascript

@@ -165,6 +164,16 @@ // Links to other websites are filtered out by the urlFilter

#### Example 4. Downloading an entire website
#### filenameGenerator
String, name of one of the bundled filenameGenerators, or a custom filenameGenerator function. Filename generator determines where the scraped files are saved.
###### byType (default)
When the `byType` filenameGenerator is used the downloaded files are saved by type (as defined by the `subdirectories` setting) or directly in the `directory` folder, if no subdirectory is specified for the specific type.
###### bySiteStructure
When the `bySiteStructure` filenameGenerator is used the downloaded files are saved in `directory` using same structure as on the website:
- `/` => `DIRECTORY/index.html`
- `/about` => `DIRECTORY/about/index.html`
- `/resources/javascript/libraries/jquery.min.js` => `DIRECTORY/resources/javascript/libraries/jquery.min.js`
```javascript
// Downloads all the crawlable files of example.com.
// The files are saved in the same structure as the structure of the website, by using the `bySiteStructure` filenameGenerator.
// Downloads all the crawlable files. The files are saved in the same structure as the structure of the website
// Links to other websites are filtered out by the urlFilter

@@ -174,8 +183,5 @@ var scrape = require('website-scraper');

urls: ['http://example.com/'],
urlFilter: function(url){
return url.indexOf('http://example.com') === 0;
},
urlFilter: function(url){ return url.indexOf('http://example.com') === 0; },
recursive: true,
maxDepth: 100,
prettifyUrls: true,
filenameGenerator: 'bySiteStructure',

@@ -186,2 +192,63 @@ directory: '/path/to/save'

#### httpResponseHandler
Function which is called on each response, allows to customize resource or reject its downloading.
It takes 1 argument - response object of [request](https://github.com/request/request) module and should return resolved `Promise` if resource should be downloaded or rejected with Error `Promise` if it should be skipped.
Promise should be resolved with:
* `string` which contains response body
* or object with properies `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.
```javascript
// Rejecting resources with 404 status and adding metadata to other resources
scrape({
urls: ['http://example.com/'],
directory: '/path/to/save',
httpResponseHandler: (response) => {
if (response.statusCode === 404) {
return Promise.reject(new Error('status is 404'));
} else {
// if you don't need metadata - you can just return Promise.resolve(response.body)
return Promise.resolve({
body: response.body,
metadata: {
headers: response.headers,
someOtherData: [ 1, 2, 3 ]
}
});
}
}
}).then(console.log).catch(console.log);
```
Scrape function resolves with array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects which contain `metadata` property from `httpResponseHandler`.
#### onResourceSaved
Function called each time when resource is saved to file system. Callback is called with [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) object. Defaults to `null` - no callback will be called.
```javascript
scrape({
urls: ['http://example.com/'],
directory: '/path/to/save',
onResourceSaved: (resource) => {
console.log(`Resource ${resource} was saved to fs`);
}
})
```
#### onResourceError
Function called each time when resource's downloading/handling/saving to fs was failed. Callback is called with - [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) object and `Error` object. Defaults to `null` - no callback will be called.
```javascript
scrape({
urls: ['http://example.com/'],
directory: '/path/to/save',
onResourceError: (resource, err) => {
console.log(`Resource ${resource} was not saved because of ${err}`);
}
})
```
## callback
Callback function, optional, includes following parameters:
- `error`: if error - `Error` object, if success - `null`
- `result`: if error - `null`, if success - array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects containing:
- `url`: url of loaded page
- `filename`: filename where page was saved (relative to `directory`)
- `children`: array of children Resources
## Log and debug

@@ -188,0 +255,0 @@ This module uses [debug](https://github.com/visionmedia/debug) to log events. To enable logs you should use environment variable `DEBUG`.

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc