website-scraper - npm Package Compare versions

Comparing version 2.1.1 to 2.2.0

lib/config/defaults.js

		@@ -33,10 +33,11 @@ var config = {
		},
		urlFilter: function urlFilter () {
		return true;
		},
		urlFilter: null,
		recursive: false,
		maxDepth: null,
		ignoreErrors: true
		ignoreErrors: true,
		httpResponseHandler: null,
		onResourceSaved: null,
		onResourceError: null
		};

		module.exports = config;

lib/request.js

		@@ -1,7 +0,9 @@
		var _ = require('lodash');
		var Promise = require('bluebird');
		var request = require('request');
		var get = Promise.promisify(request.get);
		var logger = require('./logger');
		'use strict';

		const _ = require('lodash');
		const Promise = require('bluebird');
		const request = require('request');
		const get = Promise.promisify(request.get);
		const logger = require('./logger');

		function getMimeType (contentType) {
		@@ -11,23 +13,68 @@ return contentType ? contentType.split(';')[0] : null;

		function makeRequest (options, url, referer) {
		var requestOptions = _.clone(options);
		requestOptions.url = url;
		function defaultResponseHandler (response) {
		return Promise.resolve(response.body);
		}

		if (referer) {
		requestOptions.headers = requestOptions.headers \|\| {};
		requestOptions.headers.referer = referer;
		function transformResult (result) {
		switch (true) {
		case _.isString(result):
		return {
		body: result,
		metadata: null
		};
		case _.isPlainObject(result):
		return {
		body: result.body,
		metadata: result.metadata \|\| null
		};
		default:
		throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result);
		}
		}

		logger.debug(`[request] sending request for url ${url}, referer ${referer}`);
		class Request {
		/**
		*
		* @param {Object} options
		* @param {function} options.httpResponseHandler - custom response handler
		* @param {Object} options.request - custom options for request module
		*/
		constructor (options) {
		this.handleResponse = options && options.httpResponseHandler ? options.httpResponseHandler : defaultResponseHandler;
		this.options = options && options.request ? _.clone(options.request) : {};
		}

		return get(requestOptions).then(function handleResponse (data) {
		logger.debug(`[request] received response for ${data.request.href}, statusCode ${data.statusCode}`);
		return {
		url: data.request.href,
		mimeType: getMimeType(data.headers['content-type']),
		body: data.body
		};
		});
		/**
		* Performs get request to url and returns data for resource
		* @param {string} url - url of resource
		* @param {string} referer - url of parent resource
		* @return {Promise}
		*/
		get (url, referer) {
		let requestOptions = _.clone(this.options);
		requestOptions.url = url;

		if (referer) {
		requestOptions.headers = requestOptions.headers \|\| {};
		requestOptions.headers.referer = referer;
		}

		logger.debug(`[request] sending request for url ${url}, referer ${referer}`);

		return get(requestOptions).then((response) => {
		logger.debug(`[request] received response for ${response.request.href}, statusCode ${response.statusCode}`);
		return this.handleResponse(response)
		.then(transformResult)
		.then((responseHandlerResult) => {
		return {
		url: response.request.href,
		mimeType: getMimeType(response.headers['content-type']),
		body: responseHandlerResult.body,
		metadata: responseHandlerResult.metadata
		};
		});
		});
		}
		}

		module.exports = makeRequest;
		module.exports = Request;

lib/resource.js

		@@ -91,2 +91,6 @@ var types = require('./config/resource-types');

		Resource.prototype.setMetadata = function setMetadata (metadata) {
		this.metadata = metadata;
		};

		module.exports = Resource;

lib/scraper.js

		@@ -11,3 +11,3 @@ var Promise = require('bluebird');
		var FilenameGenerator = require('./filename-generator');
		var makeRequest = require('./request');
		var Request = require('./request');
		var ResourceHandler = require('./resource-handler');
		@@ -32,3 +32,3 @@ var FSAdapter = require('./fs-adaper');

		self.makeRequest = makeRequest.bind(null, self.options.request);
		self.request = new Request(self.options);
		self.resourceHandler = new ResourceHandler(self.options, self);
		@@ -72,5 +72,9 @@ self.filenameGenerator = new FilenameGenerator(self.options);
		return self.fsAdapter.saveResource(resource);
		}).then(function afterResourceSaved () {
		if (self.options.onResourceSaved) {
		self.options.onResourceSaved(resource);
		}
		}).catch(function handleError (err) {
		logger.warn('failed to save resource ' + resource);
		return self.handleError(err);
		return self.handleError(err, resource);
		});
		@@ -86,3 +90,3 @@ };
		var referer = resource.parent ? resource.parent.getUrl() : null;
		return self.makeRequest(url, referer);
		return self.request.get(url, referer);
		}).then(function requestCompleted (responseData) {
		@@ -111,2 +115,6 @@

		if (responseData.metadata) {
		resource.setMetadata(responseData.metadata);
		}

		resource.setText(responseData.body);
		@@ -117,3 +125,3 @@ self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad
		logger.warn('failed to request resource ' + resource);
		return self.handleError(err);
		return self.handleError(err, resource);
		});
		@@ -129,3 +137,3 @@

		if (!self.options.urlFilter(url)) {
		if (self.options.urlFilter && !self.options.urlFilter(url)) {
		logger.debug('filtering out ' + resource + ' by url filter');
		@@ -175,3 +183,6 @@ return Promise.resolve(null);

		Scraper.prototype.handleError = function handleError (err) {
		Scraper.prototype.handleError = function handleError (err, resource) {
		if (resource && this.options.onResourceError) {
		this.options.onResourceError(resource, err);
		}
		if (this.options.ignoreErrors) {
		@@ -178,0 +189,0 @@ logger.warn('ignoring error: ' + err.message);

package.json

		{
		"name": "website-scraper",
		"version": "2.1.1",
		"version": "2.2.0",
		"description": "Download website to a local directory (including all css, images, js, etc.)",
		@@ -29,3 +29,3 @@ "readmeFilename": "README.md",
		],
		"author": "s0ph1e",
		"author": "Sophia Antipenko <sophia@antipenko.pp.ua>",
		"license": "MIT",
		@@ -32,0 +32,0 @@ "bugs": {

239

README.md

		@@ -34,9 +34,11 @@ ## Introduction

		// with callback
		scrape(options, function (error, result) {
		// with promise
		scrape(options).then((result) => {
		/* some code here */
		}).catch((err) => {
		/* some code here */
		});

		// or with promise
		scrape(options).then(function (result) {
		// or with callback
		scrape(options, (error, result) => {
		/* some code here */
		@@ -46,62 +48,24 @@ });

		## API
		### scrape(options, callback)
		Makes requests to `urls` and saves all files found with `sources` to `directory`.

		options - object containing next options:

		- `urls`: array of urls to load and filenames for them (required, see example below)
		- `urlFilter`: function which is called for each url to check whether it should be scraped. (optional, see example below)
		- `directory`: path to save loaded files (required)
		- `filenameGenerator`: name of one of the bundled filenameGenerators, or a custom filenameGenerator function (optional, default: 'byType')
		- `defaultFilename`: filename for index page (optional, default: 'index.html')
		- `prettifyUrls`: whether urls should be 'prettified', by having the `defaultFilename` removed (optional, default: false)
		- `sources`: array of objects to load, specifies selectors and attribute values to select files for loading (optional, see example below)
		- `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` (optional, see example below)
		- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) (optional, see example below)
		- `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading (optional, see example below)
		- `maxDepth`: positive number, maximum allowed depth for dependencies (optional, see example below)
		- `ignoreErrors`: boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error (optional, default: true)
		## options
		* [urls](#urls) - urls to download, required
		* [directory](#directory) - path to save files, required
		* [sources](#sources) - selects which resources should be downloaded
		* [recursive](#recursive) - follow anchors in html files
		* [maxDepth](#maxdepth) - maximum depth for dependencies
		* [request](#request) - custom options for for [request](https://github.com/request/request)
		* [subdirectories](#subdirectories) - subdirectories for file extensions
		* [defaultFilename](#defaultfilename) - filename for index page
		* [prettifyUrls](#prettifyurls) - prettify urls
		* [ignoreErrors](#ignoreerrors) - whether to ignore errors on resource downloading
		* [urlFilter](#urlfilter) - skip some urls
		* [filenameGenerator](#filenamegenerator) - generate filename for downloaded resource
		* [httpResponseHandler](#httpresponsehandler) - customize http response handling
		* [onResourceSaved](#onresourcesaved) - callback called when resource is saved
		* [onResourceError](#onresourceerror) - callback called when resource's downloading is failed

		Default options you can find in [lib/config/defaults.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/config/defaults.js).


		callback - callback function (optional), includes following parameters:

		- `error`: if error - `Error` object, if success - `null`
		- `result`: if error - `null`, if success - array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects containing:
		- `url`: url of loaded page
		- `filename`: filename where page was saved (relative to `directory`)
		- `children`: array of children Resources

		### Filename Generators
		The filename generator determines where the scraped files are saved.

		#### byType (default)
		When the `byType` filenameGenerator is used the downloaded files are saved by type (as defined by the `subdirectories` setting)
		or directly in the `directory` folder, if no subdirectory is specified for the specific type.

		#### bySiteStructure
		When the `bySiteStructure` filenameGenerator is used the downloaded files are saved in `directory` using same structure as on the website:
		- `/` => `DIRECTORY/index.html`
		- `/about` => `DIRECTORY/about/index.html`
		- `/resources/javascript/libraries/jquery.min.js` => `DIRECTORY/resources/javascript/libraries/jquery.min.js`


		## Examples
		#### Example 1
		Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`.
		Imagine we want to load:
		- [Home page](http://nodejs.org/) to `index.html`
		- [About page](http://nodejs.org/about/) to `about.html`
		- [Blog](http://blog.nodejs.org/) to `blog.html`

		and separate files into directories:

		- `img` for .jpg, .png, .svg (full path `/path/to/save/img`)
		- `js` for .js (full path `/path/to/save/js`)
		- `css` for .css (full path `/path/to/save/css`)

		#### urls
		Array of objects which contain urls to download and filenames for them. _Required_.
		```javascript
		var scrape = require('website-scraper');
		scrape({
		@@ -113,8 +77,16 @@ urls: [
		],
		directory: '/path/to/save'
		}).then(console.log).catch(console.log);
		```

		#### directory
		String, absolute path to directory where downloaded files will be saved. Directory should not exist. It will be created by scraper. _Required_.

		#### sources
		Array of objects to download, specifies selectors and attribute values to select files for downloading. By default scraper tries to download all possible resources.
		```javascript
		// Downloading images, css files and scripts
		scrape({
		urls: ['http://nodejs.org/'],
		directory: '/path/to/save',
		subdirectories: [
		{directory: 'img', extensions: ['.jpg', '.png', '.svg']},
		{directory: 'js', extensions: ['.js']},
		{directory: 'css', extensions: ['.css']}
		],
		sources: [
		@@ -124,3 +96,18 @@ {selector: 'img', attr: 'src'},
		{selector: 'script', attr: 'src'}
		],
		]
		}).then(console.log).catch(console.log);
		```

		#### recursive
		Boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading. Defaults to `false`.

		#### maxDepth
		Positive number, maximum allowed depth for dependencies. Defaults to `null` - no maximum depth set.

		#### request
		Object, custom options for [request](https://github.com/request/request#requestoptions-callback). Allows to set cookies, userAgent, etc.
		```javascript
		scrape({
		urls: ['http://example.com/'],
		directory: '/path/to/save',
		request: {
		@@ -131,23 +118,35 @@ headers: {
		}
		}).then(function (result) {
		console.log(result);
		}).catch(function(err){
		console.log(err);
		});
		}).then(console.log).catch(console.log);
		```

		#### Example 2. Recursive downloading
		#### subdirectories
		Array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory`.
		```javascript
		// Links from example.com will be followed
		// Links from links will be ignored because theirs depth = 2 is greater than maxDepth
		var scrape = require('website-scraper');
		/* Separate files into directories:
		- `img` for .jpg, .png, .svg (full path `/path/to/save/img`)
		- `js` for .js (full path `/path/to/save/js`)
		- `css` for .css (full path `/path/to/save/css`)
		*/
		scrape({
		urls: ['http://example.com/'],
		urls: ['http://example.com'],
		directory: '/path/to/save',
		recursive: true,
		maxDepth: 1
		subdirectories: [
		{directory: 'img', extensions: ['.jpg', '.png', '.svg']},
		{directory: 'js', extensions: ['.js']},
		{directory: 'css', extensions: ['.css']}
		]
		}).then(console.log).catch(console.log);
		```

		#### Example 3. Filtering out external resources
		#### defaultFilename
		String, filename for index page. Defaults to `index.html`.

		#### prettifyUrls
		Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`.

		#### ignoreErrors
		Boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error. Defaults to `true`.

		#### urlFilter
		Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied.
		```javascript
		@@ -165,6 +164,16 @@ // Links to other websites are filtered out by the urlFilter

		#### Example 4. Downloading an entire website
		#### filenameGenerator
		String, name of one of the bundled filenameGenerators, or a custom filenameGenerator function. Filename generator determines where the scraped files are saved.

		###### byType (default)
		When the `byType` filenameGenerator is used the downloaded files are saved by type (as defined by the `subdirectories` setting) or directly in the `directory` folder, if no subdirectory is specified for the specific type.

		###### bySiteStructure
		When the `bySiteStructure` filenameGenerator is used the downloaded files are saved in `directory` using same structure as on the website:
		- `/` => `DIRECTORY/index.html`
		- `/about` => `DIRECTORY/about/index.html`
		- `/resources/javascript/libraries/jquery.min.js` => `DIRECTORY/resources/javascript/libraries/jquery.min.js`

		```javascript
		// Downloads all the crawlable files of example.com.
		// The files are saved in the same structure as the structure of the website, by using the `bySiteStructure` filenameGenerator.
		// Downloads all the crawlable files. The files are saved in the same structure as the structure of the website
		// Links to other websites are filtered out by the urlFilter
		@@ -174,8 +183,5 @@ var scrape = require('website-scraper');
		urls: ['http://example.com/'],
		urlFilter: function(url){
		return url.indexOf('http://example.com') === 0;
		},
		urlFilter: function(url){ return url.indexOf('http://example.com') === 0; },
		recursive: true,
		maxDepth: 100,
		prettifyUrls: true,
		filenameGenerator: 'bySiteStructure',
		@@ -186,2 +192,63 @@ directory: '/path/to/save'

		#### httpResponseHandler
		Function which is called on each response, allows to customize resource or reject its downloading.
		It takes 1 argument - response object of [request](https://github.com/request/request) module and should return resolved `Promise` if resource should be downloaded or rejected with Error `Promise` if it should be skipped.
		Promise should be resolved with:
		* `string` which contains response body
		* or object with properies `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.
		```javascript
		// Rejecting resources with 404 status and adding metadata to other resources
		scrape({
		urls: ['http://example.com/'],
		directory: '/path/to/save',
		httpResponseHandler: (response) => {
		if (response.statusCode === 404) {
		return Promise.reject(new Error('status is 404'));
		} else {
		// if you don't need metadata - you can just return Promise.resolve(response.body)
		return Promise.resolve({
		body: response.body,
		metadata: {
		headers: response.headers,
		someOtherData: [ 1, 2, 3 ]
		}
		});
		}
		}
		}).then(console.log).catch(console.log);
		```
		Scrape function resolves with array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects which contain `metadata` property from `httpResponseHandler`.

		#### onResourceSaved
		Function called each time when resource is saved to file system. Callback is called with [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) object. Defaults to `null` - no callback will be called.
		```javascript
		scrape({
		urls: ['http://example.com/'],
		directory: '/path/to/save',
		onResourceSaved: (resource) => {
		console.log(`Resource ${resource} was saved to fs`);
		}
		})
		```

		#### onResourceError
		Function called each time when resource's downloading/handling/saving to fs was failed. Callback is called with - [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) object and `Error` object. Defaults to `null` - no callback will be called.
		```javascript
		scrape({
		urls: ['http://example.com/'],
		directory: '/path/to/save',
		onResourceError: (resource, err) => {
		console.log(`Resource ${resource} was not saved because of ${err}`);
		}
		})
		```

		## callback
		Callback function, optional, includes following parameters:
		- `error`: if error - `Error` object, if success - `null`
		- `result`: if error - `null`, if success - array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects containing:
		- `url`: url of loaded page
		- `filename`: filename where page was saved (relative to `directory`)
		- `children`: array of children Resources

		## Log and debug
		@@ -188,0 +255,0 @@ This module uses [debug](https://github.com/visionmedia/debug) to log events. To enable logs you should use environment variable `DEBUG`.

LICENSE

Sorry, the diff of this file is not supported yet

website-scraper - npm Package Compare versions

Improved metrics