website-scraper
Advanced tools
Comparing version 2.1.1 to 2.2.0
@@ -33,10 +33,11 @@ var config = { | ||
}, | ||
urlFilter: function urlFilter () { | ||
return true; | ||
}, | ||
urlFilter: null, | ||
recursive: false, | ||
maxDepth: null, | ||
ignoreErrors: true | ||
ignoreErrors: true, | ||
httpResponseHandler: null, | ||
onResourceSaved: null, | ||
onResourceError: null | ||
}; | ||
module.exports = config; |
@@ -1,7 +0,9 @@ | ||
var _ = require('lodash'); | ||
var Promise = require('bluebird'); | ||
var request = require('request'); | ||
var get = Promise.promisify(request.get); | ||
var logger = require('./logger'); | ||
'use strict'; | ||
const _ = require('lodash'); | ||
const Promise = require('bluebird'); | ||
const request = require('request'); | ||
const get = Promise.promisify(request.get); | ||
const logger = require('./logger'); | ||
function getMimeType (contentType) { | ||
@@ -11,23 +13,68 @@ return contentType ? contentType.split(';')[0] : null; | ||
function makeRequest (options, url, referer) { | ||
var requestOptions = _.clone(options); | ||
requestOptions.url = url; | ||
function defaultResponseHandler (response) { | ||
return Promise.resolve(response.body); | ||
} | ||
if (referer) { | ||
requestOptions.headers = requestOptions.headers || {}; | ||
requestOptions.headers.referer = referer; | ||
function transformResult (result) { | ||
switch (true) { | ||
case _.isString(result): | ||
return { | ||
body: result, | ||
metadata: null | ||
}; | ||
case _.isPlainObject(result): | ||
return { | ||
body: result.body, | ||
metadata: result.metadata || null | ||
}; | ||
default: | ||
throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result); | ||
} | ||
} | ||
logger.debug(`[request] sending request for url ${url}, referer ${referer}`); | ||
class Request { | ||
/** | ||
* | ||
* @param {Object} options | ||
* @param {function} options.httpResponseHandler - custom response handler | ||
* @param {Object} options.request - custom options for request module | ||
*/ | ||
constructor (options) { | ||
this.handleResponse = options && options.httpResponseHandler ? options.httpResponseHandler : defaultResponseHandler; | ||
this.options = options && options.request ? _.clone(options.request) : {}; | ||
} | ||
return get(requestOptions).then(function handleResponse (data) { | ||
logger.debug(`[request] received response for ${data.request.href}, statusCode ${data.statusCode}`); | ||
return { | ||
url: data.request.href, | ||
mimeType: getMimeType(data.headers['content-type']), | ||
body: data.body | ||
}; | ||
}); | ||
/** | ||
* Performs get request to url and returns data for resource | ||
* @param {string} url - url of resource | ||
* @param {string} referer - url of parent resource | ||
* @return {Promise} | ||
*/ | ||
get (url, referer) { | ||
let requestOptions = _.clone(this.options); | ||
requestOptions.url = url; | ||
if (referer) { | ||
requestOptions.headers = requestOptions.headers || {}; | ||
requestOptions.headers.referer = referer; | ||
} | ||
logger.debug(`[request] sending request for url ${url}, referer ${referer}`); | ||
return get(requestOptions).then((response) => { | ||
logger.debug(`[request] received response for ${response.request.href}, statusCode ${response.statusCode}`); | ||
return this.handleResponse(response) | ||
.then(transformResult) | ||
.then((responseHandlerResult) => { | ||
return { | ||
url: response.request.href, | ||
mimeType: getMimeType(response.headers['content-type']), | ||
body: responseHandlerResult.body, | ||
metadata: responseHandlerResult.metadata | ||
}; | ||
}); | ||
}); | ||
} | ||
} | ||
module.exports = makeRequest; | ||
module.exports = Request; |
@@ -91,2 +91,6 @@ var types = require('./config/resource-types'); | ||
Resource.prototype.setMetadata = function setMetadata (metadata) { | ||
this.metadata = metadata; | ||
}; | ||
module.exports = Resource; |
@@ -11,3 +11,3 @@ var Promise = require('bluebird'); | ||
var FilenameGenerator = require('./filename-generator'); | ||
var makeRequest = require('./request'); | ||
var Request = require('./request'); | ||
var ResourceHandler = require('./resource-handler'); | ||
@@ -32,3 +32,3 @@ var FSAdapter = require('./fs-adaper'); | ||
self.makeRequest = makeRequest.bind(null, self.options.request); | ||
self.request = new Request(self.options); | ||
self.resourceHandler = new ResourceHandler(self.options, self); | ||
@@ -72,5 +72,9 @@ self.filenameGenerator = new FilenameGenerator(self.options); | ||
return self.fsAdapter.saveResource(resource); | ||
}).then(function afterResourceSaved () { | ||
if (self.options.onResourceSaved) { | ||
self.options.onResourceSaved(resource); | ||
} | ||
}).catch(function handleError (err) { | ||
logger.warn('failed to save resource ' + resource); | ||
return self.handleError(err); | ||
return self.handleError(err, resource); | ||
}); | ||
@@ -86,3 +90,3 @@ }; | ||
var referer = resource.parent ? resource.parent.getUrl() : null; | ||
return self.makeRequest(url, referer); | ||
return self.request.get(url, referer); | ||
}).then(function requestCompleted (responseData) { | ||
@@ -111,2 +115,6 @@ | ||
if (responseData.metadata) { | ||
resource.setMetadata(responseData.metadata); | ||
} | ||
resource.setText(responseData.body); | ||
@@ -117,3 +125,3 @@ self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad | ||
logger.warn('failed to request resource ' + resource); | ||
return self.handleError(err); | ||
return self.handleError(err, resource); | ||
}); | ||
@@ -129,3 +137,3 @@ | ||
if (!self.options.urlFilter(url)) { | ||
if (self.options.urlFilter && !self.options.urlFilter(url)) { | ||
logger.debug('filtering out ' + resource + ' by url filter'); | ||
@@ -175,3 +183,6 @@ return Promise.resolve(null); | ||
Scraper.prototype.handleError = function handleError (err) { | ||
Scraper.prototype.handleError = function handleError (err, resource) { | ||
if (resource && this.options.onResourceError) { | ||
this.options.onResourceError(resource, err); | ||
} | ||
if (this.options.ignoreErrors) { | ||
@@ -178,0 +189,0 @@ logger.warn('ignoring error: ' + err.message); |
{ | ||
"name": "website-scraper", | ||
"version": "2.1.1", | ||
"version": "2.2.0", | ||
"description": "Download website to a local directory (including all css, images, js, etc.)", | ||
@@ -29,3 +29,3 @@ "readmeFilename": "README.md", | ||
], | ||
"author": "s0ph1e", | ||
"author": "Sophia Antipenko <sophia@antipenko.pp.ua>", | ||
"license": "MIT", | ||
@@ -32,0 +32,0 @@ "bugs": { |
239
README.md
@@ -34,9 +34,11 @@ ## Introduction | ||
// with callback | ||
scrape(options, function (error, result) { | ||
// with promise | ||
scrape(options).then((result) => { | ||
/* some code here */ | ||
}).catch((err) => { | ||
/* some code here */ | ||
}); | ||
// or with promise | ||
scrape(options).then(function (result) { | ||
// or with callback | ||
scrape(options, (error, result) => { | ||
/* some code here */ | ||
@@ -46,62 +48,24 @@ }); | ||
## API | ||
### scrape(options, callback) | ||
Makes requests to `urls` and saves all files found with `sources` to `directory`. | ||
**options** - object containing next options: | ||
- `urls`: array of urls to load and filenames for them *(required, see example below)* | ||
- `urlFilter`: function which is called for each url to check whether it should be scraped. *(optional, see example below)* | ||
- `directory`: path to save loaded files *(required)* | ||
- `filenameGenerator`: name of one of the bundled filenameGenerators, or a custom filenameGenerator function *(optional, default: 'byType')* | ||
- `defaultFilename`: filename for index page *(optional, default: 'index.html')* | ||
- `prettifyUrls`: whether urls should be 'prettified', by having the `defaultFilename` removed *(optional, default: false)* | ||
- `sources`: array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see example below)* | ||
- `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)* | ||
- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)* | ||
- `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)* | ||
- `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)* | ||
- `ignoreErrors`: boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error *(optional, default: true)* | ||
## options | ||
* [urls](#urls) - urls to download, *required* | ||
* [directory](#directory) - path to save files, *required* | ||
* [sources](#sources) - selects which resources should be downloaded | ||
* [recursive](#recursive) - follow anchors in html files | ||
* [maxDepth](#maxdepth) - maximum depth for dependencies | ||
* [request](#request) - custom options for for [request](https://github.com/request/request) | ||
* [subdirectories](#subdirectories) - subdirectories for file extensions | ||
* [defaultFilename](#defaultfilename) - filename for index page | ||
* [prettifyUrls](#prettifyurls) - prettify urls | ||
* [ignoreErrors](#ignoreerrors) - whether to ignore errors on resource downloading | ||
* [urlFilter](#urlfilter) - skip some urls | ||
* [filenameGenerator](#filenamegenerator) - generate filename for downloaded resource | ||
* [httpResponseHandler](#httpresponsehandler) - customize http response handling | ||
* [onResourceSaved](#onresourcesaved) - callback called when resource is saved | ||
* [onResourceError](#onresourceerror) - callback called when resource's downloading is failed | ||
Default options you can find in [lib/config/defaults.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/config/defaults.js). | ||
**callback** - callback function *(optional)*, includes following parameters: | ||
- `error`: if error - `Error` object, if success - `null` | ||
- `result`: if error - `null`, if success - array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects containing: | ||
- `url`: url of loaded page | ||
- `filename`: filename where page was saved (relative to `directory`) | ||
- `children`: array of children Resources | ||
### Filename Generators | ||
The filename generator determines where the scraped files are saved. | ||
#### byType (default) | ||
When the `byType` filenameGenerator is used the downloaded files are saved by type (as defined by the `subdirectories` setting) | ||
or directly in the `directory` folder, if no subdirectory is specified for the specific type. | ||
#### bySiteStructure | ||
When the `bySiteStructure` filenameGenerator is used the downloaded files are saved in `directory` using same structure as on the website: | ||
- `/` => `DIRECTORY/index.html` | ||
- `/about` => `DIRECTORY/about/index.html` | ||
- `/resources/javascript/libraries/jquery.min.js` => `DIRECTORY/resources/javascript/libraries/jquery.min.js` | ||
## Examples | ||
#### Example 1 | ||
Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`. | ||
Imagine we want to load: | ||
- [Home page](http://nodejs.org/) to `index.html` | ||
- [About page](http://nodejs.org/about/) to `about.html` | ||
- [Blog](http://blog.nodejs.org/) to `blog.html` | ||
and separate files into directories: | ||
- `img` for .jpg, .png, .svg (full path `/path/to/save/img`) | ||
- `js` for .js (full path `/path/to/save/js`) | ||
- `css` for .css (full path `/path/to/save/css`) | ||
#### urls | ||
Array of objects which contain urls to download and filenames for them. **_Required_**. | ||
```javascript | ||
var scrape = require('website-scraper'); | ||
scrape({ | ||
@@ -113,8 +77,16 @@ urls: [ | ||
], | ||
directory: '/path/to/save' | ||
}).then(console.log).catch(console.log); | ||
``` | ||
#### directory | ||
String, absolute path to directory where downloaded files will be saved. Directory should not exist. It will be created by scraper. **_Required_**. | ||
#### sources | ||
Array of objects to download, specifies selectors and attribute values to select files for downloading. By default scraper tries to download all possible resources. | ||
```javascript | ||
// Downloading images, css files and scripts | ||
scrape({ | ||
urls: ['http://nodejs.org/'], | ||
directory: '/path/to/save', | ||
subdirectories: [ | ||
{directory: 'img', extensions: ['.jpg', '.png', '.svg']}, | ||
{directory: 'js', extensions: ['.js']}, | ||
{directory: 'css', extensions: ['.css']} | ||
], | ||
sources: [ | ||
@@ -124,3 +96,18 @@ {selector: 'img', attr: 'src'}, | ||
{selector: 'script', attr: 'src'} | ||
], | ||
] | ||
}).then(console.log).catch(console.log); | ||
``` | ||
#### recursive | ||
Boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading. Defaults to `false`. | ||
#### maxDepth | ||
Positive number, maximum allowed depth for dependencies. Defaults to `null` - no maximum depth set. | ||
#### request | ||
Object, custom options for [request](https://github.com/request/request#requestoptions-callback). Allows to set cookies, userAgent, etc. | ||
```javascript | ||
scrape({ | ||
urls: ['http://example.com/'], | ||
directory: '/path/to/save', | ||
request: { | ||
@@ -131,23 +118,35 @@ headers: { | ||
} | ||
}).then(function (result) { | ||
console.log(result); | ||
}).catch(function(err){ | ||
console.log(err); | ||
}); | ||
}).then(console.log).catch(console.log); | ||
``` | ||
#### Example 2. Recursive downloading | ||
#### subdirectories | ||
Array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory`. | ||
```javascript | ||
// Links from example.com will be followed | ||
// Links from links will be ignored because theirs depth = 2 is greater than maxDepth | ||
var scrape = require('website-scraper'); | ||
/* Separate files into directories: | ||
- `img` for .jpg, .png, .svg (full path `/path/to/save/img`) | ||
- `js` for .js (full path `/path/to/save/js`) | ||
- `css` for .css (full path `/path/to/save/css`) | ||
*/ | ||
scrape({ | ||
urls: ['http://example.com/'], | ||
urls: ['http://example.com'], | ||
directory: '/path/to/save', | ||
recursive: true, | ||
maxDepth: 1 | ||
subdirectories: [ | ||
{directory: 'img', extensions: ['.jpg', '.png', '.svg']}, | ||
{directory: 'js', extensions: ['.js']}, | ||
{directory: 'css', extensions: ['.css']} | ||
] | ||
}).then(console.log).catch(console.log); | ||
``` | ||
#### Example 3. Filtering out external resources | ||
#### defaultFilename | ||
String, filename for index page. Defaults to `index.html`. | ||
#### prettifyUrls | ||
Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`. | ||
#### ignoreErrors | ||
Boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error. Defaults to `true`. | ||
#### urlFilter | ||
Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied. | ||
```javascript | ||
@@ -165,6 +164,16 @@ // Links to other websites are filtered out by the urlFilter | ||
#### Example 4. Downloading an entire website | ||
#### filenameGenerator | ||
String, name of one of the bundled filenameGenerators, or a custom filenameGenerator function. Filename generator determines where the scraped files are saved. | ||
###### byType (default) | ||
When the `byType` filenameGenerator is used the downloaded files are saved by type (as defined by the `subdirectories` setting) or directly in the `directory` folder, if no subdirectory is specified for the specific type. | ||
###### bySiteStructure | ||
When the `bySiteStructure` filenameGenerator is used the downloaded files are saved in `directory` using same structure as on the website: | ||
- `/` => `DIRECTORY/index.html` | ||
- `/about` => `DIRECTORY/about/index.html` | ||
- `/resources/javascript/libraries/jquery.min.js` => `DIRECTORY/resources/javascript/libraries/jquery.min.js` | ||
```javascript | ||
// Downloads all the crawlable files of example.com. | ||
// The files are saved in the same structure as the structure of the website, by using the `bySiteStructure` filenameGenerator. | ||
// Downloads all the crawlable files. The files are saved in the same structure as the structure of the website | ||
// Links to other websites are filtered out by the urlFilter | ||
@@ -174,8 +183,5 @@ var scrape = require('website-scraper'); | ||
urls: ['http://example.com/'], | ||
urlFilter: function(url){ | ||
return url.indexOf('http://example.com') === 0; | ||
}, | ||
urlFilter: function(url){ return url.indexOf('http://example.com') === 0; }, | ||
recursive: true, | ||
maxDepth: 100, | ||
prettifyUrls: true, | ||
filenameGenerator: 'bySiteStructure', | ||
@@ -186,2 +192,63 @@ directory: '/path/to/save' | ||
#### httpResponseHandler | ||
Function which is called on each response, allows to customize resource or reject its downloading. | ||
It takes 1 argument - response object of [request](https://github.com/request/request) module and should return resolved `Promise` if resource should be downloaded or rejected with Error `Promise` if it should be skipped. | ||
Promise should be resolved with: | ||
* `string` which contains response body | ||
* or object with properies `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. | ||
```javascript | ||
// Rejecting resources with 404 status and adding metadata to other resources | ||
scrape({ | ||
urls: ['http://example.com/'], | ||
directory: '/path/to/save', | ||
httpResponseHandler: (response) => { | ||
if (response.statusCode === 404) { | ||
return Promise.reject(new Error('status is 404')); | ||
} else { | ||
// if you don't need metadata - you can just return Promise.resolve(response.body) | ||
return Promise.resolve({ | ||
body: response.body, | ||
metadata: { | ||
headers: response.headers, | ||
someOtherData: [ 1, 2, 3 ] | ||
} | ||
}); | ||
} | ||
} | ||
}).then(console.log).catch(console.log); | ||
``` | ||
Scrape function resolves with array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects which contain `metadata` property from `httpResponseHandler`. | ||
#### onResourceSaved | ||
Function called each time when resource is saved to file system. Callback is called with [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) object. Defaults to `null` - no callback will be called. | ||
```javascript | ||
scrape({ | ||
urls: ['http://example.com/'], | ||
directory: '/path/to/save', | ||
onResourceSaved: (resource) => { | ||
console.log(`Resource ${resource} was saved to fs`); | ||
} | ||
}) | ||
``` | ||
#### onResourceError | ||
Function called each time when resource's downloading/handling/saving to fs was failed. Callback is called with - [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) object and `Error` object. Defaults to `null` - no callback will be called. | ||
```javascript | ||
scrape({ | ||
urls: ['http://example.com/'], | ||
directory: '/path/to/save', | ||
onResourceError: (resource, err) => { | ||
console.log(`Resource ${resource} was not saved because of ${err}`); | ||
} | ||
}) | ||
``` | ||
## callback | ||
Callback function, optional, includes following parameters: | ||
- `error`: if error - `Error` object, if success - `null` | ||
- `result`: if error - `null`, if success - array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects containing: | ||
- `url`: url of loaded page | ||
- `filename`: filename where page was saved (relative to `directory`) | ||
- `children`: array of children Resources | ||
## Log and debug | ||
@@ -188,0 +255,0 @@ This module uses [debug](https://github.com/visionmedia/debug) to log events. To enable logs you should use environment variable `DEBUG`. |
Sorry, the diff of this file is not supported yet
46839
902
254