website-scraper
Advanced tools
Comparing version 5.1.0 to 5.2.0
@@ -52,2 +52,3 @@ import fs from 'fs'; | ||
responseType: 'buffer', | ||
//cookieJar: true, | ||
decompress: true, | ||
@@ -54,0 +55,0 @@ headers: { |
@@ -23,4 +23,3 @@ import path from 'path'; | ||
const text = resource.getText(); | ||
const encoding = typeof text === 'string' ? 'utf-8' : 'binary'; | ||
await fs.outputFile(filename, text, { encoding }); | ||
await fs.outputFile(filename, text, { encoding: resource.getEncoding() }); | ||
loadedResources.push(resource); | ||
@@ -27,0 +26,0 @@ }); |
import got from 'got'; | ||
import logger from './logger.js'; | ||
import types from './config/resource-types.js'; | ||
import { extend, isPlainObject, getTypeByMime } from './utils/index.js'; | ||
import { extend } from './utils/index.js'; | ||
const TEXT_RESOURCE_TYPES = [types.html, types.css]; | ||
function getMimeType (contentType) { | ||
@@ -12,26 +9,69 @@ return contentType ? contentType.split(';')[0] : null; | ||
function defaultResponseHandler ({response, type}) { | ||
if (TEXT_RESOURCE_TYPES.includes(type)) { | ||
return response.body.toString(); | ||
function defaultResponseHandler ({response}) { | ||
return Promise.resolve(response); | ||
} | ||
function extractEncodingFromHeader (headers) { | ||
const contentTypeHeader = headers['content-type']; | ||
return contentTypeHeader && contentTypeHeader.includes('utf-8') ? 'utf8' : 'binary'; | ||
} | ||
function getEncoding (response) { | ||
if (response && typeof response === 'object') { | ||
if (response.headers && typeof response.headers === 'object') { | ||
return extractEncodingFromHeader(response.headers); | ||
} else if (response.encoding) { | ||
return response.encoding; | ||
} | ||
} | ||
return response.body; | ||
return 'binary'; | ||
} | ||
function throwTypeError (result) { | ||
let type = typeof result; | ||
if (result instanceof Error) { | ||
throw result; | ||
} else if (type === 'object' && Array.isArray(result)) { | ||
type = 'array'; | ||
} | ||
throw new Error(`Wrong response handler result. Expected string or object, but received ${type}`); | ||
} | ||
function getData (result) { | ||
let data = result; | ||
if (result && typeof result === 'object' && 'body' in result) { | ||
data = result.body; | ||
} | ||
return data; | ||
} | ||
function transformResult (result) { | ||
switch (true) { | ||
case typeof result === 'string' || Buffer.isBuffer(result): | ||
return { | ||
body: result, | ||
metadata: null | ||
}; | ||
case isPlainObject(result): | ||
return { | ||
body: result.body, | ||
metadata: result.metadata || null | ||
}; | ||
case result === null: | ||
return null; | ||
default: | ||
throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result); | ||
const encoding = getEncoding(result); | ||
const data = getData(result); | ||
// Check for no data | ||
if (data === null || data === undefined) { | ||
return null; | ||
} | ||
// Then stringify it. | ||
let body = null; | ||
if (data instanceof Buffer) { | ||
body = data.toString(encoding); | ||
} else if (typeof data === 'string') { | ||
body = data; | ||
} else { | ||
throwTypeError(result); | ||
} | ||
return { | ||
body, | ||
encoding, | ||
metadata: result.metadata || data.metadata || null | ||
}; | ||
} | ||
@@ -51,8 +91,4 @@ | ||
logger.debug(`[request] received response for ${response.url}, statusCode ${response.statusCode}`); | ||
const responseHandlerResult = transformResult(await afterResponse({response})); | ||
const mimeType = getMimeType(response.headers['content-type']); | ||
const resourceType = getTypeByMime(mimeType); | ||
const responseHandlerResult = transformResult(await afterResponse({ response, type: resourceType })); | ||
if (!responseHandlerResult) { | ||
@@ -63,6 +99,6 @@ return null; | ||
url: response.url, | ||
type: resourceType, | ||
mimeType, | ||
mimeType: getMimeType(response.headers['content-type']), | ||
body: responseHandlerResult.body, | ||
metadata: responseHandlerResult.metadata | ||
metadata: responseHandlerResult.metadata, | ||
encoding: responseHandlerResult.encoding | ||
}; | ||
@@ -72,3 +108,5 @@ } | ||
export default { | ||
get: getRequest | ||
get: getRequest, | ||
getEncoding, | ||
transformResult | ||
}; |
@@ -15,2 +15,3 @@ import types from './config/resource-types.js'; | ||
this.saved = false; | ||
this.encoding = 'binary'; | ||
} | ||
@@ -73,2 +74,10 @@ | ||
setEncoding (encoding) { | ||
this.encoding = encoding; | ||
} | ||
getEncoding () { | ||
return this.encoding; | ||
} | ||
isHtml () { | ||
@@ -75,0 +84,0 @@ return this.getType() === types.html; |
@@ -16,3 +16,3 @@ import PromiseQueue from 'p-queue'; | ||
import * as utils from './utils/index.js'; | ||
const { extend, union, urlsEqual, getTypeByFilename, series } = utils; | ||
const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils; | ||
import NormalizedUrlMap from './utils/normalized-url-map.js'; | ||
@@ -174,3 +174,4 @@ | ||
resource.setType(responseData.type); | ||
resource.setEncoding(responseData.encoding); | ||
resource.setType(getTypeByMime(responseData.mimeType)); | ||
@@ -177,0 +178,0 @@ const { filename } = await self.runActions('generateFilename', { resource, responseData }); |
{ | ||
"name": "website-scraper", | ||
"version": "5.1.0", | ||
"version": "5.2.0", | ||
"description": "Download website to a local directory (including all css, images, js, etc.)", | ||
@@ -5,0 +5,0 @@ "readmeFilename": "README.md", |
@@ -329,4 +329,8 @@ [data:image/s3,"s3://crabby-images/0c269/0c269f4ebc2b1302a8db6583df6012673424576f" alt="Version"](https://www.npmjs.org/package/website-scraper) | ||
Promise should be resolved with: | ||
* `string` which contains response body | ||
* or object with properties `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. | ||
* the `response` object with the `body` modified in place as necessary. | ||
* or object with properties | ||
* `body` (response body, string) | ||
* `encoding` (`binary` or `utf8`) used to save the file, binary used by default. | ||
* `metadata` (object) - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. | ||
* a binary `string`. This is advised against because of the binary assumption being made can foul up saving of `utf8` responses to the filesystem. | ||
@@ -346,3 +350,4 @@ If multiple actions `afterResponse` added - scraper will use result from last one. | ||
someOtherData: [ 1, 2, 3 ] | ||
} | ||
}, | ||
encoding: 'utf8' | ||
} | ||
@@ -349,0 +354,0 @@ } |
Deprecated
MaintenanceThe maintainer of the package marked it as deprecated. This could indicate that a single version should not be used, or that the package is no longer maintained and any new vulnerabilities will not be fixed.
Found 1 instance in 1 package
65150
1171
0
452