@crawlee/utils
Advanced tools
Comparing version 3.11.2-beta.25 to 3.11.2-beta.26
@@ -0,1 +1,3 @@ | ||
// @ts-ignore optional peer dependency or compatibility with es2022 | ||
import type { Delays } from 'got-scraping'; | ||
interface SitemapUrlData { | ||
@@ -23,5 +25,5 @@ loc: string; | ||
}; | ||
interface ParseSitemapOptions { | ||
export interface ParseSitemapOptions { | ||
/** | ||
* If set to `true`, elements referring to other sitemaps will be emitted as special objects with a `bouba` property. | ||
* If set to `true`, elements referring to other sitemaps will be emitted as special objects with `originSitemapUrl` set to `null`. | ||
*/ | ||
@@ -33,2 +35,10 @@ emitNestedSitemaps?: true | false; | ||
maxDepth?: number; | ||
/** | ||
* Number of retries for fetching sitemaps. The counter resets for each nested sitemap. | ||
*/ | ||
sitemapRetries?: number; | ||
/** | ||
* Network timeouts for sitemap fetching. See [Got documentation](https://github.com/sindresorhus/got/blob/main/documentation/6-timeout.md) for more details. | ||
*/ | ||
networkTimeouts?: Delays; | ||
} | ||
@@ -63,3 +73,3 @@ export declare function parseSitemap<T extends ParseSitemapOptions>(initialSources: SitemapSource[], proxyUrl?: string, options?: T): AsyncIterable<T['emitNestedSitemaps'] extends true ? SitemapUrl | NestedSitemap : SitemapUrl>; | ||
*/ | ||
static load(urls: string | string[], proxyUrl?: string): Promise<Sitemap>; | ||
static load(urls: string | string[], proxyUrl?: string, parseSitemapOptions?: ParseSitemapOptions): Promise<Sitemap>; | ||
/** | ||
@@ -71,5 +81,5 @@ * Parse XML sitemap content from a string and return URLs of referenced pages. If the sitemap references other sitemaps, they will be loaded via HTTP. | ||
static fromXmlString(content: string, proxyUrl?: string): Promise<Sitemap>; | ||
protected static parse(sources: SitemapSource[], proxyUrl?: string): Promise<Sitemap>; | ||
protected static parse(sources: SitemapSource[], proxyUrl?: string, parseSitemapOptions?: ParseSitemapOptions): Promise<Sitemap>; | ||
} | ||
export {}; | ||
//# sourceMappingURL=sitemap.d.ts.map |
@@ -166,2 +166,3 @@ "use strict"; | ||
const { fileTypeStream } = await import('file-type'); | ||
const { emitNestedSitemaps = false, maxDepth = Infinity, sitemapRetries = 3, networkTimeouts } = options ?? {}; | ||
const sources = [...initialSources]; | ||
@@ -187,4 +188,4 @@ const visitedSitemapUrls = new Set(); | ||
const source = sources.shift(); | ||
if ((source?.depth ?? 0) > (options?.maxDepth ?? Infinity)) { | ||
log_1.default.debug(`Skipping sitemap ${source.type === 'url' ? source.url : ''} because it reached max depth ${options.maxDepth}.`); | ||
if ((source?.depth ?? 0) > maxDepth) { | ||
log_1.default.debug(`Skipping sitemap ${source.type === 'url' ? source.url : ''} because it reached max depth ${maxDepth}.`); | ||
continue; | ||
@@ -196,33 +197,52 @@ } | ||
visitedSitemapUrls.add(sitemapUrl.toString()); | ||
try { | ||
const sitemapStream = await new Promise((resolve, reject) => { | ||
const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' }); | ||
request.on('response', () => resolve(request)); | ||
request.on('error', reject); | ||
}); | ||
if (sitemapStream.response.statusCode === 200) { | ||
let contentType = sitemapStream.response.headers['content-type']; | ||
const streamWithType = await fileTypeStream(sitemapStream); | ||
if (streamWithType.fileType !== undefined) { | ||
contentType = streamWithType.fileType.mime; | ||
} | ||
let isGzipped = false; | ||
if (contentType !== undefined | ||
? contentType === 'application/gzip' | ||
: sitemapUrl.pathname.endsWith('.gz')) { | ||
isGzipped = true; | ||
if (sitemapUrl.pathname.endsWith('.gz')) { | ||
sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3); | ||
let retriesLeft = sitemapRetries + 1; | ||
while (retriesLeft-- > 0) { | ||
try { | ||
const sitemapStream = await new Promise((resolve, reject) => { | ||
const request = gotScraping.stream({ | ||
url: sitemapUrl, | ||
proxyUrl, | ||
method: 'GET', | ||
timeout: networkTimeouts, | ||
headers: { | ||
'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8', | ||
}, | ||
}); | ||
request.on('response', () => resolve(request)); | ||
request.on('error', reject); | ||
}); | ||
let error = null; | ||
if (sitemapStream.response.statusCode >= 200 && sitemapStream.response.statusCode < 300) { | ||
let contentType = sitemapStream.response.headers['content-type']; | ||
const streamWithType = await fileTypeStream(sitemapStream); | ||
if (streamWithType.fileType !== undefined) { | ||
contentType = streamWithType.fileType.mime; | ||
} | ||
let isGzipped = false; | ||
if (contentType !== undefined | ||
? contentType === 'application/gzip' | ||
: sitemapUrl.pathname.endsWith('.gz')) { | ||
isGzipped = true; | ||
if (sitemapUrl.pathname.endsWith('.gz')) { | ||
sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3); | ||
} | ||
} | ||
items = (0, node_stream_1.pipeline)(streamWithType, isGzipped ? (0, node_zlib_1.createGunzip)() : new node_stream_1.PassThrough(), createParser(contentType, sitemapUrl), (e) => { | ||
if (e !== undefined) { | ||
error = e; | ||
} | ||
}); | ||
} | ||
items = (0, node_stream_1.pipeline)(streamWithType, isGzipped ? (0, node_zlib_1.createGunzip)() : new node_stream_1.PassThrough(), createParser(contentType, sitemapUrl), (error) => { | ||
if (error !== undefined) { | ||
log_1.default.warning(`Malformed sitemap content: ${sitemapUrl}, ${error}`); | ||
} | ||
}); | ||
else { | ||
error = new Error(`Failed to fetch sitemap: ${sitemapUrl}, status code: ${sitemapStream.response.statusCode}`); | ||
} | ||
if (error !== null) { | ||
throw error; | ||
} | ||
break; | ||
} | ||
catch (e) { | ||
log_1.default.warning(`Malformed sitemap content: ${sitemapUrl}, ${retriesLeft === 0 ? 'no retries left.' : 'retrying...'} (${e})`); | ||
} | ||
} | ||
catch (e) { | ||
log_1.default.warning(`Malformed sitemap content: ${sitemapUrl}, ${e}`); | ||
} | ||
} | ||
@@ -242,3 +262,3 @@ else if (source.type === 'raw') { | ||
sources.push({ type: 'url', url: item.url, depth: (source.depth ?? 0) + 1 }); | ||
if (options?.emitNestedSitemaps) { | ||
if (emitNestedSitemaps) { | ||
// @ts-ignore | ||
@@ -301,4 +321,4 @@ yield { loc: item.url, originSitemapUrl: null }; | ||
*/ | ||
static async load(urls, proxyUrl) { | ||
return await this.parse((Array.isArray(urls) ? urls : [urls]).map((url) => ({ type: 'url', url })), proxyUrl); | ||
static async load(urls, proxyUrl, parseSitemapOptions) { | ||
return await this.parse((Array.isArray(urls) ? urls : [urls]).map((url) => ({ type: 'url', url })), proxyUrl, parseSitemapOptions); | ||
} | ||
@@ -313,6 +333,6 @@ /** | ||
} | ||
static async parse(sources, proxyUrl) { | ||
static async parse(sources, proxyUrl, parseSitemapOptions) { | ||
const urls = []; | ||
try { | ||
for await (const item of parseSitemap(sources, proxyUrl)) { | ||
for await (const item of parseSitemap(sources, proxyUrl, parseSitemapOptions)) { | ||
urls.push(item.loc); | ||
@@ -319,0 +339,0 @@ } |
{ | ||
"name": "@crawlee/utils", | ||
"version": "3.11.2-beta.25", | ||
"version": "3.11.2-beta.26", | ||
"description": "A set of shared utilities that can be used by crawlers", | ||
@@ -52,3 +52,3 @@ "engines": { | ||
"@apify/ps-tree": "^1.2.0", | ||
"@crawlee/types": "3.11.2-beta.25", | ||
"@crawlee/types": "3.11.2-beta.26", | ||
"@types/sax": "^1.2.7", | ||
@@ -74,3 +74,3 @@ "cheerio": "1.0.0-rc.12", | ||
}, | ||
"gitHead": "3e6d190fa8346754867ddfeb7060c6539dbcd559" | ||
"gitHead": "4e774e1e3f89ed0fecf996734a93c24ba0292f84" | ||
} |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
321747
2935
+ Added@crawlee/types@3.11.2-beta.26(transitive)
- Removed@crawlee/types@3.11.2-beta.25(transitive)