Socket
Socket
Sign inDemoInstall

@crawlee/utils

Package Overview
Dependencies
Maintainers
0
Versions
1179
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@crawlee/utils - npm Package Compare versions

Comparing version 3.11.2-beta.25 to 3.11.2-beta.26

18

internals/sitemap.d.ts

@@ -0,1 +1,3 @@

// @ts-ignore optional peer dependency or compatibility with es2022
import type { Delays } from 'got-scraping';
interface SitemapUrlData {

@@ -23,5 +25,5 @@ loc: string;

};
interface ParseSitemapOptions {
export interface ParseSitemapOptions {
/**
* If set to `true`, elements referring to other sitemaps will be emitted as special objects with a `bouba` property.
* If set to `true`, elements referring to other sitemaps will be emitted as special objects with `originSitemapUrl` set to `null`.
*/

@@ -33,2 +35,10 @@ emitNestedSitemaps?: true | false;

maxDepth?: number;
/**
* Number of retries for fetching sitemaps. The counter resets for each nested sitemap.
*/
sitemapRetries?: number;
/**
* Network timeouts for sitemap fetching. See [Got documentation](https://github.com/sindresorhus/got/blob/main/documentation/6-timeout.md) for more details.
*/
networkTimeouts?: Delays;
}

@@ -63,3 +73,3 @@ export declare function parseSitemap<T extends ParseSitemapOptions>(initialSources: SitemapSource[], proxyUrl?: string, options?: T): AsyncIterable<T['emitNestedSitemaps'] extends true ? SitemapUrl | NestedSitemap : SitemapUrl>;

*/
static load(urls: string | string[], proxyUrl?: string): Promise<Sitemap>;
static load(urls: string | string[], proxyUrl?: string, parseSitemapOptions?: ParseSitemapOptions): Promise<Sitemap>;
/**

@@ -71,5 +81,5 @@ * Parse XML sitemap content from a string and return URLs of referenced pages. If the sitemap references other sitemaps, they will be loaded via HTTP.

static fromXmlString(content: string, proxyUrl?: string): Promise<Sitemap>;
protected static parse(sources: SitemapSource[], proxyUrl?: string): Promise<Sitemap>;
protected static parse(sources: SitemapSource[], proxyUrl?: string, parseSitemapOptions?: ParseSitemapOptions): Promise<Sitemap>;
}
export {};
//# sourceMappingURL=sitemap.d.ts.map

@@ -166,2 +166,3 @@ "use strict";

const { fileTypeStream } = await import('file-type');
const { emitNestedSitemaps = false, maxDepth = Infinity, sitemapRetries = 3, networkTimeouts } = options ?? {};
const sources = [...initialSources];

@@ -187,4 +188,4 @@ const visitedSitemapUrls = new Set();

const source = sources.shift();
if ((source?.depth ?? 0) > (options?.maxDepth ?? Infinity)) {
log_1.default.debug(`Skipping sitemap ${source.type === 'url' ? source.url : ''} because it reached max depth ${options.maxDepth}.`);
if ((source?.depth ?? 0) > maxDepth) {
log_1.default.debug(`Skipping sitemap ${source.type === 'url' ? source.url : ''} because it reached max depth ${maxDepth}.`);
continue;

@@ -196,33 +197,52 @@ }

visitedSitemapUrls.add(sitemapUrl.toString());
try {
const sitemapStream = await new Promise((resolve, reject) => {
const request = gotScraping.stream({ url: sitemapUrl, proxyUrl, method: 'GET' });
request.on('response', () => resolve(request));
request.on('error', reject);
});
if (sitemapStream.response.statusCode === 200) {
let contentType = sitemapStream.response.headers['content-type'];
const streamWithType = await fileTypeStream(sitemapStream);
if (streamWithType.fileType !== undefined) {
contentType = streamWithType.fileType.mime;
}
let isGzipped = false;
if (contentType !== undefined
? contentType === 'application/gzip'
: sitemapUrl.pathname.endsWith('.gz')) {
isGzipped = true;
if (sitemapUrl.pathname.endsWith('.gz')) {
sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3);
let retriesLeft = sitemapRetries + 1;
while (retriesLeft-- > 0) {
try {
const sitemapStream = await new Promise((resolve, reject) => {
const request = gotScraping.stream({
url: sitemapUrl,
proxyUrl,
method: 'GET',
timeout: networkTimeouts,
headers: {
'accept': 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8',
},
});
request.on('response', () => resolve(request));
request.on('error', reject);
});
let error = null;
if (sitemapStream.response.statusCode >= 200 && sitemapStream.response.statusCode < 300) {
let contentType = sitemapStream.response.headers['content-type'];
const streamWithType = await fileTypeStream(sitemapStream);
if (streamWithType.fileType !== undefined) {
contentType = streamWithType.fileType.mime;
}
let isGzipped = false;
if (contentType !== undefined
? contentType === 'application/gzip'
: sitemapUrl.pathname.endsWith('.gz')) {
isGzipped = true;
if (sitemapUrl.pathname.endsWith('.gz')) {
sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3);
}
}
items = (0, node_stream_1.pipeline)(streamWithType, isGzipped ? (0, node_zlib_1.createGunzip)() : new node_stream_1.PassThrough(), createParser(contentType, sitemapUrl), (e) => {
if (e !== undefined) {
error = e;
}
});
}
items = (0, node_stream_1.pipeline)(streamWithType, isGzipped ? (0, node_zlib_1.createGunzip)() : new node_stream_1.PassThrough(), createParser(contentType, sitemapUrl), (error) => {
if (error !== undefined) {
log_1.default.warning(`Malformed sitemap content: ${sitemapUrl}, ${error}`);
}
});
else {
error = new Error(`Failed to fetch sitemap: ${sitemapUrl}, status code: ${sitemapStream.response.statusCode}`);
}
if (error !== null) {
throw error;
}
break;
}
catch (e) {
log_1.default.warning(`Malformed sitemap content: ${sitemapUrl}, ${retriesLeft === 0 ? 'no retries left.' : 'retrying...'} (${e})`);
}
}
catch (e) {
log_1.default.warning(`Malformed sitemap content: ${sitemapUrl}, ${e}`);
}
}

@@ -242,3 +262,3 @@ else if (source.type === 'raw') {

sources.push({ type: 'url', url: item.url, depth: (source.depth ?? 0) + 1 });
if (options?.emitNestedSitemaps) {
if (emitNestedSitemaps) {
// @ts-ignore

@@ -301,4 +321,4 @@ yield { loc: item.url, originSitemapUrl: null };

*/
static async load(urls, proxyUrl) {
return await this.parse((Array.isArray(urls) ? urls : [urls]).map((url) => ({ type: 'url', url })), proxyUrl);
static async load(urls, proxyUrl, parseSitemapOptions) {
return await this.parse((Array.isArray(urls) ? urls : [urls]).map((url) => ({ type: 'url', url })), proxyUrl, parseSitemapOptions);
}

@@ -313,6 +333,6 @@ /**

}
static async parse(sources, proxyUrl) {
static async parse(sources, proxyUrl, parseSitemapOptions) {
const urls = [];
try {
for await (const item of parseSitemap(sources, proxyUrl)) {
for await (const item of parseSitemap(sources, proxyUrl, parseSitemapOptions)) {
urls.push(item.loc);

@@ -319,0 +339,0 @@ }

{
"name": "@crawlee/utils",
"version": "3.11.2-beta.25",
"version": "3.11.2-beta.26",
"description": "A set of shared utilities that can be used by crawlers",

@@ -52,3 +52,3 @@ "engines": {

"@apify/ps-tree": "^1.2.0",
"@crawlee/types": "3.11.2-beta.25",
"@crawlee/types": "3.11.2-beta.26",
"@types/sax": "^1.2.7",

@@ -74,3 +74,3 @@ "cheerio": "1.0.0-rc.12",

},
"gitHead": "3e6d190fa8346754867ddfeb7060c6539dbcd559"
"gitHead": "4e774e1e3f89ed0fecf996734a93c24ba0292f84"
}

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc