🚀. Socket Launch Week Day 3:Socket Firewall Now Blocks Malicious VS Code and Open VSX Extensions.Learn more
Sign In

srcfull

Package Overview
Dependencies
Maintainers
1
Versions
2
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

srcfull - npm Package Compare versions

Package was removed
Sorry, it seems this package was removed from the registry
Comparing version
2.0.0
to
2.0.1
+1345
dist/chunk-QHQAB7M5.js
import {
RetryableStatusError,
emitDebug,
isRetryableRequestError,
shouldRetryStatus,
sleep,
validatePublicUrl
} from "./chunk-PUJK33LY.js";
// data/patterns.json
var patterns_default = {
conde_nast: {
domain: "media.",
description: "Cond\xE9 Nast media platform (House & Garden, Vogue, GQ, Wired, Vanity Fair, etc.)",
extractSource: {
pattern: "^(https?://media\\.[^/]+/photos/[a-f0-9]{24})(?:/.*)?$",
replacement: "$1/master/w_2560,c_limit/image.jpg"
},
confidence: "high",
notes: "Raw ID returns 503; must use transformation params. master/w_2560 gives highest quality.",
examples: [
"https://media.houseandgarden.co.uk/photos/63e509b43404638ef031982b/1:1/w_1600%2Cc_limit/filename.jpg",
"https://media.vogue.com/photos/5f3e0c5e9c1a7b2d3c4e5f6a/4:3/w_800/image.jpg"
]
},
cloudinary_path: {
domain: "res.cloudinary.com",
description: "Cloudinary with path-based transformations",
extractSource: {
pattern: "^(https?://res\\.cloudinary\\.com/[^/]+/image/upload)/[^/]+/(.+)$",
replacement: "$1/$2"
},
confidence: "high",
examples: [
"https://res.cloudinary.com/demo/image/upload/w_400,h_300,c_fill/v1234/sample.jpg"
]
},
cloudflare_images: {
domain: "cdn-cgi/image",
description: "Cloudflare Image Resizing",
extractSource: {
pattern: "^https?://[^/]+/cdn-cgi/image/[^/]+/(.+)$",
replacement: "$1"
},
confidence: "high",
examples: [
"https://example.com/cdn-cgi/image/width=800,quality=80/https://example.com/original.jpg"
]
},
fastly_io: {
domain: "fastly.io",
description: "Fastly Image Optimizer with query parameter transforms",
stripParams: [
"[?&]width=\\d+",
"[?&]height=\\d+",
"[?&]quality=\\d+",
"[?&]fit=[^&#]+",
"[?&]crop=[^&#]+",
"[?&]dpr=[\\d.]+",
"[?&]format=[^&#]+",
"[?&]bg-color=[^&#]+"
],
confidence: "medium",
examples: ["https://www.fastly.io/image.jpg?width=300"]
},
shopify: {
domain: "/cdn/shop/",
domains: ["cdn.shopify.com", "/cdn/shop/"],
description: "Shopify CDN",
stripParams: [
"[?&]v=[^&#]+",
"[?&]width=\\d+",
"[?&]originalWidth=\\d+",
"[?&]originalHeight=\\d+"
],
stripSuffixes: [
"_small",
"_medium",
"_large",
"_grande",
"_master",
"_\\d+x\\d*",
"_\\d*x\\d+"
],
confidence: "high",
examples: [
"https://polinas-potent-potions.myshopify.com/cdn/shop/files/science-beakers-blue-light-new_large.jpg?v=1683744744"
]
},
sanity: {
domain: "cdn.sanity.io",
description: "Sanity.io CDN",
stripParams: ["\\?.*"],
confidence: "high",
examples: [
"https://cdn.sanity.io/images/project/production/abc123-800x600.jpg?w=400"
]
},
imgix: {
domain: ".imgix.net",
description: "imgix image CDN",
stripParams: [
"[?&]w=\\d+",
"[?&]h=\\d+",
"[?&]fit=\\w+",
"[?&]auto=[\\w,]+",
"[?&]q=\\d+",
"[?&]fm=\\w+",
"[?&]dpr=[\\d.]+",
"[?&]crop=[\\w,]+"
],
confidence: "high",
examples: [
"https://assets.imgix.net/image.jpg?w=800&h=600&fit=crop&auto=format"
]
},
wordpress: {
domain: "wp-content/uploads",
description: "WordPress media uploads",
stripSuffixes: ["-\\d+x\\d+", "-scaled"],
confidence: "medium",
examples: [
"https://example.com/wp-content/uploads/2024/01/image-800x600.jpg"
]
},
squarespace: {
domain: "squarespace.com/static/",
domains: [
"images.squarespace-cdn.com/content/",
"squarespace.com/static/"
],
description: "Squarespace image CDN",
stripParams: ["[?&]format=\\w+", "[?&]content-type=[^&#]+"],
confidence: "medium",
examples: [
"https://static1.squarespace.com/static/5134cbefe4b0c6fb04df8065/t/69b2d4e7b86fcf5aa700ae16/1773327591839/2025-homepage-thumbnail.png?format=1500w"
]
},
contentful: {
domain: "images.ctfassets.net",
domains: ["images.ctfassets.net", "images-www.contentful.com"],
description: "Contentful CDN",
stripParams: [
"[?&]w=\\d+",
"[?&]h=\\d+",
"[?&]q=\\d+",
"[?&]fm=\\w+",
"[?&]fit=\\w+"
],
confidence: "high",
examples: [
"https://images.ctfassets.net/space/asset.jpg?w=800&h=600&fm=webp"
]
},
prismic: {
domain: "images.prismic.io",
description: "Prismic CDN",
stripParams: [
"[?&]w=\\d+",
"[?&]h=\\d+",
"[?&]q=\\d+",
"[?&]auto=[\\w,]+"
],
confidence: "high",
examples: [
"https://images.prismic.io/project/image.jpg?w=800&h=600&auto=format"
]
},
bunny_cdn: {
domain: "b-cdn.net",
description: "Bunny CDN",
stripParams: ["[?&]width=\\d+", "[?&]height=\\d+", "[?&]quality=\\d+"],
confidence: "medium",
examples: ["https://example.b-cdn.net/image.jpg?width=800&height=600"]
},
imagekit: {
domain: "ik.imagekit.io",
description: "ImageKit URL transformations",
extractSource: {
pattern: "^(https?://ik\\.imagekit\\.io/[^/]+)/(?:tr:[^/]+/)(.+)$",
replacement: "$1/$2"
},
confidence: "high",
examples: [
"https://ik.imagekit.io/ikmedia/tr:w-200/docs_images/examples/example_fashion_1.jpg"
]
},
storyblok: {
domain: "a.storyblok.com",
description: "Storyblok Image Service",
extractSource: {
pattern: "^(https?://a\\.storyblok\\.com/f/\\d+/\\d+x\\d+/[^/]+/[^/?]+)(?:/m/[^?]+)?(?:\\?.*)?$",
replacement: "$1"
},
confidence: "high",
examples: [
"https://a.storyblok.com/f/212319/3174x2381/f525b9e092/demo-sunflower.png/m/500x300/filters:quality(80):brightness(-10)"
]
},
builder: {
domain: "cdn.builder.io/api/v1/image/",
domains: ["cdn.builder.io/api/v1/image/", "builder.io/api/v1/image/"],
description: "Builder.io Image API",
stripParams: [
"[?&]width=\\d+",
"[?&]height=\\d+",
"[?&]quality=\\d+",
"[?&]fit=[^&#]+",
"[?&]position=[^&#]+",
"[?&]format=[^&#]+"
],
confidence: "high",
examples: [
"https://cdn.builder.io/api/v1/image/assets%2FYJIGb4i01jvw0SRdL5Bt%2F869bfbaec9c64415ae68235d9b7b1425?width=500"
]
},
generic: {
domain: "*",
stripParams: [
"[?&]w=\\d+",
"[?&]h=\\d+",
"[?&]width=\\d+",
"[?&]height=\\d+",
"[?&]resize=\\d+(?:,\\d+)?",
"[?&]size=\\d+",
"[?&]quality=\\d+",
"[?&]q=\\d+"
],
confidence: "low",
description: "Common resizing parameters to try stripping"
}
};
// src/pattern-matcher.ts
var curatedPatterns = patterns_default;
var FIX_QUERY_PARAM_SEPARATORS_REGEX = /\?&/g;
var FIX_DUPLICATE_AMPERSANDS_REGEX = /&&+/g;
var STRIP_TRAILING_QUERY_SEPARATORS_REGEX = /[?&]+$/g;
function tryExtractSource(url, extractSource) {
try {
const regex = new RegExp(extractSource.pattern);
if (!regex.test(url)) {
return null;
}
const extracted = url.replace(regex, extractSource.replacement);
if (/^https?:\/\//.test(extracted)) {
return extracted;
}
const { origin } = new URL(url);
return new URL(extracted, `${origin}/`).toString();
} catch {
return null;
}
}
function tryStripParams(url, stripParams) {
let cleanUrl = url;
for (const param of stripParams) {
try {
cleanUrl = cleanUrl.replace(new RegExp(param, "g"), "");
} catch {
}
}
cleanUrl = cleanUrl.replace(FIX_QUERY_PARAM_SEPARATORS_REGEX, "?").replace(FIX_DUPLICATE_AMPERSANDS_REGEX, "&").replace(STRIP_TRAILING_QUERY_SEPARATORS_REGEX, "");
return cleanUrl === url ? null : cleanUrl;
}
function tryStripSuffixes(url, stripSuffixes) {
let cleanUrl = url;
for (const suffix of stripSuffixes) {
try {
cleanUrl = cleanUrl.replace(new RegExp(`${suffix}(\\.\\w+)$`), "$1");
} catch {
}
}
return cleanUrl === url ? null : cleanUrl;
}
function tryApplyCuratedPattern(url, pattern) {
let current = url;
let changed = false;
if (pattern.extractSource) {
const extracted = tryExtractSource(current, pattern.extractSource);
if (extracted) {
current = extracted;
changed = true;
}
}
if (pattern.stripParams) {
const stripped = tryStripParams(current, pattern.stripParams);
if (stripped) {
current = stripped;
changed = true;
}
}
if (pattern.stripSuffixes) {
const stripped = tryStripSuffixes(current, pattern.stripSuffixes);
if (stripped) {
current = stripped;
changed = true;
}
}
return changed ? current : null;
}
function patternMatchesUrl(url, pattern) {
if (pattern.domain === "*") {
return true;
}
const domains = pattern.domains ?? [pattern.domain];
return domains.some((domain) => url.includes(domain));
}
function applyCuratedPattern(url, patternName) {
const pattern = curatedPatterns[patternName];
if (!pattern || !patternMatchesUrl(url, pattern)) {
return null;
}
return tryApplyCuratedPattern(url, pattern);
}
function matchCuratedPattern(url) {
for (const [name, pattern] of Object.entries(curatedPatterns)) {
if (name === "generic") {
continue;
}
if (!patternMatchesUrl(url, pattern)) {
continue;
}
const resolved = tryApplyCuratedPattern(url, pattern);
if (resolved) {
return resolved;
}
}
return null;
}
function applyPattern(url, pattern) {
try {
const regex = new RegExp(pattern.matchRegex);
return regex.test(url) ? url.replace(regex, pattern.transform) : null;
} catch {
return null;
}
}
// src/concurrency.ts
function createLimiter(concurrency) {
const safeConcurrency = Math.max(1, Math.floor(concurrency) || 1);
let active = 0;
const queue = [];
async function run(item) {
active += 1;
try {
item.resolve(await item.fn());
} catch (error) {
item.reject(error);
} finally {
active -= 1;
drain();
}
}
function drain() {
while (active < safeConcurrency && queue.length > 0) {
const item = queue.shift();
if (item) {
void run(item);
}
}
}
return function limit(fn) {
return new Promise((resolve, reject) => {
queue.push({ fn, resolve, reject });
drain();
});
};
}
var httpLimiter = createLimiter(5);
// src/validator.ts
var REQUEST_TIMEOUT_MS = 5e3;
var USER_AGENT = "Mozilla/5.0 (compatible; Srcfull/2.0)";
var DEFAULT_RETRY_COUNT = 1;
var DEFAULT_RETRY_DELAY_MS = 500;
function parseSize(response) {
const contentLength = response.headers.get("content-length");
if (contentLength) {
const parsed2 = Number.parseInt(contentLength, 10);
if (Number.isFinite(parsed2)) {
return parsed2;
}
}
const contentRange = response.headers.get("content-range");
if (!contentRange) {
return void 0;
}
const total = contentRange.split("/")[1];
if (!total) {
return void 0;
}
const parsed = Number.parseInt(total, 10);
return Number.isFinite(parsed) ? parsed : void 0;
}
async function requestImage(url, method, options) {
const retryCount = Math.max(
0,
Math.floor(options.retryCount ?? DEFAULT_RETRY_COUNT)
);
const retryDelayMs = Math.max(
0,
Math.floor(options.retryDelayMs ?? DEFAULT_RETRY_DELAY_MS)
);
for (let attempt = 1; attempt <= retryCount + 1; attempt += 1) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
try {
const response = await fetch(url, {
method,
headers: {
Accept: "image/*",
"User-Agent": USER_AGENT,
...method === "GET" ? { Range: "bytes=0-0" } : {}
},
signal: controller.signal
});
if (attempt <= retryCount && shouldRetryStatus(response.status)) {
emitDebug(options.onDebug, {
type: "validate:retry",
message: `${method} returned ${response.status} for ${url}`,
url,
method,
status: response.status,
attempt
});
await sleep(retryDelayMs * attempt);
continue;
}
return response;
} catch (error) {
if (!isRetryableRequestError(error) || attempt > retryCount) {
throw error;
}
emitDebug(options.onDebug, {
type: "validate:retry",
message: `${method} failed for ${url}`,
url,
method,
attempt,
error: error instanceof Error ? error.message : String(error)
});
await sleep(retryDelayMs * attempt);
} finally {
clearTimeout(timeoutId);
}
}
throw new RetryableStatusError(503, `Unable to validate ${url}`);
}
async function validateImageUrl(url, options = {}) {
const publicUrl = validatePublicUrl(url);
const safeUrl = publicUrl.url?.href;
if (!publicUrl.valid || !safeUrl) {
emitDebug(options.onDebug, {
type: "validate:rejected",
message: publicUrl.error ?? `Rejected ${url}`,
url
});
return { valid: false };
}
return httpLimiter(async () => {
try {
for (const method of ["HEAD", "GET"]) {
const response = await requestImage(safeUrl, method, options);
if (!response.ok) {
emitDebug(options.onDebug, {
type: "validate:status",
message: `${method} returned ${response.status} for ${safeUrl}`,
url: safeUrl,
method,
status: response.status
});
continue;
}
const contentType = response.headers.get("content-type") ?? "";
if (!contentType.startsWith("image/")) {
emitDebug(options.onDebug, {
type: "validate:content_type",
message: `${method} returned non-image content for ${safeUrl}`,
url: safeUrl,
method,
metadata: {
contentType
}
});
continue;
}
emitDebug(options.onDebug, {
type: "validate:success",
message: `${method} validated ${safeUrl}`,
url: safeUrl,
method,
metadata: {
contentType,
size: parseSize(response)
}
});
return {
valid: true,
contentType,
size: parseSize(response)
};
}
emitDebug(options.onDebug, {
type: "validate:failed",
message: `Validation failed for ${safeUrl}`,
url: safeUrl
});
return { valid: false };
} catch (error) {
emitDebug(options.onDebug, {
type: "validate:error",
message: `Validation threw for ${safeUrl}`,
url: safeUrl,
error: error instanceof Error ? error.message : String(error)
});
return { valid: false };
}
});
}
// src/prober.ts
var ASPECT_RATIO_SEGMENT_REGEX = /^\d+:\d+$/;
var RESIZE_PARAMS = [
"w",
"h",
"width",
"height",
"size",
"resize",
"q",
"quality",
"fit",
"crop",
"auto",
"fm",
"format",
"dpr",
"scale",
"blur",
"sharp"
];
var SIZE_SUFFIXES = [
/_\d+x\d+(\.\w+)$/,
/-\d+x\d+(\.\w+)$/,
/_(?:small|medium|large|thumb|thumbnail)(\.\w+)$/i,
/-(?:small|medium|large|thumb|thumbnail)(\.\w+)$/i
];
var PATH_VARIANTS = ["master", "original", "full", "source", "raw"];
function pushUnique(candidates, url) {
if (url && !candidates.includes(url)) {
candidates.push(url);
}
}
function pushManyUnique(candidates, urls) {
for (const url of urls) {
pushUnique(candidates, url);
}
}
function stripResizeParamsCandidate(originalUrl) {
const strippedUrl = new URL(originalUrl);
for (const param of RESIZE_PARAMS) {
strippedUrl.searchParams.delete(param);
}
return strippedUrl.href === originalUrl ? null : strippedUrl.href;
}
function largerDimensionCandidates(urlObj) {
const candidates = [];
if (urlObj.searchParams.has("w")) {
const largeUrl = new URL(urlObj.href);
largeUrl.searchParams.set("w", "2560");
largeUrl.searchParams.delete("h");
candidates.push(largeUrl.href);
}
if (urlObj.searchParams.has("width")) {
const largeUrl = new URL(urlObj.href);
largeUrl.searchParams.set("width", "2560");
largeUrl.searchParams.delete("height");
candidates.push(largeUrl.href);
}
return candidates;
}
function noQueryCandidate(urlObj, originalUrl) {
const noQueryUrl = urlObj.origin + urlObj.pathname;
return noQueryUrl === originalUrl ? null : noQueryUrl;
}
function stripSizeSuffixCandidates(urlObj) {
const candidates = [];
for (const pattern of SIZE_SUFFIXES) {
const match = urlObj.pathname.match(pattern);
if (!match) {
continue;
}
candidates.push(
urlObj.origin + urlObj.pathname.replace(pattern, match[1]) + urlObj.search
);
}
return candidates;
}
function pathVariantCandidates(urlObj) {
const pathParts = urlObj.pathname.split("/");
const candidates = [];
for (const variant of PATH_VARIANTS) {
const variantParts = pathParts.map(
(part) => ASPECT_RATIO_SEGMENT_REGEX.test(part) ? variant : part
);
if (variantParts.join("/") !== pathParts.join("/")) {
candidates.push(urlObj.origin + variantParts.join("/") + urlObj.search);
}
}
return candidates;
}
function generateProbeCandidates(url) {
const candidates = [];
try {
const urlObj = new URL(url);
pushUnique(candidates, stripResizeParamsCandidate(url));
pushManyUnique(candidates, largerDimensionCandidates(urlObj));
pushUnique(candidates, noQueryCandidate(urlObj, url));
pushManyUnique(candidates, stripSizeSuffixCandidates(urlObj));
pushManyUnique(candidates, pathVariantCandidates(urlObj));
} catch {
return [];
}
return candidates;
}
async function probeForSource(originalUrl, originalSize = 0, validate = validateImageUrl) {
const candidates = generateProbeCandidates(originalUrl);
let bestUrl = originalUrl;
let bestSize = originalSize;
for (const candidate of candidates) {
const validation = await validate(candidate);
if (validation.valid && validation.size && validation.size > bestSize) {
bestUrl = candidate;
bestSize = validation.size;
}
}
return bestUrl === originalUrl ? null : { url: bestUrl, size: bestSize, method: "probed" };
}
// src/resolve.ts
function calculateSizeIncrease(original, resolved) {
if (!(original && resolved) || original === 0) {
return void 0;
}
return `${(resolved / original).toFixed(1)}x`;
}
async function cacheResult(original, resolved, options, patternId) {
await options.cache?.set(original, resolved, patternId);
emitDebug(options.onDebug, {
type: "cache:write",
message: `Stored cache entry for ${original}`,
url: original,
metadata: {
resolved,
patternId
}
});
}
async function learnPattern(original, resolved, options) {
if (!options.patternStore) {
return;
}
try {
const domain = new URL(original).hostname;
const escaped = original.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
await options.patternStore.save(domain, `^${escaped}$`, resolved);
} catch {
}
}
async function resolveImageUrl(imageUrl, options = {}) {
const validate = options.validate ?? ((url) => validateImageUrl(url, {
onDebug: options.onDebug,
retryCount: options.retryCount,
retryDelayMs: options.retryDelayMs
}));
const original = imageUrl;
emitDebug(options.onDebug, {
type: "resolve:start",
message: `Resolving ${imageUrl}`,
url: imageUrl
});
try {
const cached = await options.cache?.get(imageUrl);
if (cached) {
emitDebug(options.onDebug, {
type: "resolve:cached",
message: `Cache hit for ${imageUrl}`,
url: imageUrl,
metadata: {
resolved: cached
}
});
return { original, resolved: cached, method: "cached" };
}
} catch {
}
const originalValidation = await validate(imageUrl);
const originalSize = originalValidation.size ?? 0;
const curated = matchCuratedPattern(imageUrl);
if (curated) {
emitDebug(options.onDebug, {
type: "pattern:curated",
message: `Curated pattern matched ${imageUrl}`,
url: imageUrl,
metadata: {
resolved: curated
}
});
const validation = await validate(curated);
if (validation.valid) {
await cacheResult(imageUrl, curated, options);
return {
original,
resolved: curated,
method: "pattern",
confidence: 0.95,
sizeIncrease: calculateSizeIncrease(originalSize, validation.size)
};
}
emitDebug(options.onDebug, {
type: "pattern:rejected",
message: `Curated pattern candidate was rejected for ${imageUrl}`,
url: imageUrl,
metadata: {
resolved: curated
}
});
}
if (options.patternStore) {
try {
const domain = new URL(imageUrl).hostname;
const patterns = await options.patternStore.findByDomain(domain);
emitDebug(options.onDebug, {
type: "pattern:loaded",
message: `Loaded ${patterns.length} learned patterns for ${domain}`,
url: imageUrl,
metadata: {
domain,
count: patterns.length
}
});
for (const pattern of patterns) {
const resolved = applyPattern(imageUrl, pattern);
if (!resolved) {
continue;
}
const validation = await validate(resolved);
if (!validation.valid) {
await options.patternStore.incrementFailure?.(
pattern.id ?? pattern.domain
);
emitDebug(options.onDebug, {
type: "pattern:rejected",
message: `Learned pattern candidate was rejected for ${imageUrl}`,
url: imageUrl,
metadata: {
patternId: pattern.id,
resolved
}
});
continue;
}
if (pattern.id !== void 0) {
await options.patternStore.incrementSuccess(pattern.id);
}
await cacheResult(imageUrl, resolved, options, pattern.id);
emitDebug(options.onDebug, {
type: "pattern:applied",
message: `Learned pattern resolved ${imageUrl}`,
url: imageUrl,
metadata: {
patternId: pattern.id,
resolved
}
});
return {
original,
resolved,
method: "learned",
confidence: pattern.confidence,
sizeIncrease: calculateSizeIncrease(originalSize, validation.size)
};
}
} catch {
}
}
try {
const probeResult = await probeForSource(imageUrl, originalSize, validate);
if (probeResult) {
await learnPattern(imageUrl, probeResult.url, options);
await cacheResult(imageUrl, probeResult.url, options);
emitDebug(options.onDebug, {
type: "probe:resolved",
message: `Probe improved ${imageUrl}`,
url: imageUrl,
metadata: {
resolved: probeResult.url,
size: probeResult.size
}
});
return {
original,
resolved: probeResult.url,
method: "probed",
confidence: 0.5,
sizeIncrease: calculateSizeIncrease(originalSize, probeResult.size)
};
}
} catch (error) {
emitDebug(options.onDebug, {
type: "probe:failed",
message: `Probe failed for ${imageUrl}`,
url: imageUrl,
error: error instanceof Error ? error.message : String(error)
});
}
emitDebug(options.onDebug, {
type: "resolve:fallback",
message: `Falling back to original URL for ${imageUrl}`,
url: imageUrl
});
return { original, resolved: imageUrl, method: "fallback" };
}
// src/extract.ts
import { load } from "cheerio";
var IMAGE_URL_REGEX = /https?:\/\/[^\s"'<>]+\.(jpg|jpeg|png|webp|gif|avif|jfif)(?:\?[^\s"'<>]*)?/gi;
var CDN_PATTERNS = [
/amazonaws\.com/,
/cloudfront\.net/,
/cloudinary\.com/,
/imgix\.net/,
/akamaihd\.net/,
/fastly\.net/,
/staticmedia\./,
/media\..*\.com/,
/cdn\.shopify\.com/,
/squarespace-cdn\.com/,
/ctfassets\.net/,
/sanity\.io/
];
var EXCLUDED_PATTERNS = [
/favicon/i,
/\/icon/i,
/\/logo/i,
/tracking/i,
/pixel/i,
/1x1/i,
/social/i,
/facebook/i,
/twitter/i,
/instagram/i,
/linkedin/i,
/pinterest/i,
/youtube/i,
/flag/i,
/chat/i,
/badge/i,
/avatar/i,
/emoji/i,
/spinner/i,
/loading/i
];
var IMAGE_ATTRIBUTE_NAMES = [
"src",
"data-src",
"data-lazy-src",
"data-original",
"data-image",
"data-url"
];
var SRCSET_ATTRIBUTE_NAMES = ["srcset", "data-srcset", "data-lazy-srcset"];
var META_IMAGE_SELECTORS = [
'meta[property="og:image"]',
'meta[property="og:image:url"]',
'meta[name="twitter:image"]',
'meta[name="twitter:image:src"]',
'meta[itemprop="image"]'
];
var LINK_IMAGE_SELECTORS = [
'link[rel="image_src"]',
'link[rel="preload"][as="image"]'
];
function normalizeCandidateUrl(rawUrl, baseUrl) {
if (!rawUrl) {
return null;
}
const candidate = rawUrl.trim();
if (candidate.length === 0 || candidate.startsWith("data:") || candidate.startsWith("javascript:")) {
return null;
}
try {
if (candidate.startsWith("//")) {
const normalized2 = new URL(`https:${candidate}`);
normalized2.hash = "";
return normalized2.href;
}
if (baseUrl) {
const normalized2 = new URL(candidate, baseUrl);
normalized2.hash = "";
return normalized2.href;
}
const normalized = new URL(candidate);
normalized.hash = "";
return normalized.href;
} catch {
return null;
}
}
function parseSrcsetUrls(srcset) {
return srcset.split(",").map((entry) => entry.trim().split(/\s+/)[0]).filter((value) => Boolean(value));
}
function extractCssUrls(style) {
if (!style) {
return [];
}
return Array.from(
style.matchAll(/url\((['"]?)(.*?)\1\)/g),
(match) => match[2]?.trim() ?? ""
).filter(Boolean);
}
function pushCandidate(candidates, candidate) {
if (!candidate) {
return;
}
if (!candidates.some((entry) => entry.url === candidate.url)) {
candidates.push(candidate);
}
}
function createCandidate(url, source, baseUrl, extra = {}) {
const normalizedUrl = normalizeCandidateUrl(url, baseUrl);
if (!normalizedUrl) {
return null;
}
return {
...extra,
url: normalizedUrl,
source
};
}
function extractImageUrlsFromRaw(html, sourceDomain) {
const matches = html.match(IMAGE_URL_REGEX) || [];
const seen = /* @__PURE__ */ new Set();
const filtered = [];
for (const url of matches) {
const baseUrl = url.split("?")[0];
if (!baseUrl || seen.has(baseUrl)) {
continue;
}
seen.add(baseUrl);
if (EXCLUDED_PATTERNS.some((pattern) => pattern.test(url))) {
continue;
}
const isSameDomain = sourceDomain && url.includes(sourceDomain);
const isCdn = CDN_PATTERNS.some((pattern) => pattern.test(url));
if (isSameDomain || isCdn) {
filtered.push(url);
}
}
return filtered;
}
function extractImageCandidatesFromHtml(html, baseUrl) {
const $ = load(html);
const candidates = [];
$("img").each((_, element) => {
const node = $(element);
const width = Number.parseInt(node.attr("width") || "0", 10) || void 0;
const height = Number.parseInt(node.attr("height") || "0", 10) || void 0;
const alt = node.attr("alt") ?? null;
const srcsetCandidates = SRCSET_ATTRIBUTE_NAMES.flatMap(
(attribute) => parseSrcsetUrls(node.attr(attribute) ?? "").map((url) => normalizeCandidateUrl(url, baseUrl)).filter((url) => Boolean(url))
);
for (const attribute of IMAGE_ATTRIBUTE_NAMES) {
pushCandidate(
candidates,
createCandidate(node.attr(attribute), "img", baseUrl, {
width,
height,
alt,
srcset: srcsetCandidates.length > 0 ? srcsetCandidates : void 0
})
);
}
for (const srcsetUrl of srcsetCandidates) {
pushCandidate(candidates, {
url: srcsetUrl,
source: "img",
width,
height,
alt,
srcset: srcsetCandidates
});
}
});
$("picture source").each((_, element) => {
const urls = SRCSET_ATTRIBUTE_NAMES.flatMap(
(attribute) => parseSrcsetUrls($(element).attr(attribute) ?? "").map((url) => normalizeCandidateUrl(url, baseUrl)).filter((url) => Boolean(url))
);
for (const url of urls) {
pushCandidate(candidates, {
url,
source: "picture",
srcset: urls
});
}
});
$('[style*="background-image"]').each((_, element) => {
for (const url of extractCssUrls($(element).attr("style"))) {
pushCandidate(candidates, createCandidate(url, "background", baseUrl));
}
});
for (const selector of META_IMAGE_SELECTORS) {
$(selector).each((_, element) => {
pushCandidate(
candidates,
createCandidate($(element).attr("content"), "raw", baseUrl)
);
});
}
for (const selector of LINK_IMAGE_SELECTORS) {
$(selector).each((_, element) => {
pushCandidate(
candidates,
createCandidate($(element).attr("href"), "raw", baseUrl)
);
});
}
return candidates;
}
async function extractImageCandidates(html, options = {}) {
const {
includeRaw = false,
sortBySize = false,
sourceDomain,
validate = validateImageUrl
} = options;
const candidates = [...extractImageCandidatesFromHtml(html, options.baseUrl)];
const seenUrls = new Set(
candidates.map((candidate) => candidate.url.split("?")[0])
);
if (includeRaw) {
for (const url of extractImageUrlsFromRaw(html, sourceDomain)) {
const baseUrl = url.split("?")[0];
if (!baseUrl || seenUrls.has(baseUrl)) {
continue;
}
candidates.push({
url,
source: "raw"
});
seenUrls.add(baseUrl);
}
}
if (!sortBySize) {
return candidates;
}
const httpCandidates = candidates.filter(
(candidate) => candidate.url.startsWith("http")
);
const otherCandidates = candidates.filter(
(candidate) => !candidate.url.startsWith("http")
);
const withSizes = await Promise.all(
httpCandidates.map(async (candidate) => ({
candidate,
size: (await validate(candidate.url)).size ?? 0
}))
);
withSizes.sort((left, right) => right.size - left.size);
return [...withSizes.map((entry) => entry.candidate), ...otherCandidates];
}
// src/scrape.ts
var DEFAULT_MIN_SIZE = 200;
var DEFAULT_FETCH_TIMEOUT_MS = 1e4;
var DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; Srcfull/2.0)";
var DEFAULT_RETRY_COUNT2 = 1;
var DEFAULT_RETRY_DELAY_MS2 = 500;
var LOGO_PATTERNS = [
/logo/i,
/icon/i,
/favicon/i,
/badge/i,
/sprite/i,
/thumbnail/i,
/avatar/i,
/social/i,
/button/i
];
function createAbortController(timeoutMs) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
return { controller, timeoutId };
}
function createDefaultHtmlFetcher(options = {}) {
const timeoutMs = Math.max(
1,
Math.floor(options.timeoutMs ?? DEFAULT_FETCH_TIMEOUT_MS)
);
const retryCount = Math.max(
0,
Math.floor(options.retryCount ?? DEFAULT_RETRY_COUNT2)
);
const retryDelayMs = Math.max(
0,
Math.floor(options.retryDelayMs ?? DEFAULT_RETRY_DELAY_MS2)
);
const userAgent = options.userAgent?.trim() || DEFAULT_USER_AGENT;
const accept = options.accept?.trim() || "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
return async (url) => {
const validation = validatePublicUrl(url);
if (!validation.valid || !validation.url) {
throw new Error(validation.error ?? "Invalid page URL");
}
for (let attempt = 1; attempt <= retryCount + 1; attempt += 1) {
const { controller, timeoutId } = createAbortController(timeoutMs);
try {
const response = await fetch(validation.url.href, {
headers: {
Accept: accept,
"User-Agent": userAgent,
...options.headers ?? {}
},
signal: controller.signal
});
if (attempt <= retryCount && shouldRetryStatus(response.status)) {
emitDebug(options.onDebug, {
type: "fetch:retry",
message: `Page fetch returned ${response.status} for ${validation.url.href}`,
url: validation.url.href,
status: response.status,
attempt
});
await sleep(retryDelayMs * attempt);
continue;
}
if (!response.ok) {
throw new Error(`Failed to fetch page: ${response.status}`);
}
const contentType = response.headers.get("content-type") ?? "";
if (!options.allowNonHtml && contentType && !/text\/html|application\/xhtml\+xml/i.test(contentType)) {
throw new Error(`Expected HTML response but received ${contentType}`);
}
emitDebug(options.onDebug, {
type: "fetch:success",
message: `Fetched page HTML for ${validation.url.href}`,
url: validation.url.href,
attempt,
metadata: {
contentType,
status: response.status
}
});
return {
html: await response.text(),
metadata: {
fetcher: "default",
status: response.status,
contentType
}
};
} catch (error) {
if (error instanceof Error && error.name === "AbortError") {
if (attempt <= retryCount) {
emitDebug(options.onDebug, {
type: "fetch:retry",
message: `Page fetch timed out for ${validation.url.href}`,
url: validation.url.href,
attempt,
error: error.message
});
await sleep(retryDelayMs * attempt);
continue;
}
throw new Error(`Timed out fetching page after ${timeoutMs}ms`);
}
if (attempt <= retryCount && isRetryableRequestError(error)) {
emitDebug(options.onDebug, {
type: "fetch:retry",
message: `Page fetch failed for ${validation.url.href}`,
url: validation.url.href,
attempt,
error: error instanceof Error ? error.message : String(error)
});
await sleep(retryDelayMs * attempt);
continue;
}
throw error;
} finally {
clearTimeout(timeoutId);
}
}
throw new RetryableStatusError(
503,
`Unable to fetch ${validation.url.href}`
);
};
}
var defaultHtmlFetcher = createDefaultHtmlFetcher();
function filterMainImages(candidates, minSize) {
return candidates.filter((candidate) => {
if (candidate.url.startsWith("data:")) {
return false;
}
if (LOGO_PATTERNS.some((pattern) => pattern.test(candidate.url))) {
return false;
}
if (candidate.width && candidate.width < minSize) {
return false;
}
if (candidate.height && candidate.height < minSize) {
return false;
}
return true;
});
}
async function getImageSize(url, options) {
const validation = options.validate ? await options.validate(url) : await validateImageUrl(url, {
onDebug: options.onDebug,
retryCount: options.retryCount,
retryDelayMs: options.retryDelayMs
});
return validation.size ?? null;
}
async function scrapePage(url, options = {}) {
const start = Date.now();
const validation = validatePublicUrl(url);
if (!validation.valid) {
throw new Error(validation.error);
}
const fetchHtml = options.fetchHtml ?? createDefaultHtmlFetcher({
onDebug: options.onDebug,
retryCount: options.retryCount,
retryDelayMs: options.retryDelayMs
});
const htmlResult = await fetchHtml(url);
const sourceDomain = new URL(url).hostname.replace(/^www\./, "");
let candidates = filterMainImages(
await extractImageCandidates(htmlResult.html, {
includeRaw: true,
sortBySize: true,
baseUrl: url,
sourceDomain,
validate: options.validate
}),
options.minSize ?? DEFAULT_MIN_SIZE
);
let fallbackMetadata;
if (candidates.length === 0 && options.imageFallback) {
const fallback = await options.imageFallback(url);
candidates = filterMainImages(
fallback.images,
options.minSize ?? DEFAULT_MIN_SIZE
);
fallbackMetadata = fallback.metadata;
}
if (candidates.length === 0) {
throw new Error("Failed to extract images");
}
emitDebug(options.onDebug, {
type: "scrape:candidates",
message: `Collected ${candidates.length} candidates for ${url}`,
url,
metadata: {
sourceDomain
}
});
const resolve = options.resolve ?? ((imageUrl) => resolveImageUrl(imageUrl, {
onDebug: options.onDebug,
retryCount: options.retryCount,
retryDelayMs: options.retryDelayMs
}));
const toResolve = candidates.slice(0, options.maxImages ?? 20);
const limit = createLimiter(options.resolveConcurrency ?? 5);
let resolved = 0;
let failed = 0;
const images = (await Promise.all(
toResolve.map(
(candidate) => limit(async () => {
try {
const result = await resolve(candidate.url);
if (result.method !== "fallback") {
resolved += 1;
}
return {
original: result.original,
resolved: result.resolved,
originalSize: null,
resolvedSize: await getImageSize(result.resolved, options),
sizeIncrease: result.sizeIncrease ?? null,
alt: candidate.alt ?? null,
method: result.method
};
} catch (error) {
failed += 1;
emitDebug(options.onDebug, {
type: "scrape:resolve_failed",
message: `Failed to resolve ${candidate.url}`,
url: candidate.url,
error: error instanceof Error ? error.message : String(error)
});
return null;
}
})
)
)).filter((image) => image !== null);
images.sort(
(left, right) => (right.resolvedSize ?? 0) - (left.resolvedSize ?? 0)
);
return {
url,
images,
stats: {
found: candidates.length,
resolved,
failed,
returned: images.length,
durationMs: Date.now() - start
},
metadata: {
...htmlResult.metadata ?? {},
...fallbackMetadata ?? {}
}
};
}
export {
applyCuratedPattern,
matchCuratedPattern,
applyPattern,
createLimiter,
httpLimiter,
validateImageUrl,
generateProbeCandidates,
probeForSource,
resolveImageUrl,
extractImageUrlsFromRaw,
extractImageCandidatesFromHtml,
extractImageCandidates,
createDefaultHtmlFetcher,
defaultHtmlFetcher,
scrapePage
};
+79
-22

@@ -41,17 +41,27 @@ {

"fastly_io": {
"domain": ".io/",
"description": "Fastly Image Optimizer with path transforms",
"extractSource": {
"pattern": "^(https?://[^/]+/[^?]+?)(?:/(?:fit|resize|crop)-in/[^?]+)?(?:\\?.*)?$",
"replacement": "$1"
},
"domain": "fastly.io",
"description": "Fastly Image Optimizer with query parameter transforms",
"stripParams": [
"[?&]width=\\d+",
"[?&]height=\\d+",
"[?&]quality=\\d+",
"[?&]fit=[^&#]+",
"[?&]crop=[^&#]+",
"[?&]dpr=[\\d.]+",
"[?&]format=[^&#]+",
"[?&]bg-color=[^&#]+"
],
"confidence": "medium",
"examples": [
"https://images.example.io/image.jpg/resize-in/800x600?quality=80"
]
"examples": ["https://www.fastly.io/image.jpg?width=300"]
},
"shopify": {
"domain": "cdn.shopify.com",
"domain": "/cdn/shop/",
"domains": ["cdn.shopify.com", "/cdn/shop/"],
"description": "Shopify CDN",
"stripParams": ["v=\\d+"],
"stripParams": [
"[?&]v=[^&#]+",
"[?&]width=\\d+",
"[?&]originalWidth=\\d+",
"[?&]originalHeight=\\d+"
],
"stripSuffixes": [

@@ -68,3 +78,3 @@ "_small",

"examples": [
"https://cdn.shopify.com/s/files/1/0001/2345/products/image_800x800.jpg?v=1234"
"https://polinas-potent-potions.myshopify.com/cdn/shop/files/science-beakers-blue-light-new_large.jpg?v=1683744744"
]

@@ -109,12 +119,12 @@ },

"squarespace": {
"domain": "squarespace-cdn.com",
"description": "Squarespace CDN",
"extractSource": {
"pattern": "^(https?://[^/]+/content/[^?]+)(?:\\?.*)?$",
"replacement": "$1"
},
"stripParams": ["[?&]format=\\w+"],
"domain": "squarespace.com/static/",
"domains": [
"images.squarespace-cdn.com/content/",
"squarespace.com/static/"
],
"description": "Squarespace image CDN",
"stripParams": ["[?&]format=\\w+", "[?&]content-type=[^&#]+"],
"confidence": "medium",
"examples": [
"https://images.squarespace-cdn.com/content/v1/abc/123/image.jpg?format=1500w"
"https://static1.squarespace.com/static/5134cbefe4b0c6fb04df8065/t/69b2d4e7b86fcf5aa700ae16/1773327591839/2025-homepage-thumbnail.png?format=1500w"
]

@@ -124,2 +134,3 @@ },

"domain": "images.ctfassets.net",
"domains": ["images.ctfassets.net", "images-www.contentful.com"],
"description": "Contentful CDN",

@@ -141,3 +152,8 @@ "stripParams": [

"description": "Prismic CDN",
"stripParams": ["[?&]w=\\d+", "[?&]h=\\d+", "[?&]q=\\d+", "[?&]auto=\\w+"],
"stripParams": [
"[?&]w=\\d+",
"[?&]h=\\d+",
"[?&]q=\\d+",
"[?&]auto=[\\w,]+"
],
"confidence": "high",

@@ -155,2 +171,43 @@ "examples": [

},
"imagekit": {
"domain": "ik.imagekit.io",
"description": "ImageKit URL transformations",
"extractSource": {
"pattern": "^(https?://ik\\.imagekit\\.io/[^/]+)/(?:tr:[^/]+/)(.+)$",
"replacement": "$1/$2"
},
"confidence": "high",
"examples": [
"https://ik.imagekit.io/ikmedia/tr:w-200/docs_images/examples/example_fashion_1.jpg"
]
},
"storyblok": {
"domain": "a.storyblok.com",
"description": "Storyblok Image Service",
"extractSource": {
"pattern": "^(https?://a\\.storyblok\\.com/f/\\d+/\\d+x\\d+/[^/]+/[^/?]+)(?:/m/[^?]+)?(?:\\?.*)?$",
"replacement": "$1"
},
"confidence": "high",
"examples": [
"https://a.storyblok.com/f/212319/3174x2381/f525b9e092/demo-sunflower.png/m/500x300/filters:quality(80):brightness(-10)"
]
},
"builder": {
"domain": "cdn.builder.io/api/v1/image/",
"domains": ["cdn.builder.io/api/v1/image/", "builder.io/api/v1/image/"],
"description": "Builder.io Image API",
"stripParams": [
"[?&]width=\\d+",
"[?&]height=\\d+",
"[?&]quality=\\d+",
"[?&]fit=[^&#]+",
"[?&]position=[^&#]+",
"[?&]format=[^&#]+"
],
"confidence": "high",
"examples": [
"https://cdn.builder.io/api/v1/image/assets%2FYJIGb4i01jvw0SRdL5Bt%2F869bfbaec9c64415ae68235d9b7b1425?width=500"
]
},
"generic": {

@@ -163,3 +220,3 @@ "domain": "*",

"[?&]height=\\d+",
"[?&]resize=\\d+",
"[?&]resize=\\d+(?:,\\d+)?",
"[?&]size=\\d+",

@@ -166,0 +223,0 @@ "[?&]quality=\\d+",

@@ -5,3 +5,3 @@ #!/usr/bin/env node

scrapePage
} from "./chunk-GBQQ74YZ.js";
} from "./chunk-QHQAB7M5.js";
import {

@@ -21,3 +21,3 @@ createFirecrawlImageFallback

name: "srcfull",
version: "2.0.0",
version: "2.0.1",
description: "Image extraction and source-resolution toolkit for high-quality web images.",

@@ -61,3 +61,4 @@ type: "module",

typecheck: "tsc --noEmit",
test: "vitest run"
test: "vitest run",
"test:live-patterns": "SRCFULL_LIVE_TESTS=1 vitest run test/live-curated-patterns.test.ts"
},

@@ -64,0 +65,0 @@ dependencies: {

@@ -17,2 +17,3 @@ import { E as ExtractImageOptions, c as ImageCandidate, V as ValidateImageUrlOptions, d as ValidationResult, e as ResolveImageOptions, f as ResolveResult, g as DefaultHtmlFetcherOptions, h as HtmlFetchResult, S as ScrapePageOptions, i as ScrapePageResult } from './types-DA2o6fnF.js';

domain: string;
domains?: string[];
description?: string;

@@ -27,2 +28,3 @@ extractSource?: {

};
declare function applyCuratedPattern(url: string, patternName: string): string | null;
declare function matchCuratedPattern(url: string): string | null;

@@ -57,2 +59,2 @@ declare function applyPattern(url: string, pattern: {

export { type CuratedPattern, DefaultHtmlFetcherOptions, ExtractImageOptions, HtmlFetchResult, ImageCandidate, type ProbeResult, type PublicUrlValidation, ResolveImageOptions, ResolveResult, ScrapePageOptions, ScrapePageResult, ValidateImageUrlOptions, ValidationResult, applyPattern, createDefaultHtmlFetcher, createLimiter, defaultHtmlFetcher, extractImageCandidates, extractImageCandidatesFromHtml, extractImageUrlsFromRaw, generateProbeCandidates, httpLimiter, matchCuratedPattern, probeForSource, resolveImageUrl, scrapePage, validateImageUrl, validatePublicUrl };
export { type CuratedPattern, DefaultHtmlFetcherOptions, ExtractImageOptions, HtmlFetchResult, ImageCandidate, type ProbeResult, type PublicUrlValidation, ResolveImageOptions, ResolveResult, ScrapePageOptions, ScrapePageResult, ValidateImageUrlOptions, ValidationResult, applyCuratedPattern, applyPattern, createDefaultHtmlFetcher, createLimiter, defaultHtmlFetcher, extractImageCandidates, extractImageCandidatesFromHtml, extractImageUrlsFromRaw, generateProbeCandidates, httpLimiter, matchCuratedPattern, probeForSource, resolveImageUrl, scrapePage, validateImageUrl, validatePublicUrl };
import {
applyCuratedPattern,
applyPattern,

@@ -16,3 +17,3 @@ createDefaultHtmlFetcher,

validateImageUrl
} from "./chunk-GBQQ74YZ.js";
} from "./chunk-QHQAB7M5.js";
import {

@@ -36,2 +37,3 @@ createFirecrawlImageFallback

export {
applyCuratedPattern,
applyPattern,

@@ -38,0 +40,0 @@ createDefaultHtmlFetcher,

{
"name": "srcfull",
"version": "2.0.0",
"version": "2.0.1",
"description": "Image extraction and source-resolution toolkit for high-quality web images.",

@@ -42,3 +42,4 @@ "type": "module",

"typecheck": "tsc --noEmit",
"test": "vitest run"
"test": "vitest run",
"test:live-patterns": "SRCFULL_LIVE_TESTS=1 vitest run test/live-curated-patterns.test.ts"
},

@@ -45,0 +46,0 @@ "dependencies": {

@@ -134,4 +134,7 @@ # Srcfull

pnpm test
pnpm test:live-patterns
pnpm typecheck
pnpm build
```
`pnpm test:live-patterns` revalidates the researched real-world CDN fixtures in `test/fixtures/curated-patterns.json` against the network.
import {
RetryableStatusError,
emitDebug,
isRetryableRequestError,
shouldRetryStatus,
sleep,
validatePublicUrl
} from "./chunk-PUJK33LY.js";
// data/patterns.json
var patterns_default = {
conde_nast: {
domain: "media.",
description: "Cond\xE9 Nast media platform (House & Garden, Vogue, GQ, Wired, Vanity Fair, etc.)",
extractSource: {
pattern: "^(https?://media\\.[^/]+/photos/[a-f0-9]{24})(?:/.*)?$",
replacement: "$1/master/w_2560,c_limit/image.jpg"
},
confidence: "high",
notes: "Raw ID returns 503; must use transformation params. master/w_2560 gives highest quality.",
examples: [
"https://media.houseandgarden.co.uk/photos/63e509b43404638ef031982b/1:1/w_1600%2Cc_limit/filename.jpg",
"https://media.vogue.com/photos/5f3e0c5e9c1a7b2d3c4e5f6a/4:3/w_800/image.jpg"
]
},
cloudinary_path: {
domain: "res.cloudinary.com",
description: "Cloudinary with path-based transformations",
extractSource: {
pattern: "^(https?://res\\.cloudinary\\.com/[^/]+/image/upload)/[^/]+/(.+)$",
replacement: "$1/$2"
},
confidence: "high",
examples: [
"https://res.cloudinary.com/demo/image/upload/w_400,h_300,c_fill/v1234/sample.jpg"
]
},
cloudflare_images: {
domain: "cdn-cgi/image",
description: "Cloudflare Image Resizing",
extractSource: {
pattern: "^https?://[^/]+/cdn-cgi/image/[^/]+/(.+)$",
replacement: "$1"
},
confidence: "high",
examples: [
"https://example.com/cdn-cgi/image/width=800,quality=80/https://example.com/original.jpg"
]
},
fastly_io: {
domain: ".io/",
description: "Fastly Image Optimizer with path transforms",
extractSource: {
pattern: "^(https?://[^/]+/[^?]+?)(?:/(?:fit|resize|crop)-in/[^?]+)?(?:\\?.*)?$",
replacement: "$1"
},
confidence: "medium",
examples: [
"https://images.example.io/image.jpg/resize-in/800x600?quality=80"
]
},
shopify: {
domain: "cdn.shopify.com",
description: "Shopify CDN",
stripParams: ["v=\\d+"],
stripSuffixes: [
"_small",
"_medium",
"_large",
"_grande",
"_master",
"_\\d+x\\d*",
"_\\d*x\\d+"
],
confidence: "high",
examples: [
"https://cdn.shopify.com/s/files/1/0001/2345/products/image_800x800.jpg?v=1234"
]
},
sanity: {
domain: "cdn.sanity.io",
description: "Sanity.io CDN",
stripParams: ["\\?.*"],
confidence: "high",
examples: [
"https://cdn.sanity.io/images/project/production/abc123-800x600.jpg?w=400"
]
},
imgix: {
domain: ".imgix.net",
description: "imgix image CDN",
stripParams: [
"[?&]w=\\d+",
"[?&]h=\\d+",
"[?&]fit=\\w+",
"[?&]auto=[\\w,]+",
"[?&]q=\\d+",
"[?&]fm=\\w+",
"[?&]dpr=[\\d.]+",
"[?&]crop=[\\w,]+"
],
confidence: "high",
examples: [
"https://assets.imgix.net/image.jpg?w=800&h=600&fit=crop&auto=format"
]
},
wordpress: {
domain: "wp-content/uploads",
description: "WordPress media uploads",
stripSuffixes: ["-\\d+x\\d+", "-scaled"],
confidence: "medium",
examples: [
"https://example.com/wp-content/uploads/2024/01/image-800x600.jpg"
]
},
squarespace: {
domain: "squarespace-cdn.com",
description: "Squarespace CDN",
extractSource: {
pattern: "^(https?://[^/]+/content/[^?]+)(?:\\?.*)?$",
replacement: "$1"
},
stripParams: ["[?&]format=\\w+"],
confidence: "medium",
examples: [
"https://images.squarespace-cdn.com/content/v1/abc/123/image.jpg?format=1500w"
]
},
contentful: {
domain: "images.ctfassets.net",
description: "Contentful CDN",
stripParams: [
"[?&]w=\\d+",
"[?&]h=\\d+",
"[?&]q=\\d+",
"[?&]fm=\\w+",
"[?&]fit=\\w+"
],
confidence: "high",
examples: [
"https://images.ctfassets.net/space/asset.jpg?w=800&h=600&fm=webp"
]
},
prismic: {
domain: "images.prismic.io",
description: "Prismic CDN",
stripParams: ["[?&]w=\\d+", "[?&]h=\\d+", "[?&]q=\\d+", "[?&]auto=\\w+"],
confidence: "high",
examples: [
"https://images.prismic.io/project/image.jpg?w=800&h=600&auto=format"
]
},
bunny_cdn: {
domain: "b-cdn.net",
description: "Bunny CDN",
stripParams: ["[?&]width=\\d+", "[?&]height=\\d+", "[?&]quality=\\d+"],
confidence: "medium",
examples: ["https://example.b-cdn.net/image.jpg?width=800&height=600"]
},
generic: {
domain: "*",
stripParams: [
"[?&]w=\\d+",
"[?&]h=\\d+",
"[?&]width=\\d+",
"[?&]height=\\d+",
"[?&]resize=\\d+",
"[?&]size=\\d+",
"[?&]quality=\\d+",
"[?&]q=\\d+"
],
confidence: "low",
description: "Common resizing parameters to try stripping"
}
};
// src/pattern-matcher.ts
var curatedPatterns = patterns_default;
var FIX_QUERY_PARAM_SEPARATORS_REGEX = /\?&/g;
var FIX_DUPLICATE_AMPERSANDS_REGEX = /&&+/g;
var STRIP_TRAILING_QUERY_SEPARATORS_REGEX = /[?&]+$/g;
function tryExtractSource(url, extractSource) {
try {
const regex = new RegExp(extractSource.pattern);
return regex.test(url) ? url.replace(regex, extractSource.replacement) : null;
} catch {
return null;
}
}
function tryStripParams(url, stripParams) {
let cleanUrl = url;
for (const param of stripParams) {
try {
cleanUrl = cleanUrl.replace(new RegExp(param, "g"), "");
} catch {
}
}
cleanUrl = cleanUrl.replace(FIX_QUERY_PARAM_SEPARATORS_REGEX, "?").replace(FIX_DUPLICATE_AMPERSANDS_REGEX, "&").replace(STRIP_TRAILING_QUERY_SEPARATORS_REGEX, "");
return cleanUrl === url ? null : cleanUrl;
}
function tryStripSuffixes(url, stripSuffixes) {
let cleanUrl = url;
for (const suffix of stripSuffixes) {
try {
cleanUrl = cleanUrl.replace(new RegExp(`${suffix}(\\.\\w+)$`), "$1");
} catch {
}
}
return cleanUrl === url ? null : cleanUrl;
}
function tryApplyCuratedPattern(url, pattern) {
if (pattern.extractSource) {
const extracted = tryExtractSource(url, pattern.extractSource);
if (extracted) {
return extracted;
}
}
if (pattern.stripParams) {
const stripped = tryStripParams(url, pattern.stripParams);
if (stripped) {
return stripped;
}
}
if (pattern.stripSuffixes) {
const stripped = tryStripSuffixes(url, pattern.stripSuffixes);
if (stripped) {
return stripped;
}
}
return null;
}
function matchCuratedPattern(url) {
for (const [name, pattern] of Object.entries(curatedPatterns)) {
if (name === "generic") {
continue;
}
if (!url.includes(pattern.domain)) {
continue;
}
const resolved = tryApplyCuratedPattern(url, pattern);
if (resolved) {
return resolved;
}
}
return null;
}
function applyPattern(url, pattern) {
try {
const regex = new RegExp(pattern.matchRegex);
return regex.test(url) ? url.replace(regex, pattern.transform) : null;
} catch {
return null;
}
}
// src/concurrency.ts
function createLimiter(concurrency) {
const safeConcurrency = Math.max(1, Math.floor(concurrency) || 1);
let active = 0;
const queue = [];
async function run(item) {
active += 1;
try {
item.resolve(await item.fn());
} catch (error) {
item.reject(error);
} finally {
active -= 1;
drain();
}
}
function drain() {
while (active < safeConcurrency && queue.length > 0) {
const item = queue.shift();
if (item) {
void run(item);
}
}
}
return function limit(fn) {
return new Promise((resolve, reject) => {
queue.push({ fn, resolve, reject });
drain();
});
};
}
var httpLimiter = createLimiter(5);
// src/validator.ts
var REQUEST_TIMEOUT_MS = 5e3;
var USER_AGENT = "Mozilla/5.0 (compatible; Srcfull/2.0)";
var DEFAULT_RETRY_COUNT = 1;
var DEFAULT_RETRY_DELAY_MS = 500;
function parseSize(response) {
const contentLength = response.headers.get("content-length");
if (contentLength) {
const parsed2 = Number.parseInt(contentLength, 10);
if (Number.isFinite(parsed2)) {
return parsed2;
}
}
const contentRange = response.headers.get("content-range");
if (!contentRange) {
return void 0;
}
const total = contentRange.split("/")[1];
if (!total) {
return void 0;
}
const parsed = Number.parseInt(total, 10);
return Number.isFinite(parsed) ? parsed : void 0;
}
async function requestImage(url, method, options) {
const retryCount = Math.max(
0,
Math.floor(options.retryCount ?? DEFAULT_RETRY_COUNT)
);
const retryDelayMs = Math.max(
0,
Math.floor(options.retryDelayMs ?? DEFAULT_RETRY_DELAY_MS)
);
for (let attempt = 1; attempt <= retryCount + 1; attempt += 1) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
try {
const response = await fetch(url, {
method,
headers: {
Accept: "image/*",
"User-Agent": USER_AGENT,
...method === "GET" ? { Range: "bytes=0-0" } : {}
},
signal: controller.signal
});
if (attempt <= retryCount && shouldRetryStatus(response.status)) {
emitDebug(options.onDebug, {
type: "validate:retry",
message: `${method} returned ${response.status} for ${url}`,
url,
method,
status: response.status,
attempt
});
await sleep(retryDelayMs * attempt);
continue;
}
return response;
} catch (error) {
if (!isRetryableRequestError(error) || attempt > retryCount) {
throw error;
}
emitDebug(options.onDebug, {
type: "validate:retry",
message: `${method} failed for ${url}`,
url,
method,
attempt,
error: error instanceof Error ? error.message : String(error)
});
await sleep(retryDelayMs * attempt);
} finally {
clearTimeout(timeoutId);
}
}
throw new RetryableStatusError(503, `Unable to validate ${url}`);
}
async function validateImageUrl(url, options = {}) {
const publicUrl = validatePublicUrl(url);
const safeUrl = publicUrl.url?.href;
if (!publicUrl.valid || !safeUrl) {
emitDebug(options.onDebug, {
type: "validate:rejected",
message: publicUrl.error ?? `Rejected ${url}`,
url
});
return { valid: false };
}
return httpLimiter(async () => {
try {
for (const method of ["HEAD", "GET"]) {
const response = await requestImage(safeUrl, method, options);
if (!response.ok) {
emitDebug(options.onDebug, {
type: "validate:status",
message: `${method} returned ${response.status} for ${safeUrl}`,
url: safeUrl,
method,
status: response.status
});
continue;
}
const contentType = response.headers.get("content-type") ?? "";
if (!contentType.startsWith("image/")) {
emitDebug(options.onDebug, {
type: "validate:content_type",
message: `${method} returned non-image content for ${safeUrl}`,
url: safeUrl,
method,
metadata: {
contentType
}
});
continue;
}
emitDebug(options.onDebug, {
type: "validate:success",
message: `${method} validated ${safeUrl}`,
url: safeUrl,
method,
metadata: {
contentType,
size: parseSize(response)
}
});
return {
valid: true,
contentType,
size: parseSize(response)
};
}
emitDebug(options.onDebug, {
type: "validate:failed",
message: `Validation failed for ${safeUrl}`,
url: safeUrl
});
return { valid: false };
} catch (error) {
emitDebug(options.onDebug, {
type: "validate:error",
message: `Validation threw for ${safeUrl}`,
url: safeUrl,
error: error instanceof Error ? error.message : String(error)
});
return { valid: false };
}
});
}
// src/prober.ts
var ASPECT_RATIO_SEGMENT_REGEX = /^\d+:\d+$/;
var RESIZE_PARAMS = [
"w",
"h",
"width",
"height",
"size",
"resize",
"q",
"quality",
"fit",
"crop",
"auto",
"fm",
"format",
"dpr",
"scale",
"blur",
"sharp"
];
var SIZE_SUFFIXES = [
/_\d+x\d+(\.\w+)$/,
/-\d+x\d+(\.\w+)$/,
/_(?:small|medium|large|thumb|thumbnail)(\.\w+)$/i,
/-(?:small|medium|large|thumb|thumbnail)(\.\w+)$/i
];
var PATH_VARIANTS = ["master", "original", "full", "source", "raw"];
function pushUnique(candidates, url) {
if (url && !candidates.includes(url)) {
candidates.push(url);
}
}
function pushManyUnique(candidates, urls) {
for (const url of urls) {
pushUnique(candidates, url);
}
}
function stripResizeParamsCandidate(originalUrl) {
const strippedUrl = new URL(originalUrl);
for (const param of RESIZE_PARAMS) {
strippedUrl.searchParams.delete(param);
}
return strippedUrl.href === originalUrl ? null : strippedUrl.href;
}
function largerDimensionCandidates(urlObj) {
const candidates = [];
if (urlObj.searchParams.has("w")) {
const largeUrl = new URL(urlObj.href);
largeUrl.searchParams.set("w", "2560");
largeUrl.searchParams.delete("h");
candidates.push(largeUrl.href);
}
if (urlObj.searchParams.has("width")) {
const largeUrl = new URL(urlObj.href);
largeUrl.searchParams.set("width", "2560");
largeUrl.searchParams.delete("height");
candidates.push(largeUrl.href);
}
return candidates;
}
function noQueryCandidate(urlObj, originalUrl) {
const noQueryUrl = urlObj.origin + urlObj.pathname;
return noQueryUrl === originalUrl ? null : noQueryUrl;
}
function stripSizeSuffixCandidates(urlObj) {
const candidates = [];
for (const pattern of SIZE_SUFFIXES) {
const match = urlObj.pathname.match(pattern);
if (!match) {
continue;
}
candidates.push(
urlObj.origin + urlObj.pathname.replace(pattern, match[1]) + urlObj.search
);
}
return candidates;
}
function pathVariantCandidates(urlObj) {
const pathParts = urlObj.pathname.split("/");
const candidates = [];
for (const variant of PATH_VARIANTS) {
const variantParts = pathParts.map(
(part) => ASPECT_RATIO_SEGMENT_REGEX.test(part) ? variant : part
);
if (variantParts.join("/") !== pathParts.join("/")) {
candidates.push(urlObj.origin + variantParts.join("/") + urlObj.search);
}
}
return candidates;
}
function generateProbeCandidates(url) {
const candidates = [];
try {
const urlObj = new URL(url);
pushUnique(candidates, stripResizeParamsCandidate(url));
pushManyUnique(candidates, largerDimensionCandidates(urlObj));
pushUnique(candidates, noQueryCandidate(urlObj, url));
pushManyUnique(candidates, stripSizeSuffixCandidates(urlObj));
pushManyUnique(candidates, pathVariantCandidates(urlObj));
} catch {
return [];
}
return candidates;
}
async function probeForSource(originalUrl, originalSize = 0, validate = validateImageUrl) {
const candidates = generateProbeCandidates(originalUrl);
let bestUrl = originalUrl;
let bestSize = originalSize;
for (const candidate of candidates) {
const validation = await validate(candidate);
if (validation.valid && validation.size && validation.size > bestSize) {
bestUrl = candidate;
bestSize = validation.size;
}
}
return bestUrl === originalUrl ? null : { url: bestUrl, size: bestSize, method: "probed" };
}
// src/resolve.ts
function calculateSizeIncrease(original, resolved) {
if (!(original && resolved) || original === 0) {
return void 0;
}
return `${(resolved / original).toFixed(1)}x`;
}
async function cacheResult(original, resolved, options, patternId) {
await options.cache?.set(original, resolved, patternId);
emitDebug(options.onDebug, {
type: "cache:write",
message: `Stored cache entry for ${original}`,
url: original,
metadata: {
resolved,
patternId
}
});
}
async function learnPattern(original, resolved, options) {
if (!options.patternStore) {
return;
}
try {
const domain = new URL(original).hostname;
const escaped = original.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
await options.patternStore.save(domain, `^${escaped}$`, resolved);
} catch {
}
}
async function resolveImageUrl(imageUrl, options = {}) {
const validate = options.validate ?? ((url) => validateImageUrl(url, {
onDebug: options.onDebug,
retryCount: options.retryCount,
retryDelayMs: options.retryDelayMs
}));
const original = imageUrl;
emitDebug(options.onDebug, {
type: "resolve:start",
message: `Resolving ${imageUrl}`,
url: imageUrl
});
try {
const cached = await options.cache?.get(imageUrl);
if (cached) {
emitDebug(options.onDebug, {
type: "resolve:cached",
message: `Cache hit for ${imageUrl}`,
url: imageUrl,
metadata: {
resolved: cached
}
});
return { original, resolved: cached, method: "cached" };
}
} catch {
}
const originalValidation = await validate(imageUrl);
const originalSize = originalValidation.size ?? 0;
const curated = matchCuratedPattern(imageUrl);
if (curated) {
emitDebug(options.onDebug, {
type: "pattern:curated",
message: `Curated pattern matched ${imageUrl}`,
url: imageUrl,
metadata: {
resolved: curated
}
});
const validation = await validate(curated);
if (validation.valid) {
await cacheResult(imageUrl, curated, options);
return {
original,
resolved: curated,
method: "pattern",
confidence: 0.95,
sizeIncrease: calculateSizeIncrease(originalSize, validation.size)
};
}
emitDebug(options.onDebug, {
type: "pattern:rejected",
message: `Curated pattern candidate was rejected for ${imageUrl}`,
url: imageUrl,
metadata: {
resolved: curated
}
});
}
if (options.patternStore) {
try {
const domain = new URL(imageUrl).hostname;
const patterns = await options.patternStore.findByDomain(domain);
emitDebug(options.onDebug, {
type: "pattern:loaded",
message: `Loaded ${patterns.length} learned patterns for ${domain}`,
url: imageUrl,
metadata: {
domain,
count: patterns.length
}
});
for (const pattern of patterns) {
const resolved = applyPattern(imageUrl, pattern);
if (!resolved) {
continue;
}
const validation = await validate(resolved);
if (!validation.valid) {
await options.patternStore.incrementFailure?.(
pattern.id ?? pattern.domain
);
emitDebug(options.onDebug, {
type: "pattern:rejected",
message: `Learned pattern candidate was rejected for ${imageUrl}`,
url: imageUrl,
metadata: {
patternId: pattern.id,
resolved
}
});
continue;
}
if (pattern.id !== void 0) {
await options.patternStore.incrementSuccess(pattern.id);
}
await cacheResult(imageUrl, resolved, options, pattern.id);
emitDebug(options.onDebug, {
type: "pattern:applied",
message: `Learned pattern resolved ${imageUrl}`,
url: imageUrl,
metadata: {
patternId: pattern.id,
resolved
}
});
return {
original,
resolved,
method: "learned",
confidence: pattern.confidence,
sizeIncrease: calculateSizeIncrease(originalSize, validation.size)
};
}
} catch {
}
}
try {
const probeResult = await probeForSource(imageUrl, originalSize, validate);
if (probeResult) {
await learnPattern(imageUrl, probeResult.url, options);
await cacheResult(imageUrl, probeResult.url, options);
emitDebug(options.onDebug, {
type: "probe:resolved",
message: `Probe improved ${imageUrl}`,
url: imageUrl,
metadata: {
resolved: probeResult.url,
size: probeResult.size
}
});
return {
original,
resolved: probeResult.url,
method: "probed",
confidence: 0.5,
sizeIncrease: calculateSizeIncrease(originalSize, probeResult.size)
};
}
} catch (error) {
emitDebug(options.onDebug, {
type: "probe:failed",
message: `Probe failed for ${imageUrl}`,
url: imageUrl,
error: error instanceof Error ? error.message : String(error)
});
}
emitDebug(options.onDebug, {
type: "resolve:fallback",
message: `Falling back to original URL for ${imageUrl}`,
url: imageUrl
});
return { original, resolved: imageUrl, method: "fallback" };
}
// src/extract.ts
import { load } from "cheerio";
var IMAGE_URL_REGEX = /https?:\/\/[^\s"'<>]+\.(jpg|jpeg|png|webp|gif|avif|jfif)(?:\?[^\s"'<>]*)?/gi;
var CDN_PATTERNS = [
/amazonaws\.com/,
/cloudfront\.net/,
/cloudinary\.com/,
/imgix\.net/,
/akamaihd\.net/,
/fastly\.net/,
/staticmedia\./,
/media\..*\.com/,
/cdn\.shopify\.com/,
/squarespace-cdn\.com/,
/ctfassets\.net/,
/sanity\.io/
];
var EXCLUDED_PATTERNS = [
/favicon/i,
/\/icon/i,
/\/logo/i,
/tracking/i,
/pixel/i,
/1x1/i,
/social/i,
/facebook/i,
/twitter/i,
/instagram/i,
/linkedin/i,
/pinterest/i,
/youtube/i,
/flag/i,
/chat/i,
/badge/i,
/avatar/i,
/emoji/i,
/spinner/i,
/loading/i
];
var IMAGE_ATTRIBUTE_NAMES = [
"src",
"data-src",
"data-lazy-src",
"data-original",
"data-image",
"data-url"
];
var SRCSET_ATTRIBUTE_NAMES = ["srcset", "data-srcset", "data-lazy-srcset"];
var META_IMAGE_SELECTORS = [
'meta[property="og:image"]',
'meta[property="og:image:url"]',
'meta[name="twitter:image"]',
'meta[name="twitter:image:src"]',
'meta[itemprop="image"]'
];
var LINK_IMAGE_SELECTORS = [
'link[rel="image_src"]',
'link[rel="preload"][as="image"]'
];
function normalizeCandidateUrl(rawUrl, baseUrl) {
if (!rawUrl) {
return null;
}
const candidate = rawUrl.trim();
if (candidate.length === 0 || candidate.startsWith("data:") || candidate.startsWith("javascript:")) {
return null;
}
try {
if (candidate.startsWith("//")) {
const normalized2 = new URL(`https:${candidate}`);
normalized2.hash = "";
return normalized2.href;
}
if (baseUrl) {
const normalized2 = new URL(candidate, baseUrl);
normalized2.hash = "";
return normalized2.href;
}
const normalized = new URL(candidate);
normalized.hash = "";
return normalized.href;
} catch {
return null;
}
}
function parseSrcsetUrls(srcset) {
return srcset.split(",").map((entry) => entry.trim().split(/\s+/)[0]).filter((value) => Boolean(value));
}
function extractCssUrls(style) {
if (!style) {
return [];
}
return Array.from(
style.matchAll(/url\((['"]?)(.*?)\1\)/g),
(match) => match[2]?.trim() ?? ""
).filter(Boolean);
}
function pushCandidate(candidates, candidate) {
if (!candidate) {
return;
}
if (!candidates.some((entry) => entry.url === candidate.url)) {
candidates.push(candidate);
}
}
function createCandidate(url, source, baseUrl, extra = {}) {
const normalizedUrl = normalizeCandidateUrl(url, baseUrl);
if (!normalizedUrl) {
return null;
}
return {
...extra,
url: normalizedUrl,
source
};
}
function extractImageUrlsFromRaw(html, sourceDomain) {
const matches = html.match(IMAGE_URL_REGEX) || [];
const seen = /* @__PURE__ */ new Set();
const filtered = [];
for (const url of matches) {
const baseUrl = url.split("?")[0];
if (!baseUrl || seen.has(baseUrl)) {
continue;
}
seen.add(baseUrl);
if (EXCLUDED_PATTERNS.some((pattern) => pattern.test(url))) {
continue;
}
const isSameDomain = sourceDomain && url.includes(sourceDomain);
const isCdn = CDN_PATTERNS.some((pattern) => pattern.test(url));
if (isSameDomain || isCdn) {
filtered.push(url);
}
}
return filtered;
}
function extractImageCandidatesFromHtml(html, baseUrl) {
const $ = load(html);
const candidates = [];
$("img").each((_, element) => {
const node = $(element);
const width = Number.parseInt(node.attr("width") || "0", 10) || void 0;
const height = Number.parseInt(node.attr("height") || "0", 10) || void 0;
const alt = node.attr("alt") ?? null;
const srcsetCandidates = SRCSET_ATTRIBUTE_NAMES.flatMap(
(attribute) => parseSrcsetUrls(node.attr(attribute) ?? "").map((url) => normalizeCandidateUrl(url, baseUrl)).filter((url) => Boolean(url))
);
for (const attribute of IMAGE_ATTRIBUTE_NAMES) {
pushCandidate(
candidates,
createCandidate(node.attr(attribute), "img", baseUrl, {
width,
height,
alt,
srcset: srcsetCandidates.length > 0 ? srcsetCandidates : void 0
})
);
}
for (const srcsetUrl of srcsetCandidates) {
pushCandidate(candidates, {
url: srcsetUrl,
source: "img",
width,
height,
alt,
srcset: srcsetCandidates
});
}
});
$("picture source").each((_, element) => {
const urls = SRCSET_ATTRIBUTE_NAMES.flatMap(
(attribute) => parseSrcsetUrls($(element).attr(attribute) ?? "").map((url) => normalizeCandidateUrl(url, baseUrl)).filter((url) => Boolean(url))
);
for (const url of urls) {
pushCandidate(candidates, {
url,
source: "picture",
srcset: urls
});
}
});
$('[style*="background-image"]').each((_, element) => {
for (const url of extractCssUrls($(element).attr("style"))) {
pushCandidate(candidates, createCandidate(url, "background", baseUrl));
}
});
for (const selector of META_IMAGE_SELECTORS) {
$(selector).each((_, element) => {
pushCandidate(
candidates,
createCandidate($(element).attr("content"), "raw", baseUrl)
);
});
}
for (const selector of LINK_IMAGE_SELECTORS) {
$(selector).each((_, element) => {
pushCandidate(
candidates,
createCandidate($(element).attr("href"), "raw", baseUrl)
);
});
}
return candidates;
}
async function extractImageCandidates(html, options = {}) {
const {
includeRaw = false,
sortBySize = false,
sourceDomain,
validate = validateImageUrl
} = options;
const candidates = [...extractImageCandidatesFromHtml(html, options.baseUrl)];
const seenUrls = new Set(
candidates.map((candidate) => candidate.url.split("?")[0])
);
if (includeRaw) {
for (const url of extractImageUrlsFromRaw(html, sourceDomain)) {
const baseUrl = url.split("?")[0];
if (!baseUrl || seenUrls.has(baseUrl)) {
continue;
}
candidates.push({
url,
source: "raw"
});
seenUrls.add(baseUrl);
}
}
if (!sortBySize) {
return candidates;
}
const httpCandidates = candidates.filter(
(candidate) => candidate.url.startsWith("http")
);
const otherCandidates = candidates.filter(
(candidate) => !candidate.url.startsWith("http")
);
const withSizes = await Promise.all(
httpCandidates.map(async (candidate) => ({
candidate,
size: (await validate(candidate.url)).size ?? 0
}))
);
withSizes.sort((left, right) => right.size - left.size);
return [...withSizes.map((entry) => entry.candidate), ...otherCandidates];
}
// src/scrape.ts
var DEFAULT_MIN_SIZE = 200;
var DEFAULT_FETCH_TIMEOUT_MS = 1e4;
var DEFAULT_USER_AGENT = "Mozilla/5.0 (compatible; Srcfull/2.0)";
var DEFAULT_RETRY_COUNT2 = 1;
var DEFAULT_RETRY_DELAY_MS2 = 500;
var LOGO_PATTERNS = [
/logo/i,
/icon/i,
/favicon/i,
/badge/i,
/sprite/i,
/thumbnail/i,
/avatar/i,
/social/i,
/button/i
];
function createAbortController(timeoutMs) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
return { controller, timeoutId };
}
function createDefaultHtmlFetcher(options = {}) {
const timeoutMs = Math.max(
1,
Math.floor(options.timeoutMs ?? DEFAULT_FETCH_TIMEOUT_MS)
);
const retryCount = Math.max(
0,
Math.floor(options.retryCount ?? DEFAULT_RETRY_COUNT2)
);
const retryDelayMs = Math.max(
0,
Math.floor(options.retryDelayMs ?? DEFAULT_RETRY_DELAY_MS2)
);
const userAgent = options.userAgent?.trim() || DEFAULT_USER_AGENT;
const accept = options.accept?.trim() || "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
return async (url) => {
const validation = validatePublicUrl(url);
if (!validation.valid || !validation.url) {
throw new Error(validation.error ?? "Invalid page URL");
}
for (let attempt = 1; attempt <= retryCount + 1; attempt += 1) {
const { controller, timeoutId } = createAbortController(timeoutMs);
try {
const response = await fetch(validation.url.href, {
headers: {
Accept: accept,
"User-Agent": userAgent,
...options.headers ?? {}
},
signal: controller.signal
});
if (attempt <= retryCount && shouldRetryStatus(response.status)) {
emitDebug(options.onDebug, {
type: "fetch:retry",
message: `Page fetch returned ${response.status} for ${validation.url.href}`,
url: validation.url.href,
status: response.status,
attempt
});
await sleep(retryDelayMs * attempt);
continue;
}
if (!response.ok) {
throw new Error(`Failed to fetch page: ${response.status}`);
}
const contentType = response.headers.get("content-type") ?? "";
if (!options.allowNonHtml && contentType && !/text\/html|application\/xhtml\+xml/i.test(contentType)) {
throw new Error(`Expected HTML response but received ${contentType}`);
}
emitDebug(options.onDebug, {
type: "fetch:success",
message: `Fetched page HTML for ${validation.url.href}`,
url: validation.url.href,
attempt,
metadata: {
contentType,
status: response.status
}
});
return {
html: await response.text(),
metadata: {
fetcher: "default",
status: response.status,
contentType
}
};
} catch (error) {
if (error instanceof Error && error.name === "AbortError") {
if (attempt <= retryCount) {
emitDebug(options.onDebug, {
type: "fetch:retry",
message: `Page fetch timed out for ${validation.url.href}`,
url: validation.url.href,
attempt,
error: error.message
});
await sleep(retryDelayMs * attempt);
continue;
}
throw new Error(`Timed out fetching page after ${timeoutMs}ms`);
}
if (attempt <= retryCount && isRetryableRequestError(error)) {
emitDebug(options.onDebug, {
type: "fetch:retry",
message: `Page fetch failed for ${validation.url.href}`,
url: validation.url.href,
attempt,
error: error instanceof Error ? error.message : String(error)
});
await sleep(retryDelayMs * attempt);
continue;
}
throw error;
} finally {
clearTimeout(timeoutId);
}
}
throw new RetryableStatusError(
503,
`Unable to fetch ${validation.url.href}`
);
};
}
var defaultHtmlFetcher = createDefaultHtmlFetcher();
function filterMainImages(candidates, minSize) {
return candidates.filter((candidate) => {
if (candidate.url.startsWith("data:")) {
return false;
}
if (LOGO_PATTERNS.some((pattern) => pattern.test(candidate.url))) {
return false;
}
if (candidate.width && candidate.width < minSize) {
return false;
}
if (candidate.height && candidate.height < minSize) {
return false;
}
return true;
});
}
async function getImageSize(url, options) {
const validation = options.validate ? await options.validate(url) : await validateImageUrl(url, {
onDebug: options.onDebug,
retryCount: options.retryCount,
retryDelayMs: options.retryDelayMs
});
return validation.size ?? null;
}
async function scrapePage(url, options = {}) {
const start = Date.now();
const validation = validatePublicUrl(url);
if (!validation.valid) {
throw new Error(validation.error);
}
const fetchHtml = options.fetchHtml ?? createDefaultHtmlFetcher({
onDebug: options.onDebug,
retryCount: options.retryCount,
retryDelayMs: options.retryDelayMs
});
const htmlResult = await fetchHtml(url);
const sourceDomain = new URL(url).hostname.replace(/^www\./, "");
let candidates = filterMainImages(
await extractImageCandidates(htmlResult.html, {
includeRaw: true,
sortBySize: true,
baseUrl: url,
sourceDomain,
validate: options.validate
}),
options.minSize ?? DEFAULT_MIN_SIZE
);
let fallbackMetadata;
if (candidates.length === 0 && options.imageFallback) {
const fallback = await options.imageFallback(url);
candidates = filterMainImages(
fallback.images,
options.minSize ?? DEFAULT_MIN_SIZE
);
fallbackMetadata = fallback.metadata;
}
if (candidates.length === 0) {
throw new Error("Failed to extract images");
}
emitDebug(options.onDebug, {
type: "scrape:candidates",
message: `Collected ${candidates.length} candidates for ${url}`,
url,
metadata: {
sourceDomain
}
});
const resolve = options.resolve ?? ((imageUrl) => resolveImageUrl(imageUrl, {
onDebug: options.onDebug,
retryCount: options.retryCount,
retryDelayMs: options.retryDelayMs
}));
const toResolve = candidates.slice(0, options.maxImages ?? 20);
const limit = createLimiter(options.resolveConcurrency ?? 5);
let resolved = 0;
let failed = 0;
const images = (await Promise.all(
toResolve.map(
(candidate) => limit(async () => {
try {
const result = await resolve(candidate.url);
if (result.method !== "fallback") {
resolved += 1;
}
return {
original: result.original,
resolved: result.resolved,
originalSize: null,
resolvedSize: await getImageSize(result.resolved, options),
sizeIncrease: result.sizeIncrease ?? null,
alt: candidate.alt ?? null,
method: result.method
};
} catch (error) {
failed += 1;
emitDebug(options.onDebug, {
type: "scrape:resolve_failed",
message: `Failed to resolve ${candidate.url}`,
url: candidate.url,
error: error instanceof Error ? error.message : String(error)
});
return null;
}
})
)
)).filter((image) => image !== null);
images.sort(
(left, right) => (right.resolvedSize ?? 0) - (left.resolvedSize ?? 0)
);
return {
url,
images,
stats: {
found: candidates.length,
resolved,
failed,
returned: images.length,
durationMs: Date.now() - start
},
metadata: {
...htmlResult.metadata ?? {},
...fallbackMetadata ?? {}
}
};
}
export {
matchCuratedPattern,
applyPattern,
createLimiter,
httpLimiter,
validateImageUrl,
generateProbeCandidates,
probeForSource,
resolveImageUrl,
extractImageUrlsFromRaw,
extractImageCandidatesFromHtml,
extractImageCandidates,
createDefaultHtmlFetcher,
defaultHtmlFetcher,
scrapePage
};