Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

@sanity/export

Package Overview
Dependencies
Maintainers
114
Versions
1068
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@sanity/export - npm Package Compare versions

Comparing version
6.0.3
to
6.0.4
+109
bin/detect-corrupt.js
#!/usr/bin/env node
/**
* CLI tool to detect potentially corrupted export files caused by UTF-8
* multi-byte characters being split across chunk boundaries during streaming.
*
* Usage:
* npx @sanity/export detect-corrupt <file.ndjson|file.tar.gz|directory>
*/
import {existsSync} from 'node:fs'
import {detectCorruption} from '../dist/detectCorruption.js'
const REPLACEMENT_CHAR_DISPLAY = '�'
function printUsage() {
console.log(`
Usage: detect-corrupt <file.ndjson|file.tar.gz|directory>
Detects potentially corrupted export files caused by UTF-8 multi-byte
characters being split across chunk boundaries during streaming.
The corruption manifests as U+FFFD replacement characters (${REPLACEMENT_CHAR_DISPLAY}) appearing
where valid multi-byte characters should be.
Supported inputs:
- .tar.gz or .tgz archive (scans data.ndjson and assets.json inside)
- .ndjson file
- Directory containing data.ndjson and/or assets.json
Examples:
npx @sanity/export detect-corrupt export.tar.gz
npx @sanity/export detect-corrupt data.ndjson
npx @sanity/export detect-corrupt ./my-export-folder
`)
}
function printReport(filename, corruptions) {
console.log(`\n ${filename}:`)
// Limit output to avoid overwhelming terminal
const maxToShow = 10
const shown = corruptions.slice(0, maxToShow)
for (const c of shown) {
console.log(` Line ${c.line}, col ${c.column}: ${c.count} replacement char(s)`)
// Escape the context for display
const displayContext = c.context
.replace(/\n/g, '\\n')
.replace(/\r/g, '\\r')
.replace(/\t/g, '\\t')
console.log(` Context: "...${displayContext}..."`)
}
if (corruptions.length > maxToShow) {
console.log(` ... and ${corruptions.length - maxToShow} more occurrences`)
}
}
async function main() {
const args = process.argv.slice(2)
if (args.length === 0 || args.includes('--help') || args.includes('-h')) {
printUsage()
process.exit(0)
}
const filePath = args[0]
if (!existsSync(filePath)) {
console.error(`Error: File not found: ${filePath}`)
process.exit(1)
}
console.log(`Scanning ${filePath} for UTF-8 corruption...`)
try {
const result = await detectCorruption(filePath)
// Show which files were scanned
if (result.scannedFiles.length > 0) {
console.log(`\nScanned files:`)
for (const file of result.scannedFiles) {
console.log(` - ${file}`)
}
}
if (!result.corrupted) {
console.log('\n✓ No corruption detected')
process.exit(0)
}
console.log(`\n✗ Found potential corruption in ${result.files.size} file(s):`)
for (const [filename, corruptions] of result.files) {
printReport(filename, corruptions)
}
console.log(`\nTotal: ${result.totalCorruptedLines} line(s) with replacement characters`)
console.log('\nNote: U+FFFD replacement characters indicate where multi-byte')
console.log('UTF-8 sequences were corrupted during export streaming.')
process.exit(1)
} catch (err) {
console.error('Error:', err instanceof Error ? err.message : String(err))
process.exit(1)
}
}
main()
/**
* Information about corruption found on a specific line
* @public
*/
export interface CorruptionInfo {
/** Line number (1-indexed) */
line: number;
/** Column position of first replacement char */
column: number;
/** Surrounding text for context */
context: string;
/** Number of replacement chars on this line */
count: number;
}
/**
* Result of scanning a file for corruption
* @public
*/
export interface ScanResult {
/** Whether corruption was detected */
corrupted: boolean;
/** Map of filename to corruption info (for tar.gz, multiple files may be scanned) */
files: Map<string, CorruptionInfo[]>;
/** Total number of corrupted lines across all files */
totalCorruptedLines: number;
/** List of files that were scanned */
scannedFiles: string[];
}
/**
* Scans an NDJSON file for UTF-8 corruption
*
* @param filePath - Path to the ndjson file
* @returns Scan result with corruption information
* @public
*/
export declare function scanNdjsonFile(filePath: string): Promise<ScanResult>;
/**
* Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
*
* @param filePath - Path to the tar.gz file
* @returns Scan result with corruption information
* @public
*/
export declare function scanTarGz(filePath: string): Promise<ScanResult>;
/**
* Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
*
* @param dirPath - Path to the directory
* @returns Scan result with corruption information
* @public
*/
export declare function scanDirectory(dirPath: string): Promise<ScanResult>;
/**
* Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
*
* The corruption manifests as U+FFFD replacement characters appearing
* where valid multi-byte characters (CJK, emoji, etc.) should be.
*
* @param filePath - Path to the file or directory to scan
* @returns Scan result with corruption information
* @public
*/
export declare function detectCorruption(filePath: string): Promise<ScanResult>;
//# sourceMappingURL=detectCorruption.d.ts.map
{"version":3,"file":"detectCorruption.d.ts","sourceRoot":"","sources":["../src/detectCorruption.ts"],"names":[],"mappings":"AAWA;;;GAGG;AACH,MAAM,WAAW,cAAc;IAC7B,8BAA8B;IAC9B,IAAI,EAAE,MAAM,CAAA;IACZ,gDAAgD;IAChD,MAAM,EAAE,MAAM,CAAA;IACd,mCAAmC;IACnC,OAAO,EAAE,MAAM,CAAA;IACf,+CAA+C;IAC/C,KAAK,EAAE,MAAM,CAAA;CACd;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IACzB,sCAAsC;IACtC,SAAS,EAAE,OAAO,CAAA;IAClB,qFAAqF;IACrF,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC,CAAA;IACpC,uDAAuD;IACvD,mBAAmB,EAAE,MAAM,CAAA;IAC3B,sCAAsC;IACtC,YAAY,EAAE,MAAM,EAAE,CAAA;CACvB;AAmDD;;;;;;GAMG;AACH,wBAAsB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAe1E;AAED;;;;;;GAMG;AACH,wBAAsB,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAsErE;AAED;;;;;;GAMG;AACH,wBAAsB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAoCxE;AAED;;;;;;;;;GASG;AACH,wBAAsB,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAS5E"}
import { createReadStream, existsSync, statSync } from 'node:fs';
import { basename, join } from 'node:path';
import { createInterface } from 'node:readline';
import { createGunzip } from 'node:zlib';
import tarStream from 'tar-stream';
// U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded
const REPLACEMENT_CHAR = '\uFFFD';
/**
* Scans a line for U+FFFD replacement characters
*/
function scanLine(line, lineNumber) {
const index = line.indexOf(REPLACEMENT_CHAR);
if (index === -1)
return null;
// Count total replacement chars on this line
let count = 0;
for (const char of line) {
if (char === REPLACEMENT_CHAR)
count++;
}
// Extract context around the corruption
const contextStart = Math.max(0, index - 20);
const contextEnd = Math.min(line.length, index + 30);
const context = line.slice(contextStart, contextEnd);
return {
line: lineNumber,
column: index + 1,
context,
count,
};
}
/**
* Scans a readable stream (expecting UTF-8 text) for corruption
*/
async function scanStream(stream) {
const corruptions = [];
let lineNumber = 0;
const rl = createInterface({
input: stream,
crlfDelay: Infinity,
});
for await (const line of rl) {
lineNumber++;
const corruption = scanLine(line, lineNumber);
if (corruption) {
corruptions.push(corruption);
}
}
return corruptions;
}
/**
* Scans an NDJSON file for UTF-8 corruption
*
* @param filePath - Path to the ndjson file
* @returns Scan result with corruption information
* @public
*/
export async function scanNdjsonFile(filePath) {
const stream = createReadStream(filePath, { encoding: 'utf8' });
const corruptions = await scanStream(stream);
const files = new Map();
if (corruptions.length > 0) {
files.set(filePath, corruptions);
}
return {
corrupted: corruptions.length > 0,
files,
totalCorruptedLines: corruptions.length,
scannedFiles: [filePath],
};
}
/**
* Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
*
* @param filePath - Path to the tar.gz file
* @returns Scan result with corruption information
* @public
*/
export async function scanTarGz(filePath) {
const extract = tarStream.extract();
const results = new Map();
const scannedFiles = [];
const targetFiles = ['data.ndjson', 'asset.json'];
return new Promise((resolve, reject) => {
extract.on('entry', (header, stream, next) => {
const fileBasename = basename(header.name);
if (targetFiles.includes(fileBasename)) {
scannedFiles.push(header.name);
const chunks = [];
stream.on('data', (chunk) => {
chunks.push(chunk);
});
stream.on('end', () => {
// Combine all chunks and convert to string
const content = Buffer.concat(chunks).toString('utf8');
const lines = content.split(/\r?\n/);
const corruptions = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (line !== undefined && line.length > 0) {
const corruption = scanLine(line, i + 1);
if (corruption) {
corruptions.push(corruption);
}
}
}
if (corruptions.length > 0) {
results.set(header.name, corruptions);
}
next();
});
stream.on('error', reject);
}
else {
// Skip this entry
stream.on('end', next);
stream.resume();
}
});
extract.on('finish', () => {
let totalCorruptedLines = 0;
for (const corruptions of results.values()) {
totalCorruptedLines += corruptions.length;
}
resolve({
corrupted: results.size > 0,
files: results,
totalCorruptedLines,
scannedFiles,
});
});
extract.on('error', reject);
const gunzip = createGunzip();
gunzip.on('error', reject);
createReadStream(filePath).pipe(gunzip).pipe(extract);
});
}
/**
* Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
*
* @param dirPath - Path to the directory
* @returns Scan result with corruption information
* @public
*/
export async function scanDirectory(dirPath) {
const targetFiles = ['data.ndjson', 'assets.json'];
const foundFiles = [];
for (const filename of targetFiles) {
const filePath = join(dirPath, filename);
if (existsSync(filePath)) {
foundFiles.push(filePath);
}
}
if (foundFiles.length === 0) {
throw new Error(`No data.ndjson or assets.json found in directory: ${dirPath}`);
}
const results = new Map();
const scannedFiles = [];
let totalCorruptedLines = 0;
for (const filePath of foundFiles) {
const result = await scanNdjsonFile(filePath);
scannedFiles.push(...result.scannedFiles);
for (const [file, corruptions] of result.files) {
results.set(file, corruptions);
totalCorruptedLines += corruptions.length;
}
}
return {
corrupted: results.size > 0,
files: results,
totalCorruptedLines,
scannedFiles,
};
}
/**
* Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
*
* The corruption manifests as U+FFFD replacement characters appearing
* where valid multi-byte characters (CJK, emoji, etc.) should be.
*
* @param filePath - Path to the file or directory to scan
* @returns Scan result with corruption information
* @public
*/
export async function detectCorruption(filePath) {
const stat = statSync(filePath);
if (stat.isDirectory()) {
return scanDirectory(filePath);
}
const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz');
return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath);
}
//# sourceMappingURL=detectCorruption.js.map
{"version":3,"file":"detectCorruption.js","sourceRoot":"","sources":["../src/detectCorruption.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,gBAAgB,EAAE,UAAU,EAAE,QAAQ,EAAC,MAAM,SAAS,CAAA;AAC9D,OAAO,EAAC,QAAQ,EAAE,IAAI,EAAC,MAAM,WAAW,CAAA;AACxC,OAAO,EAAC,eAAe,EAAC,MAAM,eAAe,CAAA;AAE7C,OAAO,EAAC,YAAY,EAAC,MAAM,WAAW,CAAA;AAEtC,OAAO,SAAS,MAAM,YAAY,CAAA;AAElC,kFAAkF;AAClF,MAAM,gBAAgB,GAAG,QAAQ,CAAA;AAgCjC;;GAEG;AACH,SAAS,QAAQ,CAAC,IAAY,EAAE,UAAkB;IAChD,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAA;IAC5C,IAAI,KAAK,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IAE7B,6CAA6C;IAC7C,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;QACxB,IAAI,IAAI,KAAK,gBAAgB;YAAE,KAAK,EAAE,CAAA;IACxC,CAAC;IAED,wCAAwC;IACxC,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,GAAG,EAAE,CAAC,CAAA;IAC5C,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,KAAK,GAAG,EAAE,CAAC,CAAA;IACpD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,EAAE,UAAU,CAAC,CAAA;IAEpD,OAAO;QACL,IAAI,EAAE,UAAU;QAChB,MAAM,EAAE,KAAK,GAAG,CAAC;QACjB,OAAO;QACP,KAAK;KACN,CAAA;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,UAAU,CAAC,MAAgB;IACxC,MAAM,WAAW,GAAqB,EAAE,CAAA;IACxC,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,MAAM,EAAE,GAAG,eAAe,CAAC;QACzB,KAAK,EAAE,MAAM;QACb,SAAS,EAAE,QAAQ;KACpB,CAAC,CAAA;IAEF,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;QAC5B,UAAU,EAAE,CAAA;QACZ,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,EAAE,UAAU,CAAC,CAAA;QAC7C,IAAI,UAAU,EAAE,CAAC;YACf,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAA;AACpB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,QAAgB;IACnD,MAAM,MAAM,GAAG,gBAAgB,CAAC,QAAQ,EAAE,EAAC,QAAQ,EAAE,MAAM,EAAC,CAAC,CAAA;IAC7D,MAAM,WAAW,GAAG,MAAM,UAAU,CAAC,MAAM,CAAC,CAAA;IAE5C,MAAM,KAAK,GAAG,IAAI,GAAG,EAA4B,CAAA;IACjD,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,KAAK,CAAC,GAAG,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAA;IAClC,CAAC;IAED,OAAO;QACL,SAAS,EAAE,WAAW,CAAC,MAAM,GAAG,CAAC;QACjC,KAAK;QACL,mBAAmB,EAAE,WAAW,CAAC,MAAM;QACvC,YAAY,EAAE,CAAC,QAAQ,CAAC;KACzB,CAAA;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,QAAgB;IAC9C,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,EAAE,CAAA;IAEnC,MAAM,OAAO,GAAG,IAAI,GAAG,EAA4B,CAAA;IACnD,MAAM,YAAY,GAAa,EAAE,CAAA;IACjC,MAAM,WAAW,GAAG,CAAC,aAAa,EAAE,YAAY,CAAC,CAAA;IAEjD,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,EAAE;YAC3C,MAAM,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;YAE1C,IAAI,WAAW,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;gBACvC,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;gBAC9B,MAAM,MAAM,GAAa,EAAE,CAAA;gBAE3B,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAa,EAAE,EAAE;oBAClC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;gBACpB,CAAC,CAAC,CAAA;gBAEF,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE;oBACpB,2CAA2C;oBAC3C,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAA;oBACtD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAA;oBACpC,MAAM,WAAW,GAAqB,EAAE,CAAA;oBAExC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;wBACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAA;wBACrB,IAAI,IAAI,KAAK,SAAS,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;4BAC1C,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAA;4BACxC,IAAI,UAAU,EAAE,CAAC;gCACf,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;4BAC9B,CAAC;wBACH,CAAC;oBACH,CAAC;oBAED,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBAC3B,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,WAAW,CAAC,CAAA;oBACvC,CAAC;oBACD,IAAI,EAAE,CAAA;gBACR,CAAC,CAAC,CAAA;gBAEF,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;YAC5B,CAAC;iBAAM,CAAC;gBACN,kBAAkB;gBAClB,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,IAAI,CAAC,CAAA;gBACtB,MAAM,CAAC,MAAM,EAAE,CAAA;YACjB,CAAC;QACH,CAAC,CAAC,CAAA;QAEF,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE;YACxB,IAAI,mBAAmB,GAAG,CAAC,CAAA;YAC3B,KAAK,MAAM,WAAW,IAAI,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;gBAC3C,mBAAmB,IAAI,WAAW,CAAC,MAAM,CAAA;YAC3C,CAAC;YAED,OAAO,CAAC;gBACN,SAAS,EAAE,OAAO,CAAC,IAAI,GAAG,CAAC;gBAC3B,KAAK,EAAE,OAAO;gBACd,mBAAmB;gBACnB,YAAY;aACb,CAAC,CAAA;QACJ,CAAC,CAAC,CAAA;QAEF,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QAE3B,MAAM,MAAM,GAAG,YAAY,EAAE,CAAA;QAC7B,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QAE1B,gBAAgB,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;AACJ,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,OAAe;IACjD,MAAM,WAAW,GAAG,CAAC,aAAa,EAAE,aAAa,CAAC,CAAA;IAClD,MAAM,UAAU,GAAa,EAAE,CAAA;IAE/B,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;QACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAA;QACxC,IAAI,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YACzB,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;QAC3B,CAAC;IACH,CAAC;IAED,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,KAAK,CACb,qDAAqD,OAAO,EAAE,CAC/D,CAAA;IACH,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,GAAG,EAA4B,CAAA;IACnD,MAAM,YAAY,GAAa,EAAE,CAAA;IACjC,IAAI,mBAAmB,GAAG,CAAC,CAAA;IAE3B,KAAK,MAAM,QAAQ,IAAI,UAAU,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,CAAA;QAC7C,YAAY,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,YAAY,CAAC,CAAA;QACzC,KAAK,MAAM,CAAC,IAAI,EAAE,WAAW,CAAC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;YAC/C,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,WAAW,CAAC,CAAA;YAC9B,mBAAmB,IAAI,WAAW,CAAC,MAAM,CAAA;QAC3C,CAAC;IACH,CAAC;IAED,OAAO;QACL,SAAS,EAAE,OAAO,CAAC,IAAI,GAAG,CAAC;QAC3B,KAAK,EAAE,OAAO;QACd,mBAAmB;QACnB,YAAY;KACb,CAAA;AACH,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,QAAgB;IACrD,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAA;IAE/B,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;QACvB,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAA;IAChC,CAAC;IAED,MAAM,MAAM,GAAG,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAA;IACxE,OAAO,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAA;AAChE,CAAC"}
import {createReadStream, existsSync, statSync} from 'node:fs'
import {basename, join} from 'node:path'
import {createInterface} from 'node:readline'
import type {Readable} from 'node:stream'
import {createGunzip} from 'node:zlib'
import tarStream from 'tar-stream'
// U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded
const REPLACEMENT_CHAR = '\uFFFD'
/**
* Information about corruption found on a specific line
* @public
*/
export interface CorruptionInfo {
/** Line number (1-indexed) */
line: number
/** Column position of first replacement char */
column: number
/** Surrounding text for context */
context: string
/** Number of replacement chars on this line */
count: number
}
/**
* Result of scanning a file for corruption
* @public
*/
export interface ScanResult {
/** Whether corruption was detected */
corrupted: boolean
/** Map of filename to corruption info (for tar.gz, multiple files may be scanned) */
files: Map<string, CorruptionInfo[]>
/** Total number of corrupted lines across all files */
totalCorruptedLines: number
/** List of files that were scanned */
scannedFiles: string[]
}
/**
* Scans a line for U+FFFD replacement characters
*/
function scanLine(line: string, lineNumber: number): CorruptionInfo | null {
const index = line.indexOf(REPLACEMENT_CHAR)
if (index === -1) return null
// Count total replacement chars on this line
let count = 0
for (const char of line) {
if (char === REPLACEMENT_CHAR) count++
}
// Extract context around the corruption
const contextStart = Math.max(0, index - 20)
const contextEnd = Math.min(line.length, index + 30)
const context = line.slice(contextStart, contextEnd)
return {
line: lineNumber,
column: index + 1,
context,
count,
}
}
/**
* Scans a readable stream (expecting UTF-8 text) for corruption
*/
async function scanStream(stream: Readable): Promise<CorruptionInfo[]> {
const corruptions: CorruptionInfo[] = []
let lineNumber = 0
const rl = createInterface({
input: stream,
crlfDelay: Infinity,
})
for await (const line of rl) {
lineNumber++
const corruption = scanLine(line, lineNumber)
if (corruption) {
corruptions.push(corruption)
}
}
return corruptions
}
/**
* Scans an NDJSON file for UTF-8 corruption
*
* @param filePath - Path to the ndjson file
* @returns Scan result with corruption information
* @public
*/
export async function scanNdjsonFile(filePath: string): Promise<ScanResult> {
const stream = createReadStream(filePath, {encoding: 'utf8'})
const corruptions = await scanStream(stream)
const files = new Map<string, CorruptionInfo[]>()
if (corruptions.length > 0) {
files.set(filePath, corruptions)
}
return {
corrupted: corruptions.length > 0,
files,
totalCorruptedLines: corruptions.length,
scannedFiles: [filePath],
}
}
/**
* Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files
*
* @param filePath - Path to the tar.gz file
* @returns Scan result with corruption information
* @public
*/
export async function scanTarGz(filePath: string): Promise<ScanResult> {
const extract = tarStream.extract()
const results = new Map<string, CorruptionInfo[]>()
const scannedFiles: string[] = []
const targetFiles = ['data.ndjson', 'asset.json']
return new Promise((resolve, reject) => {
extract.on('entry', (header, stream, next) => {
const fileBasename = basename(header.name)
if (targetFiles.includes(fileBasename)) {
scannedFiles.push(header.name)
const chunks: Buffer[] = []
stream.on('data', (chunk: Buffer) => {
chunks.push(chunk)
})
stream.on('end', () => {
// Combine all chunks and convert to string
const content = Buffer.concat(chunks).toString('utf8')
const lines = content.split(/\r?\n/)
const corruptions: CorruptionInfo[] = []
for (let i = 0; i < lines.length; i++) {
const line = lines[i]
if (line !== undefined && line.length > 0) {
const corruption = scanLine(line, i + 1)
if (corruption) {
corruptions.push(corruption)
}
}
}
if (corruptions.length > 0) {
results.set(header.name, corruptions)
}
next()
})
stream.on('error', reject)
} else {
// Skip this entry
stream.on('end', next)
stream.resume()
}
})
extract.on('finish', () => {
let totalCorruptedLines = 0
for (const corruptions of results.values()) {
totalCorruptedLines += corruptions.length
}
resolve({
corrupted: results.size > 0,
files: results,
totalCorruptedLines,
scannedFiles,
})
})
extract.on('error', reject)
const gunzip = createGunzip()
gunzip.on('error', reject)
createReadStream(filePath).pipe(gunzip).pipe(extract)
})
}
/**
* Scans a directory for UTF-8 corruption in data.ndjson and assets.json files
*
* @param dirPath - Path to the directory
* @returns Scan result with corruption information
* @public
*/
export async function scanDirectory(dirPath: string): Promise<ScanResult> {
const targetFiles = ['data.ndjson', 'assets.json']
const foundFiles: string[] = []
for (const filename of targetFiles) {
const filePath = join(dirPath, filename)
if (existsSync(filePath)) {
foundFiles.push(filePath)
}
}
if (foundFiles.length === 0) {
throw new Error(
`No data.ndjson or assets.json found in directory: ${dirPath}`,
)
}
const results = new Map<string, CorruptionInfo[]>()
const scannedFiles: string[] = []
let totalCorruptedLines = 0
for (const filePath of foundFiles) {
const result = await scanNdjsonFile(filePath)
scannedFiles.push(...result.scannedFiles)
for (const [file, corruptions] of result.files) {
results.set(file, corruptions)
totalCorruptedLines += corruptions.length
}
}
return {
corrupted: results.size > 0,
files: results,
totalCorruptedLines,
scannedFiles,
}
}
/**
* Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory)
*
* The corruption manifests as U+FFFD replacement characters appearing
* where valid multi-byte characters (CJK, emoji, etc.) should be.
*
* @param filePath - Path to the file or directory to scan
* @returns Scan result with corruption information
* @public
*/
export async function detectCorruption(filePath: string): Promise<ScanResult> {
const stat = statSync(filePath)
if (stat.isDirectory()) {
return scanDirectory(filePath)
}
const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz')
return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath)
}
+8
-2
{
"name": "@sanity/export",
"version": "6.0.3",
"version": "6.0.4",
"description": "Export Sanity documents and assets",

@@ -33,3 +33,7 @@ "keywords": [

"types": "./dist/index.d.ts",
"bin": {
"detect-corrupt": "./bin/detect-corrupt.js"
},
"files": [
"bin",
"dist",

@@ -51,3 +55,4 @@ "src"

"json-stream-stringify": "^3.1.6",
"p-queue": "^9.0.1"
"p-queue": "^9.0.1",
"tar-stream": "^3.1.7"
},

@@ -61,2 +66,3 @@ "devDependencies": {

"@types/node": "^20.19.0",
"@types/tar-stream": "^3.1.3",
"@vitest/coverage-v8": "^4.0.15",

@@ -63,0 +69,0 @@ "eslint": "^8.57.0",