@sanity/export
Advanced tools
| #!/usr/bin/env node | ||
| /** | ||
| * CLI tool to detect potentially corrupted export files caused by UTF-8 | ||
| * multi-byte characters being split across chunk boundaries during streaming. | ||
| * | ||
| * Usage: | ||
| * npx @sanity/export detect-corrupt <file.ndjson|file.tar.gz|directory> | ||
| */ | ||
| import {existsSync} from 'node:fs' | ||
| import {detectCorruption} from '../dist/detectCorruption.js' | ||
| const REPLACEMENT_CHAR_DISPLAY = '�' | ||
| function printUsage() { | ||
| console.log(` | ||
| Usage: detect-corrupt <file.ndjson|file.tar.gz|directory> | ||
| Detects potentially corrupted export files caused by UTF-8 multi-byte | ||
| characters being split across chunk boundaries during streaming. | ||
| The corruption manifests as U+FFFD replacement characters (${REPLACEMENT_CHAR_DISPLAY}) appearing | ||
| where valid multi-byte characters should be. | ||
| Supported inputs: | ||
| - .tar.gz or .tgz archive (scans data.ndjson and assets.json inside) | ||
| - .ndjson file | ||
| - Directory containing data.ndjson and/or assets.json | ||
| Examples: | ||
| npx @sanity/export detect-corrupt export.tar.gz | ||
| npx @sanity/export detect-corrupt data.ndjson | ||
| npx @sanity/export detect-corrupt ./my-export-folder | ||
| `) | ||
| } | ||
| function printReport(filename, corruptions) { | ||
| console.log(`\n ${filename}:`) | ||
| // Limit output to avoid overwhelming terminal | ||
| const maxToShow = 10 | ||
| const shown = corruptions.slice(0, maxToShow) | ||
| for (const c of shown) { | ||
| console.log(` Line ${c.line}, col ${c.column}: ${c.count} replacement char(s)`) | ||
| // Escape the context for display | ||
| const displayContext = c.context | ||
| .replace(/\n/g, '\\n') | ||
| .replace(/\r/g, '\\r') | ||
| .replace(/\t/g, '\\t') | ||
| console.log(` Context: "...${displayContext}..."`) | ||
| } | ||
| if (corruptions.length > maxToShow) { | ||
| console.log(` ... and ${corruptions.length - maxToShow} more occurrences`) | ||
| } | ||
| } | ||
| async function main() { | ||
| const args = process.argv.slice(2) | ||
| if (args.length === 0 || args.includes('--help') || args.includes('-h')) { | ||
| printUsage() | ||
| process.exit(0) | ||
| } | ||
| const filePath = args[0] | ||
| if (!existsSync(filePath)) { | ||
| console.error(`Error: File not found: ${filePath}`) | ||
| process.exit(1) | ||
| } | ||
| console.log(`Scanning ${filePath} for UTF-8 corruption...`) | ||
| try { | ||
| const result = await detectCorruption(filePath) | ||
| // Show which files were scanned | ||
| if (result.scannedFiles.length > 0) { | ||
| console.log(`\nScanned files:`) | ||
| for (const file of result.scannedFiles) { | ||
| console.log(` - ${file}`) | ||
| } | ||
| } | ||
| if (!result.corrupted) { | ||
| console.log('\n✓ No corruption detected') | ||
| process.exit(0) | ||
| } | ||
| console.log(`\n✗ Found potential corruption in ${result.files.size} file(s):`) | ||
| for (const [filename, corruptions] of result.files) { | ||
| printReport(filename, corruptions) | ||
| } | ||
| console.log(`\nTotal: ${result.totalCorruptedLines} line(s) with replacement characters`) | ||
| console.log('\nNote: U+FFFD replacement characters indicate where multi-byte') | ||
| console.log('UTF-8 sequences were corrupted during export streaming.') | ||
| process.exit(1) | ||
| } catch (err) { | ||
| console.error('Error:', err instanceof Error ? err.message : String(err)) | ||
| process.exit(1) | ||
| } | ||
| } | ||
| main() |
| /** | ||
| * Information about corruption found on a specific line | ||
| * @public | ||
| */ | ||
| export interface CorruptionInfo { | ||
| /** Line number (1-indexed) */ | ||
| line: number; | ||
| /** Column position of first replacement char */ | ||
| column: number; | ||
| /** Surrounding text for context */ | ||
| context: string; | ||
| /** Number of replacement chars on this line */ | ||
| count: number; | ||
| } | ||
| /** | ||
| * Result of scanning a file for corruption | ||
| * @public | ||
| */ | ||
| export interface ScanResult { | ||
| /** Whether corruption was detected */ | ||
| corrupted: boolean; | ||
| /** Map of filename to corruption info (for tar.gz, multiple files may be scanned) */ | ||
| files: Map<string, CorruptionInfo[]>; | ||
| /** Total number of corrupted lines across all files */ | ||
| totalCorruptedLines: number; | ||
| /** List of files that were scanned */ | ||
| scannedFiles: string[]; | ||
| } | ||
| /** | ||
| * Scans an NDJSON file for UTF-8 corruption | ||
| * | ||
| * @param filePath - Path to the ndjson file | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export declare function scanNdjsonFile(filePath: string): Promise<ScanResult>; | ||
| /** | ||
| * Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files | ||
| * | ||
| * @param filePath - Path to the tar.gz file | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export declare function scanTarGz(filePath: string): Promise<ScanResult>; | ||
| /** | ||
| * Scans a directory for UTF-8 corruption in data.ndjson and assets.json files | ||
| * | ||
| * @param dirPath - Path to the directory | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export declare function scanDirectory(dirPath: string): Promise<ScanResult>; | ||
| /** | ||
| * Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory) | ||
| * | ||
| * The corruption manifests as U+FFFD replacement characters appearing | ||
| * where valid multi-byte characters (CJK, emoji, etc.) should be. | ||
| * | ||
| * @param filePath - Path to the file or directory to scan | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export declare function detectCorruption(filePath: string): Promise<ScanResult>; | ||
| //# sourceMappingURL=detectCorruption.d.ts.map |
| {"version":3,"file":"detectCorruption.d.ts","sourceRoot":"","sources":["../src/detectCorruption.ts"],"names":[],"mappings":"AAWA;;;GAGG;AACH,MAAM,WAAW,cAAc;IAC7B,8BAA8B;IAC9B,IAAI,EAAE,MAAM,CAAA;IACZ,gDAAgD;IAChD,MAAM,EAAE,MAAM,CAAA;IACd,mCAAmC;IACnC,OAAO,EAAE,MAAM,CAAA;IACf,+CAA+C;IAC/C,KAAK,EAAE,MAAM,CAAA;CACd;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IACzB,sCAAsC;IACtC,SAAS,EAAE,OAAO,CAAA;IAClB,qFAAqF;IACrF,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC,CAAA;IACpC,uDAAuD;IACvD,mBAAmB,EAAE,MAAM,CAAA;IAC3B,sCAAsC;IACtC,YAAY,EAAE,MAAM,EAAE,CAAA;CACvB;AAmDD;;;;;;GAMG;AACH,wBAAsB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAe1E;AAED;;;;;;GAMG;AACH,wBAAsB,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAsErE;AAED;;;;;;GAMG;AACH,wBAAsB,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAoCxE;AAED;;;;;;;;;GASG;AACH,wBAAsB,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAS5E"} |
| import { createReadStream, existsSync, statSync } from 'node:fs'; | ||
| import { basename, join } from 'node:path'; | ||
| import { createInterface } from 'node:readline'; | ||
| import { createGunzip } from 'node:zlib'; | ||
| import tarStream from 'tar-stream'; | ||
| // U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded | ||
| const REPLACEMENT_CHAR = '\uFFFD'; | ||
| /** | ||
| * Scans a line for U+FFFD replacement characters | ||
| */ | ||
| function scanLine(line, lineNumber) { | ||
| const index = line.indexOf(REPLACEMENT_CHAR); | ||
| if (index === -1) | ||
| return null; | ||
| // Count total replacement chars on this line | ||
| let count = 0; | ||
| for (const char of line) { | ||
| if (char === REPLACEMENT_CHAR) | ||
| count++; | ||
| } | ||
| // Extract context around the corruption | ||
| const contextStart = Math.max(0, index - 20); | ||
| const contextEnd = Math.min(line.length, index + 30); | ||
| const context = line.slice(contextStart, contextEnd); | ||
| return { | ||
| line: lineNumber, | ||
| column: index + 1, | ||
| context, | ||
| count, | ||
| }; | ||
| } | ||
| /** | ||
| * Scans a readable stream (expecting UTF-8 text) for corruption | ||
| */ | ||
| async function scanStream(stream) { | ||
| const corruptions = []; | ||
| let lineNumber = 0; | ||
| const rl = createInterface({ | ||
| input: stream, | ||
| crlfDelay: Infinity, | ||
| }); | ||
| for await (const line of rl) { | ||
| lineNumber++; | ||
| const corruption = scanLine(line, lineNumber); | ||
| if (corruption) { | ||
| corruptions.push(corruption); | ||
| } | ||
| } | ||
| return corruptions; | ||
| } | ||
| /** | ||
| * Scans an NDJSON file for UTF-8 corruption | ||
| * | ||
| * @param filePath - Path to the ndjson file | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export async function scanNdjsonFile(filePath) { | ||
| const stream = createReadStream(filePath, { encoding: 'utf8' }); | ||
| const corruptions = await scanStream(stream); | ||
| const files = new Map(); | ||
| if (corruptions.length > 0) { | ||
| files.set(filePath, corruptions); | ||
| } | ||
| return { | ||
| corrupted: corruptions.length > 0, | ||
| files, | ||
| totalCorruptedLines: corruptions.length, | ||
| scannedFiles: [filePath], | ||
| }; | ||
| } | ||
| /** | ||
| * Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files | ||
| * | ||
| * @param filePath - Path to the tar.gz file | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export async function scanTarGz(filePath) { | ||
| const extract = tarStream.extract(); | ||
| const results = new Map(); | ||
| const scannedFiles = []; | ||
| const targetFiles = ['data.ndjson', 'asset.json']; | ||
| return new Promise((resolve, reject) => { | ||
| extract.on('entry', (header, stream, next) => { | ||
| const fileBasename = basename(header.name); | ||
| if (targetFiles.includes(fileBasename)) { | ||
| scannedFiles.push(header.name); | ||
| const chunks = []; | ||
| stream.on('data', (chunk) => { | ||
| chunks.push(chunk); | ||
| }); | ||
| stream.on('end', () => { | ||
| // Combine all chunks and convert to string | ||
| const content = Buffer.concat(chunks).toString('utf8'); | ||
| const lines = content.split(/\r?\n/); | ||
| const corruptions = []; | ||
| for (let i = 0; i < lines.length; i++) { | ||
| const line = lines[i]; | ||
| if (line !== undefined && line.length > 0) { | ||
| const corruption = scanLine(line, i + 1); | ||
| if (corruption) { | ||
| corruptions.push(corruption); | ||
| } | ||
| } | ||
| } | ||
| if (corruptions.length > 0) { | ||
| results.set(header.name, corruptions); | ||
| } | ||
| next(); | ||
| }); | ||
| stream.on('error', reject); | ||
| } | ||
| else { | ||
| // Skip this entry | ||
| stream.on('end', next); | ||
| stream.resume(); | ||
| } | ||
| }); | ||
| extract.on('finish', () => { | ||
| let totalCorruptedLines = 0; | ||
| for (const corruptions of results.values()) { | ||
| totalCorruptedLines += corruptions.length; | ||
| } | ||
| resolve({ | ||
| corrupted: results.size > 0, | ||
| files: results, | ||
| totalCorruptedLines, | ||
| scannedFiles, | ||
| }); | ||
| }); | ||
| extract.on('error', reject); | ||
| const gunzip = createGunzip(); | ||
| gunzip.on('error', reject); | ||
| createReadStream(filePath).pipe(gunzip).pipe(extract); | ||
| }); | ||
| } | ||
| /** | ||
| * Scans a directory for UTF-8 corruption in data.ndjson and assets.json files | ||
| * | ||
| * @param dirPath - Path to the directory | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export async function scanDirectory(dirPath) { | ||
| const targetFiles = ['data.ndjson', 'assets.json']; | ||
| const foundFiles = []; | ||
| for (const filename of targetFiles) { | ||
| const filePath = join(dirPath, filename); | ||
| if (existsSync(filePath)) { | ||
| foundFiles.push(filePath); | ||
| } | ||
| } | ||
| if (foundFiles.length === 0) { | ||
| throw new Error(`No data.ndjson or assets.json found in directory: ${dirPath}`); | ||
| } | ||
| const results = new Map(); | ||
| const scannedFiles = []; | ||
| let totalCorruptedLines = 0; | ||
| for (const filePath of foundFiles) { | ||
| const result = await scanNdjsonFile(filePath); | ||
| scannedFiles.push(...result.scannedFiles); | ||
| for (const [file, corruptions] of result.files) { | ||
| results.set(file, corruptions); | ||
| totalCorruptedLines += corruptions.length; | ||
| } | ||
| } | ||
| return { | ||
| corrupted: results.size > 0, | ||
| files: results, | ||
| totalCorruptedLines, | ||
| scannedFiles, | ||
| }; | ||
| } | ||
| /** | ||
| * Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory) | ||
| * | ||
| * The corruption manifests as U+FFFD replacement characters appearing | ||
| * where valid multi-byte characters (CJK, emoji, etc.) should be. | ||
| * | ||
| * @param filePath - Path to the file or directory to scan | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export async function detectCorruption(filePath) { | ||
| const stat = statSync(filePath); | ||
| if (stat.isDirectory()) { | ||
| return scanDirectory(filePath); | ||
| } | ||
| const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz'); | ||
| return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath); | ||
| } | ||
| //# sourceMappingURL=detectCorruption.js.map |
| {"version":3,"file":"detectCorruption.js","sourceRoot":"","sources":["../src/detectCorruption.ts"],"names":[],"mappings":"AAAA,OAAO,EAAC,gBAAgB,EAAE,UAAU,EAAE,QAAQ,EAAC,MAAM,SAAS,CAAA;AAC9D,OAAO,EAAC,QAAQ,EAAE,IAAI,EAAC,MAAM,WAAW,CAAA;AACxC,OAAO,EAAC,eAAe,EAAC,MAAM,eAAe,CAAA;AAE7C,OAAO,EAAC,YAAY,EAAC,MAAM,WAAW,CAAA;AAEtC,OAAO,SAAS,MAAM,YAAY,CAAA;AAElC,kFAAkF;AAClF,MAAM,gBAAgB,GAAG,QAAQ,CAAA;AAgCjC;;GAEG;AACH,SAAS,QAAQ,CAAC,IAAY,EAAE,UAAkB;IAChD,MAAM,KAAK,GAAG,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAA;IAC5C,IAAI,KAAK,KAAK,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IAE7B,6CAA6C;IAC7C,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,MAAM,IAAI,IAAI,IAAI,EAAE,CAAC;QACxB,IAAI,IAAI,KAAK,gBAAgB;YAAE,KAAK,EAAE,CAAA;IACxC,CAAC;IAED,wCAAwC;IACxC,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,GAAG,EAAE,CAAC,CAAA;IAC5C,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,KAAK,GAAG,EAAE,CAAC,CAAA;IACpD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,EAAE,UAAU,CAAC,CAAA;IAEpD,OAAO;QACL,IAAI,EAAE,UAAU;QAChB,MAAM,EAAE,KAAK,GAAG,CAAC;QACjB,OAAO;QACP,KAAK;KACN,CAAA;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,UAAU,CAAC,MAAgB;IACxC,MAAM,WAAW,GAAqB,EAAE,CAAA;IACxC,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,MAAM,EAAE,GAAG,eAAe,CAAC;QACzB,KAAK,EAAE,MAAM;QACb,SAAS,EAAE,QAAQ;KACpB,CAAC,CAAA;IAEF,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,EAAE,EAAE,CAAC;QAC5B,UAAU,EAAE,CAAA;QACZ,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,EAAE,UAAU,CAAC,CAAA;QAC7C,IAAI,UAAU,EAAE,CAAC;YACf,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QAC9B,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAA;AACpB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,QAAgB;IACnD,MAAM,MAAM,GAAG,gBAAgB,CAAC,QAAQ,EAAE,EAAC,QAAQ,EAAE,MAAM,EAAC,CAAC,CAAA;IAC7D,MAAM,WAAW,GAAG,MAAM,UAAU,CAAC,MAAM,CAAC,CAAA;IAE5C,MAAM,KAAK,GAAG,IAAI,GAAG,EAA4B,CAAA;IACjD,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,KAAK,CAAC,GAAG,CAAC,QAAQ,EAAE,WAAW,CAAC,CAAA;IAClC,CAAC;IAED,OAAO;QACL,SAAS,EAAE,WAAW,CAAC,MAAM,GAAG,CAAC;QACjC,KAAK;QACL,mBAAmB,EAAE,WAAW,CAAC,MAAM;QACvC,YAAY,EAAE,CAAC,QAAQ,CAAC;KACzB,CAAA;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,QAAgB;IAC9C,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,EAAE,CAAA;IAEnC,MAAM,OAAO,GAAG,IAAI,GAAG,EAA4B,CAAA;IACnD,MAAM,YAAY,GAAa,EAAE,CAAA;IACjC,MAAM,WAAW,GAAG,CAAC,aAAa,EAAE,YAAY,CAAC,CAAA;IAEjD,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,EAAE;YAC3C,MAAM,YAAY,GAAG,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;YAE1C,IAAI,WAAW,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;gBACvC,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;gBAC9B,MAAM,MAAM,GAAa,EAAE,CAAA;gBAE3B,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAa,EAAE,EAAE;oBAClC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;gBACpB,CAAC,CAAC,CAAA;gBAEF,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE;oBACpB,2CAA2C;oBAC3C,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAA;oBACtD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAA;oBACpC,MAAM,WAAW,GAAqB,EAAE,CAAA;oBAExC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;wBACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAA;wBACrB,IAAI,IAAI,KAAK,SAAS,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;4BAC1C,MAAM,UAAU,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAA;4BACxC,IAAI,UAAU,EAAE,CAAC;gCACf,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;4BAC9B,CAAC;wBACH,CAAC;oBACH,CAAC;oBAED,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBAC3B,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,EAAE,WAAW,CAAC,CAAA;oBACvC,CAAC;oBACD,IAAI,EAAE,CAAA;gBACR,CAAC,CAAC,CAAA;gBAEF,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;YAC5B,CAAC;iBAAM,CAAC;gBACN,kBAAkB;gBAClB,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,IAAI,CAAC,CAAA;gBACtB,MAAM,CAAC,MAAM,EAAE,CAAA;YACjB,CAAC;QACH,CAAC,CAAC,CAAA;QAEF,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE;YACxB,IAAI,mBAAmB,GAAG,CAAC,CAAA;YAC3B,KAAK,MAAM,WAAW,IAAI,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;gBAC3C,mBAAmB,IAAI,WAAW,CAAC,MAAM,CAAA;YAC3C,CAAC;YAED,OAAO,CAAC;gBACN,SAAS,EAAE,OAAO,CAAC,IAAI,GAAG,CAAC;gBAC3B,KAAK,EAAE,OAAO;gBACd,mBAAmB;gBACnB,YAAY;aACb,CAAC,CAAA;QACJ,CAAC,CAAC,CAAA;QAEF,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QAE3B,MAAM,MAAM,GAAG,YAAY,EAAE,CAAA;QAC7B,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,CAAC,CAAA;QAE1B,gBAAgB,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;AACJ,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,OAAe;IACjD,MAAM,WAAW,GAAG,CAAC,aAAa,EAAE,aAAa,CAAC,CAAA;IAClD,MAAM,UAAU,GAAa,EAAE,CAAA;IAE/B,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;QACnC,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAA;QACxC,IAAI,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YACzB,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;QAC3B,CAAC;IACH,CAAC;IAED,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,KAAK,CACb,qDAAqD,OAAO,EAAE,CAC/D,CAAA;IACH,CAAC;IAED,MAAM,OAAO,GAAG,IAAI,GAAG,EAA4B,CAAA;IACnD,MAAM,YAAY,GAAa,EAAE,CAAA;IACjC,IAAI,mBAAmB,GAAG,CAAC,CAAA;IAE3B,KAAK,MAAM,QAAQ,IAAI,UAAU,EAAE,CAAC;QAClC,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,CAAA;QAC7C,YAAY,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,YAAY,CAAC,CAAA;QACzC,KAAK,MAAM,CAAC,IAAI,EAAE,WAAW,CAAC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;YAC/C,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,WAAW,CAAC,CAAA;YAC9B,mBAAmB,IAAI,WAAW,CAAC,MAAM,CAAA;QAC3C,CAAC;IACH,CAAC;IAED,OAAO;QACL,SAAS,EAAE,OAAO,CAAC,IAAI,GAAG,CAAC;QAC3B,KAAK,EAAE,OAAO;QACd,mBAAmB;QACnB,YAAY;KACb,CAAA;AACH,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,QAAgB;IACrD,MAAM,IAAI,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAA;IAE/B,IAAI,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC;QACvB,OAAO,aAAa,CAAC,QAAQ,CAAC,CAAA;IAChC,CAAC;IAED,MAAM,MAAM,GAAG,QAAQ,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAA;IACxE,OAAO,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,QAAQ,CAAC,CAAA;AAChE,CAAC"} |
| import {createReadStream, existsSync, statSync} from 'node:fs' | ||
| import {basename, join} from 'node:path' | ||
| import {createInterface} from 'node:readline' | ||
| import type {Readable} from 'node:stream' | ||
| import {createGunzip} from 'node:zlib' | ||
| import tarStream from 'tar-stream' | ||
| // U+FFFD replacement character - appears when invalid UTF-8 sequences are decoded | ||
| const REPLACEMENT_CHAR = '\uFFFD' | ||
| /** | ||
| * Information about corruption found on a specific line | ||
| * @public | ||
| */ | ||
| export interface CorruptionInfo { | ||
| /** Line number (1-indexed) */ | ||
| line: number | ||
| /** Column position of first replacement char */ | ||
| column: number | ||
| /** Surrounding text for context */ | ||
| context: string | ||
| /** Number of replacement chars on this line */ | ||
| count: number | ||
| } | ||
| /** | ||
| * Result of scanning a file for corruption | ||
| * @public | ||
| */ | ||
| export interface ScanResult { | ||
| /** Whether corruption was detected */ | ||
| corrupted: boolean | ||
| /** Map of filename to corruption info (for tar.gz, multiple files may be scanned) */ | ||
| files: Map<string, CorruptionInfo[]> | ||
| /** Total number of corrupted lines across all files */ | ||
| totalCorruptedLines: number | ||
| /** List of files that were scanned */ | ||
| scannedFiles: string[] | ||
| } | ||
| /** | ||
| * Scans a line for U+FFFD replacement characters | ||
| */ | ||
| function scanLine(line: string, lineNumber: number): CorruptionInfo | null { | ||
| const index = line.indexOf(REPLACEMENT_CHAR) | ||
| if (index === -1) return null | ||
| // Count total replacement chars on this line | ||
| let count = 0 | ||
| for (const char of line) { | ||
| if (char === REPLACEMENT_CHAR) count++ | ||
| } | ||
| // Extract context around the corruption | ||
| const contextStart = Math.max(0, index - 20) | ||
| const contextEnd = Math.min(line.length, index + 30) | ||
| const context = line.slice(contextStart, contextEnd) | ||
| return { | ||
| line: lineNumber, | ||
| column: index + 1, | ||
| context, | ||
| count, | ||
| } | ||
| } | ||
| /** | ||
| * Scans a readable stream (expecting UTF-8 text) for corruption | ||
| */ | ||
| async function scanStream(stream: Readable): Promise<CorruptionInfo[]> { | ||
| const corruptions: CorruptionInfo[] = [] | ||
| let lineNumber = 0 | ||
| const rl = createInterface({ | ||
| input: stream, | ||
| crlfDelay: Infinity, | ||
| }) | ||
| for await (const line of rl) { | ||
| lineNumber++ | ||
| const corruption = scanLine(line, lineNumber) | ||
| if (corruption) { | ||
| corruptions.push(corruption) | ||
| } | ||
| } | ||
| return corruptions | ||
| } | ||
| /** | ||
| * Scans an NDJSON file for UTF-8 corruption | ||
| * | ||
| * @param filePath - Path to the ndjson file | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export async function scanNdjsonFile(filePath: string): Promise<ScanResult> { | ||
| const stream = createReadStream(filePath, {encoding: 'utf8'}) | ||
| const corruptions = await scanStream(stream) | ||
| const files = new Map<string, CorruptionInfo[]>() | ||
| if (corruptions.length > 0) { | ||
| files.set(filePath, corruptions) | ||
| } | ||
| return { | ||
| corrupted: corruptions.length > 0, | ||
| files, | ||
| totalCorruptedLines: corruptions.length, | ||
| scannedFiles: [filePath], | ||
| } | ||
| } | ||
| /** | ||
| * Scans a tar.gz archive for UTF-8 corruption in data.ndjson and asset.json files | ||
| * | ||
| * @param filePath - Path to the tar.gz file | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export async function scanTarGz(filePath: string): Promise<ScanResult> { | ||
| const extract = tarStream.extract() | ||
| const results = new Map<string, CorruptionInfo[]>() | ||
| const scannedFiles: string[] = [] | ||
| const targetFiles = ['data.ndjson', 'asset.json'] | ||
| return new Promise((resolve, reject) => { | ||
| extract.on('entry', (header, stream, next) => { | ||
| const fileBasename = basename(header.name) | ||
| if (targetFiles.includes(fileBasename)) { | ||
| scannedFiles.push(header.name) | ||
| const chunks: Buffer[] = [] | ||
| stream.on('data', (chunk: Buffer) => { | ||
| chunks.push(chunk) | ||
| }) | ||
| stream.on('end', () => { | ||
| // Combine all chunks and convert to string | ||
| const content = Buffer.concat(chunks).toString('utf8') | ||
| const lines = content.split(/\r?\n/) | ||
| const corruptions: CorruptionInfo[] = [] | ||
| for (let i = 0; i < lines.length; i++) { | ||
| const line = lines[i] | ||
| if (line !== undefined && line.length > 0) { | ||
| const corruption = scanLine(line, i + 1) | ||
| if (corruption) { | ||
| corruptions.push(corruption) | ||
| } | ||
| } | ||
| } | ||
| if (corruptions.length > 0) { | ||
| results.set(header.name, corruptions) | ||
| } | ||
| next() | ||
| }) | ||
| stream.on('error', reject) | ||
| } else { | ||
| // Skip this entry | ||
| stream.on('end', next) | ||
| stream.resume() | ||
| } | ||
| }) | ||
| extract.on('finish', () => { | ||
| let totalCorruptedLines = 0 | ||
| for (const corruptions of results.values()) { | ||
| totalCorruptedLines += corruptions.length | ||
| } | ||
| resolve({ | ||
| corrupted: results.size > 0, | ||
| files: results, | ||
| totalCorruptedLines, | ||
| scannedFiles, | ||
| }) | ||
| }) | ||
| extract.on('error', reject) | ||
| const gunzip = createGunzip() | ||
| gunzip.on('error', reject) | ||
| createReadStream(filePath).pipe(gunzip).pipe(extract) | ||
| }) | ||
| } | ||
| /** | ||
| * Scans a directory for UTF-8 corruption in data.ndjson and assets.json files | ||
| * | ||
| * @param dirPath - Path to the directory | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export async function scanDirectory(dirPath: string): Promise<ScanResult> { | ||
| const targetFiles = ['data.ndjson', 'assets.json'] | ||
| const foundFiles: string[] = [] | ||
| for (const filename of targetFiles) { | ||
| const filePath = join(dirPath, filename) | ||
| if (existsSync(filePath)) { | ||
| foundFiles.push(filePath) | ||
| } | ||
| } | ||
| if (foundFiles.length === 0) { | ||
| throw new Error( | ||
| `No data.ndjson or assets.json found in directory: ${dirPath}`, | ||
| ) | ||
| } | ||
| const results = new Map<string, CorruptionInfo[]>() | ||
| const scannedFiles: string[] = [] | ||
| let totalCorruptedLines = 0 | ||
| for (const filePath of foundFiles) { | ||
| const result = await scanNdjsonFile(filePath) | ||
| scannedFiles.push(...result.scannedFiles) | ||
| for (const [file, corruptions] of result.files) { | ||
| results.set(file, corruptions) | ||
| totalCorruptedLines += corruptions.length | ||
| } | ||
| } | ||
| return { | ||
| corrupted: results.size > 0, | ||
| files: results, | ||
| totalCorruptedLines, | ||
| scannedFiles, | ||
| } | ||
| } | ||
| /** | ||
| * Detects UTF-8 corruption in an export file (ndjson, tar.gz, or directory) | ||
| * | ||
| * The corruption manifests as U+FFFD replacement characters appearing | ||
| * where valid multi-byte characters (CJK, emoji, etc.) should be. | ||
| * | ||
| * @param filePath - Path to the file or directory to scan | ||
| * @returns Scan result with corruption information | ||
| * @public | ||
| */ | ||
| export async function detectCorruption(filePath: string): Promise<ScanResult> { | ||
| const stat = statSync(filePath) | ||
| if (stat.isDirectory()) { | ||
| return scanDirectory(filePath) | ||
| } | ||
| const isGzip = filePath.endsWith('.tar.gz') || filePath.endsWith('.tgz') | ||
| return isGzip ? scanTarGz(filePath) : scanNdjsonFile(filePath) | ||
| } |
+8
-2
| { | ||
| "name": "@sanity/export", | ||
| "version": "6.0.3", | ||
| "version": "6.0.4", | ||
| "description": "Export Sanity documents and assets", | ||
@@ -33,3 +33,7 @@ "keywords": [ | ||
| "types": "./dist/index.d.ts", | ||
| "bin": { | ||
| "detect-corrupt": "./bin/detect-corrupt.js" | ||
| }, | ||
| "files": [ | ||
| "bin", | ||
| "dist", | ||
@@ -51,3 +55,4 @@ "src" | ||
| "json-stream-stringify": "^3.1.6", | ||
| "p-queue": "^9.0.1" | ||
| "p-queue": "^9.0.1", | ||
| "tar-stream": "^3.1.7" | ||
| }, | ||
@@ -61,2 +66,3 @@ "devDependencies": { | ||
| "@types/node": "^20.19.0", | ||
| "@types/tar-stream": "^3.1.3", | ||
| "@vitest/coverage-v8": "^4.0.15", | ||
@@ -63,0 +69,0 @@ "eslint": "^8.57.0", |
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
215663
13.26%114
5.56%4049
16.05%6
20%18
5.88%+ Added