@marswave/coli
Advanced tools
@@ -6,3 +6,3 @@ import { Buffer } from 'node:buffer'; | ||
| import { convertToWav, readWave, runAsr, } from './asr.js'; | ||
| import { ensureModels, ensureVadModel } from './models.js'; | ||
| import { ensureModels, ensureVadModel, resolveAsrModelFiles, resolveVadModelFile, } from './models.js'; | ||
| import { streamAsr } from './stream-asr.js'; | ||
@@ -16,2 +16,3 @@ export function register(program) { | ||
| .option('--model <name>', 'Model to use: whisper, sensevoice', 'sensevoice') | ||
| .option('--model-path <path>', 'Path to a local model file or directory') | ||
| .option('--language <lang>', 'Language for sensevoice: auto, zh, en, ja, ko, yue', 'auto') | ||
@@ -27,3 +28,8 @@ .action(async (file, options) => { | ||
| } | ||
| await ensureModels([model]); | ||
| if (options.modelPath) { | ||
| resolveAsrModelFiles(model, options.modelPath); | ||
| } | ||
| else { | ||
| await ensureModels([model]); | ||
| } | ||
| const resolvedPath = path.resolve(file); | ||
@@ -45,2 +51,3 @@ const ext = path.extname(resolvedPath).toLowerCase(); | ||
| model, | ||
| modelPath: options.modelPath, | ||
| // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion | ||
@@ -61,2 +68,4 @@ language: options.language, | ||
| .option('--vad', 'Enable voice activity detection', false) | ||
| .option('--model-path <path>', 'Path to a local SenseVoice model file or directory') | ||
| .option('--vad-model-path <path>', 'Path to a local VAD model file') | ||
| .option('--language <lang>', 'Language for sensevoice: auto, zh, en, ja, ko, yue', 'auto') | ||
@@ -69,4 +78,17 @@ .option('--asr-interval-ms <ms>', 'Recognition interval in ms (ignored with --vad)', '1000') | ||
| } | ||
| await ensureModels(); | ||
| if (options.vad) { | ||
| if (options.modelPath) { | ||
| resolveAsrModelFiles('sensevoice', options.modelPath); | ||
| } | ||
| else { | ||
| await ensureModels(); | ||
| } | ||
| if (options.vadModelPath) { | ||
| if (options.vad) { | ||
| resolveVadModelFile(options.vadModelPath); | ||
| } | ||
| else { | ||
| throw new Error('Use --vad with --vad-model-path.'); | ||
| } | ||
| } | ||
| else if (options.vad) { | ||
| await ensureVadModel(); | ||
@@ -89,3 +111,4 @@ } | ||
| language: options.language, | ||
| vad: options.vad || undefined, | ||
| modelPath: options.modelPath, | ||
| vad: options.vad ? { modelPath: options.vadModelPath } : undefined, | ||
| asrIntervalMs: Number(options.asrIntervalMs), | ||
@@ -92,0 +115,0 @@ onResult(result) { |
@@ -1,3 +0,3 @@ | ||
| export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js'; | ||
| export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, type ModelName, } from './models.js'; | ||
| export { convertToWav, readWave, runAsr, type AsrOptions, type AudioData, type SenseVoiceLanguage, } from './asr.js'; | ||
| export { streamAsr, type AsrStreamResult, type StreamAsrOptions, type VadOptions, } from './stream-asr.js'; |
@@ -0,4 +1,4 @@ | ||
| import { type ModelName } from './models.js'; | ||
| export declare function readWave(filename: string): AudioData; | ||
| export declare function convertToWav(inputPath: string): Promise<string>; | ||
| type ModelName = 'whisper' | 'sensevoice'; | ||
| export type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue'; | ||
@@ -8,2 +8,3 @@ export type AsrOptions = { | ||
| model: ModelName; | ||
| modelPath?: string | undefined; | ||
| language?: SenseVoiceLanguage; | ||
@@ -16,2 +17,1 @@ }; | ||
| export declare function runAsr(input: string | AudioData, options: AsrOptions): Promise<void>; | ||
| export {}; |
@@ -8,3 +8,3 @@ import fs from 'node:fs'; | ||
| import { deprecationAsrFilePath } from '../deprecations.js'; | ||
| import { getModelPath, modelDisplayNames } from './models.js'; | ||
| import { modelDisplayNames, resolveAsrModelFiles, } from './models.js'; | ||
| const require = createRequire(import.meta.url); | ||
@@ -46,6 +46,5 @@ // Loaded lazily to avoid loading the native addon until needed | ||
| } | ||
| function createRecognizer(model, language) { | ||
| const modelDir = getModelPath(model); | ||
| function createRecognizer(modelFiles, language) { | ||
| const onnx = sherpaOnnx(); | ||
| if (model === 'whisper') { | ||
| if (modelFiles.model === 'whisper') { | ||
| return new onnx.OfflineRecognizer({ | ||
@@ -55,6 +54,6 @@ featConfig: { sampleRate: 16_000, featureDim: 80 }, | ||
| whisper: { | ||
| encoder: path.join(modelDir, 'tiny.en-encoder.int8.onnx'), | ||
| decoder: path.join(modelDir, 'tiny.en-decoder.int8.onnx'), | ||
| encoder: modelFiles.files.encoder, | ||
| decoder: modelFiles.files.decoder, | ||
| }, | ||
| tokens: path.join(modelDir, 'tiny.en-tokens.txt'), | ||
| tokens: modelFiles.files.tokens, | ||
| numThreads: 2, | ||
@@ -70,7 +69,7 @@ provider: 'cpu', | ||
| senseVoice: { | ||
| model: path.join(modelDir, 'model.int8.onnx'), | ||
| model: modelFiles.files.model, | ||
| useInverseTextNormalization: 1, | ||
| language: language ?? 'auto', | ||
| }, | ||
| tokens: path.join(modelDir, 'tokens.txt'), | ||
| tokens: modelFiles.files.tokens, | ||
| numThreads: 2, | ||
@@ -83,2 +82,3 @@ provider: 'cpu', | ||
| export async function runAsr(input, options) { | ||
| const modelFiles = resolveAsrModelFiles(options.model, options.modelPath); | ||
| let wave; | ||
@@ -107,3 +107,3 @@ let needsCleanup = false; | ||
| try { | ||
| const recognizer = createRecognizer(options.model, options.language); | ||
| const recognizer = createRecognizer(modelFiles, options.language); | ||
| const stream = recognizer.createStream(); | ||
@@ -110,0 +110,0 @@ stream.acceptWaveform({ sampleRate: wave.sampleRate, samples: wave.samples }); |
@@ -1,7 +0,32 @@ | ||
| type ModelName = 'whisper' | 'sensevoice'; | ||
| export type ModelName = 'whisper' | 'sensevoice'; | ||
| export type WhisperModelFiles = { | ||
| encoder: string; | ||
| decoder: string; | ||
| tokens: string; | ||
| }; | ||
| export type SenseVoiceModelFiles = { | ||
| model: string; | ||
| tokens: string; | ||
| }; | ||
| export type AsrModelFiles = { | ||
| model: 'whisper'; | ||
| files: WhisperModelFiles; | ||
| } | { | ||
| model: 'sensevoice'; | ||
| files: SenseVoiceModelFiles; | ||
| }; | ||
| export declare const modelDisplayNames: Record<ModelName, string>; | ||
| export declare function getModelPath(model: ModelName): string; | ||
| export declare function resolveAsrModelFiles(model: 'whisper', modelPath?: string): { | ||
| model: 'whisper'; | ||
| files: WhisperModelFiles; | ||
| }; | ||
| export declare function resolveAsrModelFiles(model: 'sensevoice', modelPath?: string): { | ||
| model: 'sensevoice'; | ||
| files: SenseVoiceModelFiles; | ||
| }; | ||
| export declare function resolveAsrModelFiles(model: ModelName, modelPath?: string): AsrModelFiles; | ||
| export declare function ensureModels(modelNames?: ModelName[]): Promise<void>; | ||
| export declare function getVadModelPath(): string; | ||
| export declare function resolveVadModelFile(modelPath?: string): string; | ||
| export declare function ensureVadModel(): Promise<void>; | ||
| export {}; |
@@ -59,2 +59,51 @@ import { createHash } from 'node:crypto'; | ||
| } | ||
| function assertExistingFile(filePath, label) { | ||
| if (!fs.existsSync(filePath)) { | ||
| throw new Error(`${label} not found: ${filePath}`); | ||
| } | ||
| if (!fs.statSync(filePath).isFile()) { | ||
| throw new Error(`${label} must be a file: ${filePath}`); | ||
| } | ||
| } | ||
| function resolveModelDirectory(modelPath, defaultDirectory) { | ||
| if (!modelPath) { | ||
| return { directory: defaultDirectory }; | ||
| } | ||
| const resolvedPath = path.resolve(modelPath); | ||
| if (!fs.existsSync(resolvedPath)) { | ||
| throw new Error(`Model path not found: ${resolvedPath}`); | ||
| } | ||
| const stat = fs.statSync(resolvedPath); | ||
| if (stat.isDirectory()) { | ||
| return { directory: resolvedPath }; | ||
| } | ||
| if (stat.isFile()) { | ||
| return { directory: path.dirname(resolvedPath), filePath: resolvedPath }; | ||
| } | ||
| throw new Error(`Model path must be a file or directory: ${resolvedPath}`); | ||
| } | ||
| export function resolveAsrModelFiles(model, modelPath) { | ||
| const { directory, filePath } = resolveModelDirectory(modelPath, getModelPath(model)); | ||
| if (model === 'whisper') { | ||
| if (filePath) { | ||
| throw new Error('Custom whisper model path must be a directory containing tiny.en-encoder.int8.onnx, tiny.en-decoder.int8.onnx, and tiny.en-tokens.txt.'); | ||
| } | ||
| const files = { | ||
| encoder: path.join(directory, 'tiny.en-encoder.int8.onnx'), | ||
| decoder: path.join(directory, 'tiny.en-decoder.int8.onnx'), | ||
| tokens: path.join(directory, 'tiny.en-tokens.txt'), | ||
| }; | ||
| assertExistingFile(files.encoder, 'Whisper encoder model'); | ||
| assertExistingFile(files.decoder, 'Whisper decoder model'); | ||
| assertExistingFile(files.tokens, 'Whisper tokens file'); | ||
| return { model, files }; | ||
| } | ||
| const files = { | ||
| model: filePath ?? path.join(directory, 'model.int8.onnx'), | ||
| tokens: path.join(directory, 'tokens.txt'), | ||
| }; | ||
| assertExistingFile(files.model, 'SenseVoice model'); | ||
| assertExistingFile(files.tokens, 'SenseVoice tokens file'); | ||
| return { model, files }; | ||
| } | ||
| async function getFileSha256(filePath) { | ||
@@ -202,2 +251,7 @@ const hash = createHash('sha256'); | ||
| } | ||
| export function resolveVadModelFile(modelPath) { | ||
| const resolvedPath = modelPath ? path.resolve(modelPath) : getVadModelPath(); | ||
| assertExistingFile(resolvedPath, 'VAD model'); | ||
| return resolvedPath; | ||
| } | ||
| export async function ensureVadModel() { | ||
@@ -204,0 +258,0 @@ const modelPath = getVadModelPath(); |
@@ -12,2 +12,3 @@ import type { SenseVoiceLanguage } from './asr.js'; | ||
| export type VadOptions = { | ||
| modelPath?: string | undefined; | ||
| threshold?: number; | ||
@@ -23,2 +24,3 @@ minSpeechDuration?: number; | ||
| language?: SenseVoiceLanguage; | ||
| modelPath?: string | undefined; | ||
| vad?: boolean | VadOptions; | ||
@@ -25,0 +27,0 @@ onResult: (result: AsrStreamResult) => void; |
| import { createRequire } from 'node:module'; | ||
| import path from 'node:path'; | ||
| import { getModelPath, getVadModelPath } from './models.js'; | ||
| import { resolveAsrModelFiles, resolveVadModelFile, } from './models.js'; | ||
| const require = createRequire(import.meta.url); | ||
@@ -13,4 +12,3 @@ let _sherpaOnnx; | ||
| const defaultAsrIntervalMs = 1000; | ||
| function createRecognizer(language) { | ||
| const modelDir = getModelPath('sensevoice'); | ||
| function createRecognizer(modelFiles, language) { | ||
| const onnx = sherpaOnnx(); | ||
@@ -21,7 +19,7 @@ return new onnx.OfflineRecognizer({ | ||
| senseVoice: { | ||
| model: path.join(modelDir, 'model.int8.onnx'), | ||
| model: modelFiles.model, | ||
| useInverseTextNormalization: 1, | ||
| language: language ?? 'auto', | ||
| }, | ||
| tokens: path.join(modelDir, 'tokens.txt'), | ||
| tokens: modelFiles.tokens, | ||
| numThreads: 2, | ||
@@ -53,5 +51,6 @@ provider: 'cpu', | ||
| const onnx = sherpaOnnx(); | ||
| const modelPath = resolveVadModelFile(vadOptions.modelPath); | ||
| return new onnx.Vad({ | ||
| sileroVad: { | ||
| model: getVadModelPath(), | ||
| model: modelPath, | ||
| threshold: vadOptions.threshold ?? 0.5, | ||
@@ -75,3 +74,4 @@ minSpeechDuration: vadOptions.minSpeechDuration ?? 0.25, | ||
| async function streamWithVad(audio, options, vadOptions) { | ||
| const recognizer = createRecognizer(options.language); | ||
| const modelFiles = resolveAsrModelFiles('sensevoice', options.modelPath); | ||
| const recognizer = createRecognizer(modelFiles.files, options.language); | ||
| const vad = createVad(vadOptions); | ||
@@ -110,3 +110,4 @@ const { windowSize } = vad.config.sileroVad; | ||
| const chunkInterval = (defaultSampleRate * intervalMs) / 1000; | ||
| const recognizer = createRecognizer(options.language); | ||
| const modelFiles = resolveAsrModelFiles('sensevoice', options.modelPath); | ||
| const recognizer = createRecognizer(modelFiles.files, options.language); | ||
| const buffers = []; | ||
@@ -113,0 +114,0 @@ let totalSamples = 0; |
+44
-20
@@ -21,2 +21,5 @@ # ASR (Automatic Speech Recognition) | ||
| # Use a local model file or directory without downloading | ||
| coli asr --model-path /path/to/sensevoice/model.int8.onnx recording.wav | ||
| # Specify language (sensevoice only) | ||
@@ -31,2 +34,3 @@ coli asr --language zh recording.wav | ||
| --model Model to use: whisper, sensevoice (default: sensevoice) | ||
| --model-path Path to a local model file or directory | ||
| --language Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto) | ||
@@ -49,2 +53,5 @@ ``` | ||
| # Use local ASR and VAD models without downloading | ||
| ffmpeg -f avfoundation -i :0 -ar 16000 -ac 1 -f s16le pipe:1 | coli asr-stream --model-path /path/to/sensevoice/model.int8.onnx --vad --vad-model-path /path/to/silero_vad.onnx | ||
| # From a file | ||
@@ -59,2 +66,4 @@ ffmpeg -i podcast.m4a -ar 16000 -ac 1 -f s16le pipe:1 | coli asr-stream --vad | ||
| --vad Enable voice activity detection | ||
| --model-path <path> Path to a local SenseVoice model file or directory | ||
| --vad-model-path <path> Path to a local VAD model file | ||
| --language <lang> Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto) | ||
@@ -122,2 +131,8 @@ --asr-interval-ms <ms> Recognition interval in ms (default: 1000, ignored with --vad) | ||
| // Custom model path: no download, fails if local files are missing | ||
| await runAsr( | ||
| {sampleRate: 16000, samples: myFloat32Array}, | ||
| {json: false, model: 'sensevoice', modelPath: '/path/to/model.int8.onnx'}, | ||
| ); | ||
| // Deprecated: file path input (requires ffmpeg for non-WAV formats) | ||
@@ -129,7 +144,8 @@ await runAsr('recording.m4a', {json: false, model: 'sensevoice'}); | ||
| | Property | Type | Description | | ||
| | ---------- | --------------------------- | --------------------------------------------------------------------------------------------------- | | ||
| | `json` | `boolean` | Output JSON (with model name, tokens, timestamps, etc.) instead of plain text | | ||
| | `model` | `'whisper' \| 'sensevoice'` | Which model to use for recognition | | ||
| | `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) | | ||
| | Property | Type | Description | | ||
| | ----------- | --------------------------- | ------------------------------------------------------------------------------------------------------- | | ||
| | `json` | `boolean` | Output JSON (with model name, tokens, timestamps, etc.) instead of plain text | | ||
| | `model` | `'whisper' \| 'sensevoice'` | Which model to use for recognition | | ||
| | `modelPath` | `string` | Path to a local model file or directory. Skips download and throws if required local files are missing. | | ||
| | `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) | | ||
@@ -178,2 +194,3 @@ ### `getModelPath(model)` | ||
| await streamAsr(audioSource, { | ||
| modelPath: '/path/to/model.int8.onnx', | ||
| onResult(result) { | ||
@@ -195,3 +212,8 @@ console.log(result.text, result.isFinal ? '(final)' : '(partial)'); | ||
| await streamAsr(audioSource, { | ||
| vad: {threshold: 0.4, minSilenceDuration: 0.3, maxSpeechDuration: 10}, | ||
| vad: { | ||
| modelPath: '/path/to/silero_vad.onnx', | ||
| threshold: 0.4, | ||
| minSilenceDuration: 0.3, | ||
| maxSpeechDuration: 10, | ||
| }, | ||
| onResult(result) { | ||
@@ -205,19 +227,21 @@ console.log(result.text); | ||
| | Property | Type | Description | | ||
| | --------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------- | | ||
| | `onResult` | `(result: AsrStreamResult) => void` | Callback invoked with each recognition result | | ||
| | `sampleRate` | `number` | Audio sample rate in Hz (default: `16000`) | | ||
| | `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) | | ||
| | `asrIntervalMs` | `number` | Recognition interval in milliseconds (default: `1000`). Ignored when using VAD | | ||
| | `vad` | `boolean \| VadOptions` | Enable VAD. Pass `true` for defaults or a `VadOptions` object | | ||
| | Property | Type | Description | | ||
| | --------------- | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------ | | ||
| | `onResult` | `(result: AsrStreamResult) => void` | Callback invoked with each recognition result | | ||
| | `sampleRate` | `number` | Audio sample rate in Hz (default: `16000`) | | ||
| | `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) | | ||
| | `modelPath` | `string` | Path to a local SenseVoice model file or directory. Skips download and throws if required local files are missing. | | ||
| | `asrIntervalMs` | `number` | Recognition interval in milliseconds (default: `1000`). Ignored when using VAD | | ||
| | `vad` | `boolean \| VadOptions` | Enable VAD. Pass `true` for defaults or a `VadOptions` object | | ||
| **VadOptions** | ||
| | Property | Type | Description | | ||
| | ---------------------- | --------- | ------------------------------------------------------------------ | | ||
| | `threshold` | `number` | Speech detection threshold (default: `0.5`) | | ||
| | `minSpeechDuration` | `number` | Minimum speech duration in seconds (default: `0.25`) | | ||
| | `minSilenceDuration` | `number` | Minimum silence to end a segment in seconds (default: `0.5`) | | ||
| | `maxSpeechDuration` | `number` | Maximum speech segment duration in seconds (default: `15`) | | ||
| | `enableExternalBuffer` | `boolean` | Use external buffer for VAD speech segments (default: `undefined`) | | ||
| | Property | Type | Description | | ||
| | ---------------------- | --------- | --------------------------------------------------------------------- | | ||
| | `modelPath` | `string` | Path to a local VAD model file. Skips download and throws if missing. | | ||
| | `threshold` | `number` | Speech detection threshold (default: `0.5`) | | ||
| | `minSpeechDuration` | `number` | Minimum speech duration in seconds (default: `0.25`) | | ||
| | `minSilenceDuration` | `number` | Minimum silence to end a segment in seconds (default: `0.5`) | | ||
| | `maxSpeechDuration` | `number` | Maximum speech segment duration in seconds (default: `15`) | | ||
| | `enableExternalBuffer` | `boolean` | Use external buffer for VAD speech segments (default: `undefined`) | | ||
@@ -224,0 +248,0 @@ **Result** |
+1
-1
| { | ||
| "name": "@marswave/coli", | ||
| "private": false, | ||
| "version": "0.0.19", | ||
| "version": "0.0.20", | ||
| "description": "A CLI for the Cola", | ||
@@ -6,0 +6,0 @@ "repository": "marswaveai/coli", |
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
66000
9.39%1107
10.48%