@marswave/coli
Advanced tools
@@ -13,2 +13,3 @@ import { Buffer } from 'node:buffer'; | ||
| .option('--model <name>', 'Model to use: whisper, sensevoice', 'sensevoice') | ||
| .option('--language <lang>', 'Language for sensevoice: auto, zh, en, ja, ko, yue', 'auto') | ||
| .action(async (file, options) => { | ||
@@ -19,4 +20,13 @@ const { model } = options; | ||
| } | ||
| const validLanguages = new Set(['auto', 'zh', 'en', 'ja', 'ko', 'yue']); | ||
| if (!validLanguages.has(options.language)) { | ||
| throw new Error(`Unknown language "${options.language}". Use one of: auto, zh, en, ja, ko, yue.`); | ||
| } | ||
| await ensureModels([model]); | ||
| await runAsr(file, { json: options.json, model }); | ||
| await runAsr(file, { | ||
| json: options.json, | ||
| model, | ||
| // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion | ||
| language: options.language, | ||
| }); | ||
| }); | ||
@@ -28,4 +38,9 @@ program | ||
| .option('--vad', 'Enable voice activity detection', false) | ||
| .option('--language <lang>', 'Language for sensevoice: auto, zh, en, ja, ko, yue', 'auto') | ||
| .option('--asr-interval-ms <ms>', 'Recognition interval in ms (ignored with --vad)', '1000') | ||
| .action(async (options) => { | ||
| const validLanguages = new Set(['auto', 'zh', 'en', 'ja', 'ko', 'yue']); | ||
| if (!validLanguages.has(options.language)) { | ||
| throw new Error(`Unknown language "${options.language}". Use one of: auto, zh, en, ja, ko, yue.`); | ||
| } | ||
| await ensureModels(); | ||
@@ -49,2 +64,4 @@ if (options.vad) { | ||
| await streamAsr(stdinAudio(), { | ||
| // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion | ||
| language: options.language, | ||
| vad: options.vad || undefined, | ||
@@ -51,0 +68,0 @@ asrIntervalMs: Number(options.asrIntervalMs), |
| export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js'; | ||
| export { runAsr, type AsrOptions, type SenseVoiceLanguage } from './asr.js'; | ||
| export { streamAsr, type AsrStreamResult, type StreamAsrOptions, type VadOptions, } from './stream-asr.js'; |
| export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js'; | ||
| export { runAsr } from './asr.js'; | ||
| export { streamAsr, } from './stream-asr.js'; |
| type ModelName = 'whisper' | 'sensevoice'; | ||
| export type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue'; | ||
| export type AsrOptions = { | ||
| json: boolean; | ||
| model: ModelName; | ||
| language?: SenseVoiceLanguage; | ||
| }; | ||
| export declare function runAsr(filePath: string, options: AsrOptions): Promise<void>; | ||
| export {}; |
@@ -41,3 +41,3 @@ import fs from 'node:fs'; | ||
| } | ||
| function createRecognizer(model) { | ||
| function createRecognizer(model, language) { | ||
| const modelDir = getModelPath(model); | ||
@@ -66,2 +66,3 @@ const onnx = sherpaOnnx(); | ||
| useInverseTextNormalization: 1, | ||
| language: language ?? 'auto', | ||
| }, | ||
@@ -92,3 +93,3 @@ tokens: path.join(modelDir, 'tokens.txt'), | ||
| const onnx = sherpaOnnx(); | ||
| const recognizer = createRecognizer(options.model); | ||
| const recognizer = createRecognizer(options.model, options.language); | ||
| const stream = recognizer.createStream(); | ||
@@ -95,0 +96,0 @@ const wave = onnx.readWave(wavPath); |
@@ -0,1 +1,2 @@ | ||
| import type { SenseVoiceLanguage } from './asr.js'; | ||
| export type AsrStreamResult = { | ||
@@ -20,2 +21,3 @@ text: string; | ||
| asrIntervalMs?: number; | ||
| language?: SenseVoiceLanguage; | ||
| vad?: boolean | VadOptions; | ||
@@ -22,0 +24,0 @@ onResult: (result: AsrStreamResult) => void; |
@@ -13,3 +13,3 @@ import { createRequire } from 'node:module'; | ||
| const defaultAsrIntervalMs = 1000; | ||
| function createRecognizer() { | ||
| function createRecognizer(language) { | ||
| const modelDir = getModelPath('sensevoice'); | ||
@@ -23,2 +23,3 @@ const onnx = sherpaOnnx(); | ||
| useInverseTextNormalization: 1, | ||
| language: language ?? 'auto', | ||
| }, | ||
@@ -73,3 +74,3 @@ tokens: path.join(modelDir, 'tokens.txt'), | ||
| async function streamWithVad(audio, options, vadOptions) { | ||
| const recognizer = createRecognizer(); | ||
| const recognizer = createRecognizer(options.language); | ||
| const vad = createVad(vadOptions); | ||
@@ -109,3 +110,3 @@ const { windowSize } = vad.config.sileroVad; | ||
| const chunkInterval = (defaultSampleRate * intervalMs) / 1000; | ||
| const recognizer = createRecognizer(); | ||
| const recognizer = createRecognizer(options.language); | ||
| const buffers = []; | ||
@@ -112,0 +113,0 @@ let totalSamples = 0; |
+32
-18
@@ -28,2 +28,5 @@ # ASR (Automatic Speech Recognition) | ||
| coli asr --model whisper recording.wav | ||
| # Specify language (sensevoice only) | ||
| coli asr --language zh recording.m4a | ||
| ``` | ||
@@ -36,2 +39,3 @@ | ||
| --model Model to use: whisper, sensevoice (default: sensevoice) | ||
| --language Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto) | ||
| ``` | ||
@@ -62,2 +66,3 @@ | ||
| --vad Enable voice activity detection | ||
| --language <lang> Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto) | ||
| --asr-interval-ms <ms> Recognition interval in ms (default: 1000, ignored with --vad) | ||
@@ -108,2 +113,9 @@ ``` | ||
| await runAsr('recording.m4a', {json: true, model: 'whisper'}); | ||
| // Force Chinese language (sensevoice only) | ||
| await runAsr('recording.m4a', { | ||
| json: false, | ||
| model: 'sensevoice', | ||
| language: 'zh', | ||
| }); | ||
| ``` | ||
@@ -113,6 +125,7 @@ | ||
| | Property | Type | Description | | ||
| | -------- | --------------------------- | ----------------------------------------------------------------------------- | | ||
| | `json` | `boolean` | Output JSON (with model name, tokens, timestamps, etc.) instead of plain text | | ||
| | `model` | `'whisper' \| 'sensevoice'` | Which model to use for recognition | | ||
| | Property | Type | Description | | ||
| | ---------- | --------------------------- | --------------------------------------------------------------------------------------------------- | | ||
| | `json` | `boolean` | Output JSON (with model name, tokens, timestamps, etc.) instead of plain text | | ||
| | `model` | `'whisper' \| 'sensevoice'` | Which model to use for recognition | | ||
| | `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) | | ||
@@ -186,17 +199,18 @@ ### `getModelPath(model)` | ||
| | Property | Type | Description | | ||
| | --------------- | ----------------------------------- | ---------------------------------------------------------------------------- | | ||
| | `onResult` | `(result: AsrStreamResult) => void` | Callback invoked with each recognition result | | ||
| | `sampleRate` | `number` | Audio sample rate in Hz (default: `16000`) | | ||
| | `asrIntervalMs` | `number` | Recognition interval in milliseconds (default: `1000`). Ignored when using VAD | | ||
| | `vad` | `boolean \| VadOptions` | Enable VAD. Pass `true` for defaults or a `VadOptions` object | | ||
| | Property | Type | Description | | ||
| | --------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------- | | ||
| | `onResult` | `(result: AsrStreamResult) => void` | Callback invoked with each recognition result | | ||
| | `sampleRate` | `number` | Audio sample rate in Hz (default: `16000`) | | ||
| | `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) | | ||
| | `asrIntervalMs` | `number` | Recognition interval in milliseconds (default: `1000`). Ignored when using VAD | | ||
| | `vad` | `boolean \| VadOptions` | Enable VAD. Pass `true` for defaults or a `VadOptions` object | | ||
| **VadOptions** | ||
| | Property | Type | Description | | ||
| | -------------------- | -------- | --------------------------------------------------- | | ||
| | `threshold` | `number` | Speech detection threshold (default: `0.5`) | | ||
| | `minSpeechDuration` | `number` | Minimum speech duration in seconds (default: `0.25`) | | ||
| | `minSilenceDuration` | `number` | Minimum silence to end a segment in seconds (default: `0.5`) | | ||
| | `maxSpeechDuration` | `number` | Maximum speech segment duration in seconds (default: `15`) | | ||
| | Property | Type | Description | | ||
| | ---------------------- | --------- | ------------------------------------------------------------------ | | ||
| | `threshold` | `number` | Speech detection threshold (default: `0.5`) | | ||
| | `minSpeechDuration` | `number` | Minimum speech duration in seconds (default: `0.25`) | | ||
| | `minSilenceDuration` | `number` | Minimum silence to end a segment in seconds (default: `0.5`) | | ||
| | `maxSpeechDuration` | `number` | Maximum speech segment duration in seconds (default: `15`) | | ||
| | `enableExternalBuffer` | `boolean` | Use external buffer for VAD speech segments (default: `undefined`) | | ||
@@ -229,4 +243,4 @@ | ||
| | Name | Model | Size | | ||
| | ------------ | ------------------------------------------------------------------- | ------ | | ||
| | Name | Model | Size | | ||
| | ------------ | -------------------------------------------------------------------- | ------- | | ||
| | `silero_vad` | [Silero VAD](https://github.com/snakers4/silero-vad) (k2-fsa export) | ~629 KB | | ||
@@ -233,0 +247,0 @@ |
+1
-1
| { | ||
| "name": "@marswave/coli", | ||
| "private": false, | ||
| "version": "0.0.14", | ||
| "version": "0.0.15", | ||
| "description": "A CLI for the Cola", | ||
@@ -6,0 +6,0 @@ "repository": "marswaveai/coli", |
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
50883
5.09%814
3.17%2
-33.33%