🚀 Socket Launch Week Day 5:Introducing Repository Access Permissions and Custom Roles.Learn more
Sign In

@marswave/coli

Package Overview
Dependencies
Maintainers
5
Versions
18
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@marswave/coli - npm Package Compare versions

Comparing version
0.0.14
to
0.0.15
+18
-1
distribution/source/asr/_cli.js

@@ -13,2 +13,3 @@ import { Buffer } from 'node:buffer';

.option('--model <name>', 'Model to use: whisper, sensevoice', 'sensevoice')
.option('--language <lang>', 'Language for sensevoice: auto, zh, en, ja, ko, yue', 'auto')
.action(async (file, options) => {

@@ -19,4 +20,13 @@ const { model } = options;

}
const validLanguages = new Set(['auto', 'zh', 'en', 'ja', 'ko', 'yue']);
if (!validLanguages.has(options.language)) {
throw new Error(`Unknown language "${options.language}". Use one of: auto, zh, en, ja, ko, yue.`);
}
await ensureModels([model]);
await runAsr(file, { json: options.json, model });
await runAsr(file, {
json: options.json,
model,
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
language: options.language,
});
});

@@ -28,4 +38,9 @@ program

.option('--vad', 'Enable voice activity detection', false)
.option('--language <lang>', 'Language for sensevoice: auto, zh, en, ja, ko, yue', 'auto')
.option('--asr-interval-ms <ms>', 'Recognition interval in ms (ignored with --vad)', '1000')
.action(async (options) => {
const validLanguages = new Set(['auto', 'zh', 'en', 'ja', 'ko', 'yue']);
if (!validLanguages.has(options.language)) {
throw new Error(`Unknown language "${options.language}". Use one of: auto, zh, en, ja, ko, yue.`);
}
await ensureModels();

@@ -49,2 +64,4 @@ if (options.vad) {

await streamAsr(stdinAudio(), {
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
language: options.language,
vad: options.vad || undefined,

@@ -51,0 +68,0 @@ asrIntervalMs: Number(options.asrIntervalMs),

export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js';
export { runAsr, type AsrOptions, type SenseVoiceLanguage } from './asr.js';
export { streamAsr, type AsrStreamResult, type StreamAsrOptions, type VadOptions, } from './stream-asr.js';
export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js';
export { runAsr } from './asr.js';
export { streamAsr, } from './stream-asr.js';
type ModelName = 'whisper' | 'sensevoice';
export type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue';
export type AsrOptions = {
json: boolean;
model: ModelName;
language?: SenseVoiceLanguage;
};
export declare function runAsr(filePath: string, options: AsrOptions): Promise<void>;
export {};
+3
-2

@@ -41,3 +41,3 @@ import fs from 'node:fs';

}
function createRecognizer(model) {
function createRecognizer(model, language) {
const modelDir = getModelPath(model);

@@ -66,2 +66,3 @@ const onnx = sherpaOnnx();

useInverseTextNormalization: 1,
language: language ?? 'auto',
},

@@ -92,3 +93,3 @@ tokens: path.join(modelDir, 'tokens.txt'),

const onnx = sherpaOnnx();
const recognizer = createRecognizer(options.model);
const recognizer = createRecognizer(options.model, options.language);
const stream = recognizer.createStream();

@@ -95,0 +96,0 @@ const wave = onnx.readWave(wavPath);

@@ -0,1 +1,2 @@

import type { SenseVoiceLanguage } from './asr.js';
export type AsrStreamResult = {

@@ -20,2 +21,3 @@ text: string;

asrIntervalMs?: number;
language?: SenseVoiceLanguage;
vad?: boolean | VadOptions;

@@ -22,0 +24,0 @@ onResult: (result: AsrStreamResult) => void;

@@ -13,3 +13,3 @@ import { createRequire } from 'node:module';

const defaultAsrIntervalMs = 1000;
function createRecognizer() {
function createRecognizer(language) {
const modelDir = getModelPath('sensevoice');

@@ -23,2 +23,3 @@ const onnx = sherpaOnnx();

useInverseTextNormalization: 1,
language: language ?? 'auto',
},

@@ -73,3 +74,3 @@ tokens: path.join(modelDir, 'tokens.txt'),

async function streamWithVad(audio, options, vadOptions) {
const recognizer = createRecognizer();
const recognizer = createRecognizer(options.language);
const vad = createVad(vadOptions);

@@ -109,3 +110,3 @@ const { windowSize } = vad.config.sileroVad;

const chunkInterval = (defaultSampleRate * intervalMs) / 1000;
const recognizer = createRecognizer();
const recognizer = createRecognizer(options.language);
const buffers = [];

@@ -112,0 +113,0 @@ let totalSamples = 0;

@@ -28,2 +28,5 @@ # ASR (Automatic Speech Recognition)

coli asr --model whisper recording.wav
# Specify language (sensevoice only)
coli asr --language zh recording.m4a
```

@@ -36,2 +39,3 @@

--model Model to use: whisper, sensevoice (default: sensevoice)
--language Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto)
```

@@ -62,2 +66,3 @@

--vad Enable voice activity detection
--language <lang> Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto)
--asr-interval-ms <ms> Recognition interval in ms (default: 1000, ignored with --vad)

@@ -108,2 +113,9 @@ ```

await runAsr('recording.m4a', {json: true, model: 'whisper'});
// Force Chinese language (sensevoice only)
await runAsr('recording.m4a', {
json: false,
model: 'sensevoice',
language: 'zh',
});
```

@@ -113,6 +125,7 @@

| Property | Type | Description |
| -------- | --------------------------- | ----------------------------------------------------------------------------- |
| `json` | `boolean` | Output JSON (with model name, tokens, timestamps, etc.) instead of plain text |
| `model` | `'whisper' \| 'sensevoice'` | Which model to use for recognition |
| Property | Type | Description |
| ---------- | --------------------------- | --------------------------------------------------------------------------------------------------- |
| `json` | `boolean` | Output JSON (with model name, tokens, timestamps, etc.) instead of plain text |
| `model` | `'whisper' \| 'sensevoice'` | Which model to use for recognition |
| `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) |

@@ -186,17 +199,18 @@ ### `getModelPath(model)`

| Property | Type | Description |
| --------------- | ----------------------------------- | ---------------------------------------------------------------------------- |
| `onResult` | `(result: AsrStreamResult) => void` | Callback invoked with each recognition result |
| `sampleRate` | `number` | Audio sample rate in Hz (default: `16000`) |
| `asrIntervalMs` | `number` | Recognition interval in milliseconds (default: `1000`). Ignored when using VAD |
| `vad` | `boolean \| VadOptions` | Enable VAD. Pass `true` for defaults or a `VadOptions` object |
| Property | Type | Description |
| --------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------- |
| `onResult` | `(result: AsrStreamResult) => void` | Callback invoked with each recognition result |
| `sampleRate` | `number` | Audio sample rate in Hz (default: `16000`) |
| `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) |
| `asrIntervalMs` | `number` | Recognition interval in milliseconds (default: `1000`). Ignored when using VAD |
| `vad` | `boolean \| VadOptions` | Enable VAD. Pass `true` for defaults or a `VadOptions` object |
**VadOptions**
| Property | Type | Description |
| -------------------- | -------- | --------------------------------------------------- |
| `threshold` | `number` | Speech detection threshold (default: `0.5`) |
| `minSpeechDuration` | `number` | Minimum speech duration in seconds (default: `0.25`) |
| `minSilenceDuration` | `number` | Minimum silence to end a segment in seconds (default: `0.5`) |
| `maxSpeechDuration` | `number` | Maximum speech segment duration in seconds (default: `15`) |
| Property | Type | Description |
| ---------------------- | --------- | ------------------------------------------------------------------ |
| `threshold` | `number` | Speech detection threshold (default: `0.5`) |
| `minSpeechDuration` | `number` | Minimum speech duration in seconds (default: `0.25`) |
| `minSilenceDuration` | `number` | Minimum silence to end a segment in seconds (default: `0.5`) |
| `maxSpeechDuration` | `number` | Maximum speech segment duration in seconds (default: `15`) |
| `enableExternalBuffer` | `boolean` | Use external buffer for VAD speech segments (default: `undefined`) |

@@ -229,4 +243,4 @@

| Name | Model | Size |
| ------------ | ------------------------------------------------------------------- | ------ |
| Name | Model | Size |
| ------------ | -------------------------------------------------------------------- | ------- |
| `silero_vad` | [Silero VAD](https://github.com/snakers4/silero-vad) (k2-fsa export) | ~629 KB |

@@ -233,0 +247,0 @@

{
"name": "@marswave/coli",
"private": false,
"version": "0.0.14",
"version": "0.0.15",
"description": "A CLI for the Cola",

@@ -6,0 +6,0 @@ "repository": "marswaveai/coli",