🚀 Socket Launch Week Day 5:Introducing Repository Access Permissions and Custom Roles.Learn more
Sign In

@marswave/coli

Package Overview
Dependencies
Maintainers
6
Versions
18
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@marswave/coli - npm Package Compare versions

Comparing version
0.0.19
to
0.0.20
+28
-5
distribution/source/asr/_cli.js

@@ -6,3 +6,3 @@ import { Buffer } from 'node:buffer';

import { convertToWav, readWave, runAsr, } from './asr.js';
import { ensureModels, ensureVadModel } from './models.js';
import { ensureModels, ensureVadModel, resolveAsrModelFiles, resolveVadModelFile, } from './models.js';
import { streamAsr } from './stream-asr.js';

@@ -16,2 +16,3 @@ export function register(program) {

.option('--model <name>', 'Model to use: whisper, sensevoice', 'sensevoice')
.option('--model-path <path>', 'Path to a local model file or directory')
.option('--language <lang>', 'Language for sensevoice: auto, zh, en, ja, ko, yue', 'auto')

@@ -27,3 +28,8 @@ .action(async (file, options) => {

}
await ensureModels([model]);
if (options.modelPath) {
resolveAsrModelFiles(model, options.modelPath);
}
else {
await ensureModels([model]);
}
const resolvedPath = path.resolve(file);

@@ -45,2 +51,3 @@ const ext = path.extname(resolvedPath).toLowerCase();

model,
modelPath: options.modelPath,
// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion

@@ -61,2 +68,4 @@ language: options.language,

.option('--vad', 'Enable voice activity detection', false)
.option('--model-path <path>', 'Path to a local SenseVoice model file or directory')
.option('--vad-model-path <path>', 'Path to a local VAD model file')
.option('--language <lang>', 'Language for sensevoice: auto, zh, en, ja, ko, yue', 'auto')

@@ -69,4 +78,17 @@ .option('--asr-interval-ms <ms>', 'Recognition interval in ms (ignored with --vad)', '1000')

}
await ensureModels();
if (options.vad) {
if (options.modelPath) {
resolveAsrModelFiles('sensevoice', options.modelPath);
}
else {
await ensureModels();
}
if (options.vadModelPath) {
if (options.vad) {
resolveVadModelFile(options.vadModelPath);
}
else {
throw new Error('Use --vad with --vad-model-path.');
}
}
else if (options.vad) {
await ensureVadModel();

@@ -89,3 +111,4 @@ }

language: options.language,
vad: options.vad || undefined,
modelPath: options.modelPath,
vad: options.vad ? { modelPath: options.vadModelPath } : undefined,
asrIntervalMs: Number(options.asrIntervalMs),

@@ -92,0 +115,0 @@ onResult(result) {

+1
-1

@@ -1,3 +0,3 @@

export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js';
export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, type ModelName, } from './models.js';
export { convertToWav, readWave, runAsr, type AsrOptions, type AudioData, type SenseVoiceLanguage, } from './asr.js';
export { streamAsr, type AsrStreamResult, type StreamAsrOptions, type VadOptions, } from './stream-asr.js';

@@ -0,4 +1,4 @@

import { type ModelName } from './models.js';
export declare function readWave(filename: string): AudioData;
export declare function convertToWav(inputPath: string): Promise<string>;
type ModelName = 'whisper' | 'sensevoice';
export type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue';

@@ -8,2 +8,3 @@ export type AsrOptions = {

model: ModelName;
modelPath?: string | undefined;
language?: SenseVoiceLanguage;

@@ -16,2 +17,1 @@ };

export declare function runAsr(input: string | AudioData, options: AsrOptions): Promise<void>;
export {};

@@ -8,3 +8,3 @@ import fs from 'node:fs';

import { deprecationAsrFilePath } from '../deprecations.js';
import { getModelPath, modelDisplayNames } from './models.js';
import { modelDisplayNames, resolveAsrModelFiles, } from './models.js';
const require = createRequire(import.meta.url);

@@ -46,6 +46,5 @@ // Loaded lazily to avoid loading the native addon until needed

}
function createRecognizer(model, language) {
const modelDir = getModelPath(model);
function createRecognizer(modelFiles, language) {
const onnx = sherpaOnnx();
if (model === 'whisper') {
if (modelFiles.model === 'whisper') {
return new onnx.OfflineRecognizer({

@@ -55,6 +54,6 @@ featConfig: { sampleRate: 16_000, featureDim: 80 },

whisper: {
encoder: path.join(modelDir, 'tiny.en-encoder.int8.onnx'),
decoder: path.join(modelDir, 'tiny.en-decoder.int8.onnx'),
encoder: modelFiles.files.encoder,
decoder: modelFiles.files.decoder,
},
tokens: path.join(modelDir, 'tiny.en-tokens.txt'),
tokens: modelFiles.files.tokens,
numThreads: 2,

@@ -70,7 +69,7 @@ provider: 'cpu',

senseVoice: {
model: path.join(modelDir, 'model.int8.onnx'),
model: modelFiles.files.model,
useInverseTextNormalization: 1,
language: language ?? 'auto',
},
tokens: path.join(modelDir, 'tokens.txt'),
tokens: modelFiles.files.tokens,
numThreads: 2,

@@ -83,2 +82,3 @@ provider: 'cpu',

export async function runAsr(input, options) {
const modelFiles = resolveAsrModelFiles(options.model, options.modelPath);
let wave;

@@ -107,3 +107,3 @@ let needsCleanup = false;

try {
const recognizer = createRecognizer(options.model, options.language);
const recognizer = createRecognizer(modelFiles, options.language);
const stream = recognizer.createStream();

@@ -110,0 +110,0 @@ stream.acceptWaveform({ sampleRate: wave.sampleRate, samples: wave.samples });

@@ -1,7 +0,32 @@

type ModelName = 'whisper' | 'sensevoice';
export type ModelName = 'whisper' | 'sensevoice';
export type WhisperModelFiles = {
encoder: string;
decoder: string;
tokens: string;
};
export type SenseVoiceModelFiles = {
model: string;
tokens: string;
};
export type AsrModelFiles = {
model: 'whisper';
files: WhisperModelFiles;
} | {
model: 'sensevoice';
files: SenseVoiceModelFiles;
};
export declare const modelDisplayNames: Record<ModelName, string>;
export declare function getModelPath(model: ModelName): string;
export declare function resolveAsrModelFiles(model: 'whisper', modelPath?: string): {
model: 'whisper';
files: WhisperModelFiles;
};
export declare function resolveAsrModelFiles(model: 'sensevoice', modelPath?: string): {
model: 'sensevoice';
files: SenseVoiceModelFiles;
};
export declare function resolveAsrModelFiles(model: ModelName, modelPath?: string): AsrModelFiles;
export declare function ensureModels(modelNames?: ModelName[]): Promise<void>;
export declare function getVadModelPath(): string;
export declare function resolveVadModelFile(modelPath?: string): string;
export declare function ensureVadModel(): Promise<void>;
export {};

@@ -59,2 +59,51 @@ import { createHash } from 'node:crypto';

}
function assertExistingFile(filePath, label) {
if (!fs.existsSync(filePath)) {
throw new Error(`${label} not found: ${filePath}`);
}
if (!fs.statSync(filePath).isFile()) {
throw new Error(`${label} must be a file: ${filePath}`);
}
}
function resolveModelDirectory(modelPath, defaultDirectory) {
if (!modelPath) {
return { directory: defaultDirectory };
}
const resolvedPath = path.resolve(modelPath);
if (!fs.existsSync(resolvedPath)) {
throw new Error(`Model path not found: ${resolvedPath}`);
}
const stat = fs.statSync(resolvedPath);
if (stat.isDirectory()) {
return { directory: resolvedPath };
}
if (stat.isFile()) {
return { directory: path.dirname(resolvedPath), filePath: resolvedPath };
}
throw new Error(`Model path must be a file or directory: ${resolvedPath}`);
}
export function resolveAsrModelFiles(model, modelPath) {
const { directory, filePath } = resolveModelDirectory(modelPath, getModelPath(model));
if (model === 'whisper') {
if (filePath) {
throw new Error('Custom whisper model path must be a directory containing tiny.en-encoder.int8.onnx, tiny.en-decoder.int8.onnx, and tiny.en-tokens.txt.');
}
const files = {
encoder: path.join(directory, 'tiny.en-encoder.int8.onnx'),
decoder: path.join(directory, 'tiny.en-decoder.int8.onnx'),
tokens: path.join(directory, 'tiny.en-tokens.txt'),
};
assertExistingFile(files.encoder, 'Whisper encoder model');
assertExistingFile(files.decoder, 'Whisper decoder model');
assertExistingFile(files.tokens, 'Whisper tokens file');
return { model, files };
}
const files = {
model: filePath ?? path.join(directory, 'model.int8.onnx'),
tokens: path.join(directory, 'tokens.txt'),
};
assertExistingFile(files.model, 'SenseVoice model');
assertExistingFile(files.tokens, 'SenseVoice tokens file');
return { model, files };
}
async function getFileSha256(filePath) {

@@ -202,2 +251,7 @@ const hash = createHash('sha256');

}
export function resolveVadModelFile(modelPath) {
const resolvedPath = modelPath ? path.resolve(modelPath) : getVadModelPath();
assertExistingFile(resolvedPath, 'VAD model');
return resolvedPath;
}
export async function ensureVadModel() {

@@ -204,0 +258,0 @@ const modelPath = getVadModelPath();

@@ -12,2 +12,3 @@ import type { SenseVoiceLanguage } from './asr.js';

export type VadOptions = {
modelPath?: string | undefined;
threshold?: number;

@@ -23,2 +24,3 @@ minSpeechDuration?: number;

language?: SenseVoiceLanguage;
modelPath?: string | undefined;
vad?: boolean | VadOptions;

@@ -25,0 +27,0 @@ onResult: (result: AsrStreamResult) => void;

import { createRequire } from 'node:module';
import path from 'node:path';
import { getModelPath, getVadModelPath } from './models.js';
import { resolveAsrModelFiles, resolveVadModelFile, } from './models.js';
const require = createRequire(import.meta.url);

@@ -13,4 +12,3 @@ let _sherpaOnnx;

const defaultAsrIntervalMs = 1000;
function createRecognizer(language) {
const modelDir = getModelPath('sensevoice');
function createRecognizer(modelFiles, language) {
const onnx = sherpaOnnx();

@@ -21,7 +19,7 @@ return new onnx.OfflineRecognizer({

senseVoice: {
model: path.join(modelDir, 'model.int8.onnx'),
model: modelFiles.model,
useInverseTextNormalization: 1,
language: language ?? 'auto',
},
tokens: path.join(modelDir, 'tokens.txt'),
tokens: modelFiles.tokens,
numThreads: 2,

@@ -53,5 +51,6 @@ provider: 'cpu',

const onnx = sherpaOnnx();
const modelPath = resolveVadModelFile(vadOptions.modelPath);
return new onnx.Vad({
sileroVad: {
model: getVadModelPath(),
model: modelPath,
threshold: vadOptions.threshold ?? 0.5,

@@ -75,3 +74,4 @@ minSpeechDuration: vadOptions.minSpeechDuration ?? 0.25,

async function streamWithVad(audio, options, vadOptions) {
const recognizer = createRecognizer(options.language);
const modelFiles = resolveAsrModelFiles('sensevoice', options.modelPath);
const recognizer = createRecognizer(modelFiles.files, options.language);
const vad = createVad(vadOptions);

@@ -110,3 +110,4 @@ const { windowSize } = vad.config.sileroVad;

const chunkInterval = (defaultSampleRate * intervalMs) / 1000;
const recognizer = createRecognizer(options.language);
const modelFiles = resolveAsrModelFiles('sensevoice', options.modelPath);
const recognizer = createRecognizer(modelFiles.files, options.language);
const buffers = [];

@@ -113,0 +114,0 @@ let totalSamples = 0;

@@ -21,2 +21,5 @@ # ASR (Automatic Speech Recognition)

# Use a local model file or directory without downloading
coli asr --model-path /path/to/sensevoice/model.int8.onnx recording.wav
# Specify language (sensevoice only)

@@ -31,2 +34,3 @@ coli asr --language zh recording.wav

--model Model to use: whisper, sensevoice (default: sensevoice)
--model-path Path to a local model file or directory
--language Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto)

@@ -49,2 +53,5 @@ ```

# Use local ASR and VAD models without downloading
ffmpeg -f avfoundation -i :0 -ar 16000 -ac 1 -f s16le pipe:1 | coli asr-stream --model-path /path/to/sensevoice/model.int8.onnx --vad --vad-model-path /path/to/silero_vad.onnx
# From a file

@@ -59,2 +66,4 @@ ffmpeg -i podcast.m4a -ar 16000 -ac 1 -f s16le pipe:1 | coli asr-stream --vad

--vad Enable voice activity detection
--model-path <path> Path to a local SenseVoice model file or directory
--vad-model-path <path> Path to a local VAD model file
--language <lang> Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto)

@@ -122,2 +131,8 @@ --asr-interval-ms <ms> Recognition interval in ms (default: 1000, ignored with --vad)

// Custom model path: no download, fails if local files are missing
await runAsr(
{sampleRate: 16000, samples: myFloat32Array},
{json: false, model: 'sensevoice', modelPath: '/path/to/model.int8.onnx'},
);
// Deprecated: file path input (requires ffmpeg for non-WAV formats)

@@ -129,7 +144,8 @@ await runAsr('recording.m4a', {json: false, model: 'sensevoice'});

| Property | Type | Description |
| ---------- | --------------------------- | --------------------------------------------------------------------------------------------------- |
| `json` | `boolean` | Output JSON (with model name, tokens, timestamps, etc.) instead of plain text |
| `model` | `'whisper' \| 'sensevoice'` | Which model to use for recognition |
| `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) |
| Property | Type | Description |
| ----------- | --------------------------- | ------------------------------------------------------------------------------------------------------- |
| `json` | `boolean` | Output JSON (with model name, tokens, timestamps, etc.) instead of plain text |
| `model` | `'whisper' \| 'sensevoice'` | Which model to use for recognition |
| `modelPath` | `string` | Path to a local model file or directory. Skips download and throws if required local files are missing. |
| `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) |

@@ -178,2 +194,3 @@ ### `getModelPath(model)`

await streamAsr(audioSource, {
modelPath: '/path/to/model.int8.onnx',
onResult(result) {

@@ -195,3 +212,8 @@ console.log(result.text, result.isFinal ? '(final)' : '(partial)');

await streamAsr(audioSource, {
vad: {threshold: 0.4, minSilenceDuration: 0.3, maxSpeechDuration: 10},
vad: {
modelPath: '/path/to/silero_vad.onnx',
threshold: 0.4,
minSilenceDuration: 0.3,
maxSpeechDuration: 10,
},
onResult(result) {

@@ -205,19 +227,21 @@ console.log(result.text);

| Property | Type | Description |
| --------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------- |
| `onResult` | `(result: AsrStreamResult) => void` | Callback invoked with each recognition result |
| `sampleRate` | `number` | Audio sample rate in Hz (default: `16000`) |
| `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) |
| `asrIntervalMs` | `number` | Recognition interval in milliseconds (default: `1000`). Ignored when using VAD |
| `vad` | `boolean \| VadOptions` | Enable VAD. Pass `true` for defaults or a `VadOptions` object |
| Property | Type | Description |
| --------------- | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
| `onResult` | `(result: AsrStreamResult) => void` | Callback invoked with each recognition result |
| `sampleRate` | `number` | Audio sample rate in Hz (default: `16000`) |
| `language` | `SenseVoiceLanguage` | Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) |
| `modelPath` | `string` | Path to a local SenseVoice model file or directory. Skips download and throws if required local files are missing. |
| `asrIntervalMs` | `number` | Recognition interval in milliseconds (default: `1000`). Ignored when using VAD |
| `vad` | `boolean \| VadOptions` | Enable VAD. Pass `true` for defaults or a `VadOptions` object |
**VadOptions**
| Property | Type | Description |
| ---------------------- | --------- | ------------------------------------------------------------------ |
| `threshold` | `number` | Speech detection threshold (default: `0.5`) |
| `minSpeechDuration` | `number` | Minimum speech duration in seconds (default: `0.25`) |
| `minSilenceDuration` | `number` | Minimum silence to end a segment in seconds (default: `0.5`) |
| `maxSpeechDuration` | `number` | Maximum speech segment duration in seconds (default: `15`) |
| `enableExternalBuffer` | `boolean` | Use external buffer for VAD speech segments (default: `undefined`) |
| Property | Type | Description |
| ---------------------- | --------- | --------------------------------------------------------------------- |
| `modelPath` | `string` | Path to a local VAD model file. Skips download and throws if missing. |
| `threshold` | `number` | Speech detection threshold (default: `0.5`) |
| `minSpeechDuration` | `number` | Minimum speech duration in seconds (default: `0.25`) |
| `minSilenceDuration` | `number` | Minimum silence to end a segment in seconds (default: `0.5`) |
| `maxSpeechDuration` | `number` | Maximum speech segment duration in seconds (default: `15`) |
| `enableExternalBuffer` | `boolean` | Use external buffer for VAD speech segments (default: `undefined`) |

@@ -224,0 +248,0 @@ **Result**

{
"name": "@marswave/coli",
"private": false,
"version": "0.0.19",
"version": "0.0.20",
"description": "A CLI for the Cola",

@@ -6,0 +6,0 @@ "repository": "marswaveai/coli",