@marswave/coli - npm Package Compare versions

+18

-1

distribution/source/asr/_cli.js

		@@ -13,2 +13,3 @@ import { Buffer } from 'node:buffer';
		.option('--model <name>', 'Model to use: whisper, sensevoice', 'sensevoice')
		.option('--language <lang>', 'Language for sensevoice: auto, zh, en, ja, ko, yue', 'auto')
		.action(async (file, options) => {
		@@ -19,4 +20,13 @@ const { model } = options;
		}
		const validLanguages = new Set(['auto', 'zh', 'en', 'ja', 'ko', 'yue']);
		if (!validLanguages.has(options.language)) {
		throw new Error(`Unknown language "${options.language}". Use one of: auto, zh, en, ja, ko, yue.`);
		}
		await ensureModels([model]);
		await runAsr(file, { json: options.json, model });
		await runAsr(file, {
		json: options.json,
		model,
		// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
		language: options.language,
		});
		});
		@@ -28,4 +38,9 @@ program
		.option('--vad', 'Enable voice activity detection', false)
		.option('--language <lang>', 'Language for sensevoice: auto, zh, en, ja, ko, yue', 'auto')
		.option('--asr-interval-ms <ms>', 'Recognition interval in ms (ignored with --vad)', '1000')
		.action(async (options) => {
		const validLanguages = new Set(['auto', 'zh', 'en', 'ja', 'ko', 'yue']);
		if (!validLanguages.has(options.language)) {
		throw new Error(`Unknown language "${options.language}". Use one of: auto, zh, en, ja, ko, yue.`);
		}
		await ensureModels();
		@@ -49,2 +64,4 @@ if (options.vad) {
		await streamAsr(stdinAudio(), {
		// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
		language: options.language,
		vad: options.vad \|\| undefined,
		@@ -51,0 +68,0 @@ asrIntervalMs: Number(options.asrIntervalMs),

+1

-0

distribution/source/asr/_index.d.ts

		export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js';
		export { runAsr, type AsrOptions, type SenseVoiceLanguage } from './asr.js';
		export { streamAsr, type AsrStreamResult, type StreamAsrOptions, type VadOptions, } from './stream-asr.js';

+1

-0

distribution/source/asr/_index.js

		export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js';
		export { runAsr } from './asr.js';
		export { streamAsr, } from './stream-asr.js';

+2

-0

distribution/source/asr/asr.d.ts

		type ModelName = 'whisper' \| 'sensevoice';
		export type SenseVoiceLanguage = 'auto' \| 'zh' \| 'en' \| 'ja' \| 'ko' \| 'yue';
		export type AsrOptions = {
		json: boolean;
		model: ModelName;
		language?: SenseVoiceLanguage;
		};
		export declare function runAsr(filePath: string, options: AsrOptions): Promise<void>;
		export {};

+3

-2

distribution/source/asr/asr.js

		@@ -41,3 +41,3 @@ import fs from 'node:fs';
		}
		function createRecognizer(model) {
		function createRecognizer(model, language) {
		const modelDir = getModelPath(model);
		@@ -66,2 +66,3 @@ const onnx = sherpaOnnx();
		useInverseTextNormalization: 1,
		language: language ?? 'auto',
		},
		@@ -92,3 +93,3 @@ tokens: path.join(modelDir, 'tokens.txt'),
		const onnx = sherpaOnnx();
		const recognizer = createRecognizer(options.model);
		const recognizer = createRecognizer(options.model, options.language);
		const stream = recognizer.createStream();
		@@ -95,0 +96,0 @@ const wave = onnx.readWave(wavPath);

+2

-0

distribution/source/asr/stream-asr.d.ts

		@@ -0,1 +1,2 @@
		import type { SenseVoiceLanguage } from './asr.js';
		export type AsrStreamResult = {
		@@ -20,2 +21,3 @@ text: string;
		asrIntervalMs?: number;
		language?: SenseVoiceLanguage;
		vad?: boolean \| VadOptions;
		@@ -22,0 +24,0 @@ onResult: (result: AsrStreamResult) => void;

+4

-3

distribution/source/asr/stream-asr.js

		@@ -13,3 +13,3 @@ import { createRequire } from 'node:module';
		const defaultAsrIntervalMs = 1000;
		function createRecognizer() {
		function createRecognizer(language) {
		const modelDir = getModelPath('sensevoice');
		@@ -23,2 +23,3 @@ const onnx = sherpaOnnx();
		useInverseTextNormalization: 1,
		language: language ?? 'auto',
		},
		@@ -73,3 +74,3 @@ tokens: path.join(modelDir, 'tokens.txt'),
		async function streamWithVad(audio, options, vadOptions) {
		const recognizer = createRecognizer();
		const recognizer = createRecognizer(options.language);
		const vad = createVad(vadOptions);
		@@ -109,3 +110,3 @@ const { windowSize } = vad.config.sileroVad;
		const chunkInterval = (defaultSampleRate * intervalMs) / 1000;
		const recognizer = createRecognizer();
		const recognizer = createRecognizer(options.language);
		const buffers = [];
		@@ -112,0 +113,0 @@ let totalSamples = 0;

+32

-18

docs/asr.md

		@@ -28,2 +28,5 @@ # ASR (Automatic Speech Recognition)
		coli asr --model whisper recording.wav

		# Specify language (sensevoice only)
		coli asr --language zh recording.m4a
		```
		@@ -36,2 +39,3 @@
		--model Model to use: whisper, sensevoice (default: sensevoice)
		--language Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto)
		```
		@@ -62,2 +66,3 @@
		--vad Enable voice activity detection
		--language <lang> Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto)
		--asr-interval-ms <ms> Recognition interval in ms (default: 1000, ignored with --vad)
		@@ -108,2 +113,9 @@ ```
		await runAsr('recording.m4a', {json: true, model: 'whisper'});

		// Force Chinese language (sensevoice only)
		await runAsr('recording.m4a', {
		json: false,
		model: 'sensevoice',
		language: 'zh',
		});
		```
		@@ -113,6 +125,7 @@

		\| Property \| Type \| Description \|
		\| -------- \| --------------------------- \| ----------------------------------------------------------------------------- \|
		\| `json` \| `boolean` \| Output JSON (with model name, tokens, timestamps, etc.) instead of plain text \|
		\| `model` \| `'whisper' \\| 'sensevoice'` \| Which model to use for recognition \|
		\| Property \| Type \| Description \|
		\| ---------- \| --------------------------- \| --------------------------------------------------------------------------------------------------- \|
		\| `json` \| `boolean` \| Output JSON (with model name, tokens, timestamps, etc.) instead of plain text \|
		\| `model` \| `'whisper' \\| 'sensevoice'` \| Which model to use for recognition \|
		\| `language` \| `SenseVoiceLanguage` \| Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) \|

		@@ -186,17 +199,18 @@ ### `getModelPath(model)`

		\| Property \| Type \| Description \|
		\| --------------- \| ----------------------------------- \| ---------------------------------------------------------------------------- \|
		\| `onResult` \| `(result: AsrStreamResult) => void` \| Callback invoked with each recognition result \|
		\| `sampleRate` \| `number` \| Audio sample rate in Hz (default: `16000`) \|
		\| `asrIntervalMs` \| `number` \| Recognition interval in milliseconds (default: `1000`). Ignored when using VAD \|
		\| `vad` \| `boolean \\| VadOptions` \| Enable VAD. Pass `true` for defaults or a `VadOptions` object \|
		\| Property \| Type \| Description \|
		\| --------------- \| ----------------------------------- \| --------------------------------------------------------------------------------------------------- \|
		\| `onResult` \| `(result: AsrStreamResult) => void` \| Callback invoked with each recognition result \|
		\| `sampleRate` \| `number` \| Audio sample rate in Hz (default: `16000`) \|
		\| `language` \| `SenseVoiceLanguage` \| Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) \|
		\| `asrIntervalMs` \| `number` \| Recognition interval in milliseconds (default: `1000`). Ignored when using VAD \|
		\| `vad` \| `boolean \\| VadOptions` \| Enable VAD. Pass `true` for defaults or a `VadOptions` object \|

		VadOptions

		\| Property \| Type \| Description \|
		\| -------------------- \| -------- \| --------------------------------------------------- \|
		\| `threshold` \| `number` \| Speech detection threshold (default: `0.5`) \|
		\| `minSpeechDuration` \| `number` \| Minimum speech duration in seconds (default: `0.25`) \|
		\| `minSilenceDuration` \| `number` \| Minimum silence to end a segment in seconds (default: `0.5`) \|
		\| `maxSpeechDuration` \| `number` \| Maximum speech segment duration in seconds (default: `15`) \|
		\| Property \| Type \| Description \|
		\| ---------------------- \| --------- \| ------------------------------------------------------------------ \|
		\| `threshold` \| `number` \| Speech detection threshold (default: `0.5`) \|
		\| `minSpeechDuration` \| `number` \| Minimum speech duration in seconds (default: `0.25`) \|
		\| `minSilenceDuration` \| `number` \| Minimum silence to end a segment in seconds (default: `0.5`) \|
		\| `maxSpeechDuration` \| `number` \| Maximum speech segment duration in seconds (default: `15`) \|
		\| `enableExternalBuffer` \| `boolean` \| Use external buffer for VAD speech segments (default: `undefined`) \|
		@@ -229,4 +243,4 @@

		\| Name \| Model \| Size \|
		\| ------------ \| ------------------------------------------------------------------- \| ------ \|
		\| Name \| Model \| Size \|
		\| ------------ \| -------------------------------------------------------------------- \| ------- \|
		\| `silero_vad` \| [Silero VAD](https://github.com/snakers4/silero-vad) (k2-fsa export) \| ~629 KB \|
		@@ -233,0 +247,0 @@

+1

-1

package.json

		{
		"name": "@marswave/coli",
		"private": false,
		"version": "0.0.14",
		"version": "0.0.15",
		"description": "A CLI for the Cola",
		@@ -6,0 +6,0 @@ "repository": "marswaveai/coli",

		@@ -28,2 +28,5 @@ # ASR (Automatic Speech Recognition)
		coli asr --model whisper recording.wav

		# Specify language (sensevoice only)
		coli asr --language zh recording.m4a
		```
		@@ -36,2 +39,3 @@
		--model Model to use: whisper, sensevoice (default: sensevoice)
		--language Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto)
		```
		@@ -62,2 +66,3 @@
		--vad Enable voice activity detection
		--language <lang> Language for sensevoice: auto, zh, en, ja, ko, yue (default: auto)
		--asr-interval-ms <ms> Recognition interval in ms (default: 1000, ignored with --vad)
		@@ -108,2 +113,9 @@ ```
		await runAsr('recording.m4a', {json: true, model: 'whisper'});

		// Force Chinese language (sensevoice only)
		await runAsr('recording.m4a', {
		json: false,
		model: 'sensevoice',
		language: 'zh',
		});
		```
		@@ -113,6 +125,7 @@

		\| Property \| Type \| Description \|
		\| -------- \| --------------------------- \| ----------------------------------------------------------------------------- \|
		\| `json` \| `boolean` \| Output JSON (with model name, tokens, timestamps, etc.) instead of plain text \|
		\| `model` \| `'whisper' \\| 'sensevoice'` \| Which model to use for recognition \|
		\| Property \| Type \| Description \|
		\| ---------- \| --------------------------- \| --------------------------------------------------------------------------------------------------- \|
		\| `json` \| `boolean` \| Output JSON (with model name, tokens, timestamps, etc.) instead of plain text \|
		\| `model` \| `'whisper' \\| 'sensevoice'` \| Which model to use for recognition \|
		\| `language` \| `SenseVoiceLanguage` \| Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) \|

		@@ -186,17 +199,18 @@ ### `getModelPath(model)`

		\| Property \| Type \| Description \|
		\| --------------- \| ----------------------------------- \| ---------------------------------------------------------------------------- \|
		\| `onResult` \| `(result: AsrStreamResult) => void` \| Callback invoked with each recognition result \|
		\| `sampleRate` \| `number` \| Audio sample rate in Hz (default: `16000`) \|
		\| `asrIntervalMs` \| `number` \| Recognition interval in milliseconds (default: `1000`). Ignored when using VAD \|
		\| `vad` \| `boolean \\| VadOptions` \| Enable VAD. Pass `true` for defaults or a `VadOptions` object \|
		\| Property \| Type \| Description \|
		\| --------------- \| ----------------------------------- \| --------------------------------------------------------------------------------------------------- \|
		\| `onResult` \| `(result: AsrStreamResult) => void` \| Callback invoked with each recognition result \|
		\| `sampleRate` \| `number` \| Audio sample rate in Hz (default: `16000`) \|
		\| `language` \| `SenseVoiceLanguage` \| Language hint for sensevoice: `'auto'`, `'zh'`, `'en'`, `'ja'`, `'ko'`, `'yue'` (default: `'auto'`) \|
		\| `asrIntervalMs` \| `number` \| Recognition interval in milliseconds (default: `1000`). Ignored when using VAD \|
		\| `vad` \| `boolean \\| VadOptions` \| Enable VAD. Pass `true` for defaults or a `VadOptions` object \|

		VadOptions

		\| Property \| Type \| Description \|
		\| -------------------- \| -------- \| --------------------------------------------------- \|
		\| `threshold` \| `number` \| Speech detection threshold (default: `0.5`) \|
		\| `minSpeechDuration` \| `number` \| Minimum speech duration in seconds (default: `0.25`) \|
		\| `minSilenceDuration` \| `number` \| Minimum silence to end a segment in seconds (default: `0.5`) \|
		\| `maxSpeechDuration` \| `number` \| Maximum speech segment duration in seconds (default: `15`) \|
		\| Property \| Type \| Description \|
		\| ---------------------- \| --------- \| ------------------------------------------------------------------ \|
		\| `threshold` \| `number` \| Speech detection threshold (default: `0.5`) \|
		\| `minSpeechDuration` \| `number` \| Minimum speech duration in seconds (default: `0.25`) \|
		\| `minSilenceDuration` \| `number` \| Minimum silence to end a segment in seconds (default: `0.5`) \|
		\| `maxSpeechDuration` \| `number` \| Maximum speech segment duration in seconds (default: `15`) \|
		\| `enableExternalBuffer` \| `boolean` \| Use external buffer for VAD speech segments (default: `undefined`) \|
		@@ -229,4 +243,4 @@

		\| Name \| Model \| Size \|
		\| ------------ \| ------------------------------------------------------------------- \| ------ \|
		\| Name \| Model \| Size \|
		\| ------------ \| -------------------------------------------------------------------- \| ------- \|
		\| `silero_vad` \| [Silero VAD](https://github.com/snakers4/silero-vad) (k2-fsa export) \| ~629 KB \|
		@@ -233,0 +247,0 @@

@marswave/coli - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics