@marswave/coli - npm Package Compare versions

+2

distribution/source/_api/constants.d.ts

		import type { SpeakerLanguage } from './types.js';
		export declare const defaultSpeaker: Record<SpeakerLanguage, string>;

+5

distribution/source/_api/constants.js

		export const defaultSpeaker = {
		en: 'chat-girl-105-cn',
		zh: 'leo-9328b6d2',
		ja: 'tianzhongdunzi-5d612542',
		};

+44

distribution/source/_api/listenhub-openapi.d.ts

		import { type KyInstance } from 'ky';
		import type { ApiResponse, SpeakerLanguage } from './types.js';
		export * from './constants.js';
		export type * from './types.js';
		export type ListenHubApiOptions = {
		apiKey: string;
		baseUrl?: string;
		};
		export declare class ListenHubApi {
		api: KyInstance;
		constructor({ apiKey, baseUrl }: ListenHubApiOptions);
		/**
		* Get a list of available speakers.
		* @param options - The options for the speakers request.
		* @param options.language - Optional. The language of the speakers to get, defaults to English.
		* @returns A list of available speakers.
		* @see {@link https://listenhub.ai/docs/en/openapi/api-reference/speakers#list-available-speakers\|List Available Speakers}
		*/
		getAvailableSpeakers(options?: {
		language?: SpeakerLanguage;
		}): Promise<ApiResponse<{
		items: Array<{
		name: string;
		speakerId: string;
		demoAudioUrl: string;
		gender: string;
		language: SpeakerLanguage;
		}>;
		}>>;
		/**
		* Generate audio from text using the Streaming TTS API.
		* @param options - The options for the TTS request.
		* @param options.input - The text to generate audio from.
		* @param options.voice - The `speakerId` to use for the TTS.
		* @param options.model - Optional. The model to use for the TTS, defaults to `flowtts`.
		* @returns A readable stream of the MP3 audio.
		* @see {@link https://listenhub.ai/docs/en/openapi/api-reference/flowspeech\|Streaming TTS}
		*/
		tts(options: {
		input: string;
		voice: string;
		model?: string;
		}): Promise<ReadableStream<Uint8Array<ArrayBuffer>>>;
		}

+40

distribution/source/_api/listenhub-openapi.js

		import ky from 'ky';
		export * from './constants.js';
		export class ListenHubApi {
		api;
		constructor({ apiKey, baseUrl }) {
		this.api = ky.extend({
		prefixUrl: baseUrl ?? 'https://api.marswave.ai/openapi',
		headers: {
		// eslint-disable-next-line @typescript-eslint/naming-convention
		Authorization: `Bearer ${apiKey}`,
		},
		});
		}
		/**
		* Get a list of available speakers.
		* @param options - The options for the speakers request.
		* @param options.language - Optional. The language of the speakers to get, defaults to English.
		* @returns A list of available speakers.
		* @see {@link https://listenhub.ai/docs/en/openapi/api-reference/speakers#list-available-speakers\|List Available Speakers}
		*/
		async getAvailableSpeakers(options) {
		return this.api.get('v1/speakers/list', { searchParams: options }).json();
		}
		/**
		* Generate audio from text using the Streaming TTS API.
		* @param options - The options for the TTS request.
		* @param options.input - The text to generate audio from.
		* @param options.voice - The `speakerId` to use for the TTS.
		* @param options.model - Optional. The model to use for the TTS, defaults to `flowtts`.
		* @returns A readable stream of the MP3 audio.
		* @see {@link https://listenhub.ai/docs/en/openapi/api-reference/flowspeech\|Streaming TTS}
		*/
		async tts(options) {
		// eslint-disable-next-line @typescript-eslint/await-thenable
		const response = await this.api.post('v1/tts', { json: options });
		if (!response.body)
		throw new Error('Empty response body from TTS API');
		return response.body;
		}
		}

+6

distribution/source/_api/types.d.ts

		export type ApiResponse<T> = {
		code: number;
		message: string;
		data: T;
		};
		export type SpeakerLanguage = 'en' \| 'zh' \| 'ja';

+1

distribution/source/_api/types.js

export {};

+2

distribution/source/asr/_cli.d.ts

		import type { Command } from 'commander';
		export declare function register(program: Command): void;

+58

distribution/source/asr/_cli.js

		import { Buffer } from 'node:buffer';
		import process from 'node:process';
		import { runAsr } from './asr.js';
		import { ensureModels, ensureVadModel } from './models.js';
		import { streamAsr } from './stream-asr.js';
		export function register(program) {
		program
		.command('asr')
		.description('Transcribe an audio file using speech recognition')
		.argument('<file>', 'Audio file to transcribe')
		.option('-j, --json', 'Output result in JSON format', false)
		.option('--model <name>', 'Model to use: whisper, sensevoice', 'sensevoice')
		.action(async (file, options) => {
		const { model } = options;
		if (model !== 'whisper' && model !== 'sensevoice') {
		throw new Error(`Unknown model "${model}". Use "whisper" or "sensevoice".`);
		}
		await ensureModels([model]);
		await runAsr(file, { json: options.json, model });
		});
		program
		.command('asr-stream')
		.description('Stream speech recognition from stdin (expects 16kHz mono s16le PCM)')
		.option('-j, --json', 'Output each result as a JSON line', false)
		.option('--vad', 'Enable voice activity detection', false)
		.option('--asr-interval-ms <ms>', 'Recognition interval in ms (ignored with --vad)', '1000')
		.action(async (options) => {
		await ensureModels();
		if (options.vad) {
		await ensureVadModel();
		}
		async function* stdinAudio() {
		// eslint-disable-next-line @typescript-eslint/await-thenable
		for await (const chunk of process.stdin) {
		// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
		const buf = Buffer.from(chunk);
		const pcm = new Int16Array(buf.buffer, buf.byteOffset, buf.byteLength / 2);
		const float32 = new Float32Array(pcm.length);
		for (const [i, sample] of pcm.entries()) {
		float32[i] = sample / 32_768;
		}
		yield float32;
		}
		}
		await streamAsr(stdinAudio(), {
		vad: options.vad \|\| undefined,
		asrIntervalMs: Number(options.asrIntervalMs),
		onResult(result) {
		if (options.json) {
		console.log(JSON.stringify(result));
		}
		else {
		console.log(result.text);
		}
		},
		});
		});
		}

+2

distribution/source/asr/_index.d.ts

		export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js';
		export { streamAsr, type AsrStreamResult, type StreamAsrOptions, type VadOptions, } from './stream-asr.js';

+2

distribution/source/asr/_index.js

		export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js';
		export { streamAsr, } from './stream-asr.js';

+7

distribution/source/asr/asr.d.ts

		type ModelName = 'whisper' \| 'sensevoice';
		export type AsrOptions = {
		json: boolean;
		model: ModelName;
		};
		export declare function runAsr(filePath: string, options: AsrOptions): Promise<void>;
		export {};

+117

distribution/source/asr/asr.js

		import fs from 'node:fs';
		import { createRequire } from 'node:module';
		import os from 'node:os';
		import path from 'node:path';
		import { execa } from 'execa';
		import { getModelPath, modelDisplayNames } from './models.js';
		const require = createRequire(import.meta.url);
		// Loaded lazily to avoid loading the native addon until needed
		let _sherpaOnnx;
		function sherpaOnnx() {
		// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
		_sherpaOnnx ??= require('sherpa-onnx-node');
		return _sherpaOnnx;
		}
		async function convertToWav(inputPath) {
		const outputPath = path.join(os.tmpdir(), `coli-${Date.now()}.wav`);
		try {
		// eslint-disable-next-line @typescript-eslint/await-thenable
		await execa('ffmpeg', [
		'-i',
		inputPath,
		'-ar',
		'16000',
		'-ac',
		'1',
		'-f',
		'wav',
		'-acodec',
		'pcm_s16le',
		outputPath,
		'-y',
		]);
		}
		catch {
		throw new Error('Failed to convert audio file. Please make sure ffmpeg is installed.\n' +
		' brew install ffmpeg # macOS\n' +
		' sudo apt install ffmpeg # Debian/Ubuntu');
		}
		return outputPath;
		}
		function createRecognizer(model) {
		const modelDir = getModelPath(model);
		const onnx = sherpaOnnx();
		if (model === 'whisper') {
		return new onnx.OfflineRecognizer({
		featConfig: { sampleRate: 16_000, featureDim: 80 },
		modelConfig: {
		whisper: {
		encoder: path.join(modelDir, 'tiny.en-encoder.int8.onnx'),
		decoder: path.join(modelDir, 'tiny.en-decoder.int8.onnx'),
		},
		tokens: path.join(modelDir, 'tiny.en-tokens.txt'),
		numThreads: 2,
		provider: 'cpu',
		debug: 0,
		},
		});
		}
		return new onnx.OfflineRecognizer({
		featConfig: { sampleRate: 16_000, featureDim: 80 },
		modelConfig: {
		senseVoice: {
		model: path.join(modelDir, 'model.int8.onnx'),
		useInverseTextNormalization: 1,
		},
		tokens: path.join(modelDir, 'tokens.txt'),
		numThreads: 2,
		provider: 'cpu',
		debug: 0,
		},
		});
		}
		export async function runAsr(filePath, options) {
		const resolvedPath = path.resolve(filePath);
		if (!fs.existsSync(resolvedPath)) {
		throw new Error(`File not found: ${resolvedPath}`);
		}
		const ext = path.extname(resolvedPath).toLowerCase();
		let wavPath;
		let needsCleanup = false;
		if (ext === '.wav') {
		wavPath = resolvedPath;
		}
		else {
		wavPath = await convertToWav(resolvedPath);
		needsCleanup = true;
		}
		try {
		const onnx = sherpaOnnx();
		const recognizer = createRecognizer(options.model);
		const stream = recognizer.createStream();
		const wave = onnx.readWave(wavPath);
		stream.acceptWaveform({ sampleRate: wave.sampleRate, samples: wave.samples });
		recognizer.decode(stream);
		const result = recognizer.getResult(stream);
		if (options.json) {
		console.log(JSON.stringify({
		text: result.text.trim(),
		model: modelDisplayNames[options.model],
		lang: result.lang \|\| undefined,
		emotion: result.emotion \|\| undefined,
		event: result.event \|\| undefined,
		tokens: result.tokens,
		timestamps: result.timestamps,
		duration: wave.samples.length / wave.sampleRate,
		}, null, 2));
		}
		else {
		console.log(result.text.trim());
		}
		}
		finally {
		if (needsCleanup && fs.existsSync(wavPath)) {
		fs.unlinkSync(wavPath);
		}
		}
		}

+7

distribution/source/asr/models.d.ts

		type ModelName = 'whisper' \| 'sensevoice';
		export declare const modelDisplayNames: Record<ModelName, string>;
		export declare function getModelPath(model: ModelName): string;
		export declare function ensureModels(modelNames?: ModelName[]): Promise<void>;
		export declare function getVadModelPath(): string;
		export declare function ensureVadModel(): Promise<void>;
		export {};

+118

distribution/source/asr/models.js

		import fs from 'node:fs';
		import os from 'node:os';
		import path from 'node:path';
		import process from 'node:process';
		import { execa } from 'execa';
		const modelsDirectory = path.join(os.homedir(), '.coli', 'models');
		const models = {
		whisper: {
		dirName: 'sherpa-onnx-whisper-tiny.en',
		url: 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2',
		checkFile: 'tiny.en-encoder.int8.onnx',
		},
		sensevoice: {
		dirName: 'sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17',
		url: 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17.tar.bz2',
		checkFile: 'model.int8.onnx',
		},
		};
		export const modelDisplayNames = {
		whisper: 'whisper-tiny.en',
		sensevoice: 'sensevoice-small',
		};
		export function getModelPath(model) {
		return path.join(modelsDirectory, models[model].dirName);
		}
		function isModelInstalled(entry) {
		const modelDir = path.join(modelsDirectory, entry.dirName);
		return fs.existsSync(path.join(modelDir, entry.checkFile));
		}
		async function downloadModel(entry) {
		const { dirName, url } = entry;
		console.log(`Downloading ${dirName}...`);
		fs.mkdirSync(modelsDirectory, { recursive: true });
		const tarPath = path.join(modelsDirectory, `${dirName}.tar.bz2`);
		const response = await fetch(url, { redirect: 'follow' });
		if (!response.ok \|\| !response.body) {
		throw new Error(`Failed to download model: ${response.statusText}`);
		}
		const contentLength = Number(response.headers.get('content-length') ?? 0);
		const reader = response.body.getReader();
		const fileHandle = fs.openSync(tarPath, 'w');
		let downloaded = 0;
		try {
		for (;;) {
		const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop
		if (done) {
		break;
		}
		fs.writeSync(fileHandle, value);
		downloaded += value.length;
		if (contentLength > 0) {
		const percent = ((downloaded / contentLength) * 100).toFixed(1);
		const mb = (downloaded / (1024 * 1024)).toFixed(1);
		const totalMb = (contentLength / (1024 * 1024)).toFixed(1);
		process.stdout.write(`\r ${mb} MB / ${totalMb} MB (${percent}%)`);
		}
		}
		}
		finally {
		fs.closeSync(fileHandle);
		}
		process.stdout.write('\n');
		console.log(' Extracting...');
		// eslint-disable-next-line @typescript-eslint/await-thenable
		await execa('tar', ['xjf', tarPath, '-C', modelsDirectory]);
		fs.unlinkSync(tarPath);
		console.log(` ${dirName} ready.\n`);
		}
		export async function ensureModels(modelNames = ['sensevoice']) {
		const pending = modelNames
		.map((name) => models[name])
		.filter((entry) => !isModelInstalled(entry));
		for (const entry of pending) {
		await downloadModel(entry); // eslint-disable-line no-await-in-loop
		}
		}
		const vadModelFile = 'silero_vad.onnx';
		const vadModelUrl = 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx';
		export function getVadModelPath() {
		return path.join(modelsDirectory, vadModelFile);
		}
		export async function ensureVadModel() {
		const modelPath = getVadModelPath();
		if (fs.existsSync(modelPath)) {
		return;
		}
		console.log(`Downloading ${vadModelFile}...`);
		fs.mkdirSync(modelsDirectory, { recursive: true });
		const response = await fetch(vadModelUrl, { redirect: 'follow' });
		if (!response.ok \|\| !response.body) {
		throw new Error(`Failed to download VAD model: ${response.statusText}`);
		}
		const contentLength = Number(response.headers.get('content-length') ?? 0);
		const reader = response.body.getReader();
		const fileHandle = fs.openSync(modelPath, 'w');
		let downloaded = 0;
		try {
		for (;;) {
		const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop
		if (done) {
		break;
		}
		fs.writeSync(fileHandle, value);
		downloaded += value.length;
		if (contentLength > 0) {
		const percent = ((downloaded / contentLength) * 100).toFixed(1);
		const kb = (downloaded / 1024).toFixed(0);
		const totalKb = (contentLength / 1024).toFixed(0);
		process.stdout.write(`\r ${kb} KB / ${totalKb} KB (${percent}%)`);
		}
		}
		}
		finally {
		fs.closeSync(fileHandle);
		}
		process.stdout.write('\n');
		console.log(` ${vadModelFile} ready.\n`);
		}

+23

distribution/source/asr/stream-asr.d.ts

		export type AsrStreamResult = {
		text: string;
		lang: string;
		emotion: string;
		event: string;
		tokens: string[];
		timestamps: number[];
		isFinal: boolean;
		};
		export type VadOptions = {
		threshold?: number;
		minSpeechDuration?: number;
		minSilenceDuration?: number;
		maxSpeechDuration?: number;
		enableExternalBuffer?: boolean;
		};
		export type StreamAsrOptions = {
		sampleRate?: number;
		asrIntervalMs?: number;
		vad?: boolean \| VadOptions;
		onResult: (result: AsrStreamResult) => void;
		};
		export declare function streamAsr(audio: AsyncIterable<Float32Array>, options: StreamAsrOptions): Promise<void>;

+138

distribution/source/asr/stream-asr.js

		import { createRequire } from 'node:module';
		import path from 'node:path';
		import { getModelPath, getVadModelPath } from './models.js';
		const require = createRequire(import.meta.url);
		let _sherpaOnnx;
		function sherpaOnnx() {
		// eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion
		_sherpaOnnx ??= require('sherpa-onnx-node');
		return _sherpaOnnx;
		}
		const defaultSampleRate = 16_000;
		const defaultAsrIntervalMs = 1000;
		function createRecognizer() {
		const modelDir = getModelPath('sensevoice');
		const onnx = sherpaOnnx();
		return new onnx.OfflineRecognizer({
		featConfig: { sampleRate: defaultSampleRate, featureDim: 80 },
		modelConfig: {
		senseVoice: {
		model: path.join(modelDir, 'model.int8.onnx'),
		useInverseTextNormalization: 1,
		},
		tokens: path.join(modelDir, 'tokens.txt'),
		numThreads: 2,
		provider: 'cpu',
		debug: 0,
		},
		});
		}
		function recognize(recognizer, samples) {
		const stream = recognizer.createStream();
		stream.acceptWaveform({ sampleRate: defaultSampleRate, samples });
		recognizer.decode(stream);
		return recognizer.getResult(stream);
		}
		function mergeBuffers(buffers, totalLength) {
		if (buffers.length === 1 && buffers[0]) {
		return buffers[0];
		}
		const merged = new Float32Array(totalLength);
		let offset = 0;
		for (const buf of buffers) {
		merged.set(buf, offset);
		offset += buf.length;
		}
		return merged;
		}
		function createVad(vadOptions) {
		const onnx = sherpaOnnx();
		return new onnx.Vad({
		sileroVad: {
		model: getVadModelPath(),
		threshold: vadOptions.threshold ?? 0.5,
		minSpeechDuration: vadOptions.minSpeechDuration ?? 0.25,
		minSilenceDuration: vadOptions.minSilenceDuration ?? 0.5,
		maxSpeechDuration: vadOptions.maxSpeechDuration ?? 15,
		windowSize: 512,
		},
		sampleRate: defaultSampleRate,
		debug: 0,
		numThreads: 1,
		}, 60);
		}
		function emitResult(result, isFinal, onResult) {
		const text = result.text.trim();
		if (text) {
		onResult({ ...result, text, isFinal });
		}
		}
		async function streamWithVad(audio, options, vadOptions) {
		const recognizer = createRecognizer();
		const vad = createVad(vadOptions);
		const { windowSize } = vad.config.sileroVad;
		let pending = new Float32Array(0);
		function drainSegments() {
		while (!vad.isEmpty()) {
		const segment = vad.front(vadOptions.enableExternalBuffer);
		vad.pop();
		emitResult(recognize(recognizer, segment.samples), true, options.onResult);
		}
		}
		// eslint-disable-next-line @typescript-eslint/await-thenable
		for await (const chunk of audio) {
		const combined = new Float32Array(pending.length + chunk.length);
		combined.set(pending);
		combined.set(chunk, pending.length);
		pending = combined;
		while (pending.length >= windowSize) {
		vad.acceptWaveform(pending.subarray(0, windowSize));
		pending = pending.subarray(windowSize);
		drainSegments();
		}
		}
		if (pending.length > 0) {
		const padded = new Float32Array(windowSize);
		padded.set(pending);
		vad.acceptWaveform(padded);
		}
		vad.flush();
		drainSegments();
		}
		async function streamWithInterval(audio, options) {
		const inputSampleRate = options.sampleRate ?? defaultSampleRate;
		const intervalMs = options.asrIntervalMs ?? defaultAsrIntervalMs;
		const chunkInterval = (defaultSampleRate * intervalMs) / 1000;
		const recognizer = createRecognizer();
		const buffers = [];
		let totalSamples = 0;
		let lastRecognizedAt = 0;
		let lastText = '';
		// eslint-disable-next-line @typescript-eslint/await-thenable
		for await (const chunk of audio) {
		buffers.push(chunk);
		totalSamples += chunk.length;
		const samplesForInterval = (chunkInterval * inputSampleRate) / defaultSampleRate;
		if (totalSamples - lastRecognizedAt >= samplesForInterval) {
		lastRecognizedAt = totalSamples;
		const merged = mergeBuffers(buffers, totalSamples);
		const result = recognize(recognizer, merged);
		const text = result.text.trim();
		if (text && text !== lastText) {
		lastText = text;
		options.onResult({ ...result, text, isFinal: false });
		}
		}
		}
		const merged = mergeBuffers(buffers, totalSamples);
		if (merged.length > 0) {
		emitResult(recognize(recognizer, merged), true, options.onResult);
		}
		}
		export async function streamAsr(audio, options) {
		if (options.vad) {
		const vadOptions = typeof options.vad === 'object' ? options.vad : {};
		return streamWithVad(audio, options, vadOptions);
		}
		return streamWithInterval(audio, options);
		}

+2

distribution/source/cli.d.ts

		#!/usr/bin/env node
		export {};

+11

distribution/source/cli.js

		#!/usr/bin/env node
		import { Command } from 'commander';
		import { register as registerAsr } from './asr/_cli.js';
		import { register as registerCloudTts } from './cloud-tts/_cli.js';
		import { register as registerTts } from './tts/_cli.js';
		const program = new Command();
		program.name('coli').description('Core CLI for Cola');
		registerAsr(program);
		registerTts(program);
		registerCloudTts(program);
		program.parse();

+2

distribution/source/cloud-tts/_cli.d.ts

		import type { Command } from 'commander';
		export declare function register(program: Command): void;

+64

distribution/source/cloud-tts/_cli.js

		import process from 'node:process';
		import { defaultSpeaker } from '../_api/constants.js';
		import { listSpeakers, runCloudTts } from './cloud-tts.js';
		function getApiKey(options) {
		const key = options.apiKey ?? process.env['COLI_LISTENHUB_API_KEY'];
		if (!key) {
		throw new Error('API key required. Use --api-key or set COLI_LISTENHUB_API_KEY environment variable. Get an API key from https://listenhub.ai/settings/api-keys');
		}
		return key;
		}
		function getBaseUrl(options) {
		return options.baseUrl ?? process.env['COLI_TTS_BASE_URL'];
		}
		export function register(program) {
		program
		.command('cloud-tts')
		.description('Generate speech using ListenHub OpenAPI')
		.argument('[text]', 'Text to synthesize')
		.option('--api-key <key>', 'ListenHub API key (or set COLI_LISTENHUB_API_KEY environment variable)')
		.option('--voice <id>', 'Speaker ID to use')
		.option('--model <name>', 'Model to use (default: flowtts)')
		.option('--base-url <url>', 'Base URL for TTS API (or set COLI_TTS_BASE_URL environment variable)')
		.option('-o, --output <file>', 'Save audio to file')
		.option('--list-speakers', 'List available speakers')
		.option('--language <lang>', 'Speaker language (en, zh, ja)')
		.option('-j, --json', 'Output in JSON format (use with --list-speakers)')
		.action(async (text, options) => {
		if (options.listSpeakers) {
		const apiKey = getApiKey(options);
		const baseUrl = getBaseUrl(options);
		const speakers = await listSpeakers({
		apiKey,
		baseUrl,
		language: options.language,
		});
		if (options.json) {
		console.log(JSON.stringify(speakers, null, 2));
		}
		else {
		for (const speaker of speakers) {
		console.log(`${speaker.name}\t${speaker.speakerId}\t${speaker.gender}\t${speaker.language}`);
		}
		}
		return;
		}
		if (!text) {
		throw new Error('Please provide text to synthesize.');
		}
		const voice = options.voice ??
		(options.language && defaultSpeaker[options.language]);
		if (!voice) {
		throw new Error('Please specify a speaker with --voice or a language with --language. Use --list-speakers to see available speakers.');
		}
		const apiKey = getApiKey(options);
		const baseUrl = getBaseUrl(options);
		await runCloudTts(text, {
		apiKey,
		baseUrl,
		voice,
		model: options.model,
		output: options.output,
		});
		});
		}

+1

distribution/source/cloud-tts/_index.d.ts

export { listSpeakers, runCloudTts, type CloudTtsOptions, type ListSpeakersOptions, } from './cloud-tts.js';

+1

distribution/source/cloud-tts/_index.js

export { listSpeakers, runCloudTts, } from './cloud-tts.js';

+21

distribution/source/cloud-tts/cloud-tts.d.ts

		import type { SpeakerLanguage } from '../_api/types.js';
		export type CloudTtsOptions = {
		apiKey: string;
		voice: string;
		model?: string;
		output?: string;
		baseUrl?: string;
		};
		export type ListSpeakersOptions = {
		apiKey: string;
		language?: SpeakerLanguage;
		baseUrl?: string;
		};
		export declare function listSpeakers(options: ListSpeakersOptions): Promise<{
		name: string;
		speakerId: string;
		demoAudioUrl: string;
		gender: string;
		language: SpeakerLanguage;
		}[]>;
		export declare function runCloudTts(text: string, options: CloudTtsOptions): Promise<void>;

+54

distribution/source/cloud-tts/cloud-tts.js

		import { Buffer } from 'node:buffer';
		import fs from 'node:fs';
		import os from 'node:os';
		import path from 'node:path';
		import { Writable } from 'node:stream';
		import { execa } from 'execa';
		import { ListenHubApi } from '../_api/listenhub-openapi.js';
		export async function listSpeakers(options) {
		const api = new ListenHubApi({
		apiKey: options.apiKey,
		baseUrl: options.baseUrl,
		});
		const result = await api.getAvailableSpeakers({
		language: options.language,
		});
		return result.data.items;
		}
		async function collectStream(stream) {
		const chunks = [];
		const reader = stream.getReader();
		for (;;) {
		const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop
		if (done)
		break;
		chunks.push(value);
		}
		return Buffer.concat(chunks);
		}
		export async function runCloudTts(text, options) {
		const api = new ListenHubApi({
		apiKey: options.apiKey,
		baseUrl: options.baseUrl,
		});
		const stream = await api.tts({
		input: text,
		voice: options.voice,
		model: options.model,
		});
		if (options.output) {
		const fileStream = fs.createWriteStream(options.output);
		await stream.pipeTo(Writable.toWeb(fileStream));
		return;
		}
		const mp3Path = path.join(os.tmpdir(), `coli-cloud-tts-${Date.now()}.mp3`);
		const audio = await collectStream(stream);
		fs.writeFileSync(mp3Path, audio);
		try {
		// eslint-disable-next-line @typescript-eslint/await-thenable
		await execa('afplay', [mp3Path]);
		}
		finally {
		fs.unlinkSync(mp3Path);
		}
		}

+4

distribution/source/index.d.ts

		export * from './_api/listenhub-openapi.js';
		export * from './asr/_index.js';
		export * from './cloud-tts/_index.js';
		export * from './tts/_index.js';

+4

distribution/source/index.js

		export * from './_api/listenhub-openapi.js';
		export * from './asr/_index.js';
		export * from './cloud-tts/_index.js';
		export * from './tts/_index.js';

+2

distribution/source/tts/_cli.d.ts

		import type { Command } from 'commander';
		export declare function register(program: Command): void;

+30

distribution/source/tts/_cli.js

		import { getVoices, runTts } from './tts.js';
		export function register(program) {
		program
		.command('tts')
		.description('Speak text using text-to-speech (macOS only)')
		.argument('[text]', 'Text to speak')
		.option('-v, --voice <name>', 'Voice to use, defaults to macOS system voice')
		.option('-r, --rate <wpm>', 'Speech rate in words per minute', Number)
		.option('-o, --output <file>', 'Save audio to file instead of speaking')
		.option('--list-voices', 'List available voices')
		.option('-j, --json', 'Output in JSON format (use with --list-voices)')
		.action(async (text, options) => {
		if (options.listVoices) {
		const voices = await getVoices();
		if (options.json) {
		console.log(JSON.stringify(voices, null, 2));
		}
		else {
		for (const voice of voices) {
		console.log(`${voice.name}\t${voice.languageCode}\t${voice.example}`);
		}
		}
		return;
		}
		if (!text) {
		throw new Error('Please provide text to speak.');
		}
		await runTts(text, options);
		});
		}

+1

distribution/source/tts/_index.d.ts

export { getVoices, runTts, type TtsOptions } from './tts.js';

+1

distribution/source/tts/_index.js

export { getVoices, runTts } from './tts.js';

+8

distribution/source/tts/tts.d.ts

		import { type Voice } from 'mac-say';
		export type TtsOptions = {
		voice?: string;
		rate?: number;
		output?: string;
		};
		export declare function getVoices(): Promise<Voice[]>;
		export declare function runTts(text: string, options?: TtsOptions): Promise<void>;

+11

distribution/source/tts/tts.js

		import { getVoices as macGetVoices, say } from 'mac-say';
		export async function getVoices() {
		return macGetVoices();
		}
		export async function runTts(text, options = {}) {
		await say(text, {
		voice: options.voice,
		rate: options.rate,
		outputFile: options.output,
		});
		}

+10

-6

docs/cloud-tts.md

		@@ -8,2 +8,3 @@ # Cloud TTS
		- A ListenHub API key. Pass it via `--api-key` or set the `COLI_LISTENHUB_API_KEY` environment variable.
		- Optionally, a custom base URL via `--base-url` or the `COLI_TTS_BASE_URL` environment variable.

		@@ -39,2 +40,3 @@ ## CLI
		--model <name> Model to use (default: flowtts)
		--base-url <url> Base URL for TTS API (or set COLI_TTS_BASE_URL)
		-o, --output <file> Save audio to file
		@@ -70,2 +72,3 @@ --list-speakers List available speakers
		\| `language` \| `'en' \\| 'zh' \\| 'ja'` \| Filter speakers by language. Omit to list all. \|
		\| `baseUrl` \| `string` \| Custom base URL for TTS API (optional) \|

		@@ -89,7 +92,8 @@ ### `runCloudTts(text, options)`

		\| Property \| Type \| Description \|
		\| -------- \| -------- \| ---------------------------------------------- \|
		\| `apiKey` \| `string` \| ListenHub API key \|
		\| `voice` \| `string` \| Speaker ID (from `listSpeakers`) \|
		\| `model` \| `string` \| Model to use (optional, defaults to `flowtts`) \|
		\| `output` \| `string` \| Save to file instead of playing directly \|
		\| Property \| Type \| Description \|
		\| --------- \| -------- \| ---------------------------------------------- \|
		\| `apiKey` \| `string` \| ListenHub API key \|
		\| `voice` \| `string` \| Speaker ID (from `listSpeakers`) \|
		\| `model` \| `string` \| Model to use (optional, defaults to `flowtts`) \|
		\| `output` \| `string` \| Save to file instead of playing directly \|
		\| `baseUrl` \| `string` \| Custom base URL for TTS API (optional) \|

+6

-0

docs/listenhub-openapi.md

		@@ -11,2 +11,8 @@ # ListenHub OpenAPI
		const api = new ListenHubApi({apiKey: 'lh_sk_...'});

		// Or with a custom base URL
		const api = new ListenHubApi({
		apiKey: 'lh_sk_...',
		baseUrl: 'https://custom-api.example.com/openapi',
		});
		```
		@@ -13,0 +19,0 @@

+9

-8

package.json

		{
		"name": "@marswave/coli",
		"private": false,
		"version": "0.0.13",
		"version": "0.0.14",
		"description": "A CLI for the Cola",
		"repository": "marswaveai/coli",
		"type": "module",
		"bin": "distribution/cli.js",
		"exports": "./distribution/index.js",
		"types": "distribution",
		"bin": "distribution/source/cli.js",
		"exports": "./distribution/source/index.js",
		"types": "distribution/source",
		"files": [
		@@ -19,3 +19,4 @@ "distribution",
		"clean": "del-cli distribution",
		"build": "node --run clean && tsc && chmod +x distribution/cli.js",
		"dev": "node --run clean && tsc --watch",
		"build": "node --run clean && tsc && chmod +x distribution/source/cli.js",
		"pretest": "node --run build",
		@@ -29,3 +30,3 @@ "test": "xo"
		"mac-say": "^0.3.3",
		"sherpa-onnx-node": "^1.12.29"
		"sherpa-onnx-node": "^1.12.33"
		},
		@@ -36,5 +37,5 @@ "devDependencies": {
		"del-cli": "^7.0.0",
		"typescript": "^5.9.3",
		"xo": "^1.2.3"
		"typescript": "^6.0.2",
		"xo": "^2.0.2"
		}
		}

-2

distribution/_api/constants.d.ts

		import type { SpeakerLanguage } from './types.js';
		export declare const defaultSpeaker: Record<SpeakerLanguage, string>;

-5

distribution/_api/constants.js

		export const defaultSpeaker = {
		en: 'chat-girl-105-cn',
		zh: 'leo-9328b6d2',
		ja: 'tianzhongdunzi-5d612542',
		};

-43

distribution/_api/listenhub-openapi.d.ts

		import { type KyInstance } from 'ky';
		import type { ApiResponse, SpeakerLanguage } from './types.js';
		export * from './constants.js';
		export type * from './types.js';
		export type ListenHubApiOptions = {
		apiKey: string;
		};
		export declare class ListenHubApi {
		api: KyInstance;
		constructor({ apiKey }: ListenHubApiOptions);
		/**
		* Get a list of available speakers.
		* @param options - The options for the speakers request.
		* @param options.language - Optional. The language of the speakers to get, defaults to English.
		* @returns A list of available speakers.
		* @see {@link https://listenhub.ai/docs/en/openapi/api-reference/speakers#list-available-speakers\|List Available Speakers}
		*/
		getAvailableSpeakers(options?: {
		language?: SpeakerLanguage;
		}): Promise<ApiResponse<{
		items: Array<{
		name: string;
		speakerId: string;
		demoAudioUrl: string;
		gender: string;
		language: SpeakerLanguage;
		}>;
		}>>;
		/**
		* Generate audio from text using the Streaming TTS API.
		* @param options - The options for the TTS request.
		* @param options.input - The text to generate audio from.
		* @param options.voice - The `speakerId` to use for the TTS.
		* @param options.model - Optional. The model to use for the TTS, defaults to `flowtts`.
		* @returns A readable stream of the MP3 audio.
		* @see {@link https://listenhub.ai/docs/en/openapi/api-reference/flowspeech\|Streaming TTS}
		*/
		tts(options: {
		input: string;
		voice: string;
		model?: string;
		}): Promise<ReadableStream<Uint8Array<ArrayBuffer>>>;
		}

-39

distribution/_api/listenhub-openapi.js

		import ky from 'ky';
		export * from './constants.js';
		export class ListenHubApi {
		api;
		constructor({ apiKey }) {
		this.api = ky.extend({
		prefixUrl: 'https://api.marswave.ai/openapi',
		headers: {
		// eslint-disable-next-line @typescript-eslint/naming-convention
		Authorization: `Bearer ${apiKey}`,
		},
		});
		}
		/**
		* Get a list of available speakers.
		* @param options - The options for the speakers request.
		* @param options.language - Optional. The language of the speakers to get, defaults to English.
		* @returns A list of available speakers.
		* @see {@link https://listenhub.ai/docs/en/openapi/api-reference/speakers#list-available-speakers\|List Available Speakers}
		*/
		async getAvailableSpeakers(options) {
		return this.api.get('v1/speakers/list', { searchParams: options }).json();
		}
		/**
		* Generate audio from text using the Streaming TTS API.
		* @param options - The options for the TTS request.
		* @param options.input - The text to generate audio from.
		* @param options.voice - The `speakerId` to use for the TTS.
		* @param options.model - Optional. The model to use for the TTS, defaults to `flowtts`.
		* @returns A readable stream of the MP3 audio.
		* @see {@link https://listenhub.ai/docs/en/openapi/api-reference/flowspeech\|Streaming TTS}
		*/
		async tts(options) {
		const response = await this.api.post('v1/tts', { json: options });
		if (!response.body)
		throw new Error('Empty response body from TTS API');
		return response.body;
		}
		}

-6

distribution/_api/types.d.ts

		export type ApiResponse<T> = {
		code: number;
		message: string;
		data: T;
		};
		export type SpeakerLanguage = 'en' \| 'zh' \| 'ja';

-1

distribution/_api/types.js

export {};

-2

distribution/asr/_cli.d.ts

		import type { Command } from 'commander';
		export declare function register(program: Command): void;

-56

distribution/asr/_cli.js

		import { Buffer } from 'node:buffer';
		import process from 'node:process';
		import { runAsr } from './asr.js';
		import { ensureModels, ensureVadModel } from './models.js';
		import { streamAsr } from './stream-asr.js';
		export function register(program) {
		program
		.command('asr')
		.description('Transcribe an audio file using speech recognition')
		.argument('<file>', 'Audio file to transcribe')
		.option('-j, --json', 'Output result in JSON format', false)
		.option('--model <name>', 'Model to use: whisper, sensevoice', 'sensevoice')
		.action(async (file, options) => {
		const { model } = options;
		if (model !== 'whisper' && model !== 'sensevoice') {
		throw new Error(`Unknown model "${model}". Use "whisper" or "sensevoice".`);
		}
		await ensureModels([model]);
		await runAsr(file, { json: options.json, model });
		});
		program
		.command('asr-stream')
		.description('Stream speech recognition from stdin (expects 16kHz mono s16le PCM)')
		.option('-j, --json', 'Output each result as a JSON line', false)
		.option('--vad', 'Enable voice activity detection', false)
		.option('--asr-interval-ms <ms>', 'Recognition interval in ms (ignored with --vad)', '1000')
		.action(async (options) => {
		await ensureModels();
		if (options.vad) {
		await ensureVadModel();
		}
		async function* stdinAudio() {
		for await (const chunk of process.stdin) {
		const buf = Buffer.from(chunk);
		const pcm = new Int16Array(buf.buffer, buf.byteOffset, buf.byteLength / 2);
		const float32 = new Float32Array(pcm.length);
		for (const [i, sample] of pcm.entries()) {
		float32[i] = sample / 32_768;
		}
		yield float32;
		}
		}
		await streamAsr(stdinAudio(), {
		vad: options.vad \|\| undefined,
		asrIntervalMs: Number(options.asrIntervalMs),
		onResult(result) {
		if (options.json) {
		console.log(JSON.stringify(result));
		}
		else {
		console.log(result.text);
		}
		},
		});
		});
		}

-2

distribution/asr/_index.d.ts

		export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js';
		export { streamAsr, type AsrStreamResult, type StreamAsrOptions, type VadOptions, } from './stream-asr.js';

-2

distribution/asr/_index.js

		export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js';
		export { streamAsr, } from './stream-asr.js';

-7

distribution/asr/asr.d.ts

		type ModelName = 'whisper' \| 'sensevoice';
		export type AsrOptions = {
		json: boolean;
		model: ModelName;
		};
		export declare function runAsr(filePath: string, options: AsrOptions): Promise<void>;
		export {};

-115

distribution/asr/asr.js

		import fs from 'node:fs';
		import { createRequire } from 'node:module';
		import os from 'node:os';
		import path from 'node:path';
		import { execa } from 'execa';
		import { getModelPath, modelDisplayNames } from './models.js';
		const require = createRequire(import.meta.url);
		// Loaded lazily to avoid loading the native addon until needed
		let _sherpaOnnx;
		function sherpaOnnx() {
		_sherpaOnnx ??= require('sherpa-onnx-node');
		return _sherpaOnnx;
		}
		async function convertToWav(inputPath) {
		const outputPath = path.join(os.tmpdir(), `coli-${Date.now()}.wav`);
		try {
		await execa('ffmpeg', [
		'-i',
		inputPath,
		'-ar',
		'16000',
		'-ac',
		'1',
		'-f',
		'wav',
		'-acodec',
		'pcm_s16le',
		outputPath,
		'-y',
		]);
		}
		catch {
		throw new Error('Failed to convert audio file. Please make sure ffmpeg is installed.\n' +
		' brew install ffmpeg # macOS\n' +
		' sudo apt install ffmpeg # Debian/Ubuntu');
		}
		return outputPath;
		}
		function createRecognizer(model) {
		const modelDir = getModelPath(model);
		const onnx = sherpaOnnx();
		if (model === 'whisper') {
		return new onnx.OfflineRecognizer({
		featConfig: { sampleRate: 16_000, featureDim: 80 },
		modelConfig: {
		whisper: {
		encoder: path.join(modelDir, 'tiny.en-encoder.int8.onnx'),
		decoder: path.join(modelDir, 'tiny.en-decoder.int8.onnx'),
		},
		tokens: path.join(modelDir, 'tiny.en-tokens.txt'),
		numThreads: 2,
		provider: 'cpu',
		debug: 0,
		},
		});
		}
		return new onnx.OfflineRecognizer({
		featConfig: { sampleRate: 16_000, featureDim: 80 },
		modelConfig: {
		senseVoice: {
		model: path.join(modelDir, 'model.int8.onnx'),
		useInverseTextNormalization: 1,
		},
		tokens: path.join(modelDir, 'tokens.txt'),
		numThreads: 2,
		provider: 'cpu',
		debug: 0,
		},
		});
		}
		export async function runAsr(filePath, options) {
		const resolvedPath = path.resolve(filePath);
		if (!fs.existsSync(resolvedPath)) {
		throw new Error(`File not found: ${resolvedPath}`);
		}
		const ext = path.extname(resolvedPath).toLowerCase();
		let wavPath;
		let needsCleanup = false;
		if (ext === '.wav') {
		wavPath = resolvedPath;
		}
		else {
		wavPath = await convertToWav(resolvedPath);
		needsCleanup = true;
		}
		try {
		const onnx = sherpaOnnx();
		const recognizer = createRecognizer(options.model);
		const stream = recognizer.createStream();
		const wave = onnx.readWave(wavPath);
		stream.acceptWaveform({ sampleRate: wave.sampleRate, samples: wave.samples });
		recognizer.decode(stream);
		const result = recognizer.getResult(stream);
		if (options.json) {
		console.log(JSON.stringify({
		text: result.text.trim(),
		model: modelDisplayNames[options.model],
		lang: result.lang \|\| undefined,
		emotion: result.emotion \|\| undefined,
		event: result.event \|\| undefined,
		tokens: result.tokens,
		timestamps: result.timestamps,
		duration: wave.samples.length / wave.sampleRate,
		}, null, 2));
		}
		else {
		console.log(result.text.trim());
		}
		}
		finally {
		if (needsCleanup && fs.existsSync(wavPath)) {
		fs.unlinkSync(wavPath);
		}
		}
		}

-7

distribution/asr/models.d.ts

		type ModelName = 'whisper' \| 'sensevoice';
		export declare const modelDisplayNames: Record<ModelName, string>;
		export declare function getModelPath(model: ModelName): string;
		export declare function ensureModels(modelNames?: ModelName[]): Promise<void>;
		export declare function getVadModelPath(): string;
		export declare function ensureVadModel(): Promise<void>;
		export {};

-117

distribution/asr/models.js

		import fs from 'node:fs';
		import os from 'node:os';
		import path from 'node:path';
		import process from 'node:process';
		import { execa } from 'execa';
		const modelsDirectory = path.join(os.homedir(), '.coli', 'models');
		const models = {
		whisper: {
		dirName: 'sherpa-onnx-whisper-tiny.en',
		url: 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2',
		checkFile: 'tiny.en-encoder.int8.onnx',
		},
		sensevoice: {
		dirName: 'sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17',
		url: 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17.tar.bz2',
		checkFile: 'model.int8.onnx',
		},
		};
		export const modelDisplayNames = {
		whisper: 'whisper-tiny.en',
		sensevoice: 'sensevoice-small',
		};
		export function getModelPath(model) {
		return path.join(modelsDirectory, models[model].dirName);
		}
		function isModelInstalled(entry) {
		const modelDir = path.join(modelsDirectory, entry.dirName);
		return fs.existsSync(path.join(modelDir, entry.checkFile));
		}
		async function downloadModel(entry) {
		const { dirName, url } = entry;
		console.log(`Downloading ${dirName}...`);
		fs.mkdirSync(modelsDirectory, { recursive: true });
		const tarPath = path.join(modelsDirectory, `${dirName}.tar.bz2`);
		const response = await fetch(url, { redirect: 'follow' });
		if (!response.ok \|\| !response.body) {
		throw new Error(`Failed to download model: ${response.statusText}`);
		}
		const contentLength = Number(response.headers.get('content-length') ?? 0);
		const reader = response.body.getReader();
		const fileHandle = fs.openSync(tarPath, 'w');
		let downloaded = 0;
		try {
		for (;;) {
		const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop
		if (done) {
		break;
		}
		fs.writeSync(fileHandle, value);
		downloaded += value.length;
		if (contentLength > 0) {
		const percent = ((downloaded / contentLength) * 100).toFixed(1);
		const mb = (downloaded / (1024 * 1024)).toFixed(1);
		const totalMb = (contentLength / (1024 * 1024)).toFixed(1);
		process.stdout.write(`\r ${mb} MB / ${totalMb} MB (${percent}%)`);
		}
		}
		}
		finally {
		fs.closeSync(fileHandle);
		}
		process.stdout.write('\n');
		console.log(' Extracting...');
		await execa('tar', ['xjf', tarPath, '-C', modelsDirectory]);
		fs.unlinkSync(tarPath);
		console.log(` ${dirName} ready.\n`);
		}
		export async function ensureModels(modelNames = ['sensevoice']) {
		const pending = modelNames
		.map((name) => models[name])
		.filter((entry) => !isModelInstalled(entry));
		for (const entry of pending) {
		await downloadModel(entry); // eslint-disable-line no-await-in-loop
		}
		}
		const vadModelFile = 'silero_vad.onnx';
		const vadModelUrl = 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx';
		export function getVadModelPath() {
		return path.join(modelsDirectory, vadModelFile);
		}
		export async function ensureVadModel() {
		const modelPath = getVadModelPath();
		if (fs.existsSync(modelPath)) {
		return;
		}
		console.log(`Downloading ${vadModelFile}...`);
		fs.mkdirSync(modelsDirectory, { recursive: true });
		const response = await fetch(vadModelUrl, { redirect: 'follow' });
		if (!response.ok \|\| !response.body) {
		throw new Error(`Failed to download VAD model: ${response.statusText}`);
		}
		const contentLength = Number(response.headers.get('content-length') ?? 0);
		const reader = response.body.getReader();
		const fileHandle = fs.openSync(modelPath, 'w');
		let downloaded = 0;
		try {
		for (;;) {
		const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop
		if (done) {
		break;
		}
		fs.writeSync(fileHandle, value);
		downloaded += value.length;
		if (contentLength > 0) {
		const percent = ((downloaded / contentLength) * 100).toFixed(1);
		const kb = (downloaded / 1024).toFixed(0);
		const totalKb = (contentLength / 1024).toFixed(0);
		process.stdout.write(`\r ${kb} KB / ${totalKb} KB (${percent}%)`);
		}
		}
		}
		finally {
		fs.closeSync(fileHandle);
		}
		process.stdout.write('\n');
		console.log(` ${vadModelFile} ready.\n`);
		}

-23

distribution/asr/stream-asr.d.ts

		export type AsrStreamResult = {
		text: string;
		lang: string;
		emotion: string;
		event: string;
		tokens: string[];
		timestamps: number[];
		isFinal: boolean;
		};
		export type VadOptions = {
		threshold?: number;
		minSpeechDuration?: number;
		minSilenceDuration?: number;
		maxSpeechDuration?: number;
		enableExternalBuffer?: boolean;
		};
		export type StreamAsrOptions = {
		sampleRate?: number;
		asrIntervalMs?: number;
		vad?: boolean \| VadOptions;
		onResult: (result: AsrStreamResult) => void;
		};
		export declare function streamAsr(audio: AsyncIterable<Float32Array>, options: StreamAsrOptions): Promise<void>;

-135

distribution/asr/stream-asr.js

		import { createRequire } from 'node:module';
		import path from 'node:path';
		import { getModelPath, getVadModelPath } from './models.js';
		const require = createRequire(import.meta.url);
		let _sherpaOnnx;
		function sherpaOnnx() {
		_sherpaOnnx ??= require('sherpa-onnx-node');
		return _sherpaOnnx;
		}
		const defaultSampleRate = 16_000;
		const defaultAsrIntervalMs = 1000;
		function createRecognizer() {
		const modelDir = getModelPath('sensevoice');
		const onnx = sherpaOnnx();
		return new onnx.OfflineRecognizer({
		featConfig: { sampleRate: defaultSampleRate, featureDim: 80 },
		modelConfig: {
		senseVoice: {
		model: path.join(modelDir, 'model.int8.onnx'),
		useInverseTextNormalization: 1,
		},
		tokens: path.join(modelDir, 'tokens.txt'),
		numThreads: 2,
		provider: 'cpu',
		debug: 0,
		},
		});
		}
		function recognize(recognizer, samples) {
		const stream = recognizer.createStream();
		stream.acceptWaveform({ sampleRate: defaultSampleRate, samples });
		recognizer.decode(stream);
		return recognizer.getResult(stream);
		}
		function mergeBuffers(buffers, totalLength) {
		if (buffers.length === 1 && buffers[0]) {
		return buffers[0];
		}
		const merged = new Float32Array(totalLength);
		let offset = 0;
		for (const buf of buffers) {
		merged.set(buf, offset);
		offset += buf.length;
		}
		return merged;
		}
		function createVad(vadOptions) {
		const onnx = sherpaOnnx();
		return new onnx.Vad({
		sileroVad: {
		model: getVadModelPath(),
		threshold: vadOptions.threshold ?? 0.5,
		minSpeechDuration: vadOptions.minSpeechDuration ?? 0.25,
		minSilenceDuration: vadOptions.minSilenceDuration ?? 0.5,
		maxSpeechDuration: vadOptions.maxSpeechDuration ?? 15,
		windowSize: 512,
		},
		sampleRate: defaultSampleRate,
		debug: 0,
		numThreads: 1,
		}, 60);
		}
		function emitResult(result, isFinal, onResult) {
		const text = result.text.trim();
		if (text) {
		onResult({ ...result, text, isFinal });
		}
		}
		async function streamWithVad(audio, options, vadOptions) {
		const recognizer = createRecognizer();
		const vad = createVad(vadOptions);
		const { windowSize } = vad.config.sileroVad;
		let pending = new Float32Array(0);
		function drainSegments() {
		while (!vad.isEmpty()) {
		const segment = vad.front(vadOptions.enableExternalBuffer);
		vad.pop();
		emitResult(recognize(recognizer, segment.samples), true, options.onResult);
		}
		}
		for await (const chunk of audio) {
		const combined = new Float32Array(pending.length + chunk.length);
		combined.set(pending);
		combined.set(chunk, pending.length);
		pending = combined;
		while (pending.length >= windowSize) {
		vad.acceptWaveform(pending.subarray(0, windowSize));
		pending = pending.subarray(windowSize);
		drainSegments();
		}
		}
		if (pending.length > 0) {
		const padded = new Float32Array(windowSize);
		padded.set(pending);
		vad.acceptWaveform(padded);
		}
		vad.flush();
		drainSegments();
		}
		async function streamWithInterval(audio, options) {
		const inputSampleRate = options.sampleRate ?? defaultSampleRate;
		const intervalMs = options.asrIntervalMs ?? defaultAsrIntervalMs;
		const chunkInterval = (defaultSampleRate * intervalMs) / 1000;
		const recognizer = createRecognizer();
		const buffers = [];
		let totalSamples = 0;
		let lastRecognizedAt = 0;
		let lastText = '';
		for await (const chunk of audio) {
		buffers.push(chunk);
		totalSamples += chunk.length;
		const samplesForInterval = (chunkInterval * inputSampleRate) / defaultSampleRate;
		if (totalSamples - lastRecognizedAt >= samplesForInterval) {
		lastRecognizedAt = totalSamples;
		const merged = mergeBuffers(buffers, totalSamples);
		const result = recognize(recognizer, merged);
		const text = result.text.trim();
		if (text && text !== lastText) {
		lastText = text;
		options.onResult({ ...result, text, isFinal: false });
		}
		}
		}
		const merged = mergeBuffers(buffers, totalSamples);
		if (merged.length > 0) {
		emitResult(recognize(recognizer, merged), true, options.onResult);
		}
		}
		export async function streamAsr(audio, options) {
		if (options.vad) {
		const vadOptions = typeof options.vad === 'object' ? options.vad : {};
		return streamWithVad(audio, options, vadOptions);
		}
		return streamWithInterval(audio, options);
		}

-2

distribution/cli.d.ts

		#!/usr/bin/env node
		export {};

-11

distribution/cli.js

		#!/usr/bin/env node
		import { Command } from 'commander';
		import { register as registerAsr } from './asr/_cli.js';
		import { register as registerCloudTts } from './cloud-tts/_cli.js';
		import { register as registerTts } from './tts/_cli.js';
		const program = new Command();
		program.name('coli').description('Core CLI for Cola');
		registerAsr(program);
		registerTts(program);
		registerCloudTts(program);
		program.parse();

-2

distribution/cloud-tts/_cli.d.ts

		import type { Command } from 'commander';
		export declare function register(program: Command): void;

-56

distribution/cloud-tts/_cli.js

		import process from 'node:process';
		import { defaultSpeaker } from '../_api/constants.js';
		import { listSpeakers, runCloudTts } from './cloud-tts.js';
		function getApiKey(options) {
		const key = options.apiKey ?? process.env['COLI_LISTENHUB_API_KEY'];
		if (!key) {
		throw new Error('API key required. Use --api-key or set COLI_LISTENHUB_API_KEY environment variable. Get an API key from https://listenhub.ai/settings/api-keys');
		}
		return key;
		}
		export function register(program) {
		program
		.command('cloud-tts')
		.description('Generate speech using ListenHub OpenAPI')
		.argument('[text]', 'Text to synthesize')
		.option('--api-key <key>', 'ListenHub API key (or set COLI_LISTENHUB_API_KEY environment variable)')
		.option('--voice <id>', 'Speaker ID to use')
		.option('--model <name>', 'Model to use (default: flowtts)')
		.option('-o, --output <file>', 'Save audio to file')
		.option('--list-speakers', 'List available speakers')
		.option('--language <lang>', 'Speaker language (en, zh, ja)')
		.option('-j, --json', 'Output in JSON format (use with --list-speakers)')
		.action(async (text, options) => {
		if (options.listSpeakers) {
		const apiKey = getApiKey(options);
		const speakers = await listSpeakers({
		apiKey,
		language: options.language,
		});
		if (options.json) {
		console.log(JSON.stringify(speakers, null, 2));
		}
		else {
		for (const speaker of speakers) {
		console.log(`${speaker.name}\t${speaker.speakerId}\t${speaker.gender}\t${speaker.language}`);
		}
		}
		return;
		}
		if (!text) {
		throw new Error('Please provide text to synthesize.');
		}
		const voice = options.voice ??
		(options.language && defaultSpeaker[options.language]);
		if (!voice) {
		throw new Error('Please specify a speaker with --voice or a language with --language. Use --list-speakers to see available speakers.');
		}
		const apiKey = getApiKey(options);
		await runCloudTts(text, {
		apiKey,
		voice,
		model: options.model,
		output: options.output,
		});
		});
		}

-1

distribution/cloud-tts/_index.d.ts

export { listSpeakers, runCloudTts, type CloudTtsOptions, type ListSpeakersOptions, } from './cloud-tts.js';

-1

distribution/cloud-tts/_index.js

export { listSpeakers, runCloudTts, } from './cloud-tts.js';

-19

distribution/cloud-tts/cloud-tts.d.ts

		import type { SpeakerLanguage } from '../_api/types.js';
		export type CloudTtsOptions = {
		apiKey: string;
		voice: string;
		model?: string;
		output?: string;
		};
		export type ListSpeakersOptions = {
		apiKey: string;
		language?: SpeakerLanguage;
		};
		export declare function listSpeakers(options: ListSpeakersOptions): Promise<{
		name: string;
		speakerId: string;
		demoAudioUrl: string;
		gender: string;
		language: SpeakerLanguage;
		}[]>;
		export declare function runCloudTts(text: string, options: CloudTtsOptions): Promise<void>;

-47

distribution/cloud-tts/cloud-tts.js

		import { Buffer } from 'node:buffer';
		import fs from 'node:fs';
		import os from 'node:os';
		import path from 'node:path';
		import { Writable } from 'node:stream';
		import { execa } from 'execa';
		import { ListenHubApi } from '../_api/listenhub-openapi.js';
		export async function listSpeakers(options) {
		const api = new ListenHubApi({ apiKey: options.apiKey });
		const result = await api.getAvailableSpeakers({
		language: options.language,
		});
		return result.data.items;
		}
		async function collectStream(stream) {
		const chunks = [];
		const reader = stream.getReader();
		for (;;) {
		const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop
		if (done)
		break;
		chunks.push(value);
		}
		return Buffer.concat(chunks);
		}
		export async function runCloudTts(text, options) {
		const api = new ListenHubApi({ apiKey: options.apiKey });
		const stream = await api.tts({
		input: text,
		voice: options.voice,
		model: options.model,
		});
		if (options.output) {
		const fileStream = fs.createWriteStream(options.output);
		await stream.pipeTo(Writable.toWeb(fileStream));
		return;
		}
		const mp3Path = path.join(os.tmpdir(), `coli-cloud-tts-${Date.now()}.mp3`);
		const audio = await collectStream(stream);
		fs.writeFileSync(mp3Path, audio);
		try {
		await execa('afplay', [mp3Path]);
		}
		finally {
		fs.unlinkSync(mp3Path);
		}
		}

-4

distribution/index.d.ts

		export * from './_api/listenhub-openapi.js';
		export * from './asr/_index.js';
		export * from './cloud-tts/_index.js';
		export * from './tts/_index.js';

-4

distribution/index.js

		export * from './_api/listenhub-openapi.js';
		export * from './asr/_index.js';
		export * from './cloud-tts/_index.js';
		export * from './tts/_index.js';

-2

distribution/tts/_cli.d.ts

		import type { Command } from 'commander';
		export declare function register(program: Command): void;

-30

distribution/tts/_cli.js

		import { getVoices, runTts } from './tts.js';
		export function register(program) {
		program
		.command('tts')
		.description('Speak text using text-to-speech (macOS only)')
		.argument('[text]', 'Text to speak')
		.option('-v, --voice <name>', 'Voice to use, defaults to macOS system voice')
		.option('-r, --rate <wpm>', 'Speech rate in words per minute', Number)
		.option('-o, --output <file>', 'Save audio to file instead of speaking')
		.option('--list-voices', 'List available voices')
		.option('-j, --json', 'Output in JSON format (use with --list-voices)')
		.action(async (text, options) => {
		if (options.listVoices) {
		const voices = await getVoices();
		if (options.json) {
		console.log(JSON.stringify(voices, null, 2));
		}
		else {
		for (const voice of voices) {
		console.log(`${voice.name}\t${voice.languageCode}\t${voice.example}`);
		}
		}
		return;
		}
		if (!text) {
		throw new Error('Please provide text to speak.');
		}
		await runTts(text, options);
		});
		}

-1

distribution/tts/_index.d.ts

export { getVoices, runTts, type TtsOptions } from './tts.js';

-1

distribution/tts/_index.js

export { getVoices, runTts } from './tts.js';

-8

distribution/tts/tts.d.ts

		import { type Voice } from 'mac-say';
		export type TtsOptions = {
		voice?: string;
		rate?: number;
		output?: string;
		};
		export declare function getVoices(): Promise<Voice[]>;
		export declare function runTts(text: string, options?: TtsOptions): Promise<void>;

-11

distribution/tts/tts.js

		import { getVoices as macGetVoices, say } from 'mac-say';
		export async function getVoices() {
		return macGetVoices();
		}
		export async function runTts(text, options = {}) {
		await say(text, {
		voice: options.voice,
		rate: options.rate,
		outputFile: options.output,
		});
		}

		@@ -8,2 +8,3 @@ # Cloud TTS
		- A ListenHub API key. Pass it via `--api-key` or set the `COLI_LISTENHUB_API_KEY` environment variable.
		- Optionally, a custom base URL via `--base-url` or the `COLI_TTS_BASE_URL` environment variable.

		@@ -39,2 +40,3 @@ ## CLI
		--model <name> Model to use (default: flowtts)
		--base-url <url> Base URL for TTS API (or set COLI_TTS_BASE_URL)
		-o, --output <file> Save audio to file
		@@ -70,2 +72,3 @@ --list-speakers List available speakers
		\| `language` \| `'en' \\| 'zh' \\| 'ja'` \| Filter speakers by language. Omit to list all. \|
		\| `baseUrl` \| `string` \| Custom base URL for TTS API (optional) \|

		@@ -89,7 +92,8 @@ ### `runCloudTts(text, options)`

		\| Property \| Type \| Description \|
		\| -------- \| -------- \| ---------------------------------------------- \|
		\| `apiKey` \| `string` \| ListenHub API key \|
		\| `voice` \| `string` \| Speaker ID (from `listSpeakers`) \|
		\| `model` \| `string` \| Model to use (optional, defaults to `flowtts`) \|
		\| `output` \| `string` \| Save to file instead of playing directly \|
		\| Property \| Type \| Description \|
		\| --------- \| -------- \| ---------------------------------------------- \|
		\| `apiKey` \| `string` \| ListenHub API key \|
		\| `voice` \| `string` \| Speaker ID (from `listSpeakers`) \|
		\| `model` \| `string` \| Model to use (optional, defaults to `flowtts`) \|
		\| `output` \| `string` \| Save to file instead of playing directly \|
		\| `baseUrl` \| `string` \| Custom base URL for TTS API (optional) \|

@marswave/coli - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes