@marswave/coli
Advanced tools
| import type { SpeakerLanguage } from './types.js'; | ||
| export declare const defaultSpeaker: Record<SpeakerLanguage, string>; |
| export const defaultSpeaker = { | ||
| en: 'chat-girl-105-cn', | ||
| zh: 'leo-9328b6d2', | ||
| ja: 'tianzhongdunzi-5d612542', | ||
| }; |
| import { type KyInstance } from 'ky'; | ||
| import type { ApiResponse, SpeakerLanguage } from './types.js'; | ||
| export * from './constants.js'; | ||
| export type * from './types.js'; | ||
| export type ListenHubApiOptions = { | ||
| apiKey: string; | ||
| baseUrl?: string; | ||
| }; | ||
| export declare class ListenHubApi { | ||
| api: KyInstance; | ||
| constructor({ apiKey, baseUrl }: ListenHubApiOptions); | ||
| /** | ||
| * Get a list of available speakers. | ||
| * @param options - The options for the speakers request. | ||
| * @param options.language - Optional. The language of the speakers to get, defaults to English. | ||
| * @returns A list of available speakers. | ||
| * @see {@link https://listenhub.ai/docs/en/openapi/api-reference/speakers#list-available-speakers|List Available Speakers} | ||
| */ | ||
| getAvailableSpeakers(options?: { | ||
| language?: SpeakerLanguage; | ||
| }): Promise<ApiResponse<{ | ||
| items: Array<{ | ||
| name: string; | ||
| speakerId: string; | ||
| demoAudioUrl: string; | ||
| gender: string; | ||
| language: SpeakerLanguage; | ||
| }>; | ||
| }>>; | ||
| /** | ||
| * Generate audio from text using the Streaming TTS API. | ||
| * @param options - The options for the TTS request. | ||
| * @param options.input - The text to generate audio from. | ||
| * @param options.voice - The `speakerId` to use for the TTS. | ||
| * @param options.model - Optional. The model to use for the TTS, defaults to `flowtts`. | ||
| * @returns A readable stream of the MP3 audio. | ||
| * @see {@link https://listenhub.ai/docs/en/openapi/api-reference/flowspeech|Streaming TTS} | ||
| */ | ||
| tts(options: { | ||
| input: string; | ||
| voice: string; | ||
| model?: string; | ||
| }): Promise<ReadableStream<Uint8Array<ArrayBuffer>>>; | ||
| } |
| import ky from 'ky'; | ||
| export * from './constants.js'; | ||
| export class ListenHubApi { | ||
| api; | ||
| constructor({ apiKey, baseUrl }) { | ||
| this.api = ky.extend({ | ||
| prefixUrl: baseUrl ?? 'https://api.marswave.ai/openapi', | ||
| headers: { | ||
| // eslint-disable-next-line @typescript-eslint/naming-convention | ||
| Authorization: `Bearer ${apiKey}`, | ||
| }, | ||
| }); | ||
| } | ||
| /** | ||
| * Get a list of available speakers. | ||
| * @param options - The options for the speakers request. | ||
| * @param options.language - Optional. The language of the speakers to get, defaults to English. | ||
| * @returns A list of available speakers. | ||
| * @see {@link https://listenhub.ai/docs/en/openapi/api-reference/speakers#list-available-speakers|List Available Speakers} | ||
| */ | ||
| async getAvailableSpeakers(options) { | ||
| return this.api.get('v1/speakers/list', { searchParams: options }).json(); | ||
| } | ||
| /** | ||
| * Generate audio from text using the Streaming TTS API. | ||
| * @param options - The options for the TTS request. | ||
| * @param options.input - The text to generate audio from. | ||
| * @param options.voice - The `speakerId` to use for the TTS. | ||
| * @param options.model - Optional. The model to use for the TTS, defaults to `flowtts`. | ||
| * @returns A readable stream of the MP3 audio. | ||
| * @see {@link https://listenhub.ai/docs/en/openapi/api-reference/flowspeech|Streaming TTS} | ||
| */ | ||
| async tts(options) { | ||
| // eslint-disable-next-line @typescript-eslint/await-thenable | ||
| const response = await this.api.post('v1/tts', { json: options }); | ||
| if (!response.body) | ||
| throw new Error('Empty response body from TTS API'); | ||
| return response.body; | ||
| } | ||
| } |
| export type ApiResponse<T> = { | ||
| code: number; | ||
| message: string; | ||
| data: T; | ||
| }; | ||
| export type SpeakerLanguage = 'en' | 'zh' | 'ja'; |
| export {}; |
| import type { Command } from 'commander'; | ||
| export declare function register(program: Command): void; |
| import { Buffer } from 'node:buffer'; | ||
| import process from 'node:process'; | ||
| import { runAsr } from './asr.js'; | ||
| import { ensureModels, ensureVadModel } from './models.js'; | ||
| import { streamAsr } from './stream-asr.js'; | ||
| export function register(program) { | ||
| program | ||
| .command('asr') | ||
| .description('Transcribe an audio file using speech recognition') | ||
| .argument('<file>', 'Audio file to transcribe') | ||
| .option('-j, --json', 'Output result in JSON format', false) | ||
| .option('--model <name>', 'Model to use: whisper, sensevoice', 'sensevoice') | ||
| .action(async (file, options) => { | ||
| const { model } = options; | ||
| if (model !== 'whisper' && model !== 'sensevoice') { | ||
| throw new Error(`Unknown model "${model}". Use "whisper" or "sensevoice".`); | ||
| } | ||
| await ensureModels([model]); | ||
| await runAsr(file, { json: options.json, model }); | ||
| }); | ||
| program | ||
| .command('asr-stream') | ||
| .description('Stream speech recognition from stdin (expects 16kHz mono s16le PCM)') | ||
| .option('-j, --json', 'Output each result as a JSON line', false) | ||
| .option('--vad', 'Enable voice activity detection', false) | ||
| .option('--asr-interval-ms <ms>', 'Recognition interval in ms (ignored with --vad)', '1000') | ||
| .action(async (options) => { | ||
| await ensureModels(); | ||
| if (options.vad) { | ||
| await ensureVadModel(); | ||
| } | ||
| async function* stdinAudio() { | ||
| // eslint-disable-next-line @typescript-eslint/await-thenable | ||
| for await (const chunk of process.stdin) { | ||
| // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion | ||
| const buf = Buffer.from(chunk); | ||
| const pcm = new Int16Array(buf.buffer, buf.byteOffset, buf.byteLength / 2); | ||
| const float32 = new Float32Array(pcm.length); | ||
| for (const [i, sample] of pcm.entries()) { | ||
| float32[i] = sample / 32_768; | ||
| } | ||
| yield float32; | ||
| } | ||
| } | ||
| await streamAsr(stdinAudio(), { | ||
| vad: options.vad || undefined, | ||
| asrIntervalMs: Number(options.asrIntervalMs), | ||
| onResult(result) { | ||
| if (options.json) { | ||
| console.log(JSON.stringify(result)); | ||
| } | ||
| else { | ||
| console.log(result.text); | ||
| } | ||
| }, | ||
| }); | ||
| }); | ||
| } |
| export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js'; | ||
| export { streamAsr, type AsrStreamResult, type StreamAsrOptions, type VadOptions, } from './stream-asr.js'; |
| export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js'; | ||
| export { streamAsr, } from './stream-asr.js'; |
| type ModelName = 'whisper' | 'sensevoice'; | ||
| export type AsrOptions = { | ||
| json: boolean; | ||
| model: ModelName; | ||
| }; | ||
| export declare function runAsr(filePath: string, options: AsrOptions): Promise<void>; | ||
| export {}; |
| import fs from 'node:fs'; | ||
| import { createRequire } from 'node:module'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| import { execa } from 'execa'; | ||
| import { getModelPath, modelDisplayNames } from './models.js'; | ||
| const require = createRequire(import.meta.url); | ||
| // Loaded lazily to avoid loading the native addon until needed | ||
| let _sherpaOnnx; | ||
| function sherpaOnnx() { | ||
| // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion | ||
| _sherpaOnnx ??= require('sherpa-onnx-node'); | ||
| return _sherpaOnnx; | ||
| } | ||
| async function convertToWav(inputPath) { | ||
| const outputPath = path.join(os.tmpdir(), `coli-${Date.now()}.wav`); | ||
| try { | ||
| // eslint-disable-next-line @typescript-eslint/await-thenable | ||
| await execa('ffmpeg', [ | ||
| '-i', | ||
| inputPath, | ||
| '-ar', | ||
| '16000', | ||
| '-ac', | ||
| '1', | ||
| '-f', | ||
| 'wav', | ||
| '-acodec', | ||
| 'pcm_s16le', | ||
| outputPath, | ||
| '-y', | ||
| ]); | ||
| } | ||
| catch { | ||
| throw new Error('Failed to convert audio file. Please make sure ffmpeg is installed.\n' + | ||
| ' brew install ffmpeg # macOS\n' + | ||
| ' sudo apt install ffmpeg # Debian/Ubuntu'); | ||
| } | ||
| return outputPath; | ||
| } | ||
| function createRecognizer(model) { | ||
| const modelDir = getModelPath(model); | ||
| const onnx = sherpaOnnx(); | ||
| if (model === 'whisper') { | ||
| return new onnx.OfflineRecognizer({ | ||
| featConfig: { sampleRate: 16_000, featureDim: 80 }, | ||
| modelConfig: { | ||
| whisper: { | ||
| encoder: path.join(modelDir, 'tiny.en-encoder.int8.onnx'), | ||
| decoder: path.join(modelDir, 'tiny.en-decoder.int8.onnx'), | ||
| }, | ||
| tokens: path.join(modelDir, 'tiny.en-tokens.txt'), | ||
| numThreads: 2, | ||
| provider: 'cpu', | ||
| debug: 0, | ||
| }, | ||
| }); | ||
| } | ||
| return new onnx.OfflineRecognizer({ | ||
| featConfig: { sampleRate: 16_000, featureDim: 80 }, | ||
| modelConfig: { | ||
| senseVoice: { | ||
| model: path.join(modelDir, 'model.int8.onnx'), | ||
| useInverseTextNormalization: 1, | ||
| }, | ||
| tokens: path.join(modelDir, 'tokens.txt'), | ||
| numThreads: 2, | ||
| provider: 'cpu', | ||
| debug: 0, | ||
| }, | ||
| }); | ||
| } | ||
| export async function runAsr(filePath, options) { | ||
| const resolvedPath = path.resolve(filePath); | ||
| if (!fs.existsSync(resolvedPath)) { | ||
| throw new Error(`File not found: ${resolvedPath}`); | ||
| } | ||
| const ext = path.extname(resolvedPath).toLowerCase(); | ||
| let wavPath; | ||
| let needsCleanup = false; | ||
| if (ext === '.wav') { | ||
| wavPath = resolvedPath; | ||
| } | ||
| else { | ||
| wavPath = await convertToWav(resolvedPath); | ||
| needsCleanup = true; | ||
| } | ||
| try { | ||
| const onnx = sherpaOnnx(); | ||
| const recognizer = createRecognizer(options.model); | ||
| const stream = recognizer.createStream(); | ||
| const wave = onnx.readWave(wavPath); | ||
| stream.acceptWaveform({ sampleRate: wave.sampleRate, samples: wave.samples }); | ||
| recognizer.decode(stream); | ||
| const result = recognizer.getResult(stream); | ||
| if (options.json) { | ||
| console.log(JSON.stringify({ | ||
| text: result.text.trim(), | ||
| model: modelDisplayNames[options.model], | ||
| lang: result.lang || undefined, | ||
| emotion: result.emotion || undefined, | ||
| event: result.event || undefined, | ||
| tokens: result.tokens, | ||
| timestamps: result.timestamps, | ||
| duration: wave.samples.length / wave.sampleRate, | ||
| }, null, 2)); | ||
| } | ||
| else { | ||
| console.log(result.text.trim()); | ||
| } | ||
| } | ||
| finally { | ||
| if (needsCleanup && fs.existsSync(wavPath)) { | ||
| fs.unlinkSync(wavPath); | ||
| } | ||
| } | ||
| } |
| type ModelName = 'whisper' | 'sensevoice'; | ||
| export declare const modelDisplayNames: Record<ModelName, string>; | ||
| export declare function getModelPath(model: ModelName): string; | ||
| export declare function ensureModels(modelNames?: ModelName[]): Promise<void>; | ||
| export declare function getVadModelPath(): string; | ||
| export declare function ensureVadModel(): Promise<void>; | ||
| export {}; |
| import fs from 'node:fs'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| import process from 'node:process'; | ||
| import { execa } from 'execa'; | ||
| const modelsDirectory = path.join(os.homedir(), '.coli', 'models'); | ||
| const models = { | ||
| whisper: { | ||
| dirName: 'sherpa-onnx-whisper-tiny.en', | ||
| url: 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2', | ||
| checkFile: 'tiny.en-encoder.int8.onnx', | ||
| }, | ||
| sensevoice: { | ||
| dirName: 'sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17', | ||
| url: 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17.tar.bz2', | ||
| checkFile: 'model.int8.onnx', | ||
| }, | ||
| }; | ||
| export const modelDisplayNames = { | ||
| whisper: 'whisper-tiny.en', | ||
| sensevoice: 'sensevoice-small', | ||
| }; | ||
| export function getModelPath(model) { | ||
| return path.join(modelsDirectory, models[model].dirName); | ||
| } | ||
| function isModelInstalled(entry) { | ||
| const modelDir = path.join(modelsDirectory, entry.dirName); | ||
| return fs.existsSync(path.join(modelDir, entry.checkFile)); | ||
| } | ||
| async function downloadModel(entry) { | ||
| const { dirName, url } = entry; | ||
| console.log(`Downloading ${dirName}...`); | ||
| fs.mkdirSync(modelsDirectory, { recursive: true }); | ||
| const tarPath = path.join(modelsDirectory, `${dirName}.tar.bz2`); | ||
| const response = await fetch(url, { redirect: 'follow' }); | ||
| if (!response.ok || !response.body) { | ||
| throw new Error(`Failed to download model: ${response.statusText}`); | ||
| } | ||
| const contentLength = Number(response.headers.get('content-length') ?? 0); | ||
| const reader = response.body.getReader(); | ||
| const fileHandle = fs.openSync(tarPath, 'w'); | ||
| let downloaded = 0; | ||
| try { | ||
| for (;;) { | ||
| const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop | ||
| if (done) { | ||
| break; | ||
| } | ||
| fs.writeSync(fileHandle, value); | ||
| downloaded += value.length; | ||
| if (contentLength > 0) { | ||
| const percent = ((downloaded / contentLength) * 100).toFixed(1); | ||
| const mb = (downloaded / (1024 * 1024)).toFixed(1); | ||
| const totalMb = (contentLength / (1024 * 1024)).toFixed(1); | ||
| process.stdout.write(`\r ${mb} MB / ${totalMb} MB (${percent}%)`); | ||
| } | ||
| } | ||
| } | ||
| finally { | ||
| fs.closeSync(fileHandle); | ||
| } | ||
| process.stdout.write('\n'); | ||
| console.log(' Extracting...'); | ||
| // eslint-disable-next-line @typescript-eslint/await-thenable | ||
| await execa('tar', ['xjf', tarPath, '-C', modelsDirectory]); | ||
| fs.unlinkSync(tarPath); | ||
| console.log(` ${dirName} ready.\n`); | ||
| } | ||
| export async function ensureModels(modelNames = ['sensevoice']) { | ||
| const pending = modelNames | ||
| .map((name) => models[name]) | ||
| .filter((entry) => !isModelInstalled(entry)); | ||
| for (const entry of pending) { | ||
| await downloadModel(entry); // eslint-disable-line no-await-in-loop | ||
| } | ||
| } | ||
| const vadModelFile = 'silero_vad.onnx'; | ||
| const vadModelUrl = 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx'; | ||
| export function getVadModelPath() { | ||
| return path.join(modelsDirectory, vadModelFile); | ||
| } | ||
| export async function ensureVadModel() { | ||
| const modelPath = getVadModelPath(); | ||
| if (fs.existsSync(modelPath)) { | ||
| return; | ||
| } | ||
| console.log(`Downloading ${vadModelFile}...`); | ||
| fs.mkdirSync(modelsDirectory, { recursive: true }); | ||
| const response = await fetch(vadModelUrl, { redirect: 'follow' }); | ||
| if (!response.ok || !response.body) { | ||
| throw new Error(`Failed to download VAD model: ${response.statusText}`); | ||
| } | ||
| const contentLength = Number(response.headers.get('content-length') ?? 0); | ||
| const reader = response.body.getReader(); | ||
| const fileHandle = fs.openSync(modelPath, 'w'); | ||
| let downloaded = 0; | ||
| try { | ||
| for (;;) { | ||
| const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop | ||
| if (done) { | ||
| break; | ||
| } | ||
| fs.writeSync(fileHandle, value); | ||
| downloaded += value.length; | ||
| if (contentLength > 0) { | ||
| const percent = ((downloaded / contentLength) * 100).toFixed(1); | ||
| const kb = (downloaded / 1024).toFixed(0); | ||
| const totalKb = (contentLength / 1024).toFixed(0); | ||
| process.stdout.write(`\r ${kb} KB / ${totalKb} KB (${percent}%)`); | ||
| } | ||
| } | ||
| } | ||
| finally { | ||
| fs.closeSync(fileHandle); | ||
| } | ||
| process.stdout.write('\n'); | ||
| console.log(` ${vadModelFile} ready.\n`); | ||
| } |
| export type AsrStreamResult = { | ||
| text: string; | ||
| lang: string; | ||
| emotion: string; | ||
| event: string; | ||
| tokens: string[]; | ||
| timestamps: number[]; | ||
| isFinal: boolean; | ||
| }; | ||
| export type VadOptions = { | ||
| threshold?: number; | ||
| minSpeechDuration?: number; | ||
| minSilenceDuration?: number; | ||
| maxSpeechDuration?: number; | ||
| enableExternalBuffer?: boolean; | ||
| }; | ||
| export type StreamAsrOptions = { | ||
| sampleRate?: number; | ||
| asrIntervalMs?: number; | ||
| vad?: boolean | VadOptions; | ||
| onResult: (result: AsrStreamResult) => void; | ||
| }; | ||
| export declare function streamAsr(audio: AsyncIterable<Float32Array>, options: StreamAsrOptions): Promise<void>; |
| import { createRequire } from 'node:module'; | ||
| import path from 'node:path'; | ||
| import { getModelPath, getVadModelPath } from './models.js'; | ||
| const require = createRequire(import.meta.url); | ||
| let _sherpaOnnx; | ||
| function sherpaOnnx() { | ||
| // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion | ||
| _sherpaOnnx ??= require('sherpa-onnx-node'); | ||
| return _sherpaOnnx; | ||
| } | ||
| const defaultSampleRate = 16_000; | ||
| const defaultAsrIntervalMs = 1000; | ||
| function createRecognizer() { | ||
| const modelDir = getModelPath('sensevoice'); | ||
| const onnx = sherpaOnnx(); | ||
| return new onnx.OfflineRecognizer({ | ||
| featConfig: { sampleRate: defaultSampleRate, featureDim: 80 }, | ||
| modelConfig: { | ||
| senseVoice: { | ||
| model: path.join(modelDir, 'model.int8.onnx'), | ||
| useInverseTextNormalization: 1, | ||
| }, | ||
| tokens: path.join(modelDir, 'tokens.txt'), | ||
| numThreads: 2, | ||
| provider: 'cpu', | ||
| debug: 0, | ||
| }, | ||
| }); | ||
| } | ||
| function recognize(recognizer, samples) { | ||
| const stream = recognizer.createStream(); | ||
| stream.acceptWaveform({ sampleRate: defaultSampleRate, samples }); | ||
| recognizer.decode(stream); | ||
| return recognizer.getResult(stream); | ||
| } | ||
| function mergeBuffers(buffers, totalLength) { | ||
| if (buffers.length === 1 && buffers[0]) { | ||
| return buffers[0]; | ||
| } | ||
| const merged = new Float32Array(totalLength); | ||
| let offset = 0; | ||
| for (const buf of buffers) { | ||
| merged.set(buf, offset); | ||
| offset += buf.length; | ||
| } | ||
| return merged; | ||
| } | ||
| function createVad(vadOptions) { | ||
| const onnx = sherpaOnnx(); | ||
| return new onnx.Vad({ | ||
| sileroVad: { | ||
| model: getVadModelPath(), | ||
| threshold: vadOptions.threshold ?? 0.5, | ||
| minSpeechDuration: vadOptions.minSpeechDuration ?? 0.25, | ||
| minSilenceDuration: vadOptions.minSilenceDuration ?? 0.5, | ||
| maxSpeechDuration: vadOptions.maxSpeechDuration ?? 15, | ||
| windowSize: 512, | ||
| }, | ||
| sampleRate: defaultSampleRate, | ||
| debug: 0, | ||
| numThreads: 1, | ||
| }, 60); | ||
| } | ||
| function emitResult(result, isFinal, onResult) { | ||
| const text = result.text.trim(); | ||
| if (text) { | ||
| onResult({ ...result, text, isFinal }); | ||
| } | ||
| } | ||
| async function streamWithVad(audio, options, vadOptions) { | ||
| const recognizer = createRecognizer(); | ||
| const vad = createVad(vadOptions); | ||
| const { windowSize } = vad.config.sileroVad; | ||
| let pending = new Float32Array(0); | ||
| function drainSegments() { | ||
| while (!vad.isEmpty()) { | ||
| const segment = vad.front(vadOptions.enableExternalBuffer); | ||
| vad.pop(); | ||
| emitResult(recognize(recognizer, segment.samples), true, options.onResult); | ||
| } | ||
| } | ||
| // eslint-disable-next-line @typescript-eslint/await-thenable | ||
| for await (const chunk of audio) { | ||
| const combined = new Float32Array(pending.length + chunk.length); | ||
| combined.set(pending); | ||
| combined.set(chunk, pending.length); | ||
| pending = combined; | ||
| while (pending.length >= windowSize) { | ||
| vad.acceptWaveform(pending.subarray(0, windowSize)); | ||
| pending = pending.subarray(windowSize); | ||
| drainSegments(); | ||
| } | ||
| } | ||
| if (pending.length > 0) { | ||
| const padded = new Float32Array(windowSize); | ||
| padded.set(pending); | ||
| vad.acceptWaveform(padded); | ||
| } | ||
| vad.flush(); | ||
| drainSegments(); | ||
| } | ||
| async function streamWithInterval(audio, options) { | ||
| const inputSampleRate = options.sampleRate ?? defaultSampleRate; | ||
| const intervalMs = options.asrIntervalMs ?? defaultAsrIntervalMs; | ||
| const chunkInterval = (defaultSampleRate * intervalMs) / 1000; | ||
| const recognizer = createRecognizer(); | ||
| const buffers = []; | ||
| let totalSamples = 0; | ||
| let lastRecognizedAt = 0; | ||
| let lastText = ''; | ||
| // eslint-disable-next-line @typescript-eslint/await-thenable | ||
| for await (const chunk of audio) { | ||
| buffers.push(chunk); | ||
| totalSamples += chunk.length; | ||
| const samplesForInterval = (chunkInterval * inputSampleRate) / defaultSampleRate; | ||
| if (totalSamples - lastRecognizedAt >= samplesForInterval) { | ||
| lastRecognizedAt = totalSamples; | ||
| const merged = mergeBuffers(buffers, totalSamples); | ||
| const result = recognize(recognizer, merged); | ||
| const text = result.text.trim(); | ||
| if (text && text !== lastText) { | ||
| lastText = text; | ||
| options.onResult({ ...result, text, isFinal: false }); | ||
| } | ||
| } | ||
| } | ||
| const merged = mergeBuffers(buffers, totalSamples); | ||
| if (merged.length > 0) { | ||
| emitResult(recognize(recognizer, merged), true, options.onResult); | ||
| } | ||
| } | ||
| export async function streamAsr(audio, options) { | ||
| if (options.vad) { | ||
| const vadOptions = typeof options.vad === 'object' ? options.vad : {}; | ||
| return streamWithVad(audio, options, vadOptions); | ||
| } | ||
| return streamWithInterval(audio, options); | ||
| } |
| #!/usr/bin/env node | ||
| export {}; |
| #!/usr/bin/env node | ||
| import { Command } from 'commander'; | ||
| import { register as registerAsr } from './asr/_cli.js'; | ||
| import { register as registerCloudTts } from './cloud-tts/_cli.js'; | ||
| import { register as registerTts } from './tts/_cli.js'; | ||
| const program = new Command(); | ||
| program.name('coli').description('Core CLI for Cola'); | ||
| registerAsr(program); | ||
| registerTts(program); | ||
| registerCloudTts(program); | ||
| program.parse(); |
| import type { Command } from 'commander'; | ||
| export declare function register(program: Command): void; |
| import process from 'node:process'; | ||
| import { defaultSpeaker } from '../_api/constants.js'; | ||
| import { listSpeakers, runCloudTts } from './cloud-tts.js'; | ||
| function getApiKey(options) { | ||
| const key = options.apiKey ?? process.env['COLI_LISTENHUB_API_KEY']; | ||
| if (!key) { | ||
| throw new Error('API key required. Use --api-key or set COLI_LISTENHUB_API_KEY environment variable. Get an API key from https://listenhub.ai/settings/api-keys'); | ||
| } | ||
| return key; | ||
| } | ||
| function getBaseUrl(options) { | ||
| return options.baseUrl ?? process.env['COLI_TTS_BASE_URL']; | ||
| } | ||
| export function register(program) { | ||
| program | ||
| .command('cloud-tts') | ||
| .description('Generate speech using ListenHub OpenAPI') | ||
| .argument('[text]', 'Text to synthesize') | ||
| .option('--api-key <key>', 'ListenHub API key (or set COLI_LISTENHUB_API_KEY environment variable)') | ||
| .option('--voice <id>', 'Speaker ID to use') | ||
| .option('--model <name>', 'Model to use (default: flowtts)') | ||
| .option('--base-url <url>', 'Base URL for TTS API (or set COLI_TTS_BASE_URL environment variable)') | ||
| .option('-o, --output <file>', 'Save audio to file') | ||
| .option('--list-speakers', 'List available speakers') | ||
| .option('--language <lang>', 'Speaker language (en, zh, ja)') | ||
| .option('-j, --json', 'Output in JSON format (use with --list-speakers)') | ||
| .action(async (text, options) => { | ||
| if (options.listSpeakers) { | ||
| const apiKey = getApiKey(options); | ||
| const baseUrl = getBaseUrl(options); | ||
| const speakers = await listSpeakers({ | ||
| apiKey, | ||
| baseUrl, | ||
| language: options.language, | ||
| }); | ||
| if (options.json) { | ||
| console.log(JSON.stringify(speakers, null, 2)); | ||
| } | ||
| else { | ||
| for (const speaker of speakers) { | ||
| console.log(`${speaker.name}\t${speaker.speakerId}\t${speaker.gender}\t${speaker.language}`); | ||
| } | ||
| } | ||
| return; | ||
| } | ||
| if (!text) { | ||
| throw new Error('Please provide text to synthesize.'); | ||
| } | ||
| const voice = options.voice ?? | ||
| (options.language && defaultSpeaker[options.language]); | ||
| if (!voice) { | ||
| throw new Error('Please specify a speaker with --voice or a language with --language. Use --list-speakers to see available speakers.'); | ||
| } | ||
| const apiKey = getApiKey(options); | ||
| const baseUrl = getBaseUrl(options); | ||
| await runCloudTts(text, { | ||
| apiKey, | ||
| baseUrl, | ||
| voice, | ||
| model: options.model, | ||
| output: options.output, | ||
| }); | ||
| }); | ||
| } |
| export { listSpeakers, runCloudTts, type CloudTtsOptions, type ListSpeakersOptions, } from './cloud-tts.js'; |
| export { listSpeakers, runCloudTts, } from './cloud-tts.js'; |
| import type { SpeakerLanguage } from '../_api/types.js'; | ||
| export type CloudTtsOptions = { | ||
| apiKey: string; | ||
| voice: string; | ||
| model?: string; | ||
| output?: string; | ||
| baseUrl?: string; | ||
| }; | ||
| export type ListSpeakersOptions = { | ||
| apiKey: string; | ||
| language?: SpeakerLanguage; | ||
| baseUrl?: string; | ||
| }; | ||
| export declare function listSpeakers(options: ListSpeakersOptions): Promise<{ | ||
| name: string; | ||
| speakerId: string; | ||
| demoAudioUrl: string; | ||
| gender: string; | ||
| language: SpeakerLanguage; | ||
| }[]>; | ||
| export declare function runCloudTts(text: string, options: CloudTtsOptions): Promise<void>; |
| import { Buffer } from 'node:buffer'; | ||
| import fs from 'node:fs'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| import { Writable } from 'node:stream'; | ||
| import { execa } from 'execa'; | ||
| import { ListenHubApi } from '../_api/listenhub-openapi.js'; | ||
| export async function listSpeakers(options) { | ||
| const api = new ListenHubApi({ | ||
| apiKey: options.apiKey, | ||
| baseUrl: options.baseUrl, | ||
| }); | ||
| const result = await api.getAvailableSpeakers({ | ||
| language: options.language, | ||
| }); | ||
| return result.data.items; | ||
| } | ||
| async function collectStream(stream) { | ||
| const chunks = []; | ||
| const reader = stream.getReader(); | ||
| for (;;) { | ||
| const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop | ||
| if (done) | ||
| break; | ||
| chunks.push(value); | ||
| } | ||
| return Buffer.concat(chunks); | ||
| } | ||
| export async function runCloudTts(text, options) { | ||
| const api = new ListenHubApi({ | ||
| apiKey: options.apiKey, | ||
| baseUrl: options.baseUrl, | ||
| }); | ||
| const stream = await api.tts({ | ||
| input: text, | ||
| voice: options.voice, | ||
| model: options.model, | ||
| }); | ||
| if (options.output) { | ||
| const fileStream = fs.createWriteStream(options.output); | ||
| await stream.pipeTo(Writable.toWeb(fileStream)); | ||
| return; | ||
| } | ||
| const mp3Path = path.join(os.tmpdir(), `coli-cloud-tts-${Date.now()}.mp3`); | ||
| const audio = await collectStream(stream); | ||
| fs.writeFileSync(mp3Path, audio); | ||
| try { | ||
| // eslint-disable-next-line @typescript-eslint/await-thenable | ||
| await execa('afplay', [mp3Path]); | ||
| } | ||
| finally { | ||
| fs.unlinkSync(mp3Path); | ||
| } | ||
| } |
| export * from './_api/listenhub-openapi.js'; | ||
| export * from './asr/_index.js'; | ||
| export * from './cloud-tts/_index.js'; | ||
| export * from './tts/_index.js'; |
| export * from './_api/listenhub-openapi.js'; | ||
| export * from './asr/_index.js'; | ||
| export * from './cloud-tts/_index.js'; | ||
| export * from './tts/_index.js'; |
| import type { Command } from 'commander'; | ||
| export declare function register(program: Command): void; |
| import { getVoices, runTts } from './tts.js'; | ||
| export function register(program) { | ||
| program | ||
| .command('tts') | ||
| .description('Speak text using text-to-speech (macOS only)') | ||
| .argument('[text]', 'Text to speak') | ||
| .option('-v, --voice <name>', 'Voice to use, defaults to macOS system voice') | ||
| .option('-r, --rate <wpm>', 'Speech rate in words per minute', Number) | ||
| .option('-o, --output <file>', 'Save audio to file instead of speaking') | ||
| .option('--list-voices', 'List available voices') | ||
| .option('-j, --json', 'Output in JSON format (use with --list-voices)') | ||
| .action(async (text, options) => { | ||
| if (options.listVoices) { | ||
| const voices = await getVoices(); | ||
| if (options.json) { | ||
| console.log(JSON.stringify(voices, null, 2)); | ||
| } | ||
| else { | ||
| for (const voice of voices) { | ||
| console.log(`${voice.name}\t${voice.languageCode}\t${voice.example}`); | ||
| } | ||
| } | ||
| return; | ||
| } | ||
| if (!text) { | ||
| throw new Error('Please provide text to speak.'); | ||
| } | ||
| await runTts(text, options); | ||
| }); | ||
| } |
| export { getVoices, runTts, type TtsOptions } from './tts.js'; |
| export { getVoices, runTts } from './tts.js'; |
| import { type Voice } from 'mac-say'; | ||
| export type TtsOptions = { | ||
| voice?: string; | ||
| rate?: number; | ||
| output?: string; | ||
| }; | ||
| export declare function getVoices(): Promise<Voice[]>; | ||
| export declare function runTts(text: string, options?: TtsOptions): Promise<void>; |
| import { getVoices as macGetVoices, say } from 'mac-say'; | ||
| export async function getVoices() { | ||
| return macGetVoices(); | ||
| } | ||
| export async function runTts(text, options = {}) { | ||
| await say(text, { | ||
| voice: options.voice, | ||
| rate: options.rate, | ||
| outputFile: options.output, | ||
| }); | ||
| } |
+10
-6
@@ -8,2 +8,3 @@ # Cloud TTS | ||
| - A ListenHub API key. Pass it via `--api-key` or set the `COLI_LISTENHUB_API_KEY` environment variable. | ||
| - Optionally, a custom base URL via `--base-url` or the `COLI_TTS_BASE_URL` environment variable. | ||
@@ -39,2 +40,3 @@ ## CLI | ||
| --model <name> Model to use (default: flowtts) | ||
| --base-url <url> Base URL for TTS API (or set COLI_TTS_BASE_URL) | ||
| -o, --output <file> Save audio to file | ||
@@ -70,2 +72,3 @@ --list-speakers List available speakers | ||
| | `language` | `'en' \| 'zh' \| 'ja'` | Filter speakers by language. Omit to list all. | | ||
| | `baseUrl` | `string` | Custom base URL for TTS API (optional) | | ||
@@ -89,7 +92,8 @@ ### `runCloudTts(text, options)` | ||
| | Property | Type | Description | | ||
| | -------- | -------- | ---------------------------------------------- | | ||
| | `apiKey` | `string` | ListenHub API key | | ||
| | `voice` | `string` | Speaker ID (from `listSpeakers`) | | ||
| | `model` | `string` | Model to use (optional, defaults to `flowtts`) | | ||
| | `output` | `string` | Save to file instead of playing directly | | ||
| | Property | Type | Description | | ||
| | --------- | -------- | ---------------------------------------------- | | ||
| | `apiKey` | `string` | ListenHub API key | | ||
| | `voice` | `string` | Speaker ID (from `listSpeakers`) | | ||
| | `model` | `string` | Model to use (optional, defaults to `flowtts`) | | ||
| | `output` | `string` | Save to file instead of playing directly | | ||
| | `baseUrl` | `string` | Custom base URL for TTS API (optional) | |
@@ -11,2 +11,8 @@ # ListenHub OpenAPI | ||
| const api = new ListenHubApi({apiKey: 'lh_sk_...'}); | ||
| // Or with a custom base URL | ||
| const api = new ListenHubApi({ | ||
| apiKey: 'lh_sk_...', | ||
| baseUrl: 'https://custom-api.example.com/openapi', | ||
| }); | ||
| ``` | ||
@@ -13,0 +19,0 @@ |
+9
-8
| { | ||
| "name": "@marswave/coli", | ||
| "private": false, | ||
| "version": "0.0.13", | ||
| "version": "0.0.14", | ||
| "description": "A CLI for the Cola", | ||
| "repository": "marswaveai/coli", | ||
| "type": "module", | ||
| "bin": "distribution/cli.js", | ||
| "exports": "./distribution/index.js", | ||
| "types": "distribution", | ||
| "bin": "distribution/source/cli.js", | ||
| "exports": "./distribution/source/index.js", | ||
| "types": "distribution/source", | ||
| "files": [ | ||
@@ -19,3 +19,4 @@ "distribution", | ||
| "clean": "del-cli distribution", | ||
| "build": "node --run clean && tsc && chmod +x distribution/cli.js", | ||
| "dev": "node --run clean && tsc --watch", | ||
| "build": "node --run clean && tsc && chmod +x distribution/source/cli.js", | ||
| "pretest": "node --run build", | ||
@@ -29,3 +30,3 @@ "test": "xo" | ||
| "mac-say": "^0.3.3", | ||
| "sherpa-onnx-node": "^1.12.29" | ||
| "sherpa-onnx-node": "^1.12.33" | ||
| }, | ||
@@ -36,5 +37,5 @@ "devDependencies": { | ||
| "del-cli": "^7.0.0", | ||
| "typescript": "^5.9.3", | ||
| "xo": "^1.2.3" | ||
| "typescript": "^6.0.2", | ||
| "xo": "^2.0.2" | ||
| } | ||
| } |
| import type { SpeakerLanguage } from './types.js'; | ||
| export declare const defaultSpeaker: Record<SpeakerLanguage, string>; |
| export const defaultSpeaker = { | ||
| en: 'chat-girl-105-cn', | ||
| zh: 'leo-9328b6d2', | ||
| ja: 'tianzhongdunzi-5d612542', | ||
| }; |
| import { type KyInstance } from 'ky'; | ||
| import type { ApiResponse, SpeakerLanguage } from './types.js'; | ||
| export * from './constants.js'; | ||
| export type * from './types.js'; | ||
| export type ListenHubApiOptions = { | ||
| apiKey: string; | ||
| }; | ||
| export declare class ListenHubApi { | ||
| api: KyInstance; | ||
| constructor({ apiKey }: ListenHubApiOptions); | ||
| /** | ||
| * Get a list of available speakers. | ||
| * @param options - The options for the speakers request. | ||
| * @param options.language - Optional. The language of the speakers to get, defaults to English. | ||
| * @returns A list of available speakers. | ||
| * @see {@link https://listenhub.ai/docs/en/openapi/api-reference/speakers#list-available-speakers|List Available Speakers} | ||
| */ | ||
| getAvailableSpeakers(options?: { | ||
| language?: SpeakerLanguage; | ||
| }): Promise<ApiResponse<{ | ||
| items: Array<{ | ||
| name: string; | ||
| speakerId: string; | ||
| demoAudioUrl: string; | ||
| gender: string; | ||
| language: SpeakerLanguage; | ||
| }>; | ||
| }>>; | ||
| /** | ||
| * Generate audio from text using the Streaming TTS API. | ||
| * @param options - The options for the TTS request. | ||
| * @param options.input - The text to generate audio from. | ||
| * @param options.voice - The `speakerId` to use for the TTS. | ||
| * @param options.model - Optional. The model to use for the TTS, defaults to `flowtts`. | ||
| * @returns A readable stream of the MP3 audio. | ||
| * @see {@link https://listenhub.ai/docs/en/openapi/api-reference/flowspeech|Streaming TTS} | ||
| */ | ||
| tts(options: { | ||
| input: string; | ||
| voice: string; | ||
| model?: string; | ||
| }): Promise<ReadableStream<Uint8Array<ArrayBuffer>>>; | ||
| } |
| import ky from 'ky'; | ||
| export * from './constants.js'; | ||
| export class ListenHubApi { | ||
| api; | ||
| constructor({ apiKey }) { | ||
| this.api = ky.extend({ | ||
| prefixUrl: 'https://api.marswave.ai/openapi', | ||
| headers: { | ||
| // eslint-disable-next-line @typescript-eslint/naming-convention | ||
| Authorization: `Bearer ${apiKey}`, | ||
| }, | ||
| }); | ||
| } | ||
| /** | ||
| * Get a list of available speakers. | ||
| * @param options - The options for the speakers request. | ||
| * @param options.language - Optional. The language of the speakers to get, defaults to English. | ||
| * @returns A list of available speakers. | ||
| * @see {@link https://listenhub.ai/docs/en/openapi/api-reference/speakers#list-available-speakers|List Available Speakers} | ||
| */ | ||
| async getAvailableSpeakers(options) { | ||
| return this.api.get('v1/speakers/list', { searchParams: options }).json(); | ||
| } | ||
| /** | ||
| * Generate audio from text using the Streaming TTS API. | ||
| * @param options - The options for the TTS request. | ||
| * @param options.input - The text to generate audio from. | ||
| * @param options.voice - The `speakerId` to use for the TTS. | ||
| * @param options.model - Optional. The model to use for the TTS, defaults to `flowtts`. | ||
| * @returns A readable stream of the MP3 audio. | ||
| * @see {@link https://listenhub.ai/docs/en/openapi/api-reference/flowspeech|Streaming TTS} | ||
| */ | ||
| async tts(options) { | ||
| const response = await this.api.post('v1/tts', { json: options }); | ||
| if (!response.body) | ||
| throw new Error('Empty response body from TTS API'); | ||
| return response.body; | ||
| } | ||
| } |
| export type ApiResponse<T> = { | ||
| code: number; | ||
| message: string; | ||
| data: T; | ||
| }; | ||
| export type SpeakerLanguage = 'en' | 'zh' | 'ja'; |
| export {}; |
| import type { Command } from 'commander'; | ||
| export declare function register(program: Command): void; |
| import { Buffer } from 'node:buffer'; | ||
| import process from 'node:process'; | ||
| import { runAsr } from './asr.js'; | ||
| import { ensureModels, ensureVadModel } from './models.js'; | ||
| import { streamAsr } from './stream-asr.js'; | ||
| export function register(program) { | ||
| program | ||
| .command('asr') | ||
| .description('Transcribe an audio file using speech recognition') | ||
| .argument('<file>', 'Audio file to transcribe') | ||
| .option('-j, --json', 'Output result in JSON format', false) | ||
| .option('--model <name>', 'Model to use: whisper, sensevoice', 'sensevoice') | ||
| .action(async (file, options) => { | ||
| const { model } = options; | ||
| if (model !== 'whisper' && model !== 'sensevoice') { | ||
| throw new Error(`Unknown model "${model}". Use "whisper" or "sensevoice".`); | ||
| } | ||
| await ensureModels([model]); | ||
| await runAsr(file, { json: options.json, model }); | ||
| }); | ||
| program | ||
| .command('asr-stream') | ||
| .description('Stream speech recognition from stdin (expects 16kHz mono s16le PCM)') | ||
| .option('-j, --json', 'Output each result as a JSON line', false) | ||
| .option('--vad', 'Enable voice activity detection', false) | ||
| .option('--asr-interval-ms <ms>', 'Recognition interval in ms (ignored with --vad)', '1000') | ||
| .action(async (options) => { | ||
| await ensureModels(); | ||
| if (options.vad) { | ||
| await ensureVadModel(); | ||
| } | ||
| async function* stdinAudio() { | ||
| for await (const chunk of process.stdin) { | ||
| const buf = Buffer.from(chunk); | ||
| const pcm = new Int16Array(buf.buffer, buf.byteOffset, buf.byteLength / 2); | ||
| const float32 = new Float32Array(pcm.length); | ||
| for (const [i, sample] of pcm.entries()) { | ||
| float32[i] = sample / 32_768; | ||
| } | ||
| yield float32; | ||
| } | ||
| } | ||
| await streamAsr(stdinAudio(), { | ||
| vad: options.vad || undefined, | ||
| asrIntervalMs: Number(options.asrIntervalMs), | ||
| onResult(result) { | ||
| if (options.json) { | ||
| console.log(JSON.stringify(result)); | ||
| } | ||
| else { | ||
| console.log(result.text); | ||
| } | ||
| }, | ||
| }); | ||
| }); | ||
| } |
| export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js'; | ||
| export { streamAsr, type AsrStreamResult, type StreamAsrOptions, type VadOptions, } from './stream-asr.js'; |
| export { ensureModels, ensureVadModel, getModelPath, getVadModelPath, modelDisplayNames, } from './models.js'; | ||
| export { streamAsr, } from './stream-asr.js'; |
| type ModelName = 'whisper' | 'sensevoice'; | ||
| export type AsrOptions = { | ||
| json: boolean; | ||
| model: ModelName; | ||
| }; | ||
| export declare function runAsr(filePath: string, options: AsrOptions): Promise<void>; | ||
| export {}; |
| import fs from 'node:fs'; | ||
| import { createRequire } from 'node:module'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| import { execa } from 'execa'; | ||
| import { getModelPath, modelDisplayNames } from './models.js'; | ||
| const require = createRequire(import.meta.url); | ||
| // Loaded lazily to avoid loading the native addon until needed | ||
| let _sherpaOnnx; | ||
| function sherpaOnnx() { | ||
| _sherpaOnnx ??= require('sherpa-onnx-node'); | ||
| return _sherpaOnnx; | ||
| } | ||
| async function convertToWav(inputPath) { | ||
| const outputPath = path.join(os.tmpdir(), `coli-${Date.now()}.wav`); | ||
| try { | ||
| await execa('ffmpeg', [ | ||
| '-i', | ||
| inputPath, | ||
| '-ar', | ||
| '16000', | ||
| '-ac', | ||
| '1', | ||
| '-f', | ||
| 'wav', | ||
| '-acodec', | ||
| 'pcm_s16le', | ||
| outputPath, | ||
| '-y', | ||
| ]); | ||
| } | ||
| catch { | ||
| throw new Error('Failed to convert audio file. Please make sure ffmpeg is installed.\n' + | ||
| ' brew install ffmpeg # macOS\n' + | ||
| ' sudo apt install ffmpeg # Debian/Ubuntu'); | ||
| } | ||
| return outputPath; | ||
| } | ||
| function createRecognizer(model) { | ||
| const modelDir = getModelPath(model); | ||
| const onnx = sherpaOnnx(); | ||
| if (model === 'whisper') { | ||
| return new onnx.OfflineRecognizer({ | ||
| featConfig: { sampleRate: 16_000, featureDim: 80 }, | ||
| modelConfig: { | ||
| whisper: { | ||
| encoder: path.join(modelDir, 'tiny.en-encoder.int8.onnx'), | ||
| decoder: path.join(modelDir, 'tiny.en-decoder.int8.onnx'), | ||
| }, | ||
| tokens: path.join(modelDir, 'tiny.en-tokens.txt'), | ||
| numThreads: 2, | ||
| provider: 'cpu', | ||
| debug: 0, | ||
| }, | ||
| }); | ||
| } | ||
| return new onnx.OfflineRecognizer({ | ||
| featConfig: { sampleRate: 16_000, featureDim: 80 }, | ||
| modelConfig: { | ||
| senseVoice: { | ||
| model: path.join(modelDir, 'model.int8.onnx'), | ||
| useInverseTextNormalization: 1, | ||
| }, | ||
| tokens: path.join(modelDir, 'tokens.txt'), | ||
| numThreads: 2, | ||
| provider: 'cpu', | ||
| debug: 0, | ||
| }, | ||
| }); | ||
| } | ||
| export async function runAsr(filePath, options) { | ||
| const resolvedPath = path.resolve(filePath); | ||
| if (!fs.existsSync(resolvedPath)) { | ||
| throw new Error(`File not found: ${resolvedPath}`); | ||
| } | ||
| const ext = path.extname(resolvedPath).toLowerCase(); | ||
| let wavPath; | ||
| let needsCleanup = false; | ||
| if (ext === '.wav') { | ||
| wavPath = resolvedPath; | ||
| } | ||
| else { | ||
| wavPath = await convertToWav(resolvedPath); | ||
| needsCleanup = true; | ||
| } | ||
| try { | ||
| const onnx = sherpaOnnx(); | ||
| const recognizer = createRecognizer(options.model); | ||
| const stream = recognizer.createStream(); | ||
| const wave = onnx.readWave(wavPath); | ||
| stream.acceptWaveform({ sampleRate: wave.sampleRate, samples: wave.samples }); | ||
| recognizer.decode(stream); | ||
| const result = recognizer.getResult(stream); | ||
| if (options.json) { | ||
| console.log(JSON.stringify({ | ||
| text: result.text.trim(), | ||
| model: modelDisplayNames[options.model], | ||
| lang: result.lang || undefined, | ||
| emotion: result.emotion || undefined, | ||
| event: result.event || undefined, | ||
| tokens: result.tokens, | ||
| timestamps: result.timestamps, | ||
| duration: wave.samples.length / wave.sampleRate, | ||
| }, null, 2)); | ||
| } | ||
| else { | ||
| console.log(result.text.trim()); | ||
| } | ||
| } | ||
| finally { | ||
| if (needsCleanup && fs.existsSync(wavPath)) { | ||
| fs.unlinkSync(wavPath); | ||
| } | ||
| } | ||
| } |
| type ModelName = 'whisper' | 'sensevoice'; | ||
| export declare const modelDisplayNames: Record<ModelName, string>; | ||
| export declare function getModelPath(model: ModelName): string; | ||
| export declare function ensureModels(modelNames?: ModelName[]): Promise<void>; | ||
| export declare function getVadModelPath(): string; | ||
| export declare function ensureVadModel(): Promise<void>; | ||
| export {}; |
| import fs from 'node:fs'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| import process from 'node:process'; | ||
| import { execa } from 'execa'; | ||
| const modelsDirectory = path.join(os.homedir(), '.coli', 'models'); | ||
| const models = { | ||
| whisper: { | ||
| dirName: 'sherpa-onnx-whisper-tiny.en', | ||
| url: 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2', | ||
| checkFile: 'tiny.en-encoder.int8.onnx', | ||
| }, | ||
| sensevoice: { | ||
| dirName: 'sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17', | ||
| url: 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2024-07-17.tar.bz2', | ||
| checkFile: 'model.int8.onnx', | ||
| }, | ||
| }; | ||
| export const modelDisplayNames = { | ||
| whisper: 'whisper-tiny.en', | ||
| sensevoice: 'sensevoice-small', | ||
| }; | ||
| export function getModelPath(model) { | ||
| return path.join(modelsDirectory, models[model].dirName); | ||
| } | ||
| function isModelInstalled(entry) { | ||
| const modelDir = path.join(modelsDirectory, entry.dirName); | ||
| return fs.existsSync(path.join(modelDir, entry.checkFile)); | ||
| } | ||
| async function downloadModel(entry) { | ||
| const { dirName, url } = entry; | ||
| console.log(`Downloading ${dirName}...`); | ||
| fs.mkdirSync(modelsDirectory, { recursive: true }); | ||
| const tarPath = path.join(modelsDirectory, `${dirName}.tar.bz2`); | ||
| const response = await fetch(url, { redirect: 'follow' }); | ||
| if (!response.ok || !response.body) { | ||
| throw new Error(`Failed to download model: ${response.statusText}`); | ||
| } | ||
| const contentLength = Number(response.headers.get('content-length') ?? 0); | ||
| const reader = response.body.getReader(); | ||
| const fileHandle = fs.openSync(tarPath, 'w'); | ||
| let downloaded = 0; | ||
| try { | ||
| for (;;) { | ||
| const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop | ||
| if (done) { | ||
| break; | ||
| } | ||
| fs.writeSync(fileHandle, value); | ||
| downloaded += value.length; | ||
| if (contentLength > 0) { | ||
| const percent = ((downloaded / contentLength) * 100).toFixed(1); | ||
| const mb = (downloaded / (1024 * 1024)).toFixed(1); | ||
| const totalMb = (contentLength / (1024 * 1024)).toFixed(1); | ||
| process.stdout.write(`\r ${mb} MB / ${totalMb} MB (${percent}%)`); | ||
| } | ||
| } | ||
| } | ||
| finally { | ||
| fs.closeSync(fileHandle); | ||
| } | ||
| process.stdout.write('\n'); | ||
| console.log(' Extracting...'); | ||
| await execa('tar', ['xjf', tarPath, '-C', modelsDirectory]); | ||
| fs.unlinkSync(tarPath); | ||
| console.log(` ${dirName} ready.\n`); | ||
| } | ||
| export async function ensureModels(modelNames = ['sensevoice']) { | ||
| const pending = modelNames | ||
| .map((name) => models[name]) | ||
| .filter((entry) => !isModelInstalled(entry)); | ||
| for (const entry of pending) { | ||
| await downloadModel(entry); // eslint-disable-line no-await-in-loop | ||
| } | ||
| } | ||
| const vadModelFile = 'silero_vad.onnx'; | ||
| const vadModelUrl = 'https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx'; | ||
| export function getVadModelPath() { | ||
| return path.join(modelsDirectory, vadModelFile); | ||
| } | ||
| export async function ensureVadModel() { | ||
| const modelPath = getVadModelPath(); | ||
| if (fs.existsSync(modelPath)) { | ||
| return; | ||
| } | ||
| console.log(`Downloading ${vadModelFile}...`); | ||
| fs.mkdirSync(modelsDirectory, { recursive: true }); | ||
| const response = await fetch(vadModelUrl, { redirect: 'follow' }); | ||
| if (!response.ok || !response.body) { | ||
| throw new Error(`Failed to download VAD model: ${response.statusText}`); | ||
| } | ||
| const contentLength = Number(response.headers.get('content-length') ?? 0); | ||
| const reader = response.body.getReader(); | ||
| const fileHandle = fs.openSync(modelPath, 'w'); | ||
| let downloaded = 0; | ||
| try { | ||
| for (;;) { | ||
| const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop | ||
| if (done) { | ||
| break; | ||
| } | ||
| fs.writeSync(fileHandle, value); | ||
| downloaded += value.length; | ||
| if (contentLength > 0) { | ||
| const percent = ((downloaded / contentLength) * 100).toFixed(1); | ||
| const kb = (downloaded / 1024).toFixed(0); | ||
| const totalKb = (contentLength / 1024).toFixed(0); | ||
| process.stdout.write(`\r ${kb} KB / ${totalKb} KB (${percent}%)`); | ||
| } | ||
| } | ||
| } | ||
| finally { | ||
| fs.closeSync(fileHandle); | ||
| } | ||
| process.stdout.write('\n'); | ||
| console.log(` ${vadModelFile} ready.\n`); | ||
| } |
| export type AsrStreamResult = { | ||
| text: string; | ||
| lang: string; | ||
| emotion: string; | ||
| event: string; | ||
| tokens: string[]; | ||
| timestamps: number[]; | ||
| isFinal: boolean; | ||
| }; | ||
| export type VadOptions = { | ||
| threshold?: number; | ||
| minSpeechDuration?: number; | ||
| minSilenceDuration?: number; | ||
| maxSpeechDuration?: number; | ||
| enableExternalBuffer?: boolean; | ||
| }; | ||
| export type StreamAsrOptions = { | ||
| sampleRate?: number; | ||
| asrIntervalMs?: number; | ||
| vad?: boolean | VadOptions; | ||
| onResult: (result: AsrStreamResult) => void; | ||
| }; | ||
| export declare function streamAsr(audio: AsyncIterable<Float32Array>, options: StreamAsrOptions): Promise<void>; |
| import { createRequire } from 'node:module'; | ||
| import path from 'node:path'; | ||
| import { getModelPath, getVadModelPath } from './models.js'; | ||
| const require = createRequire(import.meta.url); | ||
| let _sherpaOnnx; | ||
| function sherpaOnnx() { | ||
| _sherpaOnnx ??= require('sherpa-onnx-node'); | ||
| return _sherpaOnnx; | ||
| } | ||
| const defaultSampleRate = 16_000; | ||
| const defaultAsrIntervalMs = 1000; | ||
| function createRecognizer() { | ||
| const modelDir = getModelPath('sensevoice'); | ||
| const onnx = sherpaOnnx(); | ||
| return new onnx.OfflineRecognizer({ | ||
| featConfig: { sampleRate: defaultSampleRate, featureDim: 80 }, | ||
| modelConfig: { | ||
| senseVoice: { | ||
| model: path.join(modelDir, 'model.int8.onnx'), | ||
| useInverseTextNormalization: 1, | ||
| }, | ||
| tokens: path.join(modelDir, 'tokens.txt'), | ||
| numThreads: 2, | ||
| provider: 'cpu', | ||
| debug: 0, | ||
| }, | ||
| }); | ||
| } | ||
| function recognize(recognizer, samples) { | ||
| const stream = recognizer.createStream(); | ||
| stream.acceptWaveform({ sampleRate: defaultSampleRate, samples }); | ||
| recognizer.decode(stream); | ||
| return recognizer.getResult(stream); | ||
| } | ||
| function mergeBuffers(buffers, totalLength) { | ||
| if (buffers.length === 1 && buffers[0]) { | ||
| return buffers[0]; | ||
| } | ||
| const merged = new Float32Array(totalLength); | ||
| let offset = 0; | ||
| for (const buf of buffers) { | ||
| merged.set(buf, offset); | ||
| offset += buf.length; | ||
| } | ||
| return merged; | ||
| } | ||
| function createVad(vadOptions) { | ||
| const onnx = sherpaOnnx(); | ||
| return new onnx.Vad({ | ||
| sileroVad: { | ||
| model: getVadModelPath(), | ||
| threshold: vadOptions.threshold ?? 0.5, | ||
| minSpeechDuration: vadOptions.minSpeechDuration ?? 0.25, | ||
| minSilenceDuration: vadOptions.minSilenceDuration ?? 0.5, | ||
| maxSpeechDuration: vadOptions.maxSpeechDuration ?? 15, | ||
| windowSize: 512, | ||
| }, | ||
| sampleRate: defaultSampleRate, | ||
| debug: 0, | ||
| numThreads: 1, | ||
| }, 60); | ||
| } | ||
| function emitResult(result, isFinal, onResult) { | ||
| const text = result.text.trim(); | ||
| if (text) { | ||
| onResult({ ...result, text, isFinal }); | ||
| } | ||
| } | ||
| async function streamWithVad(audio, options, vadOptions) { | ||
| const recognizer = createRecognizer(); | ||
| const vad = createVad(vadOptions); | ||
| const { windowSize } = vad.config.sileroVad; | ||
| let pending = new Float32Array(0); | ||
| function drainSegments() { | ||
| while (!vad.isEmpty()) { | ||
| const segment = vad.front(vadOptions.enableExternalBuffer); | ||
| vad.pop(); | ||
| emitResult(recognize(recognizer, segment.samples), true, options.onResult); | ||
| } | ||
| } | ||
| for await (const chunk of audio) { | ||
| const combined = new Float32Array(pending.length + chunk.length); | ||
| combined.set(pending); | ||
| combined.set(chunk, pending.length); | ||
| pending = combined; | ||
| while (pending.length >= windowSize) { | ||
| vad.acceptWaveform(pending.subarray(0, windowSize)); | ||
| pending = pending.subarray(windowSize); | ||
| drainSegments(); | ||
| } | ||
| } | ||
| if (pending.length > 0) { | ||
| const padded = new Float32Array(windowSize); | ||
| padded.set(pending); | ||
| vad.acceptWaveform(padded); | ||
| } | ||
| vad.flush(); | ||
| drainSegments(); | ||
| } | ||
| async function streamWithInterval(audio, options) { | ||
| const inputSampleRate = options.sampleRate ?? defaultSampleRate; | ||
| const intervalMs = options.asrIntervalMs ?? defaultAsrIntervalMs; | ||
| const chunkInterval = (defaultSampleRate * intervalMs) / 1000; | ||
| const recognizer = createRecognizer(); | ||
| const buffers = []; | ||
| let totalSamples = 0; | ||
| let lastRecognizedAt = 0; | ||
| let lastText = ''; | ||
| for await (const chunk of audio) { | ||
| buffers.push(chunk); | ||
| totalSamples += chunk.length; | ||
| const samplesForInterval = (chunkInterval * inputSampleRate) / defaultSampleRate; | ||
| if (totalSamples - lastRecognizedAt >= samplesForInterval) { | ||
| lastRecognizedAt = totalSamples; | ||
| const merged = mergeBuffers(buffers, totalSamples); | ||
| const result = recognize(recognizer, merged); | ||
| const text = result.text.trim(); | ||
| if (text && text !== lastText) { | ||
| lastText = text; | ||
| options.onResult({ ...result, text, isFinal: false }); | ||
| } | ||
| } | ||
| } | ||
| const merged = mergeBuffers(buffers, totalSamples); | ||
| if (merged.length > 0) { | ||
| emitResult(recognize(recognizer, merged), true, options.onResult); | ||
| } | ||
| } | ||
| export async function streamAsr(audio, options) { | ||
| if (options.vad) { | ||
| const vadOptions = typeof options.vad === 'object' ? options.vad : {}; | ||
| return streamWithVad(audio, options, vadOptions); | ||
| } | ||
| return streamWithInterval(audio, options); | ||
| } |
| #!/usr/bin/env node | ||
| export {}; |
| #!/usr/bin/env node | ||
| import { Command } from 'commander'; | ||
| import { register as registerAsr } from './asr/_cli.js'; | ||
| import { register as registerCloudTts } from './cloud-tts/_cli.js'; | ||
| import { register as registerTts } from './tts/_cli.js'; | ||
| const program = new Command(); | ||
| program.name('coli').description('Core CLI for Cola'); | ||
| registerAsr(program); | ||
| registerTts(program); | ||
| registerCloudTts(program); | ||
| program.parse(); |
| import type { Command } from 'commander'; | ||
| export declare function register(program: Command): void; |
| import process from 'node:process'; | ||
| import { defaultSpeaker } from '../_api/constants.js'; | ||
| import { listSpeakers, runCloudTts } from './cloud-tts.js'; | ||
| function getApiKey(options) { | ||
| const key = options.apiKey ?? process.env['COLI_LISTENHUB_API_KEY']; | ||
| if (!key) { | ||
| throw new Error('API key required. Use --api-key or set COLI_LISTENHUB_API_KEY environment variable. Get an API key from https://listenhub.ai/settings/api-keys'); | ||
| } | ||
| return key; | ||
| } | ||
| export function register(program) { | ||
| program | ||
| .command('cloud-tts') | ||
| .description('Generate speech using ListenHub OpenAPI') | ||
| .argument('[text]', 'Text to synthesize') | ||
| .option('--api-key <key>', 'ListenHub API key (or set COLI_LISTENHUB_API_KEY environment variable)') | ||
| .option('--voice <id>', 'Speaker ID to use') | ||
| .option('--model <name>', 'Model to use (default: flowtts)') | ||
| .option('-o, --output <file>', 'Save audio to file') | ||
| .option('--list-speakers', 'List available speakers') | ||
| .option('--language <lang>', 'Speaker language (en, zh, ja)') | ||
| .option('-j, --json', 'Output in JSON format (use with --list-speakers)') | ||
| .action(async (text, options) => { | ||
| if (options.listSpeakers) { | ||
| const apiKey = getApiKey(options); | ||
| const speakers = await listSpeakers({ | ||
| apiKey, | ||
| language: options.language, | ||
| }); | ||
| if (options.json) { | ||
| console.log(JSON.stringify(speakers, null, 2)); | ||
| } | ||
| else { | ||
| for (const speaker of speakers) { | ||
| console.log(`${speaker.name}\t${speaker.speakerId}\t${speaker.gender}\t${speaker.language}`); | ||
| } | ||
| } | ||
| return; | ||
| } | ||
| if (!text) { | ||
| throw new Error('Please provide text to synthesize.'); | ||
| } | ||
| const voice = options.voice ?? | ||
| (options.language && defaultSpeaker[options.language]); | ||
| if (!voice) { | ||
| throw new Error('Please specify a speaker with --voice or a language with --language. Use --list-speakers to see available speakers.'); | ||
| } | ||
| const apiKey = getApiKey(options); | ||
| await runCloudTts(text, { | ||
| apiKey, | ||
| voice, | ||
| model: options.model, | ||
| output: options.output, | ||
| }); | ||
| }); | ||
| } |
| export { listSpeakers, runCloudTts, type CloudTtsOptions, type ListSpeakersOptions, } from './cloud-tts.js'; |
| export { listSpeakers, runCloudTts, } from './cloud-tts.js'; |
| import type { SpeakerLanguage } from '../_api/types.js'; | ||
| export type CloudTtsOptions = { | ||
| apiKey: string; | ||
| voice: string; | ||
| model?: string; | ||
| output?: string; | ||
| }; | ||
| export type ListSpeakersOptions = { | ||
| apiKey: string; | ||
| language?: SpeakerLanguage; | ||
| }; | ||
| export declare function listSpeakers(options: ListSpeakersOptions): Promise<{ | ||
| name: string; | ||
| speakerId: string; | ||
| demoAudioUrl: string; | ||
| gender: string; | ||
| language: SpeakerLanguage; | ||
| }[]>; | ||
| export declare function runCloudTts(text: string, options: CloudTtsOptions): Promise<void>; |
| import { Buffer } from 'node:buffer'; | ||
| import fs from 'node:fs'; | ||
| import os from 'node:os'; | ||
| import path from 'node:path'; | ||
| import { Writable } from 'node:stream'; | ||
| import { execa } from 'execa'; | ||
| import { ListenHubApi } from '../_api/listenhub-openapi.js'; | ||
| export async function listSpeakers(options) { | ||
| const api = new ListenHubApi({ apiKey: options.apiKey }); | ||
| const result = await api.getAvailableSpeakers({ | ||
| language: options.language, | ||
| }); | ||
| return result.data.items; | ||
| } | ||
| async function collectStream(stream) { | ||
| const chunks = []; | ||
| const reader = stream.getReader(); | ||
| for (;;) { | ||
| const { done, value } = await reader.read(); // eslint-disable-line no-await-in-loop | ||
| if (done) | ||
| break; | ||
| chunks.push(value); | ||
| } | ||
| return Buffer.concat(chunks); | ||
| } | ||
| export async function runCloudTts(text, options) { | ||
| const api = new ListenHubApi({ apiKey: options.apiKey }); | ||
| const stream = await api.tts({ | ||
| input: text, | ||
| voice: options.voice, | ||
| model: options.model, | ||
| }); | ||
| if (options.output) { | ||
| const fileStream = fs.createWriteStream(options.output); | ||
| await stream.pipeTo(Writable.toWeb(fileStream)); | ||
| return; | ||
| } | ||
| const mp3Path = path.join(os.tmpdir(), `coli-cloud-tts-${Date.now()}.mp3`); | ||
| const audio = await collectStream(stream); | ||
| fs.writeFileSync(mp3Path, audio); | ||
| try { | ||
| await execa('afplay', [mp3Path]); | ||
| } | ||
| finally { | ||
| fs.unlinkSync(mp3Path); | ||
| } | ||
| } |
| export * from './_api/listenhub-openapi.js'; | ||
| export * from './asr/_index.js'; | ||
| export * from './cloud-tts/_index.js'; | ||
| export * from './tts/_index.js'; |
| export * from './_api/listenhub-openapi.js'; | ||
| export * from './asr/_index.js'; | ||
| export * from './cloud-tts/_index.js'; | ||
| export * from './tts/_index.js'; |
| import type { Command } from 'commander'; | ||
| export declare function register(program: Command): void; |
| import { getVoices, runTts } from './tts.js'; | ||
| export function register(program) { | ||
| program | ||
| .command('tts') | ||
| .description('Speak text using text-to-speech (macOS only)') | ||
| .argument('[text]', 'Text to speak') | ||
| .option('-v, --voice <name>', 'Voice to use, defaults to macOS system voice') | ||
| .option('-r, --rate <wpm>', 'Speech rate in words per minute', Number) | ||
| .option('-o, --output <file>', 'Save audio to file instead of speaking') | ||
| .option('--list-voices', 'List available voices') | ||
| .option('-j, --json', 'Output in JSON format (use with --list-voices)') | ||
| .action(async (text, options) => { | ||
| if (options.listVoices) { | ||
| const voices = await getVoices(); | ||
| if (options.json) { | ||
| console.log(JSON.stringify(voices, null, 2)); | ||
| } | ||
| else { | ||
| for (const voice of voices) { | ||
| console.log(`${voice.name}\t${voice.languageCode}\t${voice.example}`); | ||
| } | ||
| } | ||
| return; | ||
| } | ||
| if (!text) { | ||
| throw new Error('Please provide text to speak.'); | ||
| } | ||
| await runTts(text, options); | ||
| }); | ||
| } |
| export { getVoices, runTts, type TtsOptions } from './tts.js'; |
| export { getVoices, runTts } from './tts.js'; |
| import { type Voice } from 'mac-say'; | ||
| export type TtsOptions = { | ||
| voice?: string; | ||
| rate?: number; | ||
| output?: string; | ||
| }; | ||
| export declare function getVoices(): Promise<Voice[]>; | ||
| export declare function runTts(text: string, options?: TtsOptions): Promise<void>; |
| import { getVoices as macGetVoices, say } from 'mac-say'; | ||
| export async function getVoices() { | ||
| return macGetVoices(); | ||
| } | ||
| export async function runTts(text, options = {}) { | ||
| await say(text, { | ||
| voice: options.voice, | ||
| rate: options.rate, | ||
| outputFile: options.output, | ||
| }); | ||
| } |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Environment variable access
Supply chain riskPackage accesses environment variables, which may be a sign of credential stuffing or data theft.
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
48419
3.87%789
3.54%4
33.33%3
50%Updated