@codexstar/pi-listen
Advanced tools
| /** | ||
| * First-run TTS onboarding helper. | ||
| * | ||
| * v7.0.0 ships the lightweight version: when a user enables TTS for the | ||
| * first time (and onboarding hasn't been completed), the orchestrator | ||
| * shows a single notify() with the smart-default recommendation and | ||
| * tells the user how to either accept it (run /voice-speak-test) or | ||
| * customize (run /voice-speak-models). | ||
| * | ||
| * Why not a multi-step picker overlay (the v7 plan's full vision): | ||
| * - The settings panel already exposes every knob with proper UX | ||
| * - A first-run popup that hijacks the editor on every initial enable | ||
| * is annoying for advanced users who already configured things via | ||
| * settings.json | ||
| * - The lightweight surface is honest: "here's the recommendation, | ||
| * here's where to change it" — and it composes with the rest of | ||
| * the v7 surface (Speak tab, /voice-speak-info) | ||
| * | ||
| * If field reports show users want a richer flow, we can swap this | ||
| * notify-based version for a `ctx.ui.custom()` overlay in v7.1 | ||
| * without changing any other code path. | ||
| */ | ||
| import type { ExtensionCommandContext, ExtensionContext } from "@mariozechner/pi-coding-agent"; | ||
| import type { VoiceConfig, VoiceSettingsScope } from "./config"; | ||
| import type { DeviceProfile } from "./device"; | ||
| import { recommendDefaultModel, isTtsModelInstalled, getTtsModel } from "./tts-local-models"; | ||
| type NotifyContext = ExtensionContext | ExtensionCommandContext; | ||
| export interface OnboardTtsOpts { | ||
| ctx: NotifyContext; | ||
| config: VoiceConfig; | ||
| device: DeviceProfile; | ||
| cwd: string; | ||
| saveConfig: (config: VoiceConfig, scope: VoiceSettingsScope, cwd: string) => void; | ||
| } | ||
| /** | ||
| * Run the first-run TTS onboarding hint. Idempotent — only shows the | ||
| * hint once per config (`config.onboarding.completed` controls the | ||
| * generic onboarding flag; we co-opt a parallel `ttsOnboardingShown` | ||
| * marker on the config to avoid spamming the hint on every enable). | ||
| * | ||
| * Returns true if the hint was shown this call. | ||
| */ | ||
| export function maybeShowTtsOnboarding(opts: OnboardTtsOpts): boolean { | ||
| const { ctx, config, device, cwd, saveConfig } = opts; | ||
| if (!ctx.hasUI) return false; | ||
| if ((config as any).ttsOnboardingShown) return false; | ||
| const recommendation = recommendDefaultModel(device.systemLocale ?? "en"); | ||
| let recModel; | ||
| try { recModel = getTtsModel(recommendation.modelId); } catch { recModel = undefined; } | ||
| const installed = isTtsModelInstalled(recommendation.modelId); | ||
| const lines = [ | ||
| "TTS enabled — voice output for Pi.", | ||
| "", | ||
| ` ${recommendation.reason}`, | ||
| "", | ||
| recModel | ||
| ? ` Recommended: ${recModel.name} (${recModel.size}, ${recModel.languages.join("/")})` | ||
| : ` Recommended: ${recommendation.modelId}`, | ||
| ` Status: ${installed ? "ready ✓" : `not installed — first speak downloads ${recModel?.size ?? "model"}`}`, | ||
| "", | ||
| " Try it: /voice-speak-test", | ||
| " Pick another: /voice-speak-models (or /voice-settings → Speak tab)", | ||
| " Diagnose: /voice-speak-info", | ||
| " Disable: /voice-speak-toggle", | ||
| ]; | ||
| if (recommendation.fallback) { | ||
| lines.push(""); | ||
| lines.push(` Note: ${device.systemLocale ?? "your locale"} has no built-in voice — English fallback chosen.`); | ||
| } | ||
| ctx.ui.notify(lines.join("\n"), "info"); | ||
| // Mark the hint as shown (and persist) so subsequent enables are quiet. | ||
| (config as any).ttsOnboardingShown = true; | ||
| saveConfig(config, config.scope === "project" ? "project" : "global", cwd); | ||
| return true; | ||
| } |
| /** | ||
| * Text preprocessing for TTS — strips formats that read aloud poorly, | ||
| * enforces length limits, and normalizes whitespace. | ||
| * | ||
| * Used by: | ||
| * - Auto-speak path (`speak.ts` → after_assistant_message): the agent's | ||
| * full response goes through `prepareForSpeech()` before synthesis. | ||
| * Critical because raw assistant output contains code fences, | ||
| * markdown links, ANSI escapes from prior tool output, and other | ||
| * forms that read as gibberish. | ||
| * - Manual `/voice-speak <text>` path: light normalization only — | ||
| * trim + collapse whitespace. Users typing explicit text don't want | ||
| * us second-guessing their input. | ||
| * | ||
| * Pure functions, no I/O, no global state. Easy to test against the | ||
| * regression cases locked in tests/tts-text-filter.test.ts. | ||
| * | ||
| * Design choices: | ||
| * - Code blocks are dropped entirely, not paraphrased. "function foo | ||
| * opens brace const x equals one closes brace" is worse than silence. | ||
| * Surface "[code block omitted]" once per response so users know | ||
| * content was skipped. | ||
| * - Markdown link syntax `[text](url)` collapses to `text` — URLs read | ||
| * as gibberish ("h-t-t-p-s-colon-slash-slash-...") and the link text | ||
| * is what the speaker meant. | ||
| * - ANSI escapes (color codes, cursor moves) are stripped — they leak | ||
| * in from quoted tool output and synthesize as noise. | ||
| * - Inline code spans (single backticks) are kept inline — "use the | ||
| * `useState` hook" reads naturally. Triple-backtick fences are the | ||
| * hard skip. | ||
| * - Length cap is enforced AFTER stripping, so a 5000-char response | ||
| * that's mostly code blocks may pass. | ||
| */ | ||
| // ─── Public API ─────────────────────────────────────────────────────────────── | ||
| export interface PrepareForSpeechOpts { | ||
| /** | ||
| * Maximum characters in the output. If the cleaned text exceeds this, | ||
| * `prepareForSpeech` returns `{ skipped: true, reason: "too long" }`. | ||
| * Auto-speak callers default to 2000; manual /voice-speak passes | ||
| * Infinity. | ||
| */ | ||
| maxChars?: number; | ||
| /** | ||
| * If true, drop fenced code blocks entirely. If false, keep them but | ||
| * unwrap the fences (rare — code reads poorly aloud). | ||
| */ | ||
| stripCodeBlocks?: boolean; | ||
| /** | ||
| * If true, replace markdown link syntax with link text only. If false, | ||
| * keep the URL appended (only useful for debugging — no real users | ||
| * want to hear "https colon slash slash ..." aloud). | ||
| */ | ||
| collapseLinks?: boolean; | ||
| } | ||
| export interface PrepareForSpeechResult { | ||
| /** True if the text was rejected (length cap, empty after stripping, etc). */ | ||
| skipped: boolean; | ||
| /** Cleaned text ready for synthesis. Empty string when skipped. */ | ||
| text: string; | ||
| /** Human-readable reason when skipped. */ | ||
| reason?: string; | ||
| /** Diagnostic counts so callers can show "[N code blocks omitted]" hints. */ | ||
| stats: { | ||
| codeBlocksRemoved: number; | ||
| linksCollapsed: number; | ||
| ansiEscapesRemoved: number; | ||
| originalChars: number; | ||
| finalChars: number; | ||
| }; | ||
| } | ||
| const DEFAULT_OPTS: Required<PrepareForSpeechOpts> = { | ||
| maxChars: 2000, | ||
| stripCodeBlocks: true, | ||
| collapseLinks: true, | ||
| }; | ||
| /** | ||
| * Prepare assistant text for TTS synthesis. See module-level doc for the | ||
| * design rationale on each transform. | ||
| */ | ||
| export function prepareForSpeech(input: string, opts: PrepareForSpeechOpts = {}): PrepareForSpeechResult { | ||
| const config = { ...DEFAULT_OPTS, ...opts }; | ||
| const stats = { | ||
| codeBlocksRemoved: 0, | ||
| linksCollapsed: 0, | ||
| ansiEscapesRemoved: 0, | ||
| originalChars: typeof input === "string" ? input.length : 0, | ||
| finalChars: 0, | ||
| }; | ||
| if (typeof input !== "string" || !input) { | ||
| return { skipped: true, text: "", reason: "empty input", stats }; | ||
| } | ||
| let text = input; | ||
| // 1. Strip ANSI escape sequences. CSI patterns from tool output: | ||
| // - `\x1b[<digits>;<digits>m` (color/style) | ||
| // - `\x1b[<digits>;<digits>H` (cursor moves) | ||
| // - `\x1b]...\x07` (OSC sequences for window titles, hyperlinks) | ||
| const ansiPattern = /\x1b\[[\d;?]*[A-Za-z]|\x1b\][^\x07\x1b]*(?:\x07|\x1b\\)/g; | ||
| const ansiMatches = text.match(ansiPattern); | ||
| stats.ansiEscapesRemoved = ansiMatches?.length ?? 0; | ||
| text = text.replace(ansiPattern, ""); | ||
| // 2. Drop fenced code blocks. Match ``` or ~~~ fences with an optional | ||
| // language tag. The middle content can include any characters | ||
| // including newlines and other backticks. Greedy on opening, lazy | ||
| // on closing. | ||
| if (config.stripCodeBlocks) { | ||
| const codeBlockPattern = /```[\w-]*\r?\n[\s\S]*?\r?\n```|~~~[\w-]*\r?\n[\s\S]*?\r?\n~~~/g; | ||
| const codeMatches = text.match(codeBlockPattern); | ||
| stats.codeBlocksRemoved = codeMatches?.length ?? 0; | ||
| text = text.replace(codeBlockPattern, " [code block omitted] "); | ||
| } | ||
| // 3. Collapse markdown link syntax `[text](url)` → `text`. We DO NOT | ||
| // resolve image syntax `` to alt text — image alt | ||
| // contents are usually decorative and rarely meaningful aloud. | ||
| // Drop image syntax entirely. | ||
| if (config.collapseLinks) { | ||
| // Image alt: drop entire `` form — alt text is usually | ||
| // decorative ("a screenshot showing...") and rarely worth speaking. | ||
| text = text.replace(/!\[[^\]]*\]\([^)]+\)/g, ""); | ||
| // Regular links: keep the visible text only. | ||
| text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, (_full, linkText: string) => { | ||
| stats.linksCollapsed++; | ||
| return linkText; | ||
| }); | ||
| } | ||
| // 4. Strip HTML tags that occasionally leak in from doc comments. | ||
| // Defensive — most assistant output is plain markdown. | ||
| text = text.replace(/<\/?[a-zA-Z][^>]*>/g, " "); | ||
| // 5. Strip raw URLs that aren't inside markdown link syntax. These | ||
| // read as gibberish aloud. Stop at whitespace; closing-paren is | ||
| // included as a stop char so URLs inside parenthetical asides | ||
| // like "(see https://x.dev/p)" don't swallow the closing `)`. | ||
| text = text.replace(/https?:\/\/[^\s)]+/g, " [link omitted] "); | ||
| // 6. Normalize markdown emphasis markers. "**bold** text *italic* text" | ||
| // should read as "bold text italic text" — TTS doesn't emphasize | ||
| // on punctuation. Preserve the inner text only. | ||
| text = text.replace(/\*\*([^*]+)\*\*/g, "$1"); | ||
| text = text.replace(/__([^_]+)__/g, "$1"); | ||
| text = text.replace(/\*([^*\n]+)\*/g, "$1"); | ||
| text = text.replace(/_([^_\n]+)_/g, "$1"); | ||
| // 7. Normalize headings — drop the `#` markers but keep the heading | ||
| // text as a sentence. "# Hello\n" → "Hello. ". | ||
| text = text.replace(/^#{1,6}\s+(.+?)$/gm, "$1."); | ||
| // 8. Strip blockquote markers ("> quoted text" → "quoted text"). | ||
| text = text.replace(/^>\s+/gm, ""); | ||
| // 9. Strip horizontal rules. | ||
| text = text.replace(/^[-*_]{3,}$/gm, ""); | ||
| // 10. Strip leading bullet markers from list items so "- foo" reads | ||
| // as "foo". Each item retains its trailing newline so the | ||
| // sentence segmenter (Intl.Segmenter in speak.ts) treats them | ||
| // as separate sentences with natural pause boundaries — a more | ||
| // natural speech cadence than collapsing to a comma list, which | ||
| // would be one long run-on with no breath points. | ||
| text = text.replace(/^[ \t]*[-*+][ \t]+/gm, ""); | ||
| // 11. Inline code spans: keep the inner text but drop backticks. | ||
| // "use `useState`" → "use useState". | ||
| text = text.replace(/`([^`\n]+)`/g, "$1"); | ||
| // 12. Collapse whitespace runs. Keep paragraph breaks (double newline) | ||
| // because the segmenter uses them; everything else becomes a | ||
| // single space. | ||
| text = text.replace(/[ \t]+/g, " "); | ||
| text = text.replace(/\n{3,}/g, "\n\n"); | ||
| text = text.trim(); | ||
| stats.finalChars = text.length; | ||
| if (!text) { | ||
| return { skipped: true, text: "", reason: "empty after stripping", stats }; | ||
| } | ||
| if (text.length > config.maxChars) { | ||
| return { | ||
| skipped: true, | ||
| text: "", | ||
| reason: `text length (${text.length}) exceeds maxChars (${config.maxChars})`, | ||
| stats, | ||
| }; | ||
| } | ||
| return { skipped: false, text, stats }; | ||
| } | ||
| /** | ||
| * Lightweight version for the manual `/voice-speak <text>` path. Trims | ||
| * and collapses whitespace runs, but leaves code/links/ANSI alone — the | ||
| * user typed exactly what they want spoken. | ||
| */ | ||
| export function lightNormalize(input: string): string { | ||
| if (typeof input !== "string") return ""; | ||
| return input.replace(/[ \t]+/g, " ").trim(); | ||
| } | ||
| // ─── BCP-47 normalization ───────────────────────────────────────────────────── | ||
| /** | ||
| * Canonicalize a BCP-47-ish language tag. | ||
| * | ||
| * Subtag handling per RFC 5646 casing convention: | ||
| * - language (2-3 letters): lowercase — `en`, `zh` | ||
| * - script (4 letters): Title-case — `Hant`, `Latn` | ||
| * - region (2 letters or 3 digits): UPPERCASE — `US`, `BR`, `419` | ||
| * - variant (5+ letters or starts with digit): kept as-is, lowercase | ||
| * | ||
| * Inputs we accept: `en`, `en-US`, `en_US`, `EN-us`, `pt-br`, `zh_CN`, | ||
| * `zh-Hant-TW`, `sl-rozaj`. Output preserves all subtags in canonical | ||
| * casing; we never drop information. | ||
| * | ||
| * This is the single canonical form used by every TTS code path — | ||
| * comparing two tags after passing both through this function is the | ||
| * only safe way to check equality. | ||
| */ | ||
| export function normalizeBCP47(tag: string): string { | ||
| if (typeof tag !== "string" || !tag) return ""; | ||
| const parts = tag.replace(/_/g, "-").split("-").filter(Boolean); | ||
| if (parts.length === 0) return ""; | ||
| const out: string[] = []; | ||
| for (let i = 0; i < parts.length; i++) { | ||
| const sub = parts[i]!; | ||
| if (i === 0) { | ||
| // Primary language: 2-3 letter code, lowercase | ||
| out.push(sub.toLowerCase()); | ||
| } else if (sub.length === 4 && /^[A-Za-z]{4}$/.test(sub)) { | ||
| // Script subtag: Title case (e.g. Hant, Latn, Cyrl) | ||
| out.push(sub.charAt(0).toUpperCase() + sub.slice(1).toLowerCase()); | ||
| } else if (/^[A-Za-z]{2}$/.test(sub) || /^\d{3}$/.test(sub)) { | ||
| // Region subtag: 2-letter alpha or 3-digit UN M.49 → uppercase | ||
| out.push(sub.toUpperCase()); | ||
| } else { | ||
| // Variant / extension subtag: keep lowercase | ||
| out.push(sub.toLowerCase()); | ||
| } | ||
| } | ||
| return out.join("-"); | ||
| } | ||
| /** Extract the base language code from a BCP-47 tag. `en-US` → `en`. */ | ||
| export function baseLanguage(tag: string): string { | ||
| const norm = normalizeBCP47(tag); | ||
| const idx = norm.indexOf("-"); | ||
| return idx === -1 ? norm : norm.slice(0, idx); | ||
| } |
@@ -84,2 +84,8 @@ import * as fs from "node:fs"; | ||
| ttsDeepgramStreaming?: boolean; | ||
| /** | ||
| * Set to true after `tts-onboarding.maybeShowTtsOnboarding()` has | ||
| * shown its first-run hint, so subsequent /voice-speak-toggle calls | ||
| * don't re-spam the same notification. New in v7.0.0. | ||
| */ | ||
| ttsOnboardingShown?: boolean; | ||
| } | ||
@@ -118,2 +124,3 @@ | ||
| ttsDeepgramStreaming: false, | ||
| ttsOnboardingShown: false, | ||
| onboarding: { | ||
@@ -204,2 +211,5 @@ completed: false, | ||
| : DEFAULT_CONFIG.ttsDeepgramStreaming, | ||
| ttsOnboardingShown: typeof rawVoice.ttsOnboardingShown === "boolean" | ||
| ? rawVoice.ttsOnboardingShown | ||
| : false, | ||
| onboarding: normalizeOnboarding(rawVoice.onboarding, fallbackCompleted), | ||
@@ -206,0 +216,0 @@ }; |
@@ -27,2 +27,12 @@ /** | ||
| import { getFreeDiskSpace, formatBytes, getModelsDir, scanHandyModels, importHandyModel } from "./model-download"; | ||
| import { | ||
| TTS_LOCAL_MODELS as TTS_LOCAL_MODELS_REF, | ||
| isTtsModelInstalled as TTS_INSTALLED_CHECK_REF, | ||
| type TtsLocalModelInfo, | ||
| type TtsVoice, | ||
| } from "./tts-local-models"; | ||
| import { | ||
| DEEPGRAM_TTS_VOICES, | ||
| filterDeepgramVoicesByLanguage, | ||
| } from "./tts-deepgram"; | ||
@@ -38,2 +48,3 @@ // ─── Types ──────────────────────────────────────────────────────────────────── | ||
| | { type: "speak-test" } | ||
| | { type: "tts-install"; modelId: string } | ||
| | undefined; | ||
@@ -89,3 +100,3 @@ | ||
| private row = 0; | ||
| private sub: "main" | "lang-picker" = "main"; | ||
| private sub: "main" | "lang-picker" | "tts-model-picker" | "tts-voice-picker" = "main"; | ||
@@ -107,2 +118,10 @@ // Models tab — grouped view | ||
| // TTS model sub-picker (Speak tab → Model row) | ||
| private ttsModelSearch = ""; | ||
| private ttsModelRow = 0; | ||
| // TTS voice sub-picker (Speak tab → Voice row) | ||
| private ttsVoiceSearch = ""; | ||
| private ttsVoiceRow = 0; | ||
| // Two-step delete on the Downloaded tab. When `x` is pressed, set the | ||
@@ -164,2 +183,10 @@ // pending modelId + expiry timestamp; a second `x` within DELETE_CONFIRM_MS | ||
| } | ||
| if (this.sub === "tts-model-picker") { | ||
| lines.push(...this.renderTtsModelPicker(w, iw).map(t)); | ||
| return lines; | ||
| } | ||
| if (this.sub === "tts-voice-picker") { | ||
| lines.push(...this.renderTtsVoicePicker(w, iw).map(t)); | ||
| return lines; | ||
| } | ||
@@ -194,2 +221,10 @@ // Tab content | ||
| } | ||
| if (this.sub === "tts-model-picker") { | ||
| this.handleTtsModelInput(data); | ||
| return; | ||
| } | ||
| if (this.sub === "tts-voice-picker") { | ||
| this.handleTtsVoiceInput(data); | ||
| return; | ||
| } | ||
@@ -516,8 +551,24 @@ const tabId = TAB_IDS[this.tab]!; | ||
| // Five rows mirroring the General tab pattern: | ||
| // Always-visible status line — single source of truth for the | ||
| // current TTS configuration so the user can scan it without | ||
| // reading every row. | ||
| const statusParts: string[] = []; | ||
| statusParts.push(isLocal ? "Local" : "Deepgram"); | ||
| statusParts.push(this.formatActiveModelOrVoice(config)); | ||
| statusParts.push(`${(config.ttsSpeed ?? 1.0).toFixed(2)}×`); | ||
| const lang = config.ttsLanguage || config.language || "en"; | ||
| statusParts.push(lang.toUpperCase()); | ||
| const statusBar = config.ttsEnabled | ||
| ? this.success("● ") + statusParts.join(this.dim(" · ")) | ||
| : this.dim("● disabled · ") + statusParts.join(this.dim(" · ")); | ||
| lines.push(` ${statusBar}`); | ||
| lines.push(""); | ||
| // Six rows: | ||
| // 0: Enabled toggle | ||
| // 1: Backend toggle | ||
| // 2: Voice (numeric sid for local; Deepgram model id otherwise) | ||
| // 3: Speed (cycles 0.5/1.0/1.25/1.5/2.0) | ||
| // 4: Test (synthesizes "The quick brown fox …") | ||
| // 2: Model picker (local) or read-only label (deepgram) | ||
| // 3: Voice picker (numeric sid for local; Aura voice id for deepgram) | ||
| // 4: Speed (cycles 0.5 / 0.75 / 1.0 / 1.25 / 1.5 / 2.0) | ||
| // 5: Test (synthesizes "The quick brown fox …") | ||
| const rows: { label: string; value: string; hint?: string }[] = [ | ||
@@ -537,12 +588,16 @@ { | ||
| { | ||
| label: "Voice", | ||
| label: "Model", | ||
| value: isLocal | ||
| ? `sid ${typeof config.ttsLocalVoiceId === "number" ? config.ttsLocalVoiceId : 0}` | ||
| + ` (${config.ttsLocalModel ?? "kitten-nano-en-v0_2"})` | ||
| : (config.ttsDeepgramVoiceId ?? "aura-asteria-en"), | ||
| hint: "edit in settings.json", | ||
| ? this.formatLocalModelLabel(config.ttsLocalModel) | ||
| : this.dim("(deepgram backend — pick a voice instead)"), | ||
| hint: isLocal ? "pick model ›" : undefined, | ||
| }, | ||
| { | ||
| label: "Voice", | ||
| value: this.formatVoiceLabel(config), | ||
| hint: "pick voice ›", | ||
| }, | ||
| { | ||
| label: "Speed", | ||
| value: `${(config.ttsSpeed ?? 1.0).toFixed(2)}x`, | ||
| value: `${(config.ttsSpeed ?? 1.0).toFixed(2)}×`, | ||
| hint: "cycle", | ||
@@ -572,2 +627,38 @@ }, | ||
| /** Status-bar helper: format the current model+voice as one short string. */ | ||
| private formatActiveModelOrVoice(config: VoiceConfig): string { | ||
| const isLocal = (config.ttsBackend ?? "local") === "local"; | ||
| if (isLocal) { | ||
| const modelId = config.ttsLocalModel ?? "kitten-nano-en-v0_2"; | ||
| const sid = typeof config.ttsLocalVoiceId === "number" ? config.ttsLocalVoiceId : 0; | ||
| // Lazy lookup — keep status compact, full label is in the rows below. | ||
| const model = TTS_LOCAL_MODELS_REF.find(m => m.id === modelId); | ||
| const shortName = model?.name ?? modelId; | ||
| const voice = model?.voices.find(v => v.sid === sid); | ||
| return voice ? `${shortName} · ${voice.name}` : `${shortName} · sid ${sid}`; | ||
| } | ||
| return config.ttsDeepgramVoiceId ?? "aura-asteria-en"; | ||
| } | ||
| private formatLocalModelLabel(id: string | undefined): string { | ||
| const modelId = id ?? "kitten-nano-en-v0_2"; | ||
| const model = TTS_LOCAL_MODELS_REF.find(m => m.id === modelId); | ||
| if (!model) return modelId; | ||
| const installed = TTS_INSTALLED_CHECK_REF(modelId); | ||
| const installedTag = installed ? this.success(" ✓") : this.warning(" ⬇ download on select"); | ||
| return `${model.name} (${model.size})${installedTag}`; | ||
| } | ||
| private formatVoiceLabel(config: VoiceConfig): string { | ||
| const isLocal = (config.ttsBackend ?? "local") === "local"; | ||
| if (isLocal) { | ||
| const modelId = config.ttsLocalModel ?? "kitten-nano-en-v0_2"; | ||
| const sid = typeof config.ttsLocalVoiceId === "number" ? config.ttsLocalVoiceId : 0; | ||
| const model = TTS_LOCAL_MODELS_REF.find(m => m.id === modelId); | ||
| const voice = model?.voices.find(v => v.sid === sid); | ||
| return voice ? `${voice.name} (sid ${sid})` : `sid ${sid}`; | ||
| } | ||
| return config.ttsDeepgramVoiceId ?? "aura-asteria-en"; | ||
| } | ||
| // ─── Device tab ─────────────────────────────────────────────────────── | ||
@@ -777,8 +868,11 @@ | ||
| break; | ||
| case 2: // Voice — v6.0 ships read-only with edit-in-config hint | ||
| // In v6.1 this opens an inline picker; for now the | ||
| // recommended path is /voice-settings → close → edit | ||
| // settings.json directly. | ||
| case 2: // Model picker (local only — Deepgram has no model concept) | ||
| if ((config.ttsBackend ?? "local") === "local") { | ||
| this.openTtsModelPicker(); | ||
| } | ||
| break; | ||
| case 3: { // Speed cycle | ||
| case 3: // Voice picker | ||
| this.openTtsVoicePicker(); | ||
| break; | ||
| case 4: { // Speed cycle | ||
| const ladder = [0.75, 1.0, 1.25, 1.5, 2.0, 0.5]; | ||
@@ -791,3 +885,3 @@ const current = config.ttsSpeed ?? 1.0; | ||
| } | ||
| case 4: { // Test — emit a special panel-close action so the | ||
| case 5: { // Test — emit a special panel-close action so the | ||
| // caller (voice.ts:openSettingsPanel) can route it to | ||
@@ -868,2 +962,268 @@ // /voice-speak-test without us depending on the | ||
| // ─── TTS Model picker ────────────────────────────────────────────────── | ||
| /** | ||
| * Filtered TTS catalog for the model picker. | ||
| * Recomputed lazily during render — cheap (14 entries) and keeps the | ||
| * filter live with the search box. | ||
| */ | ||
| private getFilteredTtsModels(): TtsLocalModelInfo[] { | ||
| const q = this.ttsModelSearch.trim().toLowerCase(); | ||
| if (!q) return TTS_LOCAL_MODELS_REF; | ||
| return TTS_LOCAL_MODELS_REF.filter(m => | ||
| `${m.name} ${m.id} ${m.notes} ${m.languages.join(" ")}`.toLowerCase().includes(q), | ||
| ); | ||
| } | ||
| private openTtsModelPicker(): void { | ||
| this.ttsModelSearch = ""; | ||
| const currentId = this.p.config.ttsLocalModel ?? "kitten-nano-en-v0_2"; | ||
| const idx = TTS_LOCAL_MODELS_REF.findIndex(m => m.id === currentId); | ||
| this.ttsModelRow = idx >= 0 ? idx : 0; | ||
| this.sub = "tts-model-picker"; | ||
| } | ||
| private renderTtsModelPicker(_w: number, iw: number): string[] { | ||
| const lines: string[] = []; | ||
| const currentId = this.p.config.ttsLocalModel ?? "kitten-nano-en-v0_2"; | ||
| const filtered = this.getFilteredTtsModels(); | ||
| lines.push(` ${this.bold("Pick TTS model")}`); | ||
| const cursor = this.ttsModelSearch ? this.ttsModelSearch : this.dim("type to filter…"); | ||
| lines.push(` ${this.dim("Search:")} ${cursor}`); | ||
| lines.push(""); | ||
| if (filtered.length === 0) { | ||
| lines.push(this.dim(" No matching models.")); | ||
| lines.push(""); | ||
| lines.push(this.dim(" esc back type to filter")); | ||
| return lines; | ||
| } | ||
| // Viewport window centered on selection. 12 rows fits a 24-line | ||
| // overlay comfortably. | ||
| const maxVisible = 12; | ||
| const total = filtered.length; | ||
| const sel = Math.min(this.ttsModelRow, total - 1); | ||
| let start = Math.max(0, sel - Math.floor(maxVisible / 2)); | ||
| const end = Math.min(start + maxVisible, total); | ||
| if (end - start < maxVisible) start = Math.max(0, end - maxVisible); | ||
| const nameW = Math.min(28, Math.max(18, iw - 32)); | ||
| for (let i = start; i < end; i++) { | ||
| const m = filtered[i]!; | ||
| const isSelected = i === sel; | ||
| const isCurrent = m.id === currentId; | ||
| const installed = TTS_INSTALLED_CHECK_REF(m.id); | ||
| const prefix = isSelected ? this.accent(" → ") : " "; | ||
| const name = isSelected ? this.accent(m.name) : m.name; | ||
| const namePad = m.name.length < nameW ? " ".repeat(nameW - m.name.length) : ""; | ||
| const size = this.dim(m.size.padStart(8)); | ||
| const langs = this.dim(m.languages.length > 1 | ||
| ? `${m.languages.length} langs`.padEnd(13) | ||
| : m.languages[0]!.padEnd(13)); | ||
| const status = isCurrent | ||
| ? this.success("active") | ||
| : installed | ||
| ? this.success("ready") | ||
| : this.warning("⬇ download"); | ||
| lines.push(`${prefix}${name}${namePad} ${size} ${langs} ${status}`); | ||
| if (isSelected) { | ||
| lines.push(` ${this.dim(m.notes)}`); | ||
| } | ||
| } | ||
| if (start > 0 || end < total) { | ||
| lines.push(this.dim(` showing ${start + 1}–${end} of ${total}`)); | ||
| } | ||
| lines.push(""); | ||
| const sel_m = filtered[sel]; | ||
| if (sel_m) { | ||
| const installed = TTS_INSTALLED_CHECK_REF(sel_m.id); | ||
| const enterHint = installed ? "activate" : `download (${sel_m.size}) + activate`; | ||
| lines.push(this.dim(` ↵ ${enterHint} esc back type to filter`)); | ||
| } else { | ||
| lines.push(this.dim(" esc back")); | ||
| } | ||
| return lines; | ||
| } | ||
| private handleTtsModelInput(data: string): void { | ||
| if (matchesKey(data, Key.escape)) { | ||
| this.sub = "main"; | ||
| return; | ||
| } | ||
| const filtered = this.getFilteredTtsModels(); | ||
| if (matchesKey(data, Key.up)) { | ||
| if (filtered.length > 0) { | ||
| this.ttsModelRow = this.ttsModelRow === 0 ? filtered.length - 1 : this.ttsModelRow - 1; | ||
| } | ||
| return; | ||
| } | ||
| if (matchesKey(data, Key.down)) { | ||
| if (filtered.length > 0) { | ||
| this.ttsModelRow = this.ttsModelRow === filtered.length - 1 ? 0 : this.ttsModelRow + 1; | ||
| } | ||
| return; | ||
| } | ||
| if (matchesKey(data, Key.enter)) { | ||
| const m = filtered[this.ttsModelRow]; | ||
| if (!m) return; | ||
| this.p.config.ttsLocalModel = m.id; | ||
| // Reset voice id when model changes — preserve sid 0 default. | ||
| this.p.config.ttsLocalVoiceId = m.defaultSid; | ||
| this.save(); | ||
| this.sub = "main"; | ||
| // If model isn't installed, signal to the caller via panel | ||
| // close so voice.ts's openSettingsPanel post-close handler can | ||
| // run ensureTtsModelInstalled with progress notify. | ||
| if (!TTS_INSTALLED_CHECK_REF(m.id)) { | ||
| this.onClose?.({ type: "tts-install", modelId: m.id }); | ||
| } | ||
| return; | ||
| } | ||
| if (matchesKey(data, Key.backspace)) { | ||
| this.ttsModelSearch = this.ttsModelSearch.slice(0, -1); | ||
| this.ttsModelRow = 0; | ||
| return; | ||
| } | ||
| if (data.length === 1 && data >= " " && data <= "~") { | ||
| this.ttsModelSearch += data; | ||
| this.ttsModelRow = 0; | ||
| } | ||
| } | ||
| // ─── TTS Voice picker ────────────────────────────────────────────────── | ||
| private getCurrentVoiceCatalog(): { id: string | number; label: string; meta?: string }[] { | ||
| const { config } = this.p; | ||
| const isLocal = (config.ttsBackend ?? "local") === "local"; | ||
| if (isLocal) { | ||
| const modelId = config.ttsLocalModel ?? "kitten-nano-en-v0_2"; | ||
| const model = TTS_LOCAL_MODELS_REF.find(m => m.id === modelId); | ||
| if (!model) return []; | ||
| return model.voices.map((v: TtsVoice) => ({ | ||
| id: v.sid, | ||
| label: v.name, | ||
| meta: v.gender, | ||
| })); | ||
| } | ||
| // Deepgram: filter Aura voices by current language for relevance. | ||
| const lang = config.ttsLanguage || config.language || "en"; | ||
| const filtered = filterDeepgramVoicesByLanguage(lang); | ||
| const list = filtered.length > 0 ? filtered : DEEPGRAM_TTS_VOICES; | ||
| return list.map(v => ({ id: v.id, label: v.name, meta: v.gender })); | ||
| } | ||
| private getFilteredTtsVoices(): { id: string | number; label: string; meta?: string }[] { | ||
| const all = this.getCurrentVoiceCatalog(); | ||
| const q = this.ttsVoiceSearch.trim().toLowerCase(); | ||
| if (!q) return all; | ||
| return all.filter(v => `${v.label} ${v.meta ?? ""} ${v.id}`.toLowerCase().includes(q)); | ||
| } | ||
| private openTtsVoicePicker(): void { | ||
| this.ttsVoiceSearch = ""; | ||
| const all = this.getCurrentVoiceCatalog(); | ||
| const { config } = this.p; | ||
| const isLocal = (config.ttsBackend ?? "local") === "local"; | ||
| const currentId: string | number = isLocal | ||
| ? (typeof config.ttsLocalVoiceId === "number" ? config.ttsLocalVoiceId : 0) | ||
| : (config.ttsDeepgramVoiceId ?? "aura-asteria-en"); | ||
| const idx = all.findIndex(v => v.id === currentId); | ||
| this.ttsVoiceRow = idx >= 0 ? idx : 0; | ||
| this.sub = "tts-voice-picker"; | ||
| } | ||
| private renderTtsVoicePicker(_w: number, _iw: number): string[] { | ||
| const lines: string[] = []; | ||
| const filtered = this.getFilteredTtsVoices(); | ||
| const { config } = this.p; | ||
| const isLocal = (config.ttsBackend ?? "local") === "local"; | ||
| const currentId: string | number = isLocal | ||
| ? (typeof config.ttsLocalVoiceId === "number" ? config.ttsLocalVoiceId : 0) | ||
| : (config.ttsDeepgramVoiceId ?? "aura-asteria-en"); | ||
| lines.push(` ${this.bold(isLocal ? "Pick local voice" : "Pick Deepgram voice")}`); | ||
| const cursor = this.ttsVoiceSearch ? this.ttsVoiceSearch : this.dim("type to filter…"); | ||
| lines.push(` ${this.dim("Search:")} ${cursor}`); | ||
| lines.push(""); | ||
| if (filtered.length === 0) { | ||
| lines.push(this.dim(" No matching voices.")); | ||
| lines.push(""); | ||
| lines.push(this.dim(" esc back type to filter")); | ||
| return lines; | ||
| } | ||
| const maxVisible = 12; | ||
| const total = filtered.length; | ||
| const sel = Math.min(this.ttsVoiceRow, total - 1); | ||
| let start = Math.max(0, sel - Math.floor(maxVisible / 2)); | ||
| const end = Math.min(start + maxVisible, total); | ||
| if (end - start < maxVisible) start = Math.max(0, end - maxVisible); | ||
| for (let i = start; i < end; i++) { | ||
| const v = filtered[i]!; | ||
| const isSelected = i === sel; | ||
| const isCurrent = v.id === currentId; | ||
| const prefix = isSelected ? this.accent(" → ") : " "; | ||
| const idStr = typeof v.id === "number" ? `sid ${v.id}` : v.id; | ||
| const text = isSelected ? this.accent(v.label) : v.label; | ||
| const meta = v.meta ? this.dim(` (${v.meta})`) : ""; | ||
| const idTag = this.dim(` — ${idStr}`); | ||
| const check = isCurrent ? this.success(" ✓") : ""; | ||
| lines.push(`${prefix}${text}${meta}${idTag}${check}`); | ||
| } | ||
| if (start > 0 || end < total) { | ||
| lines.push(this.dim(` showing ${start + 1}–${end} of ${total}`)); | ||
| } | ||
| lines.push(""); | ||
| lines.push(this.dim(" ↵ select esc back type to filter")); | ||
| return lines; | ||
| } | ||
| private handleTtsVoiceInput(data: string): void { | ||
| if (matchesKey(data, Key.escape)) { | ||
| this.sub = "main"; | ||
| return; | ||
| } | ||
| const filtered = this.getFilteredTtsVoices(); | ||
| if (matchesKey(data, Key.up)) { | ||
| if (filtered.length > 0) { | ||
| this.ttsVoiceRow = this.ttsVoiceRow === 0 ? filtered.length - 1 : this.ttsVoiceRow - 1; | ||
| } | ||
| return; | ||
| } | ||
| if (matchesKey(data, Key.down)) { | ||
| if (filtered.length > 0) { | ||
| this.ttsVoiceRow = this.ttsVoiceRow === filtered.length - 1 ? 0 : this.ttsVoiceRow + 1; | ||
| } | ||
| return; | ||
| } | ||
| if (matchesKey(data, Key.enter)) { | ||
| const v = filtered[this.ttsVoiceRow]; | ||
| if (!v) return; | ||
| const isLocal = (this.p.config.ttsBackend ?? "local") === "local"; | ||
| if (isLocal && typeof v.id === "number") { | ||
| this.p.config.ttsLocalVoiceId = v.id; | ||
| } else if (!isLocal && typeof v.id === "string") { | ||
| this.p.config.ttsDeepgramVoiceId = v.id; | ||
| } | ||
| this.save(); | ||
| this.sub = "main"; | ||
| return; | ||
| } | ||
| if (matchesKey(data, Key.backspace)) { | ||
| this.ttsVoiceSearch = this.ttsVoiceSearch.slice(0, -1); | ||
| this.ttsVoiceRow = 0; | ||
| return; | ||
| } | ||
| if (data.length === 1 && data >= " " && data <= "~") { | ||
| this.ttsVoiceSearch += data; | ||
| this.ttsVoiceRow = 0; | ||
| } | ||
| } | ||
| // ─── Helpers ────────────────────────────────────────────────────────── | ||
@@ -880,3 +1240,3 @@ | ||
| } | ||
| case "speak": return 5; | ||
| case "speak": return 6; | ||
| case "device": return 0; | ||
@@ -883,0 +1243,0 @@ } |
@@ -252,2 +252,43 @@ /** | ||
| // ─── Warmup ────────────────────────────────────────────────────────────────── | ||
| /** | ||
| * Pre-load the sherpa-onnx module AND construct the OfflineTts for `model` | ||
| * in the background, so the user's first `/voice-speak` doesn't pay the | ||
| * 600-900ms cold-start init cost. | ||
| * | ||
| * Idempotent: subsequent calls for the same (model, modelDir) await the | ||
| * same in-flight promise via the existing `ttsCache` machinery — the only | ||
| * difference vs a real synthesize is that warmup discards the result. | ||
| * | ||
| * Cancellation: `signal` aborts the load. If the user toggles TTS off | ||
| * before warmup completes, the construction continues and the resulting | ||
| * instance lands in the cache (cheap memory cost), but no UI flicker | ||
| * happens — the cache is simply unused. Cleaner alternative would be to | ||
| * abort native createAsync but sherpa-onnx-node doesn't expose that. | ||
| * | ||
| * Errors: returns `false` on any failure (logged to debug output, not | ||
| * rethrown). Callers treat this as a best-effort optimization — failure | ||
| * here is not a user-facing error because the next /voice-speak will | ||
| * surface the same error anyway through synthesize(). | ||
| */ | ||
| export async function warmupTts( | ||
| model: TtsLocalModelInfo, | ||
| modelDir: string, | ||
| opts: { signal?: AbortSignal } = {}, | ||
| ): Promise<boolean> { | ||
| if (opts.signal?.aborted) return false; | ||
| try { | ||
| const ok = await loadSherpa(); | ||
| if (!ok) return false; | ||
| if (opts.signal?.aborted) return false; | ||
| await getOrCreateTts(model, modelDir); | ||
| return true; | ||
| } catch { | ||
| // Warmup is best-effort; swallow errors so callers never have to | ||
| // worry about a backgrounded promise rejection. | ||
| return false; | ||
| } | ||
| } | ||
| // ─── Synthesis ─────────────────────────────────────────────────────────────── | ||
@@ -425,3 +466,3 @@ | ||
| const tokens = path.join(modelDir, "tokens.txt"); | ||
| const numThreads = getTtsThreads(); | ||
| const numThreads = getTtsThreads(model.sherpaSlot); | ||
@@ -497,7 +538,16 @@ switch (model.sherpaSlot) { | ||
| */ | ||
| function getTtsThreads(): number { | ||
| function getTtsThreads(slot: TtsLocalModelInfo["sherpaSlot"]): number { | ||
| const cpus = os.cpus().length || 2; | ||
| if (cpus <= 2) return 1; | ||
| if (cpus <= 4) return 2; | ||
| return Math.min(4, cpus - 2); | ||
| // Per-slot tuning, mirroring the STT path's TRANSDUCER_MAX_THREADS=6 | ||
| // vs the Whisper-class cap of 4. Decisions per sherpa-onnx published | ||
| // RTF curves and #2910 (CoreML regression for transformer graphs): | ||
| // | ||
| // - kitten (Kitten Nano TTS): small model, scales to 4 threads | ||
| // - vits (Piper): single-speaker VITS, scales to 4 | ||
| // - kokoro (Kokoro v0.19/v1.0): larger transformer encoder, scales | ||
| // to ~6 P-cores on M-series | ||
| const max = slot === "kokoro" ? 6 : 4; | ||
| return Math.min(max, cpus - 2); | ||
| } | ||
@@ -504,0 +554,0 @@ |
@@ -34,2 +34,3 @@ /** | ||
| import { spawn } from "node:child_process"; | ||
| import { createHash } from "node:crypto"; | ||
@@ -94,2 +95,11 @@ const TTS_RELEASE = "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models"; | ||
| archiveUrl: string; | ||
| /** | ||
| * Optional SHA-256 hex digest of the archive bytes for integrity | ||
| * verification. When set, ensureTtsModelInstalled() rejects a | ||
| * download whose computed hash differs. v7.0.0 ships catalog entries | ||
| * without hashes (we don't ship known-good values for sherpa-onnx | ||
| * releases yet); the verification pipeline runs anyway and produces | ||
| * a hash that can be pinned in v7.1+ to lock-in the bytes. | ||
| */ | ||
| archiveSha256?: string; | ||
| /** Sample rate (Hz) of generated audio. Drives WAV header on playback. */ | ||
@@ -287,2 +297,120 @@ sampleRate: number; | ||
| // ─── Smart default selection ───────────────────────────────────────────────── | ||
| /** | ||
| * Recommend an initial TTS model based on the user's system locale. | ||
| * | ||
| * Returns ONE catalog entry id — the recommendation, not an installation | ||
| * decision. The caller (onboarding picker, settings panel) presents this | ||
| * as a pre-highlighted suggestion with disclosure of size and language | ||
| * coverage. The user always confirms before download starts. | ||
| * | ||
| * Mapping rules: | ||
| * - English locale (en-*) → kitten-nano-en-v0_2 (smallest, 25 MB) | ||
| * - Single-language Piper match → that Piper voice (~20 MB each) | ||
| * - Multi-language locale that | ||
| * covers Kokoro → kokoro-int8-multi-lang-v1_0 (126 MB) | ||
| * - Locale with no coverage → kitten-nano-en-v0_2 + warn | ||
| * | ||
| * The single-Piper-match path is preferred over the multilingual Kokoro | ||
| * because Piper is 1/6 the size when only one language is needed. Kokoro | ||
| * is the right pick when the user reads multiple languages OR when no | ||
| * Piper voice exists for their locale. | ||
| */ | ||
| export interface SmartDefaultRecommendation { | ||
| modelId: string; | ||
| /** Why this model was picked, surfaceable in onboarding UI. */ | ||
| reason: string; | ||
| /** True iff no model in the catalog actually covers `locale`. */ | ||
| fallback: boolean; | ||
| } | ||
| /** | ||
| * Per-language single-Piper-voice mapping — only languages where the | ||
| * catalog has exactly one Piper voice for the language. Multi-region | ||
| * languages (en, pt) are intentionally NOT here — those route through | ||
| * either the en→kitten path or kokoro multilingual. | ||
| */ | ||
| const SINGLE_PIPER_BY_BASE_LANG: Readonly<Record<string, string>> = { | ||
| es: "piper-es_ES-davefx-medium-int8", | ||
| fr: "piper-fr_FR-siwis-medium-int8", | ||
| de: "piper-de_DE-thorsten-medium-int8", | ||
| hi: "piper-hi_IN-pratham-medium-int8", | ||
| zh: "piper-zh_CN-chaowen-medium-int8", | ||
| it: "piper-it_IT-paola-medium-int8", | ||
| ru: "piper-ru_RU-denis-medium-int8", | ||
| ar: "piper-ar_JO-kareem-medium-int8", | ||
| tr: "piper-tr_TR-fahrettin-medium-int8", | ||
| nl: "piper-nl_NL-pim-medium-int8", | ||
| }; | ||
| export function recommendDefaultModel(systemLocale: string): SmartDefaultRecommendation { | ||
| if (!systemLocale || typeof systemLocale !== "string") { | ||
| return { | ||
| modelId: DEFAULT_TTS_MODEL, | ||
| reason: "No system locale detected — defaulting to the smallest English model.", | ||
| fallback: true, | ||
| }; | ||
| } | ||
| // Normalize: lowercase first subtag, e.g. "en_US.UTF-8" → "en" | ||
| const base = systemLocale | ||
| .split(/[-_.]/)[0]! | ||
| .toLowerCase(); | ||
| // English locales — Kitten Nano is the smallest viable English TTS | ||
| // at 25 MB, and we ship it as the catalog default for first-run | ||
| // experience reasons. | ||
| if (base === "en") { | ||
| return { | ||
| modelId: DEFAULT_TTS_MODEL, | ||
| reason: `English locale detected — recommending ${DEFAULT_TTS_MODEL} (25 MB, 8 voices).`, | ||
| fallback: false, | ||
| }; | ||
| } | ||
| // Special-case Portuguese: catalog has Brazilian-only Piper. We pick | ||
| // pt-BR for any pt-* locale and surface the regional gap in the | ||
| // reason — region-strict matching at speak time will warn if the | ||
| // user explicitly types pt-PT. | ||
| if (base === "pt") { | ||
| return { | ||
| modelId: "piper-pt_BR-cadu-medium-int8", | ||
| reason: `Portuguese locale detected — recommending Brazilian Portuguese voice (${ | ||
| systemLocale.toLowerCase().includes("br") ? "exact match" : "closest available" | ||
| }, 20 MB).`, | ||
| fallback: false, | ||
| }; | ||
| } | ||
| // Single-language Piper match | ||
| const single = SINGLE_PIPER_BY_BASE_LANG[base]; | ||
| if (single) { | ||
| return { | ||
| modelId: single, | ||
| reason: `${base.toUpperCase()} locale detected — recommending ${single} (~20 MB).`, | ||
| fallback: false, | ||
| }; | ||
| } | ||
| // Languages covered only by Kokoro multilingual (ja, ko) | ||
| if (base === "ja" || base === "ko") { | ||
| return { | ||
| modelId: "kokoro-int8-multi-lang-v1_0", | ||
| reason: `${base.toUpperCase()} locale detected — recommending Kokoro multilingual (126 MB, ` + | ||
| `covers en/zh/ja/ko/es/fr/hi/it/pt in one model).`, | ||
| fallback: false, | ||
| }; | ||
| } | ||
| // No coverage — fall back to English default with a warning the | ||
| // caller can surface verbatim. | ||
| return { | ||
| modelId: DEFAULT_TTS_MODEL, | ||
| reason: `Locale ${systemLocale} has no built-in TTS voice. Falling back to English (${DEFAULT_TTS_MODEL}). ` + | ||
| `Browse /voice-settings → Speak tab → Models for the full catalog.`, | ||
| fallback: true, | ||
| }; | ||
| } | ||
| /** Look up a model by id; throws if unknown so callers fail loudly. */ | ||
@@ -415,2 +543,8 @@ export function getTtsModel(id: string): TtsLocalModelInfo { | ||
| export interface TtsInstallProgress { | ||
| /** | ||
| * - "download" — fetching archive bytes (with phase totals) | ||
| * - "extract" — running tar over the saved archive | ||
| * - "verify" — moving extracted files to final dir | ||
| * - "done" — install complete | ||
| */ | ||
| phase: "download" | "extract" | "verify" | "done"; | ||
@@ -422,9 +556,30 @@ bytes?: number; | ||
| /** | ||
| * Result returned alongside install completion — exposes the computed | ||
| * SHA-256 so callers (and v7.1+ catalog updates) can pin known-good hashes. | ||
| */ | ||
| export interface TtsInstallResult { | ||
| dir: string; | ||
| archiveSha256: string; | ||
| } | ||
| /** | ||
| * Download and extract `modelId` if not already installed. Idempotent — | ||
| * if already installed, resolves immediately. The download is streamed | ||
| * directly to `tar -xj` so we never buffer the whole archive in memory. | ||
| * if already installed, resolves immediately. | ||
| * | ||
| * The flow is download-to-disk-then-extract, not streaming-to-tar: | ||
| * 1. Resume-aware fetch → write archive bytes to | ||
| * `~/.pi/models/tts/<id>.partial.tar.bz2`. If the partial file | ||
| * exists from a prior interrupted run, send `Range: bytes=N-` and | ||
| * append. SHA-256 is computed across the full file by re-reading | ||
| * it once on completion (cheap — ~200ms for 126 MB on M-series). | ||
| * 2. If the catalog entry has `archiveSha256`, compare against the | ||
| * computed hash. Mismatch → reject + cleanup partial. | ||
| * 3. `tar -xj -f <archive> -C <stagingDir>` to extract. | ||
| * 4. Move staging contents to final `<modelDir>` via rename (atomic). | ||
| * 5. Delete the archive file. | ||
| * | ||
| * Errors: | ||
| * - "Network error: ..." on fetch failure | ||
| * - "Download failed: HTTP <status>" on non-2xx | ||
| * - "Download failed: HTTP <status>" on non-2xx (and not 206/200 retry) | ||
| * - "Network error: <message>" on fetch failure | ||
| * - "Archive integrity check failed: ..." on SHA-256 mismatch | ||
| * - "tar exited with code N" on extraction failure | ||
@@ -439,3 +594,3 @@ * - DOMException("AbortError") if signal fires | ||
| } = {}, | ||
| ): Promise<string> { | ||
| ): Promise<TtsInstallResult> { | ||
| const model = getTtsModel(modelId); | ||
@@ -446,97 +601,200 @@ const dir = getTtsModelDir(modelId); | ||
| opts.onProgress?.({ phase: "done" }); | ||
| return dir; | ||
| return { dir, archiveSha256: model.archiveSha256 ?? "" }; | ||
| } | ||
| if (opts.signal?.aborted) throw makeAbortErr(); | ||
| // Download to a temp file under tts/ then extract → atomic-ish: | ||
| // partial extracts go into a temp dir that we rename on success, so | ||
| // `isTtsModelInstalled` never sees a half-extracted state. | ||
| const ttsDir = getTtsModelsDir(); | ||
| fs.mkdirSync(ttsDir, { recursive: true }); | ||
| const stagingDir = `${dir}.staging-${process.pid}`; | ||
| fs.mkdirSync(stagingDir, { recursive: true }); | ||
| const archivePath = path.join(ttsDir, `${modelId}.partial.tar.bz2`); | ||
| try { | ||
| opts.onProgress?.({ phase: "download" }); | ||
| const res = await fetch(model.archiveUrl, { signal: opts.signal }); | ||
| if (!res.ok || !res.body) { | ||
| throw new Error(`Download failed: HTTP ${res.status} from ${model.archiveUrl}`); | ||
| // Phase 1 — download archive bytes (with resume). | ||
| await downloadArchive(model.archiveUrl, archivePath, opts); | ||
| if (opts.signal?.aborted) throw makeAbortErr(); | ||
| // Phase 2 — verify hash. | ||
| opts.onProgress?.({ phase: "verify" }); | ||
| const computedSha256 = await sha256OfFile(archivePath, opts.signal); | ||
| if (model.archiveSha256 && model.archiveSha256.toLowerCase() !== computedSha256.toLowerCase()) { | ||
| throw new Error( | ||
| `Archive integrity check failed for ${modelId}: ` + | ||
| `expected ${model.archiveSha256}, got ${computedSha256}. ` + | ||
| `Delete ${archivePath} and retry, or check for a corrupted upstream release.`, | ||
| ); | ||
| } | ||
| // Phase 3 — extract. | ||
| opts.onProgress?.({ phase: "extract", totalBytes: model.sizeBytes }); | ||
| // Stream the archive directly into `tar -xj -C <stagingDir>`. This | ||
| // works on macOS, Linux, and Windows 10+ (which ships bsdtar). | ||
| // Using stdin keeps the archive bytes off disk — important for | ||
| // a 126 MB Kokoro download. | ||
| const tar = spawn("tar", ["-xj", "-C", stagingDir], { | ||
| stdio: ["pipe", "ignore", "pipe"], | ||
| ...(opts.signal ? { signal: opts.signal } : {}), | ||
| }); | ||
| let tarStderr = ""; | ||
| tar.stderr?.on("data", (d: Buffer) => { | ||
| if (tarStderr.length < 1024) tarStderr += d.toString(); | ||
| }); | ||
| const stagingDir = `${dir}.staging-${process.pid}`; | ||
| fs.mkdirSync(stagingDir, { recursive: true }); | ||
| try { | ||
| await runTarExtract(archivePath, stagingDir, opts.signal); | ||
| const tarExit = new Promise<void>((resolve, reject) => { | ||
| tar.on("error", (err: NodeJS.ErrnoException) => { | ||
| if (err.name === "AbortError" || opts.signal?.aborted) reject(makeAbortErr()); | ||
| else reject(new Error(`tar failed to start: ${err.message}`)); | ||
| }); | ||
| tar.on("close", (code, sig) => { | ||
| if (code === 0) resolve(); | ||
| else if (opts.signal?.aborted) reject(makeAbortErr()); | ||
| else reject(new Error(`tar exited with code ${code}${sig ? ` (${sig})` : ""}: ${tarStderr.trim().slice(-200)}`)); | ||
| }); | ||
| }); | ||
| // Pipe response body → tar stdin. The web ReadableStream needs to | ||
| // be drained chunk-by-chunk; we write each chunk to the tar pipe | ||
| // and surface progress. | ||
| let bytesSeen = 0; | ||
| const reader = res.body.getReader(); | ||
| try { | ||
| while (true) { | ||
| if (opts.signal?.aborted) { | ||
| try { tar.kill("SIGTERM"); } catch {} | ||
| throw makeAbortErr(); | ||
| } | ||
| const { value, done } = await reader.read(); | ||
| if (done) break; | ||
| if (value) { | ||
| bytesSeen += value.byteLength; | ||
| opts.onProgress?.({ phase: "extract", bytes: bytesSeen, totalBytes: model.sizeBytes }); | ||
| // Write to tar's stdin — backpressure-respecting via the | ||
| // fact that tar.stdin is a Writable; if it's full this | ||
| // returns false but our small chunk size means it | ||
| // rarely matters. | ||
| tar.stdin?.write(Buffer.from(value)); | ||
| } | ||
| } | ||
| // Phase 4 — move into final location. The archive's top-level | ||
| // directory differs per model (e.g. | ||
| // `vits-piper-en_US-lessac-medium-int8/`). Flatten to | ||
| // `<modelDir>/tokens.txt` etc. | ||
| const stagingEntries = fs.readdirSync(stagingDir); | ||
| const innerDir = stagingEntries.length === 1 && fs.statSync(path.join(stagingDir, stagingEntries[0]!)).isDirectory() | ||
| ? path.join(stagingDir, stagingEntries[0]!) | ||
| : stagingDir; | ||
| // rename is atomic when innerDir and dir are on the same | ||
| // filesystem (~/.pi/models/tts/.staging is a sibling of dir). | ||
| fs.renameSync(innerDir, dir); | ||
| } finally { | ||
| try { reader.releaseLock(); } catch {} | ||
| try { fs.rmSync(stagingDir, { recursive: true, force: true }); } catch {} | ||
| } | ||
| tar.stdin?.end(); | ||
| await tarExit; | ||
| // The archive's top-level directory differs per model (e.g. | ||
| // `vits-piper-en_US-lessac-medium-int8/`). Move its contents | ||
| // up one level so we end up with a flat `<dir>/tokens.txt` etc. | ||
| opts.onProgress?.({ phase: "verify" }); | ||
| const stagingEntries = fs.readdirSync(stagingDir); | ||
| const innerDir = stagingEntries.length === 1 && fs.statSync(path.join(stagingDir, stagingEntries[0]!)).isDirectory() | ||
| ? path.join(stagingDir, stagingEntries[0]!) | ||
| : stagingDir; | ||
| fs.renameSync(innerDir, dir); | ||
| // Phase 5 — clean up the archive file. Successful install means we | ||
| // no longer need the partial; resume is moot. | ||
| try { fs.unlinkSync(archivePath); } catch {} | ||
| opts.onProgress?.({ phase: "done" }); | ||
| return dir; | ||
| return { dir, archiveSha256: computedSha256 }; | ||
| } catch (err) { | ||
| // Best-effort cleanup of partial state. | ||
| try { fs.rmSync(stagingDir, { recursive: true, force: true }); } catch {} | ||
| // On failure, leave the partial archive file in place so the | ||
| // next attempt can resume. But clean up the destination dir if | ||
| // extraction created it. | ||
| try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} | ||
| throw err; | ||
| } | ||
| } | ||
| /** | ||
| * Download bytes to `archivePath` with `Range` resume. | ||
| * | ||
| * If the file already exists, we send `Range: bytes=<size>-` and the | ||
| * server is expected to respond with 206 Partial Content (we append) or | ||
| * 200 OK (server doesn't support range; we throw the file away and | ||
| * start over). | ||
| * | ||
| * Surfaces byte-count progress via `opts.onProgress`. The total byte | ||
| * count comes from `Content-Length` on the response — for 206 responses | ||
| * we add the existing partial size to the running counter so the user | ||
| * sees a continuous progress bar across resumed sessions. | ||
| */ | ||
| async function downloadArchive( | ||
| url: string, | ||
| archivePath: string, | ||
| opts: { signal?: AbortSignal; onProgress?: (info: TtsInstallProgress) => void }, | ||
| ): Promise<void> { | ||
| let existingBytes = 0; | ||
| if (fs.existsSync(archivePath)) { | ||
| try { existingBytes = fs.statSync(archivePath).size; } catch {} | ||
| } | ||
| const headers: Record<string, string> = {}; | ||
| if (existingBytes > 0) headers.Range = `bytes=${existingBytes}-`; | ||
| let res: Response; | ||
| try { | ||
| res = await fetch(url, { signal: opts.signal, headers }); | ||
| } catch (err: any) { | ||
| if (err?.name === "AbortError") throw err; | ||
| throw new Error(`Network error: ${err?.message ?? String(err)}`); | ||
| } | ||
| let appendMode = false; | ||
| if (res.status === 206 && existingBytes > 0) { | ||
| appendMode = true; | ||
| } else if (res.status === 200) { | ||
| // Server ignored our Range — start over. | ||
| appendMode = false; | ||
| existingBytes = 0; | ||
| } else if (!res.ok) { | ||
| throw new Error(`Download failed: HTTP ${res.status} from ${url}`); | ||
| } | ||
| if (!res.body) throw new Error(`Download failed: empty body from ${url}`); | ||
| // Total = bytes already on disk + Content-Length of this response. | ||
| const contentLength = parseInt(res.headers.get("content-length") ?? "0", 10); | ||
| const totalBytes = existingBytes + (Number.isFinite(contentLength) ? contentLength : 0); | ||
| const sink = fs.createWriteStream(archivePath, { flags: appendMode ? "a" : "w" }); | ||
| let bytesSeen = existingBytes; | ||
| opts.onProgress?.({ phase: "download", bytes: bytesSeen, totalBytes }); | ||
| const reader = res.body.getReader(); | ||
| try { | ||
| while (true) { | ||
| if (opts.signal?.aborted) throw makeAbortErr(); | ||
| const { value, done } = await reader.read(); | ||
| if (done) break; | ||
| if (value) { | ||
| bytesSeen += value.byteLength; | ||
| // Honor backpressure: if write returns false, await `drain`. | ||
| const ok = sink.write(Buffer.from(value)); | ||
| if (!ok) await new Promise<void>(r => sink.once("drain", r)); | ||
| opts.onProgress?.({ phase: "download", bytes: bytesSeen, totalBytes }); | ||
| } | ||
| } | ||
| } finally { | ||
| try { fs.rmSync(stagingDir, { recursive: true, force: true }); } catch {} | ||
| try { reader.releaseLock(); } catch {} | ||
| // Drain and close the file. End() callback fires after the final | ||
| // flush. Errors during close are surfaced via `error` listener | ||
| // captured before the await. | ||
| await new Promise<void>((resolve, rej) => { | ||
| let settled = false; | ||
| const onError = (err: Error) => { | ||
| if (settled) return; | ||
| settled = true; | ||
| rej(err); | ||
| }; | ||
| sink.once("error", onError); | ||
| sink.end(() => { | ||
| if (settled) return; | ||
| settled = true; | ||
| sink.off("error", onError); | ||
| resolve(); | ||
| }); | ||
| }); | ||
| } | ||
| } | ||
| /** Compute the SHA-256 hex digest of `filePath`. Streams via fs.createReadStream. */ | ||
| async function sha256OfFile(filePath: string, signal?: AbortSignal): Promise<string> { | ||
| return new Promise((resolve, reject) => { | ||
| const hash = createHash("sha256"); | ||
| const stream = fs.createReadStream(filePath); | ||
| const onAbort = () => { | ||
| stream.destroy(); | ||
| reject(makeAbortErr()); | ||
| }; | ||
| signal?.addEventListener("abort", onAbort, { once: true }); | ||
| stream.on("data", (chunk) => hash.update(chunk)); | ||
| stream.on("end", () => { | ||
| signal?.removeEventListener("abort", onAbort); | ||
| resolve(hash.digest("hex")); | ||
| }); | ||
| stream.on("error", (err) => { | ||
| signal?.removeEventListener("abort", onAbort); | ||
| reject(err); | ||
| }); | ||
| }); | ||
| } | ||
| /** Spawn `tar -xj -f <archive> -C <stagingDir>` and resolve on exit code 0. */ | ||
| async function runTarExtract(archivePath: string, stagingDir: string, signal?: AbortSignal): Promise<void> { | ||
| const tar = spawn("tar", ["-xj", "-f", archivePath, "-C", stagingDir], { | ||
| stdio: ["ignore", "ignore", "pipe"], | ||
| ...(signal ? { signal } : {}), | ||
| }); | ||
| let tarStderr = ""; | ||
| tar.stderr?.on("data", (d: Buffer) => { | ||
| if (tarStderr.length < 1024) tarStderr += d.toString(); | ||
| }); | ||
| await new Promise<void>((resolve, reject) => { | ||
| tar.on("error", (err: NodeJS.ErrnoException) => { | ||
| if (err.name === "AbortError" || signal?.aborted) reject(makeAbortErr()); | ||
| else reject(new Error(`tar failed to start: ${err.message}`)); | ||
| }); | ||
| tar.on("close", (code, sig) => { | ||
| if (code === 0) resolve(); | ||
| else if (signal?.aborted) reject(makeAbortErr()); | ||
| else reject(new Error(`tar exited with code ${code}${sig ? ` (${sig})` : ""}: ${tarStderr.trim().slice(-200)}`)); | ||
| }); | ||
| }); | ||
| } | ||
| function makeAbortErr(): Error { | ||
@@ -543,0 +801,0 @@ if (typeof DOMException === "function") { |
+1
-1
| { | ||
| "name": "@codexstar/pi-listen", | ||
| "version": "6.0.0", | ||
| "version": "7.0.0", | ||
| "description": "Voice in + voice out for Pi CLI — hold-to-talk STT (Deepgram or 19 offline models) plus TTS (Kitten Nano, Piper, Kokoro, or Deepgram Aura)", | ||
@@ -5,0 +5,0 @@ "type": "module", |
+9
-8
@@ -15,10 +15,11 @@ [English](README.md) | [简体中文](README.zh-CN.md) | [日本語](README.ja.md) | [한국어](README.ko.md) | [Español](README.es.md) | [Français](README.fr.md) | [Português](README.pt-BR.md) | [हिन्दी](README.hi.md) | ||
| > **v6.0.0 — TTS! pi-listen now speaks back** — voice in + voice out in one | ||
| > extension. Hold-to-talk STT unchanged; new `/voice-speak <text>` synthesizes | ||
| > and plays via 12 local models (Kitten Nano 25 MB default, Piper per-language | ||
| > voices, Kokoro multilingual) or Deepgram Aura (cloud, same `DEEPGRAM_API_KEY` | ||
| > as STT). Region-strict language matching, sentence-aware chunking via | ||
| > `Intl.Segmenter` (no more breaking on `Dr. Smith` / `v2.0` / URLs), | ||
| > cross-platform audio playback with cooperative abort. Plus the cleaner | ||
| > v5.1 settings panel grouped by model family. [Full changelog →](CHANGELOG.md) | ||
| > **v7.0.0 — World-class TTS UX** — pick models from `/voice-settings` Speak | ||
| > tab (no more JSON editing), auto-download on selection with progress, voice | ||
| > picker for every backend, first-run onboarding with smart-default | ||
| > recommendation by your system locale, and `ttsAutoSpeak: true` finally | ||
| > works — auto-speaks the agent's responses with code-block stripping and | ||
| > rate limiting. Diagnostic command `/voice-speak-info` shows everything. | ||
| > Resume-on-interrupt downloads. Plus all v6 features (14 local models from | ||
| > 25 MB Kitten Nano up, Deepgram Aura cloud, region-strict language matching, | ||
| > sentence-aware chunking). [Full changelog →](CHANGELOG.md) | ||
@@ -25,0 +26,0 @@ --- |
Sorry, the diff of this file is too big to display
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
523776
9.79%28
7.69%9252
13.34%310
0.32%