@codexstar/pi-listen - npm Package Compare versions

Comparing version

6.0.0

7.0.0

+83

extensions/voice/tts-onboarding.ts

		/**
		* First-run TTS onboarding helper.
		*
		* v7.0.0 ships the lightweight version: when a user enables TTS for the
		* first time (and onboarding hasn't been completed), the orchestrator
		* shows a single notify() with the smart-default recommendation and
		* tells the user how to either accept it (run /voice-speak-test) or
		* customize (run /voice-speak-models).
		*
		* Why not a multi-step picker overlay (the v7 plan's full vision):
		* - The settings panel already exposes every knob with proper UX
		* - A first-run popup that hijacks the editor on every initial enable
		* is annoying for advanced users who already configured things via
		* settings.json
		* - The lightweight surface is honest: "here's the recommendation,
		* here's where to change it" — and it composes with the rest of
		* the v7 surface (Speak tab, /voice-speak-info)
		*
		* If field reports show users want a richer flow, we can swap this
		* notify-based version for a `ctx.ui.custom()` overlay in v7.1
		* without changing any other code path.
		*/

		import type { ExtensionCommandContext, ExtensionContext } from "@mariozechner/pi-coding-agent";
		import type { VoiceConfig, VoiceSettingsScope } from "./config";
		import type { DeviceProfile } from "./device";
		import { recommendDefaultModel, isTtsModelInstalled, getTtsModel } from "./tts-local-models";

		type NotifyContext = ExtensionContext \| ExtensionCommandContext;

		export interface OnboardTtsOpts {
		ctx: NotifyContext;
		config: VoiceConfig;
		device: DeviceProfile;
		cwd: string;
		saveConfig: (config: VoiceConfig, scope: VoiceSettingsScope, cwd: string) => void;
		}

		/**
		* Run the first-run TTS onboarding hint. Idempotent — only shows the
		* hint once per config (`config.onboarding.completed` controls the
		* generic onboarding flag; we co-opt a parallel `ttsOnboardingShown`
		* marker on the config to avoid spamming the hint on every enable).
		*
		* Returns true if the hint was shown this call.
		*/
		export function maybeShowTtsOnboarding(opts: OnboardTtsOpts): boolean {
		const { ctx, config, device, cwd, saveConfig } = opts;
		if (!ctx.hasUI) return false;
		if ((config as any).ttsOnboardingShown) return false;

		const recommendation = recommendDefaultModel(device.systemLocale ?? "en");
		let recModel;
		try { recModel = getTtsModel(recommendation.modelId); } catch { recModel = undefined; }
		const installed = isTtsModelInstalled(recommendation.modelId);

		const lines = [
		"TTS enabled — voice output for Pi.",
		"",
		` ${recommendation.reason}`,
		"",
		recModel
		? ` Recommended: ${recModel.name} (${recModel.size}, ${recModel.languages.join("/")})`
		: ` Recommended: ${recommendation.modelId}`,
		` Status: ${installed ? "ready ✓" : `not installed — first speak downloads ${recModel?.size ?? "model"}`}`,
		"",
		" Try it: /voice-speak-test",
		" Pick another: /voice-speak-models (or /voice-settings → Speak tab)",
		" Diagnose: /voice-speak-info",
		" Disable: /voice-speak-toggle",
		];
		if (recommendation.fallback) {
		lines.push("");
		lines.push(` Note: ${device.systemLocale ?? "your locale"} has no built-in voice — English fallback chosen.`);
		}

		ctx.ui.notify(lines.join("\n"), "info");

		// Mark the hint as shown (and persist) so subsequent enables are quiet.
		(config as any).ttsOnboardingShown = true;
		saveConfig(config, config.scope === "project" ? "project" : "global", cwd);
		return true;
		}

+259

extensions/voice/tts-text-filter.ts

		/**
		* Text preprocessing for TTS — strips formats that read aloud poorly,
		* enforces length limits, and normalizes whitespace.
		*
		* Used by:
		* - Auto-speak path (`speak.ts` → after_assistant_message): the agent's
		* full response goes through `prepareForSpeech()` before synthesis.
		* Critical because raw assistant output contains code fences,
		* markdown links, ANSI escapes from prior tool output, and other
		* forms that read as gibberish.
		* - Manual `/voice-speak <text>` path: light normalization only —
		* trim + collapse whitespace. Users typing explicit text don't want
		* us second-guessing their input.
		*
		* Pure functions, no I/O, no global state. Easy to test against the
		* regression cases locked in tests/tts-text-filter.test.ts.
		*
		* Design choices:
		* - Code blocks are dropped entirely, not paraphrased. "function foo
		* opens brace const x equals one closes brace" is worse than silence.
		* Surface "[code block omitted]" once per response so users know
		* content was skipped.
		* - Markdown link syntax `[text](url)` collapses to `text` — URLs read
		* as gibberish ("h-t-t-p-s-colon-slash-slash-...") and the link text
		* is what the speaker meant.
		* - ANSI escapes (color codes, cursor moves) are stripped — they leak
		* in from quoted tool output and synthesize as noise.
		* - Inline code spans (single backticks) are kept inline — "use the
		* `useState` hook" reads naturally. Triple-backtick fences are the
		* hard skip.
		* - Length cap is enforced AFTER stripping, so a 5000-char response
		* that's mostly code blocks may pass.
		*/

		// ─── Public API ───────────────────────────────────────────────────────────────

		export interface PrepareForSpeechOpts {
		/**
		* Maximum characters in the output. If the cleaned text exceeds this,
		* `prepareForSpeech` returns `{ skipped: true, reason: "too long" }`.
		* Auto-speak callers default to 2000; manual /voice-speak passes
		* Infinity.
		*/
		maxChars?: number;
		/**
		* If true, drop fenced code blocks entirely. If false, keep them but
		* unwrap the fences (rare — code reads poorly aloud).
		*/
		stripCodeBlocks?: boolean;
		/**
		* If true, replace markdown link syntax with link text only. If false,
		* keep the URL appended (only useful for debugging — no real users
		* want to hear "https colon slash slash ..." aloud).
		*/
		collapseLinks?: boolean;
		}

		export interface PrepareForSpeechResult {
		/** True if the text was rejected (length cap, empty after stripping, etc). */
		skipped: boolean;
		/** Cleaned text ready for synthesis. Empty string when skipped. */
		text: string;
		/** Human-readable reason when skipped. */
		reason?: string;
		/** Diagnostic counts so callers can show "[N code blocks omitted]" hints. */
		stats: {
		codeBlocksRemoved: number;
		linksCollapsed: number;
		ansiEscapesRemoved: number;
		originalChars: number;
		finalChars: number;
		};
		}

		const DEFAULT_OPTS: Required<PrepareForSpeechOpts> = {
		maxChars: 2000,
		stripCodeBlocks: true,
		collapseLinks: true,
		};

		/**
		* Prepare assistant text for TTS synthesis. See module-level doc for the
		* design rationale on each transform.
		*/
		export function prepareForSpeech(input: string, opts: PrepareForSpeechOpts = {}): PrepareForSpeechResult {
		const config = { ...DEFAULT_OPTS, ...opts };
		const stats = {
		codeBlocksRemoved: 0,
		linksCollapsed: 0,
		ansiEscapesRemoved: 0,
		originalChars: typeof input === "string" ? input.length : 0,
		finalChars: 0,
		};

		if (typeof input !== "string" \|\| !input) {
		return { skipped: true, text: "", reason: "empty input", stats };
		}

		let text = input;

		// 1. Strip ANSI escape sequences. CSI patterns from tool output:
		// - `\x1b[<digits>;<digits>m` (color/style)
		// - `\x1b[<digits>;<digits>H` (cursor moves)
		// - `\x1b]...\x07` (OSC sequences for window titles, hyperlinks)
		const ansiPattern = /\x1b\[[\d;?][A-Za-z]\|\x1b\][^\x07\x1b](?:\x07\|\x1b\\)/g;
		const ansiMatches = text.match(ansiPattern);
		stats.ansiEscapesRemoved = ansiMatches?.length ?? 0;
		text = text.replace(ansiPattern, "");

		// 2. Drop fenced code blocks. Match ``` or ~~~ fences with an optional
		// language tag. The middle content can include any characters
		// including newlines and other backticks. Greedy on opening, lazy
		// on closing.
		if (config.stripCodeBlocks) {
		const codeBlockPattern = /```[\w-]\r?\n[\s\S]?\r?\n```\|~~~[\w-]\r?\n[\s\S]?\r?\n~~~/g;
		const codeMatches = text.match(codeBlockPattern);
		stats.codeBlocksRemoved = codeMatches?.length ?? 0;
		text = text.replace(codeBlockPattern, " [code block omitted] ");
		}

		// 3. Collapse markdown link syntax `[text](url)` → `text`. We DO NOT
		// resolve image syntax `![alt](url)` to alt text — image alt
		// contents are usually decorative and rarely meaningful aloud.
		// Drop image syntax entirely.
		if (config.collapseLinks) {
		// Image alt: drop entire `![alt](url)` form — alt text is usually
		// decorative ("a screenshot showing...") and rarely worth speaking.
		text = text.replace(/!\[[^\]]*\]$[^)]+$/g, "");
		// Regular links: keep the visible text only.
		text = text.replace(/\[([^\]]+)\]$[^)]+$/g, (_full, linkText: string) => {
		stats.linksCollapsed++;
		return linkText;
		});
		}

		// 4. Strip HTML tags that occasionally leak in from doc comments.
		// Defensive — most assistant output is plain markdown.
		text = text.replace(/<\/?[a-zA-Z][^>]*>/g, " ");

		// 5. Strip raw URLs that aren't inside markdown link syntax. These
		// read as gibberish aloud. Stop at whitespace; closing-paren is
		// included as a stop char so URLs inside parenthetical asides
		// like "(see https://x.dev/p)" don't swallow the closing `)`.
		text = text.replace(/https?:\/\/[^\s)]+/g, " [link omitted] ");

		// 6. Normalize markdown emphasis markers. "bold text italic text"
		// should read as "bold text italic text" — TTS doesn't emphasize
		// on punctuation. Preserve the inner text only.
		text = text.replace(/\\([^]+)\\*/g, "$1");
		text = text.replace(/__([^_]+)__/g, "$1");
		text = text.replace(/\([^\n]+)\*/g, "$1");
		text = text.replace(/_([^_\n]+)_/g, "$1");

		// 7. Normalize headings — drop the `#` markers but keep the heading
		// text as a sentence. "# Hello\n" → "Hello. ".
		text = text.replace(/^#{1,6}\s+(.+?)$/gm, "$1.");

		// 8. Strip blockquote markers ("> quoted text" → "quoted text").
		text = text.replace(/^>\s+/gm, "");

		// 9. Strip horizontal rules.
		text = text.replace(/^[-*_]{3,}$/gm, "");

		// 10. Strip leading bullet markers from list items so "- foo" reads
		// as "foo". Each item retains its trailing newline so the
		// sentence segmenter (Intl.Segmenter in speak.ts) treats them
		// as separate sentences with natural pause boundaries — a more
		// natural speech cadence than collapsing to a comma list, which
		// would be one long run-on with no breath points.
		text = text.replace(/^[ \t][-+][ \t]+/gm, "");

		// 11. Inline code spans: keep the inner text but drop backticks.
		// "use `useState`" → "use useState".
		text = text.replace(/`([^`\n]+)`/g, "$1");

		// 12. Collapse whitespace runs. Keep paragraph breaks (double newline)
		// because the segmenter uses them; everything else becomes a
		// single space.
		text = text.replace(/[ \t]+/g, " ");
		text = text.replace(/\n{3,}/g, "\n\n");
		text = text.trim();

		stats.finalChars = text.length;

		if (!text) {
		return { skipped: true, text: "", reason: "empty after stripping", stats };
		}

		if (text.length > config.maxChars) {
		return {
		skipped: true,
		text: "",
		reason: `text length (${text.length}) exceeds maxChars (${config.maxChars})`,
		stats,
		};
		}

		return { skipped: false, text, stats };
		}

		/**
		* Lightweight version for the manual `/voice-speak <text>` path. Trims
		* and collapses whitespace runs, but leaves code/links/ANSI alone — the
		* user typed exactly what they want spoken.
		*/
		export function lightNormalize(input: string): string {
		if (typeof input !== "string") return "";
		return input.replace(/[ \t]+/g, " ").trim();
		}

		// ─── BCP-47 normalization ─────────────────────────────────────────────────────

		/**
		* Canonicalize a BCP-47-ish language tag.
		*
		* Subtag handling per RFC 5646 casing convention:
		* - language (2-3 letters): lowercase — `en`, `zh`
		* - script (4 letters): Title-case — `Hant`, `Latn`
		* - region (2 letters or 3 digits): UPPERCASE — `US`, `BR`, `419`
		* - variant (5+ letters or starts with digit): kept as-is, lowercase
		*
		* Inputs we accept: `en`, `en-US`, `en_US`, `EN-us`, `pt-br`, `zh_CN`,
		* `zh-Hant-TW`, `sl-rozaj`. Output preserves all subtags in canonical
		* casing; we never drop information.
		*
		* This is the single canonical form used by every TTS code path —
		* comparing two tags after passing both through this function is the
		* only safe way to check equality.
		*/
		export function normalizeBCP47(tag: string): string {
		if (typeof tag !== "string" \|\| !tag) return "";
		const parts = tag.replace(/_/g, "-").split("-").filter(Boolean);
		if (parts.length === 0) return "";
		const out: string[] = [];
		for (let i = 0; i < parts.length; i++) {
		const sub = parts[i]!;
		if (i === 0) {
		// Primary language: 2-3 letter code, lowercase
		out.push(sub.toLowerCase());
		} else if (sub.length === 4 && /^[A-Za-z]{4}$/.test(sub)) {
		// Script subtag: Title case (e.g. Hant, Latn, Cyrl)
		out.push(sub.charAt(0).toUpperCase() + sub.slice(1).toLowerCase());
		} else if (/^[A-Za-z]{2}$/.test(sub) \|\| /^\d{3}$/.test(sub)) {
		// Region subtag: 2-letter alpha or 3-digit UN M.49 → uppercase
		out.push(sub.toUpperCase());
		} else {
		// Variant / extension subtag: keep lowercase
		out.push(sub.toLowerCase());
		}
		}
		return out.join("-");
		}

		/** Extract the base language code from a BCP-47 tag. `en-US` → `en`. */
		export function baseLanguage(tag: string): string {
		const norm = normalizeBCP47(tag);
		const idx = norm.indexOf("-");
		return idx === -1 ? norm : norm.slice(0, idx);
		}

+10

-0

extensions/voice/config.ts

		@@ -84,2 +84,8 @@ import * as fs from "node:fs";
		ttsDeepgramStreaming?: boolean;
		/**
		* Set to true after `tts-onboarding.maybeShowTtsOnboarding()` has
		* shown its first-run hint, so subsequent /voice-speak-toggle calls
		* don't re-spam the same notification. New in v7.0.0.
		*/
		ttsOnboardingShown?: boolean;
		}
		@@ -118,2 +124,3 @@
		ttsDeepgramStreaming: false,
		ttsOnboardingShown: false,
		onboarding: {
		@@ -204,2 +211,5 @@ completed: false,
		: DEFAULT_CONFIG.ttsDeepgramStreaming,
		ttsOnboardingShown: typeof rawVoice.ttsOnboardingShown === "boolean"
		? rawVoice.ttsOnboardingShown
		: false,
		onboarding: normalizeOnboarding(rawVoice.onboarding, fallbackCompleted),
		@@ -206,0 +216,0 @@ };

+378

-18

extensions/voice/settings-panel.ts

		@@ -27,2 +27,12 @@ /**
		import { getFreeDiskSpace, formatBytes, getModelsDir, scanHandyModels, importHandyModel } from "./model-download";
		import {
		TTS_LOCAL_MODELS as TTS_LOCAL_MODELS_REF,
		isTtsModelInstalled as TTS_INSTALLED_CHECK_REF,
		type TtsLocalModelInfo,
		type TtsVoice,
		} from "./tts-local-models";
		import {
		DEEPGRAM_TTS_VOICES,
		filterDeepgramVoicesByLanguage,
		} from "./tts-deepgram";

		@@ -38,2 +48,3 @@ // ─── Types ────────────────────────────────────────────────────────────────────
		\| { type: "speak-test" }
		\| { type: "tts-install"; modelId: string }
		\| undefined;
		@@ -89,3 +100,3 @@
		private row = 0;
		private sub: "main" \| "lang-picker" = "main";
		private sub: "main" \| "lang-picker" \| "tts-model-picker" \| "tts-voice-picker" = "main";

		@@ -107,2 +118,10 @@ // Models tab — grouped view

		// TTS model sub-picker (Speak tab → Model row)
		private ttsModelSearch = "";
		private ttsModelRow = 0;

		// TTS voice sub-picker (Speak tab → Voice row)
		private ttsVoiceSearch = "";
		private ttsVoiceRow = 0;

		// Two-step delete on the Downloaded tab. When `x` is pressed, set the
		@@ -164,2 +183,10 @@ // pending modelId + expiry timestamp; a second `x` within DELETE_CONFIRM_MS
		}
		if (this.sub === "tts-model-picker") {
		lines.push(...this.renderTtsModelPicker(w, iw).map(t));
		return lines;
		}
		if (this.sub === "tts-voice-picker") {
		lines.push(...this.renderTtsVoicePicker(w, iw).map(t));
		return lines;
		}

		@@ -194,2 +221,10 @@ // Tab content
		}
		if (this.sub === "tts-model-picker") {
		this.handleTtsModelInput(data);
		return;
		}
		if (this.sub === "tts-voice-picker") {
		this.handleTtsVoiceInput(data);
		return;
		}

		@@ -516,8 +551,24 @@ const tabId = TAB_IDS[this.tab]!;

		// Five rows mirroring the General tab pattern:
		// Always-visible status line — single source of truth for the
		// current TTS configuration so the user can scan it without
		// reading every row.
		const statusParts: string[] = [];
		statusParts.push(isLocal ? "Local" : "Deepgram");
		statusParts.push(this.formatActiveModelOrVoice(config));
		statusParts.push(`${(config.ttsSpeed ?? 1.0).toFixed(2)}×`);
		const lang = config.ttsLanguage \|\| config.language \|\| "en";
		statusParts.push(lang.toUpperCase());
		const statusBar = config.ttsEnabled
		? this.success("● ") + statusParts.join(this.dim(" · "))
		: this.dim("● disabled · ") + statusParts.join(this.dim(" · "));
		lines.push(` ${statusBar}`);
		lines.push("");

		// Six rows:
		// 0: Enabled toggle
		// 1: Backend toggle
		// 2: Voice (numeric sid for local; Deepgram model id otherwise)
		// 3: Speed (cycles 0.5/1.0/1.25/1.5/2.0)
		// 4: Test (synthesizes "The quick brown fox …")
		// 2: Model picker (local) or read-only label (deepgram)
		// 3: Voice picker (numeric sid for local; Aura voice id for deepgram)
		// 4: Speed (cycles 0.5 / 0.75 / 1.0 / 1.25 / 1.5 / 2.0)
		// 5: Test (synthesizes "The quick brown fox …")
		const rows: { label: string; value: string; hint?: string }[] = [
		@@ -537,12 +588,16 @@ {
		{
		label: "Voice",
		label: "Model",
		value: isLocal
		? `sid ${typeof config.ttsLocalVoiceId === "number" ? config.ttsLocalVoiceId : 0}`
		+ ` (${config.ttsLocalModel ?? "kitten-nano-en-v0_2"})`
		: (config.ttsDeepgramVoiceId ?? "aura-asteria-en"),
		hint: "edit in settings.json",
		? this.formatLocalModelLabel(config.ttsLocalModel)
		: this.dim("(deepgram backend — pick a voice instead)"),
		hint: isLocal ? "pick model ›" : undefined,
		},
		{
		label: "Voice",
		value: this.formatVoiceLabel(config),
		hint: "pick voice ›",
		},
		{
		label: "Speed",
		value: `${(config.ttsSpeed ?? 1.0).toFixed(2)}x`,
		value: `${(config.ttsSpeed ?? 1.0).toFixed(2)}×`,
		hint: "cycle",
		@@ -572,2 +627,38 @@ },

		/** Status-bar helper: format the current model+voice as one short string. */
		private formatActiveModelOrVoice(config: VoiceConfig): string {
		const isLocal = (config.ttsBackend ?? "local") === "local";
		if (isLocal) {
		const modelId = config.ttsLocalModel ?? "kitten-nano-en-v0_2";
		const sid = typeof config.ttsLocalVoiceId === "number" ? config.ttsLocalVoiceId : 0;
		// Lazy lookup — keep status compact, full label is in the rows below.
		const model = TTS_LOCAL_MODELS_REF.find(m => m.id === modelId);
		const shortName = model?.name ?? modelId;
		const voice = model?.voices.find(v => v.sid === sid);
		return voice ? `${shortName} · ${voice.name}` : `${shortName} · sid ${sid}`;
		}
		return config.ttsDeepgramVoiceId ?? "aura-asteria-en";
		}

		private formatLocalModelLabel(id: string \| undefined): string {
		const modelId = id ?? "kitten-nano-en-v0_2";
		const model = TTS_LOCAL_MODELS_REF.find(m => m.id === modelId);
		if (!model) return modelId;
		const installed = TTS_INSTALLED_CHECK_REF(modelId);
		const installedTag = installed ? this.success(" ✓") : this.warning(" ⬇ download on select");
		return `${model.name} (${model.size})${installedTag}`;
		}

		private formatVoiceLabel(config: VoiceConfig): string {
		const isLocal = (config.ttsBackend ?? "local") === "local";
		if (isLocal) {
		const modelId = config.ttsLocalModel ?? "kitten-nano-en-v0_2";
		const sid = typeof config.ttsLocalVoiceId === "number" ? config.ttsLocalVoiceId : 0;
		const model = TTS_LOCAL_MODELS_REF.find(m => m.id === modelId);
		const voice = model?.voices.find(v => v.sid === sid);
		return voice ? `${voice.name} (sid ${sid})` : `sid ${sid}`;
		}
		return config.ttsDeepgramVoiceId ?? "aura-asteria-en";
		}

		// ─── Device tab ───────────────────────────────────────────────────────
		@@ -777,8 +868,11 @@
		break;
		case 2: // Voice — v6.0 ships read-only with edit-in-config hint
		// In v6.1 this opens an inline picker; for now the
		// recommended path is /voice-settings → close → edit
		// settings.json directly.
		case 2: // Model picker (local only — Deepgram has no model concept)
		if ((config.ttsBackend ?? "local") === "local") {
		this.openTtsModelPicker();
		}
		break;
		case 3: { // Speed cycle
		case 3: // Voice picker
		this.openTtsVoicePicker();
		break;
		case 4: { // Speed cycle
		const ladder = [0.75, 1.0, 1.25, 1.5, 2.0, 0.5];
		@@ -791,3 +885,3 @@ const current = config.ttsSpeed ?? 1.0;
		}
		case 4: { // Test — emit a special panel-close action so the
		case 5: { // Test — emit a special panel-close action so the
		// caller (voice.ts:openSettingsPanel) can route it to
		@@ -868,2 +962,268 @@ // /voice-speak-test without us depending on the

		// ─── TTS Model picker ──────────────────────────────────────────────────

		/**
		* Filtered TTS catalog for the model picker.
		* Recomputed lazily during render — cheap (14 entries) and keeps the
		* filter live with the search box.
		*/
		private getFilteredTtsModels(): TtsLocalModelInfo[] {
		const q = this.ttsModelSearch.trim().toLowerCase();
		if (!q) return TTS_LOCAL_MODELS_REF;
		return TTS_LOCAL_MODELS_REF.filter(m =>
		`${m.name} ${m.id} ${m.notes} ${m.languages.join(" ")}`.toLowerCase().includes(q),
		);
		}

		private openTtsModelPicker(): void {
		this.ttsModelSearch = "";
		const currentId = this.p.config.ttsLocalModel ?? "kitten-nano-en-v0_2";
		const idx = TTS_LOCAL_MODELS_REF.findIndex(m => m.id === currentId);
		this.ttsModelRow = idx >= 0 ? idx : 0;
		this.sub = "tts-model-picker";
		}

		private renderTtsModelPicker(_w: number, iw: number): string[] {
		const lines: string[] = [];
		const currentId = this.p.config.ttsLocalModel ?? "kitten-nano-en-v0_2";
		const filtered = this.getFilteredTtsModels();

		lines.push(` ${this.bold("Pick TTS model")}`);
		const cursor = this.ttsModelSearch ? this.ttsModelSearch : this.dim("type to filter…");
		lines.push(` ${this.dim("Search:")} ${cursor}`);
		lines.push("");

		if (filtered.length === 0) {
		lines.push(this.dim(" No matching models."));
		lines.push("");
		lines.push(this.dim(" esc back type to filter"));
		return lines;
		}

		// Viewport window centered on selection. 12 rows fits a 24-line
		// overlay comfortably.
		const maxVisible = 12;
		const total = filtered.length;
		const sel = Math.min(this.ttsModelRow, total - 1);
		let start = Math.max(0, sel - Math.floor(maxVisible / 2));
		const end = Math.min(start + maxVisible, total);
		if (end - start < maxVisible) start = Math.max(0, end - maxVisible);

		const nameW = Math.min(28, Math.max(18, iw - 32));
		for (let i = start; i < end; i++) {
		const m = filtered[i]!;
		const isSelected = i === sel;
		const isCurrent = m.id === currentId;
		const installed = TTS_INSTALLED_CHECK_REF(m.id);
		const prefix = isSelected ? this.accent(" → ") : " ";
		const name = isSelected ? this.accent(m.name) : m.name;
		const namePad = m.name.length < nameW ? " ".repeat(nameW - m.name.length) : "";
		const size = this.dim(m.size.padStart(8));
		const langs = this.dim(m.languages.length > 1
		? `${m.languages.length} langs`.padEnd(13)
		: m.languages[0]!.padEnd(13));
		const status = isCurrent
		? this.success("active")
		: installed
		? this.success("ready")
		: this.warning("⬇ download");
		lines.push(`${prefix}${name}${namePad} ${size} ${langs} ${status}`);
		if (isSelected) {
		lines.push(` ${this.dim(m.notes)}`);
		}
		}

		if (start > 0 \|\| end < total) {
		lines.push(this.dim(` showing ${start + 1}–${end} of ${total}`));
		}
		lines.push("");
		const sel_m = filtered[sel];
		if (sel_m) {
		const installed = TTS_INSTALLED_CHECK_REF(sel_m.id);
		const enterHint = installed ? "activate" : `download (${sel_m.size}) + activate`;
		lines.push(this.dim(` ↵ ${enterHint} esc back type to filter`));
		} else {
		lines.push(this.dim(" esc back"));
		}
		return lines;
		}

		private handleTtsModelInput(data: string): void {
		if (matchesKey(data, Key.escape)) {
		this.sub = "main";
		return;
		}
		const filtered = this.getFilteredTtsModels();
		if (matchesKey(data, Key.up)) {
		if (filtered.length > 0) {
		this.ttsModelRow = this.ttsModelRow === 0 ? filtered.length - 1 : this.ttsModelRow - 1;
		}
		return;
		}
		if (matchesKey(data, Key.down)) {
		if (filtered.length > 0) {
		this.ttsModelRow = this.ttsModelRow === filtered.length - 1 ? 0 : this.ttsModelRow + 1;
		}
		return;
		}
		if (matchesKey(data, Key.enter)) {
		const m = filtered[this.ttsModelRow];
		if (!m) return;
		this.p.config.ttsLocalModel = m.id;
		// Reset voice id when model changes — preserve sid 0 default.
		this.p.config.ttsLocalVoiceId = m.defaultSid;
		this.save();
		this.sub = "main";
		// If model isn't installed, signal to the caller via panel
		// close so voice.ts's openSettingsPanel post-close handler can
		// run ensureTtsModelInstalled with progress notify.
		if (!TTS_INSTALLED_CHECK_REF(m.id)) {
		this.onClose?.({ type: "tts-install", modelId: m.id });
		}
		return;
		}
		if (matchesKey(data, Key.backspace)) {
		this.ttsModelSearch = this.ttsModelSearch.slice(0, -1);
		this.ttsModelRow = 0;
		return;
		}
		if (data.length === 1 && data >= " " && data <= "~") {
		this.ttsModelSearch += data;
		this.ttsModelRow = 0;
		}
		}

		// ─── TTS Voice picker ──────────────────────────────────────────────────

		private getCurrentVoiceCatalog(): { id: string \| number; label: string; meta?: string }[] {
		const { config } = this.p;
		const isLocal = (config.ttsBackend ?? "local") === "local";
		if (isLocal) {
		const modelId = config.ttsLocalModel ?? "kitten-nano-en-v0_2";
		const model = TTS_LOCAL_MODELS_REF.find(m => m.id === modelId);
		if (!model) return [];
		return model.voices.map((v: TtsVoice) => ({
		id: v.sid,
		label: v.name,
		meta: v.gender,
		}));
		}
		// Deepgram: filter Aura voices by current language for relevance.
		const lang = config.ttsLanguage \|\| config.language \|\| "en";
		const filtered = filterDeepgramVoicesByLanguage(lang);
		const list = filtered.length > 0 ? filtered : DEEPGRAM_TTS_VOICES;
		return list.map(v => ({ id: v.id, label: v.name, meta: v.gender }));
		}

		private getFilteredTtsVoices(): { id: string \| number; label: string; meta?: string }[] {
		const all = this.getCurrentVoiceCatalog();
		const q = this.ttsVoiceSearch.trim().toLowerCase();
		if (!q) return all;
		return all.filter(v => `${v.label} ${v.meta ?? ""} ${v.id}`.toLowerCase().includes(q));
		}

		private openTtsVoicePicker(): void {
		this.ttsVoiceSearch = "";
		const all = this.getCurrentVoiceCatalog();
		const { config } = this.p;
		const isLocal = (config.ttsBackend ?? "local") === "local";
		const currentId: string \| number = isLocal
		? (typeof config.ttsLocalVoiceId === "number" ? config.ttsLocalVoiceId : 0)
		: (config.ttsDeepgramVoiceId ?? "aura-asteria-en");
		const idx = all.findIndex(v => v.id === currentId);
		this.ttsVoiceRow = idx >= 0 ? idx : 0;
		this.sub = "tts-voice-picker";
		}

		private renderTtsVoicePicker(_w: number, _iw: number): string[] {
		const lines: string[] = [];
		const filtered = this.getFilteredTtsVoices();
		const { config } = this.p;
		const isLocal = (config.ttsBackend ?? "local") === "local";
		const currentId: string \| number = isLocal
		? (typeof config.ttsLocalVoiceId === "number" ? config.ttsLocalVoiceId : 0)
		: (config.ttsDeepgramVoiceId ?? "aura-asteria-en");

		lines.push(` ${this.bold(isLocal ? "Pick local voice" : "Pick Deepgram voice")}`);
		const cursor = this.ttsVoiceSearch ? this.ttsVoiceSearch : this.dim("type to filter…");
		lines.push(` ${this.dim("Search:")} ${cursor}`);
		lines.push("");

		if (filtered.length === 0) {
		lines.push(this.dim(" No matching voices."));
		lines.push("");
		lines.push(this.dim(" esc back type to filter"));
		return lines;
		}

		const maxVisible = 12;
		const total = filtered.length;
		const sel = Math.min(this.ttsVoiceRow, total - 1);
		let start = Math.max(0, sel - Math.floor(maxVisible / 2));
		const end = Math.min(start + maxVisible, total);
		if (end - start < maxVisible) start = Math.max(0, end - maxVisible);

		for (let i = start; i < end; i++) {
		const v = filtered[i]!;
		const isSelected = i === sel;
		const isCurrent = v.id === currentId;
		const prefix = isSelected ? this.accent(" → ") : " ";
		const idStr = typeof v.id === "number" ? `sid ${v.id}` : v.id;
		const text = isSelected ? this.accent(v.label) : v.label;
		const meta = v.meta ? this.dim(` (${v.meta})`) : "";
		const idTag = this.dim(` — ${idStr}`);
		const check = isCurrent ? this.success(" ✓") : "";
		lines.push(`${prefix}${text}${meta}${idTag}${check}`);
		}

		if (start > 0 \|\| end < total) {
		lines.push(this.dim(` showing ${start + 1}–${end} of ${total}`));
		}
		lines.push("");
		lines.push(this.dim(" ↵ select esc back type to filter"));
		return lines;
		}

		private handleTtsVoiceInput(data: string): void {
		if (matchesKey(data, Key.escape)) {
		this.sub = "main";
		return;
		}
		const filtered = this.getFilteredTtsVoices();
		if (matchesKey(data, Key.up)) {
		if (filtered.length > 0) {
		this.ttsVoiceRow = this.ttsVoiceRow === 0 ? filtered.length - 1 : this.ttsVoiceRow - 1;
		}
		return;
		}
		if (matchesKey(data, Key.down)) {
		if (filtered.length > 0) {
		this.ttsVoiceRow = this.ttsVoiceRow === filtered.length - 1 ? 0 : this.ttsVoiceRow + 1;
		}
		return;
		}
		if (matchesKey(data, Key.enter)) {
		const v = filtered[this.ttsVoiceRow];
		if (!v) return;
		const isLocal = (this.p.config.ttsBackend ?? "local") === "local";
		if (isLocal && typeof v.id === "number") {
		this.p.config.ttsLocalVoiceId = v.id;
		} else if (!isLocal && typeof v.id === "string") {
		this.p.config.ttsDeepgramVoiceId = v.id;
		}
		this.save();
		this.sub = "main";
		return;
		}
		if (matchesKey(data, Key.backspace)) {
		this.ttsVoiceSearch = this.ttsVoiceSearch.slice(0, -1);
		this.ttsVoiceRow = 0;
		return;
		}
		if (data.length === 1 && data >= " " && data <= "~") {
		this.ttsVoiceSearch += data;
		this.ttsVoiceRow = 0;
		}
		}

		// ─── Helpers ──────────────────────────────────────────────────────────
		@@ -880,3 +1240,3 @@
		}
		case "speak": return 5;
		case "speak": return 6;
		case "device": return 0;
		@@ -883,0 +1243,0 @@ }

+53

-3

extensions/voice/tts-engine.ts

		@@ -252,2 +252,43 @@ /**

		// ─── Warmup ──────────────────────────────────────────────────────────────────

		/**
		* Pre-load the sherpa-onnx module AND construct the OfflineTts for `model`
		* in the background, so the user's first `/voice-speak` doesn't pay the
		* 600-900ms cold-start init cost.
		*
		* Idempotent: subsequent calls for the same (model, modelDir) await the
		* same in-flight promise via the existing `ttsCache` machinery — the only
		* difference vs a real synthesize is that warmup discards the result.
		*
		* Cancellation: `signal` aborts the load. If the user toggles TTS off
		* before warmup completes, the construction continues and the resulting
		* instance lands in the cache (cheap memory cost), but no UI flicker
		* happens — the cache is simply unused. Cleaner alternative would be to
		* abort native createAsync but sherpa-onnx-node doesn't expose that.
		*
		* Errors: returns `false` on any failure (logged to debug output, not
		* rethrown). Callers treat this as a best-effort optimization — failure
		* here is not a user-facing error because the next /voice-speak will
		* surface the same error anyway through synthesize().
		*/
		export async function warmupTts(
		model: TtsLocalModelInfo,
		modelDir: string,
		opts: { signal?: AbortSignal } = {},
		): Promise<boolean> {
		if (opts.signal?.aborted) return false;
		try {
		const ok = await loadSherpa();
		if (!ok) return false;
		if (opts.signal?.aborted) return false;
		await getOrCreateTts(model, modelDir);
		return true;
		} catch {
		// Warmup is best-effort; swallow errors so callers never have to
		// worry about a backgrounded promise rejection.
		return false;
		}
		}

		// ─── Synthesis ───────────────────────────────────────────────────────────────
		@@ -425,3 +466,3 @@
		const tokens = path.join(modelDir, "tokens.txt");
		const numThreads = getTtsThreads();
		const numThreads = getTtsThreads(model.sherpaSlot);

		@@ -497,7 +538,16 @@ switch (model.sherpaSlot) {
		*/
		function getTtsThreads(): number {
		function getTtsThreads(slot: TtsLocalModelInfo["sherpaSlot"]): number {
		const cpus = os.cpus().length \|\| 2;
		if (cpus <= 2) return 1;
		if (cpus <= 4) return 2;
		return Math.min(4, cpus - 2);
		// Per-slot tuning, mirroring the STT path's TRANSDUCER_MAX_THREADS=6
		// vs the Whisper-class cap of 4. Decisions per sherpa-onnx published
		// RTF curves and #2910 (CoreML regression for transformer graphs):
		//
		// - kitten (Kitten Nano TTS): small model, scales to 4 threads
		// - vits (Piper): single-speaker VITS, scales to 4
		// - kokoro (Kokoro v0.19/v1.0): larger transformer encoder, scales
		// to ~6 P-cores on M-series
		const max = slot === "kokoro" ? 6 : 4;
		return Math.min(max, cpus - 2);
		}
		@@ -504,0 +554,0 @@

+336

-78

extensions/voice/tts-local-models.ts

		@@ -34,2 +34,3 @@ /**
		import { spawn } from "node:child_process";
		import { createHash } from "node:crypto";

		@@ -94,2 +95,11 @@ const TTS_RELEASE = "https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models";
		archiveUrl: string;
		/**
		* Optional SHA-256 hex digest of the archive bytes for integrity
		* verification. When set, ensureTtsModelInstalled() rejects a
		* download whose computed hash differs. v7.0.0 ships catalog entries
		* without hashes (we don't ship known-good values for sherpa-onnx
		* releases yet); the verification pipeline runs anyway and produces
		* a hash that can be pinned in v7.1+ to lock-in the bytes.
		*/
		archiveSha256?: string;
		/** Sample rate (Hz) of generated audio. Drives WAV header on playback. */
		@@ -287,2 +297,120 @@ sampleRate: number;

		// ─── Smart default selection ─────────────────────────────────────────────────

		/**
		* Recommend an initial TTS model based on the user's system locale.
		*
		* Returns ONE catalog entry id — the recommendation, not an installation
		* decision. The caller (onboarding picker, settings panel) presents this
		* as a pre-highlighted suggestion with disclosure of size and language
		* coverage. The user always confirms before download starts.
		*
		* Mapping rules:
		* - English locale (en-*) → kitten-nano-en-v0_2 (smallest, 25 MB)
		* - Single-language Piper match → that Piper voice (~20 MB each)
		* - Multi-language locale that
		* covers Kokoro → kokoro-int8-multi-lang-v1_0 (126 MB)
		* - Locale with no coverage → kitten-nano-en-v0_2 + warn
		*
		* The single-Piper-match path is preferred over the multilingual Kokoro
		* because Piper is 1/6 the size when only one language is needed. Kokoro
		* is the right pick when the user reads multiple languages OR when no
		* Piper voice exists for their locale.
		*/
		export interface SmartDefaultRecommendation {
		modelId: string;
		/** Why this model was picked, surfaceable in onboarding UI. */
		reason: string;
		/** True iff no model in the catalog actually covers `locale`. */
		fallback: boolean;
		}

		/**
		* Per-language single-Piper-voice mapping — only languages where the
		* catalog has exactly one Piper voice for the language. Multi-region
		* languages (en, pt) are intentionally NOT here — those route through
		* either the en→kitten path or kokoro multilingual.
		*/
		const SINGLE_PIPER_BY_BASE_LANG: Readonly<Record<string, string>> = {
		es: "piper-es_ES-davefx-medium-int8",
		fr: "piper-fr_FR-siwis-medium-int8",
		de: "piper-de_DE-thorsten-medium-int8",
		hi: "piper-hi_IN-pratham-medium-int8",
		zh: "piper-zh_CN-chaowen-medium-int8",
		it: "piper-it_IT-paola-medium-int8",
		ru: "piper-ru_RU-denis-medium-int8",
		ar: "piper-ar_JO-kareem-medium-int8",
		tr: "piper-tr_TR-fahrettin-medium-int8",
		nl: "piper-nl_NL-pim-medium-int8",
		};

		export function recommendDefaultModel(systemLocale: string): SmartDefaultRecommendation {
		if (!systemLocale \|\| typeof systemLocale !== "string") {
		return {
		modelId: DEFAULT_TTS_MODEL,
		reason: "No system locale detected — defaulting to the smallest English model.",
		fallback: true,
		};
		}

		// Normalize: lowercase first subtag, e.g. "en_US.UTF-8" → "en"
		const base = systemLocale
		.split(/[-_.]/)[0]!
		.toLowerCase();

		// English locales — Kitten Nano is the smallest viable English TTS
		// at 25 MB, and we ship it as the catalog default for first-run
		// experience reasons.
		if (base === "en") {
		return {
		modelId: DEFAULT_TTS_MODEL,
		reason: `English locale detected — recommending ${DEFAULT_TTS_MODEL} (25 MB, 8 voices).`,
		fallback: false,
		};
		}

		// Special-case Portuguese: catalog has Brazilian-only Piper. We pick
		// pt-BR for any pt-* locale and surface the regional gap in the
		// reason — region-strict matching at speak time will warn if the
		// user explicitly types pt-PT.
		if (base === "pt") {
		return {
		modelId: "piper-pt_BR-cadu-medium-int8",
		reason: `Portuguese locale detected — recommending Brazilian Portuguese voice (${
		systemLocale.toLowerCase().includes("br") ? "exact match" : "closest available"
		}, 20 MB).`,
		fallback: false,
		};
		}

		// Single-language Piper match
		const single = SINGLE_PIPER_BY_BASE_LANG[base];
		if (single) {
		return {
		modelId: single,
		reason: `${base.toUpperCase()} locale detected — recommending ${single} (~20 MB).`,
		fallback: false,
		};
		}

		// Languages covered only by Kokoro multilingual (ja, ko)
		if (base === "ja" \|\| base === "ko") {
		return {
		modelId: "kokoro-int8-multi-lang-v1_0",
		reason: `${base.toUpperCase()} locale detected — recommending Kokoro multilingual (126 MB, ` +
		`covers en/zh/ja/ko/es/fr/hi/it/pt in one model).`,
		fallback: false,
		};
		}

		// No coverage — fall back to English default with a warning the
		// caller can surface verbatim.
		return {
		modelId: DEFAULT_TTS_MODEL,
		reason: `Locale ${systemLocale} has no built-in TTS voice. Falling back to English (${DEFAULT_TTS_MODEL}). ` +
		`Browse /voice-settings → Speak tab → Models for the full catalog.`,
		fallback: true,
		};
		}

		/** Look up a model by id; throws if unknown so callers fail loudly. */
		@@ -415,2 +543,8 @@ export function getTtsModel(id: string): TtsLocalModelInfo {
		export interface TtsInstallProgress {
		/**
		* - "download" — fetching archive bytes (with phase totals)
		* - "extract" — running tar over the saved archive
		* - "verify" — moving extracted files to final dir
		* - "done" — install complete
		*/
		phase: "download" \| "extract" \| "verify" \| "done";
		@@ -422,9 +556,30 @@ bytes?: number;
		/**
		* Result returned alongside install completion — exposes the computed
		* SHA-256 so callers (and v7.1+ catalog updates) can pin known-good hashes.
		*/
		export interface TtsInstallResult {
		dir: string;
		archiveSha256: string;
		}

		/**
		* Download and extract `modelId` if not already installed. Idempotent —
		* if already installed, resolves immediately. The download is streamed
		* directly to `tar -xj` so we never buffer the whole archive in memory.
		* if already installed, resolves immediately.
		*
		* The flow is download-to-disk-then-extract, not streaming-to-tar:
		* 1. Resume-aware fetch → write archive bytes to
		* `~/.pi/models/tts/<id>.partial.tar.bz2`. If the partial file
		* exists from a prior interrupted run, send `Range: bytes=N-` and
		* append. SHA-256 is computed across the full file by re-reading
		* it once on completion (cheap — ~200ms for 126 MB on M-series).
		* 2. If the catalog entry has `archiveSha256`, compare against the
		* computed hash. Mismatch → reject + cleanup partial.
		* 3. `tar -xj -f <archive> -C <stagingDir>` to extract.
		* 4. Move staging contents to final `<modelDir>` via rename (atomic).
		* 5. Delete the archive file.
		*
		* Errors:
		* - "Network error: ..." on fetch failure
		* - "Download failed: HTTP <status>" on non-2xx
		* - "Download failed: HTTP <status>" on non-2xx (and not 206/200 retry)
		* - "Network error: <message>" on fetch failure
		* - "Archive integrity check failed: ..." on SHA-256 mismatch
		* - "tar exited with code N" on extraction failure
		@@ -439,3 +594,3 @@ * - DOMException("AbortError") if signal fires
		} = {},
		): Promise<string> {
		): Promise<TtsInstallResult> {
		const model = getTtsModel(modelId);
		@@ -446,97 +601,200 @@ const dir = getTtsModelDir(modelId);
		opts.onProgress?.({ phase: "done" });
		return dir;
		return { dir, archiveSha256: model.archiveSha256 ?? "" };
		}
		if (opts.signal?.aborted) throw makeAbortErr();

		// Download to a temp file under tts/ then extract → atomic-ish:
		// partial extracts go into a temp dir that we rename on success, so
		// `isTtsModelInstalled` never sees a half-extracted state.
		const ttsDir = getTtsModelsDir();
		fs.mkdirSync(ttsDir, { recursive: true });
		const stagingDir = `${dir}.staging-${process.pid}`;
		fs.mkdirSync(stagingDir, { recursive: true });
		const archivePath = path.join(ttsDir, `${modelId}.partial.tar.bz2`);

		try {
		opts.onProgress?.({ phase: "download" });
		const res = await fetch(model.archiveUrl, { signal: opts.signal });
		if (!res.ok \|\| !res.body) {
		throw new Error(`Download failed: HTTP ${res.status} from ${model.archiveUrl}`);
		// Phase 1 — download archive bytes (with resume).
		await downloadArchive(model.archiveUrl, archivePath, opts);
		if (opts.signal?.aborted) throw makeAbortErr();

		// Phase 2 — verify hash.
		opts.onProgress?.({ phase: "verify" });
		const computedSha256 = await sha256OfFile(archivePath, opts.signal);
		if (model.archiveSha256 && model.archiveSha256.toLowerCase() !== computedSha256.toLowerCase()) {
		throw new Error(
		`Archive integrity check failed for ${modelId}: ` +
		`expected ${model.archiveSha256}, got ${computedSha256}. ` +
		`Delete ${archivePath} and retry, or check for a corrupted upstream release.`,
		);
		}

		// Phase 3 — extract.
		opts.onProgress?.({ phase: "extract", totalBytes: model.sizeBytes });
		// Stream the archive directly into `tar -xj -C <stagingDir>`. This
		// works on macOS, Linux, and Windows 10+ (which ships bsdtar).
		// Using stdin keeps the archive bytes off disk — important for
		// a 126 MB Kokoro download.
		const tar = spawn("tar", ["-xj", "-C", stagingDir], {
		stdio: ["pipe", "ignore", "pipe"],
		...(opts.signal ? { signal: opts.signal } : {}),
		});
		let tarStderr = "";
		tar.stderr?.on("data", (d: Buffer) => {
		if (tarStderr.length < 1024) tarStderr += d.toString();
		});
		const stagingDir = `${dir}.staging-${process.pid}`;
		fs.mkdirSync(stagingDir, { recursive: true });
		try {
		await runTarExtract(archivePath, stagingDir, opts.signal);

		const tarExit = new Promise<void>((resolve, reject) => {
		tar.on("error", (err: NodeJS.ErrnoException) => {
		if (err.name === "AbortError" \|\| opts.signal?.aborted) reject(makeAbortErr());
		else reject(new Error(`tar failed to start: ${err.message}`));
		});
		tar.on("close", (code, sig) => {
		if (code === 0) resolve();
		else if (opts.signal?.aborted) reject(makeAbortErr());
		else reject(new Error(`tar exited with code ${code}${sig ? ` (${sig})` : ""}: ${tarStderr.trim().slice(-200)}`));
		});
		});

		// Pipe response body → tar stdin. The web ReadableStream needs to
		// be drained chunk-by-chunk; we write each chunk to the tar pipe
		// and surface progress.
		let bytesSeen = 0;
		const reader = res.body.getReader();
		try {
		while (true) {
		if (opts.signal?.aborted) {
		try { tar.kill("SIGTERM"); } catch {}
		throw makeAbortErr();
		}
		const { value, done } = await reader.read();
		if (done) break;
		if (value) {
		bytesSeen += value.byteLength;
		opts.onProgress?.({ phase: "extract", bytes: bytesSeen, totalBytes: model.sizeBytes });
		// Write to tar's stdin — backpressure-respecting via the
		// fact that tar.stdin is a Writable; if it's full this
		// returns false but our small chunk size means it
		// rarely matters.
		tar.stdin?.write(Buffer.from(value));
		}
		}
		// Phase 4 — move into final location. The archive's top-level
		// directory differs per model (e.g.
		// `vits-piper-en_US-lessac-medium-int8/`). Flatten to
		// `<modelDir>/tokens.txt` etc.
		const stagingEntries = fs.readdirSync(stagingDir);
		const innerDir = stagingEntries.length === 1 && fs.statSync(path.join(stagingDir, stagingEntries[0]!)).isDirectory()
		? path.join(stagingDir, stagingEntries[0]!)
		: stagingDir;
		// rename is atomic when innerDir and dir are on the same
		// filesystem (~/.pi/models/tts/.staging is a sibling of dir).
		fs.renameSync(innerDir, dir);
		} finally {
		try { reader.releaseLock(); } catch {}
		try { fs.rmSync(stagingDir, { recursive: true, force: true }); } catch {}
		}
		tar.stdin?.end();
		await tarExit;

		// The archive's top-level directory differs per model (e.g.
		// `vits-piper-en_US-lessac-medium-int8/`). Move its contents
		// up one level so we end up with a flat `<dir>/tokens.txt` etc.
		opts.onProgress?.({ phase: "verify" });
		const stagingEntries = fs.readdirSync(stagingDir);
		const innerDir = stagingEntries.length === 1 && fs.statSync(path.join(stagingDir, stagingEntries[0]!)).isDirectory()
		? path.join(stagingDir, stagingEntries[0]!)
		: stagingDir;
		fs.renameSync(innerDir, dir);
		// Phase 5 — clean up the archive file. Successful install means we
		// no longer need the partial; resume is moot.
		try { fs.unlinkSync(archivePath); } catch {}

		opts.onProgress?.({ phase: "done" });
		return dir;
		return { dir, archiveSha256: computedSha256 };
		} catch (err) {
		// Best-effort cleanup of partial state.
		try { fs.rmSync(stagingDir, { recursive: true, force: true }); } catch {}
		// On failure, leave the partial archive file in place so the
		// next attempt can resume. But clean up the destination dir if
		// extraction created it.
		try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
		throw err;
		}
		}

		/**
		* Download bytes to `archivePath` with `Range` resume.
		*
		* If the file already exists, we send `Range: bytes=<size>-` and the
		* server is expected to respond with 206 Partial Content (we append) or
		* 200 OK (server doesn't support range; we throw the file away and
		* start over).
		*
		* Surfaces byte-count progress via `opts.onProgress`. The total byte
		* count comes from `Content-Length` on the response — for 206 responses
		* we add the existing partial size to the running counter so the user
		* sees a continuous progress bar across resumed sessions.
		*/
		async function downloadArchive(
		url: string,
		archivePath: string,
		opts: { signal?: AbortSignal; onProgress?: (info: TtsInstallProgress) => void },
		): Promise<void> {
		let existingBytes = 0;
		if (fs.existsSync(archivePath)) {
		try { existingBytes = fs.statSync(archivePath).size; } catch {}
		}

		const headers: Record<string, string> = {};
		if (existingBytes > 0) headers.Range = `bytes=${existingBytes}-`;

		let res: Response;
		try {
		res = await fetch(url, { signal: opts.signal, headers });
		} catch (err: any) {
		if (err?.name === "AbortError") throw err;
		throw new Error(`Network error: ${err?.message ?? String(err)}`);
		}

		let appendMode = false;
		if (res.status === 206 && existingBytes > 0) {
		appendMode = true;
		} else if (res.status === 200) {
		// Server ignored our Range — start over.
		appendMode = false;
		existingBytes = 0;
		} else if (!res.ok) {
		throw new Error(`Download failed: HTTP ${res.status} from ${url}`);
		}

		if (!res.body) throw new Error(`Download failed: empty body from ${url}`);

		// Total = bytes already on disk + Content-Length of this response.
		const contentLength = parseInt(res.headers.get("content-length") ?? "0", 10);
		const totalBytes = existingBytes + (Number.isFinite(contentLength) ? contentLength : 0);

		const sink = fs.createWriteStream(archivePath, { flags: appendMode ? "a" : "w" });
		let bytesSeen = existingBytes;
		opts.onProgress?.({ phase: "download", bytes: bytesSeen, totalBytes });

		const reader = res.body.getReader();
		try {
		while (true) {
		if (opts.signal?.aborted) throw makeAbortErr();
		const { value, done } = await reader.read();
		if (done) break;
		if (value) {
		bytesSeen += value.byteLength;
		// Honor backpressure: if write returns false, await `drain`.
		const ok = sink.write(Buffer.from(value));
		if (!ok) await new Promise<void>(r => sink.once("drain", r));
		opts.onProgress?.({ phase: "download", bytes: bytesSeen, totalBytes });
		}
		}
		} finally {
		try { fs.rmSync(stagingDir, { recursive: true, force: true }); } catch {}
		try { reader.releaseLock(); } catch {}
		// Drain and close the file. End() callback fires after the final
		// flush. Errors during close are surfaced via `error` listener
		// captured before the await.
		await new Promise<void>((resolve, rej) => {
		let settled = false;
		const onError = (err: Error) => {
		if (settled) return;
		settled = true;
		rej(err);
		};
		sink.once("error", onError);
		sink.end(() => {
		if (settled) return;
		settled = true;
		sink.off("error", onError);
		resolve();
		});
		});
		}
		}

		/** Compute the SHA-256 hex digest of `filePath`. Streams via fs.createReadStream. */
		async function sha256OfFile(filePath: string, signal?: AbortSignal): Promise<string> {
		return new Promise((resolve, reject) => {
		const hash = createHash("sha256");
		const stream = fs.createReadStream(filePath);
		const onAbort = () => {
		stream.destroy();
		reject(makeAbortErr());
		};
		signal?.addEventListener("abort", onAbort, { once: true });
		stream.on("data", (chunk) => hash.update(chunk));
		stream.on("end", () => {
		signal?.removeEventListener("abort", onAbort);
		resolve(hash.digest("hex"));
		});
		stream.on("error", (err) => {
		signal?.removeEventListener("abort", onAbort);
		reject(err);
		});
		});
		}

		/** Spawn `tar -xj -f <archive> -C <stagingDir>` and resolve on exit code 0. */
		async function runTarExtract(archivePath: string, stagingDir: string, signal?: AbortSignal): Promise<void> {
		const tar = spawn("tar", ["-xj", "-f", archivePath, "-C", stagingDir], {
		stdio: ["ignore", "ignore", "pipe"],
		...(signal ? { signal } : {}),
		});
		let tarStderr = "";
		tar.stderr?.on("data", (d: Buffer) => {
		if (tarStderr.length < 1024) tarStderr += d.toString();
		});
		await new Promise<void>((resolve, reject) => {
		tar.on("error", (err: NodeJS.ErrnoException) => {
		if (err.name === "AbortError" \|\| signal?.aborted) reject(makeAbortErr());
		else reject(new Error(`tar failed to start: ${err.message}`));
		});
		tar.on("close", (code, sig) => {
		if (code === 0) resolve();
		else if (signal?.aborted) reject(makeAbortErr());
		else reject(new Error(`tar exited with code ${code}${sig ? ` (${sig})` : ""}: ${tarStderr.trim().slice(-200)}`));
		});
		});
		}

		function makeAbortErr(): Error {
		@@ -543,0 +801,0 @@ if (typeof DOMException === "function") {

+1

-1

package.json

		{
		"name": "@codexstar/pi-listen",
		"version": "6.0.0",
		"version": "7.0.0",
		"description": "Voice in + voice out for Pi CLI — hold-to-talk STT (Deepgram or 19 offline models) plus TTS (Kitten Nano, Piper, Kokoro, or Deepgram Aura)",
		@@ -5,0 +5,0 @@ "type": "module",

+9

-8

README.md

		@@ -15,10 +15,11 @@ [English](README.md) \| [简体中文](README.zh-CN.md) \| [日本語](README.ja.md) \| [한국어](README.ko.md) \| [Español](README.es.md) \| [Français](README.fr.md) \| [Português](README.pt-BR.md) \| [हिन्दी](README.hi.md)

		> v6.0.0 — TTS! pi-listen now speaks back — voice in + voice out in one
		> extension. Hold-to-talk STT unchanged; new `/voice-speak <text>` synthesizes
		> and plays via 12 local models (Kitten Nano 25 MB default, Piper per-language
		> voices, Kokoro multilingual) or Deepgram Aura (cloud, same `DEEPGRAM_API_KEY`
		> as STT). Region-strict language matching, sentence-aware chunking via
		> `Intl.Segmenter` (no more breaking on `Dr. Smith` / `v2.0` / URLs),
		> cross-platform audio playback with cooperative abort. Plus the cleaner
		> v5.1 settings panel grouped by model family. [Full changelog →](CHANGELOG.md)
		> v7.0.0 — World-class TTS UX — pick models from `/voice-settings` Speak
		> tab (no more JSON editing), auto-download on selection with progress, voice
		> picker for every backend, first-run onboarding with smart-default
		> recommendation by your system locale, and `ttsAutoSpeak: true` finally
		> works — auto-speaks the agent's responses with code-block stripping and
		> rate limiting. Diagnostic command `/voice-speak-info` shows everything.
		> Resume-on-interrupt downloads. Plus all v6 features (14 local models from
		> 25 MB Kitten Nano up, Deepgram Aura cloud, region-strict language matching,
		> sentence-aware chunking). [Full changelog →](CHANGELOG.md)

		@@ -25,0 +26,0 @@ ---

extensions/voice.ts

Sorry, the diff of this file is too big to display

@codexstar/pi-listen - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics