<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
    <channel>
        <title>AI Voice Studio on Producthunt daily</title>
        <link>https://producthunt.programnotes.cn/en/tags/ai-voice-studio/</link>
        <description>Recent content in AI Voice Studio on Producthunt daily</description>
        <generator>Hugo -- gohugo.io</generator>
        <language>en</language>
        <lastBuildDate>Mon, 22 Jun 2026 21:00:58 +0800</lastBuildDate><atom:link href="https://producthunt.programnotes.cn/en/tags/ai-voice-studio/index.xml" rel="self" type="application/rss+xml" /><item>
        <title>voicebox</title>
        <link>https://producthunt.programnotes.cn/en/p/voicebox/</link>
        <pubDate>Mon, 22 Jun 2026 21:00:58 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/voicebox/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1703984383588-c2522031bd35?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3ODIxMzMxMzl8&amp;ixlib=rb-4.1.0" alt="Featured image of post voicebox" /&gt;&lt;h1 id=&#34;jamiepinevoicebox&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/jamiepine/voicebox&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;jamiepine/voicebox&lt;/a&gt;
&lt;/h1&gt;&lt;p align=&#34;center&#34;&gt;
  &lt;img src=&#34;.github/assets/icon-dark.webp&#34; alt=&#34;Voicebox&#34; width=&#34;120&#34; height=&#34;120&#34; /&gt;
&lt;/p&gt;
&lt;h1 align=&#34;center&#34;&gt;Voicebox&lt;/h1&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;strong&gt;The open-source AI voice studio.&lt;/strong&gt;&lt;br/&gt;
  Clone any voice. Generate speech. Dictate into any app. Talk to agents in voices you own.&lt;br/&gt;
  The full voice I/O stack, running locally on your machine.
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;a href=&#34;https://github.com/jamiepine/voicebox/releases&#34;&gt;
    &lt;img src=&#34;https://img.shields.io/github/downloads/jamiepine/voicebox/total?style=flat&amp;color=blue&#34; alt=&#34;Downloads&#34; /&gt;
  &lt;/a&gt;
  &lt;a href=&#34;https://github.com/jamiepine/voicebox/releases/latest&#34;&gt;
    &lt;img src=&#34;https://img.shields.io/github/v/release/jamiepine/voicebox?style=flat&#34; alt=&#34;Release&#34; /&gt;
  &lt;/a&gt;
  &lt;a href=&#34;https://github.com/jamiepine/voicebox/stargazers&#34;&gt;
    &lt;img src=&#34;https://img.shields.io/github/stars/jamiepine/voicebox?style=flat&#34; alt=&#34;Stars&#34; /&gt;
  &lt;/a&gt;
  &lt;a href=&#34;https://github.com/jamiepine/voicebox/blob/main/LICENSE&#34;&gt;
    &lt;img src=&#34;https://img.shields.io/github/license/jamiepine/voicebox?style=flat&#34; alt=&#34;License&#34; /&gt;
  &lt;/a&gt;
  &lt;a href=&#34;https://deepwiki.com/jamiepine/voicebox&#34;&gt;
    &lt;img src=&#34;https://img.shields.io/static/v1?label=Ask&amp;message=DeepWiki&amp;color=5B6EF7&#34; alt=&#34;Ask DeepWiki&#34; /&gt;
  &lt;/a&gt;
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
    &lt;a href=&#34;https://trendshift.io/repositories/21213&#34; target=&#34;_blank&#34;&gt;&lt;img src=&#34;https://trendshift.io/api/badge/repositories/21213&#34; alt=&#34;jamiepine%2Fvoicebox | Trendshift&#34; style=&#34;width: 250px; height: 55px;&#34; width=&#34;250&#34; height=&#34;55&#34;/&gt;&lt;/a&gt;
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;a href=&#34;https://voicebox.sh&#34;&gt;voicebox.sh&lt;/a&gt; •
  &lt;a href=&#34;https://docs.voicebox.sh&#34;&gt;Docs&lt;/a&gt; •
  &lt;a href=&#34;#download&#34;&gt;Download&lt;/a&gt; •
  &lt;a href=&#34;#features&#34;&gt;Features&lt;/a&gt; •
  &lt;a href=&#34;#api&#34;&gt;API&lt;/a&gt; •
  &lt;a href=&#34;docs/content/docs/overview/troubleshooting.mdx&#34;&gt;Troubleshooting&lt;/a&gt;
&lt;/p&gt;
&lt;br/&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;a href=&#34;https://voicebox.sh&#34;&gt;
    &lt;img src=&#34;landing/public/assets/app-screenshot-1.webp&#34; alt=&#34;Voicebox App Screenshot&#34; width=&#34;800&#34; /&gt;
  &lt;/a&gt;
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;em&gt;Click the image above to watch the demo video on &lt;a href=&#34;https://voicebox.sh&#34;&gt;voicebox.sh&lt;/a&gt;&lt;/em&gt;
&lt;/p&gt;
&lt;br/&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;img src=&#34;landing/public/assets/app-screenshot-2.webp&#34; alt=&#34;Voicebox Screenshot 2&#34; width=&#34;800&#34; /&gt;
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;img src=&#34;landing/public/assets/app-screenshot-3.webp&#34; alt=&#34;Voicebox Screenshot 3&#34; width=&#34;800&#34; /&gt;
&lt;/p&gt;
&lt;br/&gt;
&lt;h2 id=&#34;what-is-voicebox&#34;&gt;What is Voicebox?
&lt;/h2&gt;&lt;p&gt;Voicebox is a &lt;strong&gt;local-first AI voice studio&lt;/strong&gt; — a free and open-source alternative to &lt;strong&gt;ElevenLabs&lt;/strong&gt; and &lt;strong&gt;WisprFlow&lt;/strong&gt; in one app. Clone voices from a few seconds of audio, generate speech in 23 languages across 7 TTS engines, dictate into any text field with a global hotkey, and give any MCP-aware AI agent a voice of your choosing.&lt;/p&gt;
&lt;p&gt;The two cloud incumbents sit on opposite halves of the voice I/O loop — ElevenLabs on output, WisprFlow on input. Voicebox does both, bridges them with a bundled local LLM for refinement and per-profile personas, and runs the whole thing on your machine.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Complete privacy&lt;/strong&gt; — models, voice data, and captures never leave your machine&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;7 TTS engines&lt;/strong&gt; — Qwen3-TTS, Qwen CustomVoice, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, HumeAI TADA, and Kokoro&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Voice cloning and preset voices&lt;/strong&gt; — zero-shot cloning from a reference sample, or 50+ curated preset voices via Kokoro and Qwen CustomVoice&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;23 languages&lt;/strong&gt; — from English to Arabic, Japanese, Hindi, Swahili, and more&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Post-processing effects&lt;/strong&gt; — pitch shift, reverb, delay, chorus, compression, and filters&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Expressive speech&lt;/strong&gt; — paralinguistic tags like &lt;code&gt;[laugh]&lt;/code&gt;, &lt;code&gt;[sigh]&lt;/code&gt;, &lt;code&gt;[gasp]&lt;/code&gt; via Chatterbox Turbo; natural-language delivery control via Qwen CustomVoice&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Unlimited length&lt;/strong&gt; — auto-chunking with crossfade for scripts, articles, and chapters&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Stories editor&lt;/strong&gt; — multi-track timeline for conversations, podcasts, and narratives&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Voice input&lt;/strong&gt; — global dictation hotkey with push-to-talk and toggle modes, accessibility-verified auto-paste on macOS, in-app mic on every text field, Whisper-based STT&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Agent voice output&lt;/strong&gt; — one tool call (&lt;code&gt;voicebox.speak&lt;/code&gt;) and any MCP-aware agent (Claude Code, Cursor, Cline) speaks to you in a voice you&amp;rsquo;ve cloned&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Voice personalities&lt;/strong&gt; — attach a free-form persona to any voice profile, then Compose, Rewrite, or Respond via a bundled local LLM — agents can invoke the same modes over MCP&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;API-first&lt;/strong&gt; — REST API plus a built-in MCP server for integrating voice I/O into your own apps and agents&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Native performance&lt;/strong&gt; — built with Tauri (Rust), not Electron&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Runs everywhere&lt;/strong&gt; — macOS (MLX/Metal), Windows (CUDA), Linux, AMD ROCm, Intel Arc, Docker&lt;/li&gt;
&lt;/ul&gt;
&lt;hr&gt;
&lt;h2 id=&#34;download&#34;&gt;Download
&lt;/h2&gt;&lt;table&gt;
	&lt;thead&gt;
			&lt;tr&gt;
					&lt;th&gt;Platform&lt;/th&gt;
					&lt;th&gt;Download&lt;/th&gt;
			&lt;/tr&gt;
	&lt;/thead&gt;
	&lt;tbody&gt;
			&lt;tr&gt;
					&lt;td&gt;macOS (Apple Silicon)&lt;/td&gt;
					&lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://voicebox.sh/download/mac-arm&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Download DMG&lt;/a&gt;&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;macOS (Intel)&lt;/td&gt;
					&lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://voicebox.sh/download/mac-intel&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Download DMG&lt;/a&gt;&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Windows&lt;/td&gt;
					&lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://voicebox.sh/download/windows&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Download MSI&lt;/a&gt;&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Docker&lt;/td&gt;
					&lt;td&gt;&lt;code&gt;docker compose up&lt;/code&gt;&lt;/td&gt;
			&lt;/tr&gt;
	&lt;/tbody&gt;
&lt;/table&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/jamiepine/voicebox/releases/latest&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;View all binaries →&lt;/a&gt;&lt;/strong&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;strong&gt;Linux&lt;/strong&gt; — Pre-built binaries are not yet available. See &lt;a class=&#34;link&#34; href=&#34;https://voicebox.sh/linux-install&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;voicebox.sh/linux-install&lt;/a&gt; for build-from-source instructions.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;blockquote&gt;
&lt;p&gt;&lt;strong&gt;Having trouble?&lt;/strong&gt; See the &lt;a class=&#34;link&#34; href=&#34;docs/content/docs/overview/troubleshooting.mdx&#34; &gt;Troubleshooting Guide&lt;/a&gt; for common install, generation, model-download, and GPU issues.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;hr&gt;
&lt;h2 id=&#34;features&#34;&gt;Features
&lt;/h2&gt;&lt;h3 id=&#34;multi-engine-voice-cloning&#34;&gt;Multi-Engine Voice Cloning
&lt;/h3&gt;&lt;p&gt;Seven TTS engines with different strengths, switchable per-generation:&lt;/p&gt;
&lt;table&gt;
	&lt;thead&gt;
			&lt;tr&gt;
					&lt;th&gt;Engine&lt;/th&gt;
					&lt;th&gt;Languages&lt;/th&gt;
					&lt;th&gt;Strengths&lt;/th&gt;
			&lt;/tr&gt;
	&lt;/thead&gt;
	&lt;tbody&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Qwen3-TTS&lt;/strong&gt; (0.6B / 1.7B)&lt;/td&gt;
					&lt;td&gt;10&lt;/td&gt;
					&lt;td&gt;High-quality multilingual cloning, delivery instructions (&amp;ldquo;speak slowly&amp;rdquo;, &amp;ldquo;whisper&amp;rdquo;)&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Qwen CustomVoice&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;10&lt;/td&gt;
					&lt;td&gt;9 curated preset voices with natural-language delivery control — no reference audio required&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;LuxTTS&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;English&lt;/td&gt;
					&lt;td&gt;Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Chatterbox Multilingual&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;23&lt;/td&gt;
					&lt;td&gt;Broadest language coverage — Arabic, Danish, Finnish, Greek, Hebrew, Hindi, Malay, Norwegian, Polish, Swahili, Swedish, Turkish and more&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Chatterbox Turbo&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;English&lt;/td&gt;
					&lt;td&gt;Fast 350M model with paralinguistic emotion/sound tags&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;TADA&lt;/strong&gt; (1B / 3B)&lt;/td&gt;
					&lt;td&gt;10&lt;/td&gt;
					&lt;td&gt;HumeAI speech-language model — 700s+ coherent audio, text-acoustic dual alignment&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Kokoro&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;8&lt;/td&gt;
					&lt;td&gt;50 curated preset voices, tiny 82M model, fast CPU inference&lt;/td&gt;
			&lt;/tr&gt;
	&lt;/tbody&gt;
&lt;/table&gt;
&lt;h3 id=&#34;emotions--paralinguistic-tags&#34;&gt;Emotions &amp;amp; Paralinguistic Tags
&lt;/h3&gt;&lt;p&gt;Only &lt;strong&gt;Chatterbox Turbo&lt;/strong&gt; interprets paralinguistic tags like &lt;code&gt;[laugh]&lt;/code&gt; and
&lt;code&gt;[sigh]&lt;/code&gt;. Qwen3-TTS, LuxTTS, Chatterbox Multilingual, and HumeAI TADA read them
literally as text.&lt;/p&gt;
&lt;p&gt;With &lt;strong&gt;Chatterbox Turbo&lt;/strong&gt; selected, type &lt;code&gt;/&lt;/code&gt; in the text input to open the tag
inserter and add expressive tags inline with speech:&lt;/p&gt;
&lt;p&gt;&lt;code&gt;[laugh]&lt;/code&gt; &lt;code&gt;[chuckle]&lt;/code&gt; &lt;code&gt;[gasp]&lt;/code&gt; &lt;code&gt;[cough]&lt;/code&gt; &lt;code&gt;[sigh]&lt;/code&gt; &lt;code&gt;[groan]&lt;/code&gt; &lt;code&gt;[sniff]&lt;/code&gt; &lt;code&gt;[shush]&lt;/code&gt; &lt;code&gt;[clear throat]&lt;/code&gt;&lt;/p&gt;
&lt;h3 id=&#34;post-processing-effects&#34;&gt;Post-Processing Effects
&lt;/h3&gt;&lt;p&gt;8 audio effects powered by Spotify&amp;rsquo;s &lt;code&gt;pedalboard&lt;/code&gt; library. Apply after generation, preview in real time, build reusable presets.&lt;/p&gt;
&lt;table&gt;
	&lt;thead&gt;
			&lt;tr&gt;
					&lt;th&gt;Effect&lt;/th&gt;
					&lt;th&gt;Description&lt;/th&gt;
			&lt;/tr&gt;
	&lt;/thead&gt;
	&lt;tbody&gt;
			&lt;tr&gt;
					&lt;td&gt;Pitch Shift&lt;/td&gt;
					&lt;td&gt;Up or down by up to 12 semitones&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Reverb&lt;/td&gt;
					&lt;td&gt;Configurable room size, damping, wet/dry mix&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Delay&lt;/td&gt;
					&lt;td&gt;Echo with adjustable time, feedback, and mix&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Chorus / Flanger&lt;/td&gt;
					&lt;td&gt;Modulated delay for metallic or lush textures&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Compressor&lt;/td&gt;
					&lt;td&gt;Dynamic range compression&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Gain&lt;/td&gt;
					&lt;td&gt;Volume adjustment (-40 to +40 dB)&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;High-Pass Filter&lt;/td&gt;
					&lt;td&gt;Remove low frequencies&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Low-Pass Filter&lt;/td&gt;
					&lt;td&gt;Remove high frequencies&lt;/td&gt;
			&lt;/tr&gt;
	&lt;/tbody&gt;
&lt;/table&gt;
&lt;p&gt;Ships with 4 built-in presets (Robotic, Radio, Echo Chamber, Deep Voice) and supports custom presets. Effects can be assigned per-profile as defaults.&lt;/p&gt;
&lt;h3 id=&#34;unlimited-generation-length&#34;&gt;Unlimited Generation Length
&lt;/h3&gt;&lt;p&gt;Text is automatically split at sentence boundaries and each chunk is generated independently, then crossfaded together. Works with all engines.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Configurable auto-chunking limit (100–5,000 chars)&lt;/li&gt;
&lt;li&gt;Crossfade slider (0–200ms) for smooth transitions&lt;/li&gt;
&lt;li&gt;Max text length: 50,000 characters&lt;/li&gt;
&lt;li&gt;Smart splitting respects abbreviations, CJK punctuation, and &lt;code&gt;[tags]&lt;/code&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;generation-versions&#34;&gt;Generation Versions
&lt;/h3&gt;&lt;p&gt;Every generation supports multiple versions with provenance tracking:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Original&lt;/strong&gt; — clean TTS output, always preserved&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Effects versions&lt;/strong&gt; — apply different effects chains from any source version&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Takes&lt;/strong&gt; — regenerate with a new seed for variation&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Source tracking&lt;/strong&gt; — each version records its lineage&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Favorites&lt;/strong&gt; — star generations for quick access&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;async-generation-queue&#34;&gt;Async Generation Queue
&lt;/h3&gt;&lt;p&gt;Generation is non-blocking. Submit and immediately start typing the next one.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Serial execution queue prevents GPU contention&lt;/li&gt;
&lt;li&gt;Real-time SSE status streaming&lt;/li&gt;
&lt;li&gt;Failed generations can be retried&lt;/li&gt;
&lt;li&gt;Stale generations from crashes auto-recover on startup&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;voice-profile-management&#34;&gt;Voice Profile Management
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;Create profiles from audio files or record directly in-app&lt;/li&gt;
&lt;li&gt;Import/export profiles to share or back up&lt;/li&gt;
&lt;li&gt;Multi-sample support for higher quality cloning&lt;/li&gt;
&lt;li&gt;Per-profile default effects chains&lt;/li&gt;
&lt;li&gt;Organize with descriptions and language tags&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;stories-editor&#34;&gt;Stories Editor
&lt;/h3&gt;&lt;p&gt;Multi-voice timeline editor for conversations, podcasts, and narratives.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Multi-track composition with drag-and-drop&lt;/li&gt;
&lt;li&gt;Inline audio trimming and splitting&lt;/li&gt;
&lt;li&gt;Auto-playback with synchronized playhead&lt;/li&gt;
&lt;li&gt;Version pinning per track clip&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;global-dictation--voice-input&#34;&gt;Global Dictation &amp;amp; Voice Input
&lt;/h3&gt;&lt;p&gt;The other half of the voice I/O loop. Hold a hotkey anywhere on your system, speak, release — on macOS the transcript pastes straight into the focused text field. Or hit the mic on any Voicebox text input and dictate directly into the app.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Configurable chord bindings&lt;/strong&gt; — hold-to-speak and tap-to-toggle chords, each rebindable in the in-app chord picker. Holding push-to-talk and tapping &lt;code&gt;Space&lt;/code&gt; mid-hold upgrades into a toggle session without a gap in audio&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Target-aware paste (macOS)&lt;/strong&gt; — accessibility-verified injection into the focused text field, with atomic clipboard save/restore so your clipboard isn&amp;rsquo;t clobbered&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;First-run permissions UX&lt;/strong&gt; — in-app gates walk you through the macOS Accessibility and Input Monitoring grants with deep-links to System Settings&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;In-app mic button&lt;/strong&gt; on every Voicebox text field — generation form, profile descriptions, story titles, anywhere you&amp;rsquo;d type&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;LLM refinement&lt;/strong&gt; — optional cleanup of ums, stutters, and false starts before paste&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;On-screen pill&lt;/strong&gt; — floating overlay surfacing &lt;code&gt;recording&lt;/code&gt;, &lt;code&gt;transcribing&lt;/code&gt;, &lt;code&gt;refining&lt;/code&gt;, and &lt;code&gt;speaking&lt;/code&gt; states. Same pill agents use when they speak to you, so there&amp;rsquo;s one mental model for both directions of the loop&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;speech-to-text&#34;&gt;Speech-to-Text
&lt;/h3&gt;&lt;p&gt;Voicebox runs OpenAI Whisper for transcription — the same model that backs dictation, the Captures tab, and the &lt;code&gt;/transcribe&lt;/code&gt; API. Running on MLX (Apple Silicon) or PyTorch (CUDA / ROCm / DirectML / CPU) depending on your platform.&lt;/p&gt;
&lt;table&gt;
	&lt;thead&gt;
			&lt;tr&gt;
					&lt;th&gt;Size&lt;/th&gt;
					&lt;th&gt;Notes&lt;/th&gt;
			&lt;/tr&gt;
	&lt;/thead&gt;
	&lt;tbody&gt;
			&lt;tr&gt;
					&lt;td&gt;Base / Small / Medium / Large&lt;/td&gt;
					&lt;td&gt;Standard Whisper quality ladder&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Turbo&lt;/td&gt;
					&lt;td&gt;~8x faster than Whisper Large, minimal quality loss&lt;/td&gt;
			&lt;/tr&gt;
	&lt;/tbody&gt;
&lt;/table&gt;
&lt;p&gt;More engines (Parakeet v3, Qwen3-ASR) are planned — see &lt;a class=&#34;link&#34; href=&#34;#roadmap&#34; &gt;Roadmap&lt;/a&gt;.&lt;/p&gt;
&lt;h3 id=&#34;captures&#34;&gt;Captures
&lt;/h3&gt;&lt;p&gt;Every dictation, in-app recording, and uploaded audio file lands in the Captures tab — original audio paired with transcript, always preserved.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Replay, re-transcribe, refine&lt;/strong&gt; — rerun STT with any Whisper size, or re-run the raw transcript through the local LLM with different flags (filler cleanup, self-correction removal, technical-term preservation)&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Edit inline&lt;/strong&gt; — tweak the transcript and save on blur&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Play as voice profile&lt;/strong&gt; — turn any capture into speech with a cloned voice, one click&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Promote to voice sample&lt;/strong&gt; — use a capture&amp;rsquo;s audio + transcript as a reference sample on any voice profile&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Local capture storage&lt;/strong&gt; — original audio and transcript stay in your Voicebox data directory, with a folder shortcut in Settings&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;agent-voice-output&#34;&gt;Agent Voice Output
&lt;/h3&gt;&lt;p&gt;Every agent gets a voice. One tool call and any MCP-aware agent can speak to you in a voice you&amp;rsquo;ve cloned — task completions, questions, notifications. The same pill that surfaces during dictation surfaces during agent speech, so you always see what&amp;rsquo;s coming out of your machine.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-ts&#34; data-lang=&#34;ts&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;// In any MCP-aware agent:
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;await&lt;/span&gt; &lt;span class=&#34;nx&#34;&gt;voicebox&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;nx&#34;&gt;speak&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;({&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;nx&#34;&gt;text&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;Deploy complete.&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;nx&#34;&gt;profile&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;Morgan&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;});&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Also exposed as &lt;code&gt;POST /speak&lt;/code&gt; for anything that doesn&amp;rsquo;t speak MCP — ACP, A2A, shell scripts, custom harnesses.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Bidirectional pill&lt;/strong&gt; — &lt;code&gt;recording&lt;/code&gt;, &lt;code&gt;transcribing&lt;/code&gt;, &lt;code&gt;refining&lt;/code&gt;, and &lt;code&gt;speaking&lt;/code&gt; are all states of the same OS-level overlay, so dictation and agent speech share one surface&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Per-agent voice binding&lt;/strong&gt; — in &lt;strong&gt;Settings → MCP&lt;/strong&gt;, pin Claude Code to Morgan and Cursor to Scarlett so you can tell which agent is talking without looking. Each client&amp;rsquo;s &lt;code&gt;last_seen_at&lt;/code&gt; timestamp confirms the install actually took&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Always visible&lt;/strong&gt; — no silent background TTS; every agent-initiated speak surfaces the pill with the voice profile name for the full duration&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;HTTP + stdio transports&lt;/strong&gt; — install as a URL in Claude Code / Cursor / Windsurf / VS Code MCP, or point stdio-only clients at the bundled &lt;code&gt;voicebox-mcp&lt;/code&gt; binary&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;voice-personalities&#34;&gt;Voice Personalities
&lt;/h3&gt;&lt;p&gt;Attach a free-form personality to any voice profile — who this voice is, how they speak, what they care about. Two actions appear on the generate box when a personality is set, powered by a bundled Qwen3 LLM running entirely locally.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Compose&lt;/strong&gt; — a shuffle button that drops a fresh in-character line into the textarea; edit and speak, or click again for a different take&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Speak in character&lt;/strong&gt; — a toggle that routes your input text through the personality LLM to be rewritten in their voice before TTS&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Agents can reach the same rewrite path over MCP by passing &lt;code&gt;personality: true&lt;/code&gt; to &lt;code&gt;voicebox.speak&lt;/code&gt;, turning the tool into a text-in → personality-LLM → TTS pipeline. The same LLM backs dictation&amp;rsquo;s refinement step — one LLM in the app, one model cache, one GPU-memory footprint.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Local LLM options:&lt;/strong&gt; Qwen3 0.6B / 1.7B / 4B, sharing the TTS runtime (MLX on Apple Silicon, PyTorch elsewhere).&lt;/p&gt;
&lt;p&gt;Use cases: agent dev loops (dictate a question, hear the answer in a cloned voice), interactive characters for games and narrative tools, speech assistance for people who can&amp;rsquo;t speak in their original voice.&lt;/p&gt;
&lt;h3 id=&#34;model-management&#34;&gt;Model Management
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;Per-model unload to free GPU memory without deleting downloads&lt;/li&gt;
&lt;li&gt;Custom models directory via &lt;code&gt;VOICEBOX_MODELS_DIR&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;Model folder migration with progress tracking&lt;/li&gt;
&lt;li&gt;Download cancel/clear UI&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;gpu-support&#34;&gt;GPU Support
&lt;/h3&gt;&lt;table&gt;
	&lt;thead&gt;
			&lt;tr&gt;
					&lt;th&gt;Platform&lt;/th&gt;
					&lt;th&gt;Backend&lt;/th&gt;
					&lt;th&gt;Notes&lt;/th&gt;
			&lt;/tr&gt;
	&lt;/thead&gt;
	&lt;tbody&gt;
			&lt;tr&gt;
					&lt;td&gt;macOS (Apple Silicon)&lt;/td&gt;
					&lt;td&gt;MLX (Metal)&lt;/td&gt;
					&lt;td&gt;4-5x faster via Neural Engine&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Windows / Linux (NVIDIA)&lt;/td&gt;
					&lt;td&gt;PyTorch (CUDA)&lt;/td&gt;
					&lt;td&gt;Auto-downloads CUDA binary from within the app&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Linux (AMD)&lt;/td&gt;
					&lt;td&gt;PyTorch (ROCm)&lt;/td&gt;
					&lt;td&gt;Auto-configures HSA_OVERRIDE_GFX_VERSION&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Windows (any GPU)&lt;/td&gt;
					&lt;td&gt;DirectML&lt;/td&gt;
					&lt;td&gt;Universal Windows GPU support&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Intel Arc&lt;/td&gt;
					&lt;td&gt;IPEX/XPU&lt;/td&gt;
					&lt;td&gt;Intel discrete GPU acceleration&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Any&lt;/td&gt;
					&lt;td&gt;CPU&lt;/td&gt;
					&lt;td&gt;Works everywhere, just slower&lt;/td&gt;
			&lt;/tr&gt;
	&lt;/tbody&gt;
&lt;/table&gt;
&lt;hr&gt;
&lt;h2 id=&#34;api&#34;&gt;API
&lt;/h2&gt;&lt;p&gt;Voicebox exposes a REST API for integrating voice I/O into your own apps and agents.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Generate speech&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;curl -X POST http://127.0.0.1:17493/generate &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  -H &lt;span class=&#34;s2&#34;&gt;&amp;#34;Content-Type: application/json&amp;#34;&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  -d &lt;span class=&#34;s1&#34;&gt;&amp;#39;{&amp;#34;text&amp;#34;: &amp;#34;Hello world&amp;#34;, &amp;#34;profile_id&amp;#34;: &amp;#34;abc123&amp;#34;, &amp;#34;language&amp;#34;: &amp;#34;en&amp;#34;}&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Agent voice output — any app or script can speak in a cloned voice&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;curl -X POST http://127.0.0.1:17493/speak &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  -H &lt;span class=&#34;s2&#34;&gt;&amp;#34;Content-Type: application/json&amp;#34;&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  -H &lt;span class=&#34;s2&#34;&gt;&amp;#34;X-Voicebox-Client-Id: my-script&amp;#34;&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  -d &lt;span class=&#34;s1&#34;&gt;&amp;#39;{&amp;#34;text&amp;#34;: &amp;#34;Deploy complete.&amp;#34;, &amp;#34;profile&amp;#34;: &amp;#34;Morgan&amp;#34;}&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Transcribe an audio file&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;curl -X POST http://127.0.0.1:17493/transcribe &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  -F &lt;span class=&#34;s2&#34;&gt;&amp;#34;audio=@recording.wav&amp;#34;&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  -F &lt;span class=&#34;s2&#34;&gt;&amp;#34;model=whisper-turbo&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# List voice profiles&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;curl http://127.0.0.1:17493/profiles
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;&lt;code&gt;POST /speak&lt;/code&gt; accepts &lt;code&gt;profile&lt;/code&gt; as a name (case-insensitive) or id, and resolves via the same precedence as the MCP tool: explicit arg → per-client binding → &lt;code&gt;capture_settings.default_playback_voice_id&lt;/code&gt;.&lt;/p&gt;
&lt;h3 id=&#34;mcp-server&#34;&gt;MCP server
&lt;/h3&gt;&lt;p&gt;Voicebox ships a built-in &lt;strong&gt;Model Context Protocol&lt;/strong&gt; server so any MCP-aware agent (Claude Code, Cursor, Windsurf, Cline, VS Code MCP extensions) can speak, transcribe, and browse captures and profiles.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Claude Code one-liner:&lt;/strong&gt;&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-fallback&#34; data-lang=&#34;fallback&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;claude mcp add voicebox \
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --transport http \
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --url http://127.0.0.1:17493/mcp \
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --header &amp;#34;X-Voicebox-Client-Id: claude-code&amp;#34;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;&lt;strong&gt;Any HTTP MCP client&lt;/strong&gt; (Cursor, Windsurf, VS Code, etc.):&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-json&#34; data-lang=&#34;json&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;nt&#34;&gt;&amp;#34;mcpServers&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nt&#34;&gt;&amp;#34;voicebox&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;      &lt;span class=&#34;nt&#34;&gt;&amp;#34;url&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;http://127.0.0.1:17493/mcp&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;      &lt;span class=&#34;nt&#34;&gt;&amp;#34;headers&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt; &lt;span class=&#34;nt&#34;&gt;&amp;#34;X-Voicebox-Client-Id&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;cursor&amp;#34;&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;&lt;strong&gt;Stdio fallback&lt;/strong&gt; for clients that don&amp;rsquo;t speak HTTP MCP — point at the bundled &lt;code&gt;voicebox-mcp&lt;/code&gt; binary inside the app:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-json&#34; data-lang=&#34;json&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;nt&#34;&gt;&amp;#34;mcpServers&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nt&#34;&gt;&amp;#34;voicebox&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;      &lt;span class=&#34;nt&#34;&gt;&amp;#34;command&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;/Applications/Voicebox.app/Contents/MacOS/voicebox-mcp&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;      &lt;span class=&#34;nt&#34;&gt;&amp;#34;env&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt; &lt;span class=&#34;nt&#34;&gt;&amp;#34;VOICEBOX_CLIENT_ID&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;claude-desktop&amp;#34;&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Four tools ship: &lt;code&gt;voicebox.speak&lt;/code&gt;, &lt;code&gt;voicebox.transcribe&lt;/code&gt;, &lt;code&gt;voicebox.list_captures&lt;/code&gt;, &lt;code&gt;voicebox.list_profiles&lt;/code&gt;. Per-client voice bindings are managed in &lt;strong&gt;Voicebox → Settings → MCP&lt;/strong&gt;. See the &lt;a class=&#34;link&#34; href=&#34;docs/content/docs/overview/mcp-server.mdx&#34; &gt;full MCP guide&lt;/a&gt; for tool signatures, resolution precedence, the speaking-pill contract, and security notes.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-ts&#34; data-lang=&#34;ts&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;// In any MCP-aware agent:
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;await&lt;/span&gt; &lt;span class=&#34;nx&#34;&gt;voicebox&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;nx&#34;&gt;speak&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;({&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;nx&#34;&gt;text&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;Tests passing. Ready to merge.&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;nx&#34;&gt;profile&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;Morgan&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;      &lt;span class=&#34;c1&#34;&gt;// optional — falls back to the per-client binding
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;nx&#34;&gt;personality&lt;/span&gt;: &lt;span class=&#34;kt&#34;&gt;true&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;      &lt;span class=&#34;c1&#34;&gt;// optional — rewrites text through the profile&amp;#39;s personality LLM first
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;});&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;&lt;strong&gt;Use cases:&lt;/strong&gt; agent dev loops (voice in, voice out), game dialogue, podcast production, accessibility tools, voice assistants, content automation.&lt;/p&gt;
&lt;p&gt;Full API documentation available at &lt;code&gt;http://127.0.0.1:17493/docs&lt;/code&gt;.&lt;/p&gt;
&lt;hr&gt;
&lt;h2 id=&#34;tech-stack&#34;&gt;Tech Stack
&lt;/h2&gt;&lt;table&gt;
	&lt;thead&gt;
			&lt;tr&gt;
					&lt;th&gt;Layer&lt;/th&gt;
					&lt;th&gt;Technology&lt;/th&gt;
			&lt;/tr&gt;
	&lt;/thead&gt;
	&lt;tbody&gt;
			&lt;tr&gt;
					&lt;td&gt;Desktop App&lt;/td&gt;
					&lt;td&gt;Tauri (Rust)&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Frontend&lt;/td&gt;
					&lt;td&gt;React, TypeScript, Tailwind CSS&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;State&lt;/td&gt;
					&lt;td&gt;Zustand, React Query&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Backend&lt;/td&gt;
					&lt;td&gt;FastAPI (Python)&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;TTS Engines&lt;/td&gt;
					&lt;td&gt;Qwen3-TTS, Qwen CustomVoice, LuxTTS, Chatterbox, Chatterbox Turbo, TADA, Kokoro&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;STT&lt;/td&gt;
					&lt;td&gt;Whisper / Whisper Turbo (PyTorch or MLX)&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Local LLM&lt;/td&gt;
					&lt;td&gt;Qwen3 (0.6B / 1.7B / 4B), shared runtime with TTS / STT&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;MCP Server&lt;/td&gt;
					&lt;td&gt;FastMCP mounted at &lt;code&gt;/mcp&lt;/code&gt; (Streamable HTTP) + bundled stdio shim binary&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Native Shim&lt;/td&gt;
					&lt;td&gt;Rust (inside Tauri) for global hotkey, paste injection, focus introspection&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Effects&lt;/td&gt;
					&lt;td&gt;Pedalboard (Spotify)&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Inference&lt;/td&gt;
					&lt;td&gt;MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU)&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Database&lt;/td&gt;
					&lt;td&gt;SQLite&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;Audio&lt;/td&gt;
					&lt;td&gt;WaveSurfer.js, librosa&lt;/td&gt;
			&lt;/tr&gt;
	&lt;/tbody&gt;
&lt;/table&gt;
&lt;hr&gt;
&lt;h2 id=&#34;roadmap&#34;&gt;Roadmap
&lt;/h2&gt;&lt;table&gt;
	&lt;thead&gt;
			&lt;tr&gt;
					&lt;th&gt;Feature&lt;/th&gt;
					&lt;th&gt;Description&lt;/th&gt;
			&lt;/tr&gt;
	&lt;/thead&gt;
	&lt;tbody&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Windows / Linux auto-paste&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;Dictation paste parity — &lt;code&gt;SendInput&lt;/code&gt; on Windows, &lt;code&gt;uinput&lt;/code&gt; / AT-SPI on Linux&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;STT engine expansion&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;Parakeet v3 and Qwen3-ASR joining Whisper — 50+ languages, better non-English quality&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Pipeline routing&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;Configurable source → transform → sink chains with webhook + MCP sinks and a preset editor&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Streaming transcription&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;WebSocket &lt;code&gt;/transcribe/stream&lt;/code&gt; for partial transcripts as you speak&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;End-to-end speech LLMs&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;Moshi, GLM-4-Voice, Qwen2.5 Omni — real voice-to-voice, no text between&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Voice Design&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;Create new voices from text descriptions&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Long-form capture&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;Dual-stream recorder (mic + system audio) with summary LLM transform&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Platform sinks&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;Apple Notes, Obsidian, and other opt-in integrations&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Plugin architecture&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;Extend with custom models, transforms, and sinks&lt;/td&gt;
			&lt;/tr&gt;
			&lt;tr&gt;
					&lt;td&gt;&lt;strong&gt;Mobile companion&lt;/strong&gt;&lt;/td&gt;
					&lt;td&gt;Control Voicebox from your phone&lt;/td&gt;
			&lt;/tr&gt;
	&lt;/tbody&gt;
&lt;/table&gt;
&lt;p&gt;For the &lt;strong&gt;full engineering status, open-issue triage, and prioritized work queue&lt;/strong&gt;, see &lt;a class=&#34;link&#34; href=&#34;docs/PROJECT_STATUS.md&#34; &gt;&lt;code&gt;docs/PROJECT_STATUS.md&lt;/code&gt;&lt;/a&gt; — a living document that tracks what&amp;rsquo;s shipped, what&amp;rsquo;s in-flight, candidate TTS engines under evaluation, and why we&amp;rsquo;ve accepted or backlogged specific integrations.&lt;/p&gt;
&lt;hr&gt;
&lt;h2 id=&#34;development&#34;&gt;Development
&lt;/h2&gt;&lt;p&gt;See &lt;a class=&#34;link&#34; href=&#34;CONTRIBUTING.md&#34; &gt;CONTRIBUTING.md&lt;/a&gt; for detailed setup and contribution guidelines.&lt;/p&gt;
&lt;h3 id=&#34;quick-start&#34;&gt;Quick Start
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git clone https://github.com/jamiepine/voicebox.git
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; voicebox
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;just setup   &lt;span class=&#34;c1&#34;&gt;# creates Python venv, installs all deps&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;just dev     &lt;span class=&#34;c1&#34;&gt;# starts backend + desktop app&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Install &lt;a class=&#34;link&#34; href=&#34;https://github.com/casey/just&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;just&lt;/a&gt;: &lt;code&gt;brew install just&lt;/code&gt; or &lt;code&gt;cargo install just&lt;/code&gt;. Run &lt;code&gt;just --list&lt;/code&gt; to see all commands.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Prerequisites:&lt;/strong&gt; &lt;a class=&#34;link&#34; href=&#34;https://bun.sh&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Bun&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://rustup.rs&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Rust&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://python.org&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Python 3.11+&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://v2.tauri.app/start/prerequisites/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Tauri Prerequisites&lt;/a&gt;, and &lt;a class=&#34;link&#34; href=&#34;https://developer.apple.com/xcode/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Xcode&lt;/a&gt; on macOS.&lt;/p&gt;
&lt;p&gt;The repo ships a pre-wired &lt;code&gt;.mcp.json&lt;/code&gt; at the root — running Claude Code inside this checkout picks up the Voicebox MCP tools automatically once the dev app is running.&lt;/p&gt;
&lt;h3 id=&#34;building-locally&#34;&gt;Building Locally
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;just build          &lt;span class=&#34;c1&#34;&gt;# Build CPU server binary + Tauri app&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;just build-local    &lt;span class=&#34;c1&#34;&gt;# (Windows) Build CPU + CUDA server binaries + Tauri app&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h3 id=&#34;adding-new-voice-models&#34;&gt;Adding New Voice Models
&lt;/h3&gt;&lt;p&gt;The multi-engine architecture makes adding new TTS engines straightforward. A &lt;a class=&#34;link&#34; href=&#34;docs/content/docs/developer/tts-engines.mdx&#34; &gt;step-by-step guide&lt;/a&gt; covers the full process: dependency research, backend protocol implementation, frontend wiring, and PyInstaller bundling.&lt;/p&gt;
&lt;p&gt;The guide is optimized for AI coding agents. An &lt;a class=&#34;link&#34; href=&#34;.agents/skills/add-tts-engine/SKILL.md&#34; &gt;agent skill&lt;/a&gt; can pick up a model name and handle the entire integration autonomously — you just test the build locally.&lt;/p&gt;
&lt;h3 id=&#34;project-structure&#34;&gt;Project Structure
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-fallback&#34; data-lang=&#34;fallback&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;voicebox/
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;├── app/              # Shared React frontend
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;├── tauri/            # Desktop app (Tauri + Rust)
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;├── web/              # Web deployment
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;├── backend/          # Python FastAPI server
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;├── landing/          # Marketing website
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;└── scripts/          # Build &amp;amp; release scripts
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;hr&gt;
&lt;h2 id=&#34;contributing&#34;&gt;Contributing
&lt;/h2&gt;&lt;p&gt;Contributions welcome! See &lt;a class=&#34;link&#34; href=&#34;CONTRIBUTING.md&#34; &gt;CONTRIBUTING.md&lt;/a&gt; for guidelines.&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Fork the repo&lt;/li&gt;
&lt;li&gt;Create a feature branch&lt;/li&gt;
&lt;li&gt;Make your changes&lt;/li&gt;
&lt;li&gt;Submit a PR&lt;/li&gt;
&lt;/ol&gt;
&lt;h2 id=&#34;security&#34;&gt;Security
&lt;/h2&gt;&lt;p&gt;Found a security vulnerability? Please report it responsibly. See &lt;a class=&#34;link&#34; href=&#34;SECURITY.md&#34; &gt;SECURITY.md&lt;/a&gt; for details.&lt;/p&gt;
&lt;hr&gt;
&lt;h2 id=&#34;license&#34;&gt;License
&lt;/h2&gt;&lt;p&gt;MIT License — see &lt;a class=&#34;link&#34; href=&#34;LICENSE&#34; &gt;LICENSE&lt;/a&gt; for details.&lt;/p&gt;
&lt;hr&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;a href=&#34;https://voicebox.sh&#34;&gt;voicebox.sh&lt;/a&gt;
&lt;/p&gt;
</description>
        </item>
        
    </channel>
</rss>
