#!/usr/bin/env python3 """ Blog Text-to-Speech generator. Usage: python scripts/tts_generate.py src/content/blog/my-post.md Environment variables: TTS_PROVIDER - "google" (default), "mistral", or "openai" GOOGLE_API_KEY - Required when TTS_PROVIDER=google MISTRAL_API_KEY - Required when TTS_PROVIDER=mistral OPENAI_API_KEY - Required when TTS_PROVIDER=openai Output: .mp3 in the current working directory (or --output-dir if specified) """ import argparse import os import re import sys def load_dotenv() -> None: """Load .env from the project root into os.environ (stdlib only, never overwrites).""" project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) env_path = os.path.join(project_root, ".env") if not os.path.isfile(env_path): return with open(env_path, encoding="utf-8") as f: for line in f: line = line.strip() if not line or line.startswith("#") or "=" not in line: continue key, _, value = line.partition("=") key = key.strip() value = value.strip().strip('"').strip("'") os.environ.setdefault(key, value) def parse_frontmatter(text: str) -> tuple[dict, str]: """Extract YAML frontmatter and return (metadata_dict, body).""" if not text.startswith("---"): return {}, text end = text.find("\n---", 3) if end == -1: return {}, text front = text[3:end].strip() body = text[end + 4 :].strip() meta: dict = {} for line in front.splitlines(): if ":" in line: key, _, value = line.partition(":") meta[key.strip()] = value.strip().strip('"').strip("'") return meta, body def clean_markdown(text: str) -> str: """Strip markdown syntax so TTS reads clean prose.""" # Remove code blocks (``` ... ```) text = re.sub(r"```[\s\S]*?```", "", text) # Remove inline code text = re.sub(r"`[^`]+`", "", text) # Remove images text = re.sub(r"!\[.*?\]\(.*?\)", "", text) # Convert links to just the link text text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) # Remove ATX headings markers but keep text text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) # Remove bold/italic markers text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text) text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text) # Remove horizontal rules text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE) # Remove blockquote markers text = re.sub(r"^>\s?", "", text, flags=re.MULTILINE) # Remove list markers text = re.sub(r"^[\*\-\+]\s+", "", text, flags=re.MULTILINE) text = re.sub(r"^\d+\.\s+", "", text, flags=re.MULTILINE) # Collapse multiple blank lines text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def split_into_chunks(text: str, max_chars: int = 4000) -> list[str]: """Split text into chunks that fit within max_chars. Splits on paragraph boundaries first; falls back to sentence boundaries for paragraphs that are still too long. No text is ever discarded. """ paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()] # Flatten into atomic segments (paragraphs or individual sentences) segments: list[str] = [] for para in paragraphs: if len(para) <= max_chars: segments.append(para) else: sentences = re.split(r"(?<=[.!?])\s+", para) segments.extend(s for s in sentences if s.strip()) # Merge segments greedily into chunks under max_chars chunks: list[str] = [] current = "" for segment in segments: if not current: current = segment elif len(current) + 2 + len(segment) <= max_chars: current += "\n\n" + segment else: chunks.append(current) current = segment if current: chunks.append(current) return chunks def merge_audio_chunks(chunks: list[bytes], output_path: str) -> None: """Concatenate MP3 byte chunks with a ~400ms silent pause between each. MP3 frames are self-contained, so byte concatenation produces a valid file. Silent frame: 128kbps/44100Hz frame header + null payload = 417 bytes. 16 frames * ~26.1ms each ≈ 418ms of silence. """ silent_frame = b"\xff\xfb\x90\x00" + b"\x00" * 413 # 417 bytes silence = silent_frame * 16 with open(output_path, "wb") as f: for i, chunk in enumerate(chunks): f.write(chunk) if i < len(chunks) - 1: f.write(silence) # --------------------------------------------------------------------------- # Provider helpers — return raw MP3 bytes for a single text chunk # --------------------------------------------------------------------------- def _google_synthesize(text: str, api_key: str) -> bytes: import base64 import json import urllib.request payload = json.dumps( { "input": {"text": text}, "voice": { "languageCode": "nl-NL", "name": "nl-NL-Wavenet-D", "ssmlGender": "MALE", }, "audioConfig": {"audioEncoding": "MP3"}, } ).encode() url = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={api_key}" req = urllib.request.Request( url, data=payload, headers={"Content-Type": "application/json"} ) with urllib.request.urlopen(req) as resp: body = json.loads(resp.read()) return base64.b64decode(body["audioContent"]) def _mistral_synthesize(text: str, api_key: str) -> bytes: import base64 import json import urllib.error import urllib.request body: dict = { "model": "voxtral-mini-tts-2603", "input": text, "response_format": "mp3", } voice_id = os.environ.get("MISTRAL_VOICE_ID", "").strip() if not voice_id: raise EnvironmentError( "MISTRAL_VOICE_ID is not set. " "Create a voice at https://console.mistral.ai and add its ID to .env" ) body["voice"] = voice_id req = urllib.request.Request( "https://api.mistral.ai/v1/audio/speech", data=json.dumps(body).encode(), headers={ "Content-Type": "application/json", "Authorization": f"Bearer {api_key}", }, ) try: with urllib.request.urlopen(req) as resp: data = json.loads(resp.read()) return base64.b64decode(data["audio_data"]) except urllib.error.HTTPError as exc: detail = exc.read().decode("utf-8", errors="replace") raise RuntimeError(f"Mistral API {exc.code}: {detail}") from exc def _openai_synthesize(text: str, api_key: str) -> bytes: import json import urllib.request payload = json.dumps( { "model": "gpt-4o-mini-tts", "input": text, "voice": "ash", "instructions": "Read aloud in a warm and friendly tone.", } ).encode() req = urllib.request.Request( "https://api.openai.com/v1/audio/speech", data=payload, headers={ "Content-Type": "application/json", "Authorization": f"Bearer {api_key}", }, ) with urllib.request.urlopen(req) as resp: return resp.read() # --------------------------------------------------------------------------- # Public provider functions — chunk, call API per chunk, merge # --------------------------------------------------------------------------- def tts_google(text: str, slug: str, output_path: str) -> None: """Generate audio with Google Cloud TTS (free tier: 1M chars/month).""" api_key = os.environ.get("GOOGLE_API_KEY") if not api_key: raise EnvironmentError("GOOGLE_API_KEY is not set") chunks = split_into_chunks(text) audio_chunks: list[bytes] = [] for i, chunk in enumerate(chunks, 1): print(f" Chunk {i}/{len(chunks)} ({len(chunk)} chars)...") audio_chunks.append(_google_synthesize(chunk, api_key)) merge_audio_chunks(audio_chunks, output_path) def tts_mistral(text: str, slug: str, output_path: str) -> None: """Generate audio with Mistral Voxtral TTS (~$16/M chars).""" api_key = os.environ.get("MISTRAL_API_KEY") if not api_key: raise EnvironmentError("MISTRAL_API_KEY is not set") chunks = split_into_chunks(text) audio_chunks: list[bytes] = [] for i, chunk in enumerate(chunks, 1): print(f" Chunk {i}/{len(chunks)} ({len(chunk)} chars)...") audio_chunks.append(_mistral_synthesize(chunk, api_key)) merge_audio_chunks(audio_chunks, output_path) def tts_openai(text: str, slug: str, output_path: str) -> None: """Generate audio with OpenAI TTS-1 (~$15/M chars).""" api_key = os.environ.get("OPENAI_API_KEY") if not api_key: raise EnvironmentError("OPENAI_API_KEY is not set") chunks = split_into_chunks(text) audio_chunks: list[bytes] = [] for i, chunk in enumerate(chunks, 1): print(f" Chunk {i}/{len(chunks)} ({len(chunk)} chars)...") audio_chunks.append(_openai_synthesize(chunk, api_key)) merge_audio_chunks(audio_chunks, output_path) PROVIDERS = { "google": tts_google, "mistral": tts_mistral, "openai": tts_openai, } def main() -> None: load_dotenv() parser = argparse.ArgumentParser(description="Generate TTS audio for a blog post") parser.add_argument("file", help="Path to the .md blog post") parser.add_argument( "--output-dir", default=".", help="Directory to write the .mp3 (default: .)" ) args = parser.parse_args() md_path = args.file if not os.path.isfile(md_path): print(f"ERROR: file not found: {md_path}", file=sys.stderr) sys.exit(1) with open(md_path, encoding="utf-8") as f: raw = f.read() meta, body = parse_frontmatter(raw) slug = meta.get("slug") or os.path.splitext(os.path.basename(md_path))[0] title = meta.get("title", "") full_text = f"{title}.\n\n{clean_markdown(body)}" if title else clean_markdown(body) provider_name = os.environ.get("TTS_PROVIDER", "google").lower() if provider_name not in PROVIDERS: print( f"ERROR: unknown TTS_PROVIDER '{provider_name}'. Choose from: {', '.join(PROVIDERS)}", file=sys.stderr, ) sys.exit(1) output_path = os.path.join(args.output_dir, f"{slug}.mp3") print( f"Generating audio for '{slug}' using provider '{provider_name}' ({len(full_text)} chars)..." ) try: PROVIDERS[provider_name](full_text, slug, output_path) except Exception as exc: print(f"ERROR: TTS generation failed: {exc}", file=sys.stderr) sys.exit(1) print(f"Saved: {output_path}") if __name__ == "__main__": main()