Cozy-Den/scripts/tts_generate.py

#!/usr/bin/env python3
"""
Blog Text-to-Speech generator.

Usage:
    python scripts/tts_generate.py src/content/blog/my-post.md

Environment variables:
    TTS_PROVIDER    - "google" (default), "mistral", or "openai"
    GOOGLE_API_KEY  - Required when TTS_PROVIDER=google
    MISTRAL_API_KEY - Required when TTS_PROVIDER=mistral
    OPENAI_API_KEY  - Required when TTS_PROVIDER=openai

Output:
    <slug>.mp3 in the current working directory (or --output-dir if specified)
"""

import argparse
import os
import re
import sys


def load_dotenv() -> None:
    """Load .env from the project root into os.environ (stdlib only, never overwrites)."""
    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    env_path = os.path.join(project_root, ".env")
    if not os.path.isfile(env_path):
        return
    with open(env_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#") or "=" not in line:
                continue
            key, _, value = line.partition("=")
            key = key.strip()
            value = value.strip().strip('"').strip("'")
            os.environ.setdefault(key, value)


def parse_frontmatter(text: str) -> tuple[dict, str]:
    """Extract YAML frontmatter and return (metadata_dict, body)."""
    if not text.startswith("---"):
        return {}, text

    end = text.find("\n---", 3)
    if end == -1:
        return {}, text

    front = text[3:end].strip()
    body = text[end + 4 :].strip()

    meta: dict = {}
    for line in front.splitlines():
        if ":" in line:
            key, _, value = line.partition(":")
            meta[key.strip()] = value.strip().strip('"').strip("'")

    return meta, body


def clean_markdown(text: str) -> str:
    """Strip markdown syntax so TTS reads clean prose."""
    # Remove code blocks (``` ... ```)
    text = re.sub(r"```[\s\S]*?```", "", text)
    # Remove inline code
    text = re.sub(r"`[^`]+`", "", text)
    # Remove images
    text = re.sub(r"!\[.*?\]\(.*?\)", "", text)
    # Convert links to just the link text
    text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text)
    # Remove ATX headings markers but keep text
    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
    # Remove bold/italic markers
    text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text)
    text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
    # Remove horizontal rules
    text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE)
    # Remove blockquote markers
    text = re.sub(r"^>\s?", "", text, flags=re.MULTILINE)
    # Remove list markers
    text = re.sub(r"^[\*\-\+]\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^\d+\.\s+", "", text, flags=re.MULTILINE)
    # Collapse multiple blank lines
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def split_into_chunks(text: str, max_chars: int = 4000) -> list[str]:
    """Split text into chunks that fit within max_chars.

    Splits on paragraph boundaries first; falls back to sentence boundaries
    for paragraphs that are still too long. No text is ever discarded.
    """
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]

    # Flatten into atomic segments (paragraphs or individual sentences)
    segments: list[str] = []
    for para in paragraphs:
        if len(para) <= max_chars:
            segments.append(para)
        else:
            sentences = re.split(r"(?<=[.!?])\s+", para)
            segments.extend(s for s in sentences if s.strip())

    # Merge segments greedily into chunks under max_chars
    chunks: list[str] = []
    current = ""
    for segment in segments:
        if not current:
            current = segment
        elif len(current) + 2 + len(segment) <= max_chars:
            current += "\n\n" + segment
        else:
            chunks.append(current)
            current = segment
    if current:
        chunks.append(current)

    return chunks


def merge_audio_chunks(chunks: list[bytes], output_path: str) -> None:
    """Concatenate MP3 byte chunks with a ~400ms silent pause between each.

    MP3 frames are self-contained, so byte concatenation produces a valid file.
    Silent frame: 128kbps/44100Hz frame header + null payload = 417 bytes.
    16 frames * ~26.1ms each ≈ 418ms of silence.
    """
    silent_frame = b"\xff\xfb\x90\x00" + b"\x00" * 413  # 417 bytes
    silence = silent_frame * 16

    with open(output_path, "wb") as f:
        for i, chunk in enumerate(chunks):
            f.write(chunk)
            if i < len(chunks) - 1:
                f.write(silence)


# ---------------------------------------------------------------------------
# Provider helpers — return raw MP3 bytes for a single text chunk
# ---------------------------------------------------------------------------


def _google_synthesize(text: str, api_key: str) -> bytes:
    import base64
    import json
    import urllib.request

    payload = json.dumps(
        {
            "input": {"text": text},
            "voice": {
                "languageCode": "nl-NL",
                "name": "nl-NL-Wavenet-D",
                "ssmlGender": "MALE",
            },
            "audioConfig": {"audioEncoding": "MP3"},
        }
    ).encode()

    url = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={api_key}"
    req = urllib.request.Request(
        url, data=payload, headers={"Content-Type": "application/json"}
    )
    with urllib.request.urlopen(req) as resp:
        body = json.loads(resp.read())

    return base64.b64decode(body["audioContent"])


def _mistral_synthesize(text: str, api_key: str) -> bytes:
    import base64
    import json
    import urllib.error
    import urllib.request

    body: dict = {
        "model": "voxtral-mini-tts-2603",
        "input": text,
        "response_format": "mp3",
    }
    voice_id = os.environ.get("MISTRAL_VOICE_ID", "").strip()
    if not voice_id:
        raise EnvironmentError(
            "MISTRAL_VOICE_ID is not set. "
            "Create a voice at https://console.mistral.ai and add its ID to .env"
        )
    body["voice"] = voice_id

    req = urllib.request.Request(
        "https://api.mistral.ai/v1/audio/speech",
        data=json.dumps(body).encode(),
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}",
        },
    )
    try:
        with urllib.request.urlopen(req) as resp:
            data = json.loads(resp.read())
            return base64.b64decode(data["audio_data"])
    except urllib.error.HTTPError as exc:
        detail = exc.read().decode("utf-8", errors="replace")
        raise RuntimeError(f"Mistral API {exc.code}: {detail}") from exc


def _openai_synthesize(text: str, api_key: str) -> bytes:
    import json
    import urllib.request

    payload = json.dumps(
        {
            "model": "gpt-4o-mini-tts",
            "input": text,
            "voice": "ash",
            "instructions": "Read aloud in a warm and friendly tone.",
        }
    ).encode()

    req = urllib.request.Request(
        "https://api.openai.com/v1/audio/speech",
        data=payload,
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}",
        },
    )
    with urllib.request.urlopen(req) as resp:
        return resp.read()


# ---------------------------------------------------------------------------
# Public provider functions — chunk, call API per chunk, merge
# ---------------------------------------------------------------------------


def tts_google(text: str, slug: str, output_path: str) -> None:
    """Generate audio with Google Cloud TTS (free tier: 1M chars/month)."""
    api_key = os.environ.get("GOOGLE_API_KEY")
    if not api_key:
        raise EnvironmentError("GOOGLE_API_KEY is not set")

    chunks = split_into_chunks(text)
    audio_chunks: list[bytes] = []
    for i, chunk in enumerate(chunks, 1):
        print(f"  Chunk {i}/{len(chunks)} ({len(chunk)} chars)...")
        audio_chunks.append(_google_synthesize(chunk, api_key))
    merge_audio_chunks(audio_chunks, output_path)


def tts_mistral(text: str, slug: str, output_path: str) -> None:
    """Generate audio with Mistral Voxtral TTS (~$16/M chars)."""
    api_key = os.environ.get("MISTRAL_API_KEY")
    if not api_key:
        raise EnvironmentError("MISTRAL_API_KEY is not set")

    chunks = split_into_chunks(text)
    audio_chunks: list[bytes] = []
    for i, chunk in enumerate(chunks, 1):
        print(f"  Chunk {i}/{len(chunks)} ({len(chunk)} chars)...")
        audio_chunks.append(_mistral_synthesize(chunk, api_key))
    merge_audio_chunks(audio_chunks, output_path)


def tts_openai(text: str, slug: str, output_path: str) -> None:
    """Generate audio with OpenAI TTS-1 (~$15/M chars)."""
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise EnvironmentError("OPENAI_API_KEY is not set")

    chunks = split_into_chunks(text)
    audio_chunks: list[bytes] = []
    for i, chunk in enumerate(chunks, 1):
        print(f"  Chunk {i}/{len(chunks)} ({len(chunk)} chars)...")
        audio_chunks.append(_openai_synthesize(chunk, api_key))
    merge_audio_chunks(audio_chunks, output_path)


PROVIDERS = {
    "google": tts_google,
    "mistral": tts_mistral,
    "openai": tts_openai,
}


def main() -> None:
    load_dotenv()

    parser = argparse.ArgumentParser(description="Generate TTS audio for a blog post")
    parser.add_argument("file", help="Path to the .md blog post")
    parser.add_argument(
        "--output-dir", default=".", help="Directory to write the .mp3 (default: .)"
    )
    args = parser.parse_args()

    md_path = args.file
    if not os.path.isfile(md_path):
        print(f"ERROR: file not found: {md_path}", file=sys.stderr)
        sys.exit(1)

    with open(md_path, encoding="utf-8") as f:
        raw = f.read()

    meta, body = parse_frontmatter(raw)

    slug = meta.get("slug") or os.path.splitext(os.path.basename(md_path))[0]
    title = meta.get("title", "")

    full_text = f"{title}.\n\n{clean_markdown(body)}" if title else clean_markdown(body)

    provider_name = os.environ.get("TTS_PROVIDER", "google").lower()
    if provider_name not in PROVIDERS:
        print(
            f"ERROR: unknown TTS_PROVIDER '{provider_name}'. Choose from: {', '.join(PROVIDERS)}",
            file=sys.stderr,
        )
        sys.exit(1)

    output_path = os.path.join(args.output_dir, f"{slug}.mp3")

    print(
        f"Generating audio for '{slug}' using provider '{provider_name}' ({len(full_text)} chars)..."
    )

    try:
        PROVIDERS[provider_name](full_text, slug, output_path)
    except Exception as exc:
        print(f"ERROR: TTS generation failed: {exc}", file=sys.stderr)
        sys.exit(1)

    print(f"Saved: {output_path}")


if __name__ == "__main__":
    main()