Miscellaneous code and documentation updates

2026-04-05 16:27:42 +02:00
parent 00cff1eb7e
commit a4191658c5
32 changed files with 346 additions and 105 deletions
@@ -6,8 +6,8 @@ Usage:
    python scripts/tts_generate.py src/content/blog/my-post.md

 Environment variables:
-    TTS_PROVIDER   - "google" (default), "mistral", or "openai"
-    GOOGLE_API_KEY - Required when TTS_PROVIDER=google
+    TTS_PROVIDER    - "google" (default), "mistral", or "openai"
+    GOOGLE_API_KEY  - Required when TTS_PROVIDER=google
    MISTRAL_API_KEY - Required when TTS_PROVIDER=mistral
    OPENAI_API_KEY  - Required when TTS_PROVIDER=openai

@@ -21,6 +21,23 @@ import re
 import sys


+def load_dotenv() -> None:
+    """Load .env from the project root into os.environ (stdlib only, never overwrites)."""
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    env_path = os.path.join(project_root, ".env")
+    if not os.path.isfile(env_path):
+        return
+    with open(env_path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("#") or "=" not in line:
+                continue
+            key, _, value = line.partition("=")
+            key = key.strip()
+            value = value.strip().strip('"').strip("'")
+            os.environ.setdefault(key, value)
+
+
 def parse_frontmatter(text: str) -> tuple[dict, str]:
    """Extract YAML frontmatter and return (metadata_dict, body)."""
    if not text.startswith("---"):
@@ -31,7 +48,7 @@ def parse_frontmatter(text: str) -> tuple[dict, str]:
        return {}, text

    front = text[3:end].strip()
-    body = text[end + 4:].strip()
+    body = text[end + 4 :].strip()

    meta: dict = {}
    for line in front.splitlines():
@@ -69,99 +86,195 @@ def clean_markdown(text: str) -> str:
    return text.strip()


-def tts_google(text: str, slug: str, output_path: str) -> None:
-    """Generate audio with Google Cloud TTS (free tier: 1M chars/month)."""
+def split_into_chunks(text: str, max_chars: int = 4000) -> list[str]:
+    """Split text into chunks that fit within max_chars.
+
+    Splits on paragraph boundaries first; falls back to sentence boundaries
+    for paragraphs that are still too long. No text is ever discarded.
+    """
+    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
+
+    # Flatten into atomic segments (paragraphs or individual sentences)
+    segments: list[str] = []
+    for para in paragraphs:
+        if len(para) <= max_chars:
+            segments.append(para)
+        else:
+            sentences = re.split(r"(?<=[.!?])\s+", para)
+            segments.extend(s for s in sentences if s.strip())
+
+    # Merge segments greedily into chunks under max_chars
+    chunks: list[str] = []
+    current = ""
+    for segment in segments:
+        if not current:
+            current = segment
+        elif len(current) + 2 + len(segment) <= max_chars:
+            current += "\n\n" + segment
+        else:
+            chunks.append(current)
+            current = segment
+    if current:
+        chunks.append(current)
+
+    return chunks
+
+
+def merge_audio_chunks(chunks: list[bytes], output_path: str) -> None:
+    """Concatenate MP3 byte chunks with a ~400ms silent pause between each.
+
+    MP3 frames are self-contained, so byte concatenation produces a valid file.
+    Silent frame: 128kbps/44100Hz frame header + null payload = 417 bytes.
+    16 frames * ~26.1ms each ≈ 418ms of silence.
+    """
+    silent_frame = b"\xff\xfb\x90\x00" + b"\x00" * 413  # 417 bytes
+    silence = silent_frame * 16
+
+    with open(output_path, "wb") as f:
+        for i, chunk in enumerate(chunks):
+            f.write(chunk)
+            if i < len(chunks) - 1:
+                f.write(silence)
+
+
+# ---------------------------------------------------------------------------
+# Provider helpers — return raw MP3 bytes for a single text chunk
+# ---------------------------------------------------------------------------
+
+
+def _google_synthesize(text: str, api_key: str) -> bytes:
+    import base64
    import json
    import urllib.request

+    payload = json.dumps(
+        {
+            "input": {"text": text},
+            "voice": {
+                "languageCode": "nl-NL",
+                "name": "nl-NL-Wavenet-D",
+                "ssmlGender": "MALE",
+            },
+            "audioConfig": {"audioEncoding": "MP3"},
+        }
+    ).encode()
+
+    url = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={api_key}"
+    req = urllib.request.Request(
+        url, data=payload, headers={"Content-Type": "application/json"}
+    )
+    with urllib.request.urlopen(req) as resp:
+        body = json.loads(resp.read())
+
+    return base64.b64decode(body["audioContent"])
+
+
+def _mistral_synthesize(text: str, api_key: str) -> bytes:
+    import base64
+    import json
+    import urllib.error
+    import urllib.request
+
+    body: dict = {
+        "model": "voxtral-mini-tts-2603",
+        "input": text,
+        "response_format": "mp3",
+    }
+    voice_id = os.environ.get("MISTRAL_VOICE_ID", "").strip()
+    if not voice_id:
+        raise EnvironmentError(
+            "MISTRAL_VOICE_ID is not set. "
+            "Create a voice at https://console.mistral.ai and add its ID to .env"
+        )
+    body["voice"] = voice_id
+
+    req = urllib.request.Request(
+        "https://api.mistral.ai/v1/audio/speech",
+        data=json.dumps(body).encode(),
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_key}",
+        },
+    )
+    try:
+        with urllib.request.urlopen(req) as resp:
+            data = json.loads(resp.read())
+            return base64.b64decode(data["audio_data"])
+    except urllib.error.HTTPError as exc:
+        detail = exc.read().decode("utf-8", errors="replace")
+        raise RuntimeError(f"Mistral API {exc.code}: {detail}") from exc
+
+
+def _openai_synthesize(text: str, api_key: str) -> bytes:
+    import json
+    import urllib.request
+
+    payload = json.dumps(
+        {
+            "model": "gpt-4o-mini-tts",
+            "input": text,
+            "voice": "ash",
+            "instructions": "Read aloud in a warm and friendly tone.",
+        }
+    ).encode()
+
+    req = urllib.request.Request(
+        "https://api.openai.com/v1/audio/speech",
+        data=payload,
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_key}",
+        },
+    )
+    with urllib.request.urlopen(req) as resp:
+        return resp.read()
+
+
+# ---------------------------------------------------------------------------
+# Public provider functions — chunk, call API per chunk, merge
+# ---------------------------------------------------------------------------
+
+
+def tts_google(text: str, slug: str, output_path: str) -> None:
+    """Generate audio with Google Cloud TTS (free tier: 1M chars/month)."""
    api_key = os.environ.get("GOOGLE_API_KEY")
    if not api_key:
        raise EnvironmentError("GOOGLE_API_KEY is not set")

-    payload = json.dumps({
-        "input": {"text": text},
-        "voice": {
-            "languageCode": "nl-NL",
-            "name": "nl-NL-Wavenet-D",
-            "ssmlGender": "FEMALE",
-        },
-        "audioConfig": {"audioEncoding": "MP3"},
-    }).encode()
-
-    url = f"https://texttospeech.googleapis.com/v1/text:synthesize?key={api_key}"
-    req = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
-
-    with urllib.request.urlopen(req) as resp:
-        body = json.loads(resp.read())
-
-    import base64
-    audio_bytes = base64.b64decode(body["audioContent"])
-    with open(output_path, "wb") as f:
-        f.write(audio_bytes)
+    chunks = split_into_chunks(text)
+    audio_chunks: list[bytes] = []
+    for i, chunk in enumerate(chunks, 1):
+        print(f"  Chunk {i}/{len(chunks)} ({len(chunk)} chars)...")
+        audio_chunks.append(_google_synthesize(chunk, api_key))
+    merge_audio_chunks(audio_chunks, output_path)


 def tts_mistral(text: str, slug: str, output_path: str) -> None:
    """Generate audio with Mistral Voxtral TTS (~$16/M chars)."""
-    import json
-    import urllib.request
-
    api_key = os.environ.get("MISTRAL_API_KEY")
    if not api_key:
        raise EnvironmentError("MISTRAL_API_KEY is not set")

-    payload = json.dumps({
-        "model": "voxtral-mini-tts-2507",
-        "input": text,
-        "voice": "river",
-    }).encode()
-
-    url = "https://api.mistral.ai/v1/audio/speech"
-    req = urllib.request.Request(
-        url,
-        data=payload,
-        headers={
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {api_key}",
-        },
-    )
-
-    with urllib.request.urlopen(req) as resp:
-        audio_bytes = resp.read()
-
-    with open(output_path, "wb") as f:
-        f.write(audio_bytes)
+    chunks = split_into_chunks(text)
+    audio_chunks: list[bytes] = []
+    for i, chunk in enumerate(chunks, 1):
+        print(f"  Chunk {i}/{len(chunks)} ({len(chunk)} chars)...")
+        audio_chunks.append(_mistral_synthesize(chunk, api_key))
+    merge_audio_chunks(audio_chunks, output_path)


 def tts_openai(text: str, slug: str, output_path: str) -> None:
    """Generate audio with OpenAI TTS-1 (~$15/M chars)."""
-    import json
-    import urllib.request
-
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise EnvironmentError("OPENAI_API_KEY is not set")

-    payload = json.dumps({
-        "model": "tts-1",
-        "input": text,
-        "voice": "nova",
-    }).encode()
-
-    url = "https://api.openai.com/v1/audio/speech"
-    req = urllib.request.Request(
-        url,
-        data=payload,
-        headers={
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {api_key}",
-        },
-    )
-
-    with urllib.request.urlopen(req) as resp:
-        audio_bytes = resp.read()
-
-    with open(output_path, "wb") as f:
-        f.write(audio_bytes)
+    chunks = split_into_chunks(text)
+    audio_chunks: list[bytes] = []
+    for i, chunk in enumerate(chunks, 1):
+        print(f"  Chunk {i}/{len(chunks)} ({len(chunk)} chars)...")
+        audio_chunks.append(_openai_synthesize(chunk, api_key))
+    merge_audio_chunks(audio_chunks, output_path)


 PROVIDERS = {
@@ -172,9 +285,13 @@ PROVIDERS = {


 def main() -> None:
+    load_dotenv()
+
    parser = argparse.ArgumentParser(description="Generate TTS audio for a blog post")
    parser.add_argument("file", help="Path to the .md blog post")
-    parser.add_argument("--output-dir", default=".", help="Directory to write the .mp3 (default: .)")
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write the .mp3 (default: .)"
+    )
    args = parser.parse_args()

    md_path = args.file
@@ -190,17 +307,21 @@ def main() -> None:
    slug = meta.get("slug") or os.path.splitext(os.path.basename(md_path))[0]
    title = meta.get("title", "")

-    # Prepend title so TTS reads it aloud
    full_text = f"{title}.\n\n{clean_markdown(body)}" if title else clean_markdown(body)

    provider_name = os.environ.get("TTS_PROVIDER", "google").lower()
    if provider_name not in PROVIDERS:
-        print(f"ERROR: unknown TTS_PROVIDER '{provider_name}'. Choose from: {', '.join(PROVIDERS)}", file=sys.stderr)
+        print(
+            f"ERROR: unknown TTS_PROVIDER '{provider_name}'. Choose from: {', '.join(PROVIDERS)}",
+            file=sys.stderr,
+        )
        sys.exit(1)

    output_path = os.path.join(args.output_dir, f"{slug}.mp3")

-    print(f"Generating audio for '{slug}' using provider '{provider_name}' ({len(full_text)} chars)...")
+    print(
+        f"Generating audio for '{slug}' using provider '{provider_name}' ({len(full_text)} chars)..."
+    )

    try:
        PROVIDERS[provider_name](full_text, slug, output_path)