music on atproto
plyr.fm
1#!/usr/bin/env python3
2"""generate audio from a podcast script using gemini TTS.
3
4usage:
5 uv run scripts/generate_tts.py podcast_script.txt output.wav
6
7requires GOOGLE_API_KEY environment variable.
8"""
9# /// script
10# requires-python = ">=3.11"
11# dependencies = ["google-genai"]
12# ///
13
14import io
15import os
16import sys
17import wave
18from pathlib import Path
19
20from google import genai
21from google.genai import types
22
23
24def pcm_to_wav(
25 pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, sample_width: int = 2
26) -> bytes:
27 """wrap raw PCM data in a WAV header."""
28 buffer = io.BytesIO()
29 with wave.open(buffer, "wb") as wav:
30 wav.setnchannels(channels)
31 wav.setsampwidth(sample_width)
32 wav.setframerate(sample_rate)
33 wav.writeframes(pcm_data)
34 return buffer.getvalue()
35
36
37def main() -> None:
38 if len(sys.argv) != 3:
39 print("usage: generate_tts.py <script_file> <output_file>")
40 sys.exit(1)
41
42 script_path = Path(sys.argv[1])
43 output_path = Path(sys.argv[2])
44
45 if not script_path.exists():
46 print(f"error: {script_path} not found")
47 sys.exit(1)
48
49 api_key = os.environ.get("GOOGLE_API_KEY")
50 if not api_key:
51 print("error: GOOGLE_API_KEY not set")
52 sys.exit(1)
53
54 script = script_path.read_text()
55 print(f"generating audio from {script_path} ({len(script)} chars)")
56
57 client = genai.Client(api_key=api_key)
58 response = client.models.generate_content(
59 model="gemini-2.5-flash-preview-tts",
60 contents=script,
61 config=types.GenerateContentConfig(
62 response_modalities=["AUDIO"],
63 speech_config=types.SpeechConfig(
64 multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
65 speaker_voice_configs=[
66 types.SpeakerVoiceConfig(
67 speaker="Host",
68 voice_config=types.VoiceConfig(
69 prebuilt_voice_config=types.PrebuiltVoiceConfig(
70 voice_name="Kore"
71 )
72 ),
73 ),
74 types.SpeakerVoiceConfig(
75 speaker="Cohost",
76 voice_config=types.VoiceConfig(
77 prebuilt_voice_config=types.PrebuiltVoiceConfig(
78 voice_name="Puck"
79 )
80 ),
81 ),
82 ]
83 )
84 ),
85 ),
86 )
87
88 # gemini returns raw PCM (audio/L16;codec=pcm;rate=24000), wrap in WAV header
89 pcm_data = response.candidates[0].content.parts[0].inline_data.data
90 wav_data = pcm_to_wav(pcm_data)
91 output_path.write_bytes(wav_data)
92 print(f"saved audio to {output_path} ({len(wav_data)} bytes)")
93
94
95if __name__ == "__main__":
96 main()