at main 3.0 kB view raw
1#!/usr/bin/env python3 2"""generate audio from a podcast script using gemini TTS. 3 4usage: 5 uv run scripts/generate_tts.py podcast_script.txt output.wav 6 7requires GOOGLE_API_KEY environment variable. 8""" 9# /// script 10# requires-python = ">=3.11" 11# dependencies = ["google-genai"] 12# /// 13 14import io 15import os 16import sys 17import wave 18from pathlib import Path 19 20from google import genai 21from google.genai import types 22 23 24def pcm_to_wav( 25 pcm_data: bytes, sample_rate: int = 24000, channels: int = 1, sample_width: int = 2 26) -> bytes: 27 """wrap raw PCM data in a WAV header.""" 28 buffer = io.BytesIO() 29 with wave.open(buffer, "wb") as wav: 30 wav.setnchannels(channels) 31 wav.setsampwidth(sample_width) 32 wav.setframerate(sample_rate) 33 wav.writeframes(pcm_data) 34 return buffer.getvalue() 35 36 37def main() -> None: 38 if len(sys.argv) != 3: 39 print("usage: generate_tts.py <script_file> <output_file>") 40 sys.exit(1) 41 42 script_path = Path(sys.argv[1]) 43 output_path = Path(sys.argv[2]) 44 45 if not script_path.exists(): 46 print(f"error: {script_path} not found") 47 sys.exit(1) 48 49 api_key = os.environ.get("GOOGLE_API_KEY") 50 if not api_key: 51 print("error: GOOGLE_API_KEY not set") 52 sys.exit(1) 53 54 script = script_path.read_text() 55 print(f"generating audio from {script_path} ({len(script)} chars)") 56 57 client = genai.Client(api_key=api_key) 58 response = client.models.generate_content( 59 model="gemini-2.5-flash-preview-tts", 60 contents=script, 61 config=types.GenerateContentConfig( 62 response_modalities=["AUDIO"], 63 speech_config=types.SpeechConfig( 64 multi_speaker_voice_config=types.MultiSpeakerVoiceConfig( 65 speaker_voice_configs=[ 66 types.SpeakerVoiceConfig( 67 speaker="Host", 68 voice_config=types.VoiceConfig( 69 prebuilt_voice_config=types.PrebuiltVoiceConfig( 70 voice_name="Kore" 71 ) 72 ), 73 ), 74 types.SpeakerVoiceConfig( 75 speaker="Cohost", 76 voice_config=types.VoiceConfig( 77 prebuilt_voice_config=types.PrebuiltVoiceConfig( 78 voice_name="Puck" 79 ) 80 ), 81 ), 82 ] 83 ) 84 ), 85 ), 86 ) 87 88 # gemini returns raw PCM (audio/L16;codec=pcm;rate=24000), wrap in WAV header 89 pcm_data = response.candidates[0].content.parts[0].inline_data.data 90 wav_data = pcm_to_wav(pcm_data) 91 output_path.write_bytes(wav_data) 92 print(f"saved audio to {output_path} ({len(wav_data)} bytes)") 93 94 95if __name__ == "__main__": 96 main()