this repo has no description
1use axum::extract::Query;
2use axum::http::header;
3use axum::response::{IntoResponse, Response};
4use msedge_tts::tts::client::connect_async;
5use msedge_tts::tts::SpeechConfig;
6use sha2::{Digest, Sha256};
7use std::path::PathBuf;
8
9use crate::errors::AppError;
10
11const DEFAULT_VOICE: &str = "fil-PH-BlessicaNeural";
12const ALLOWED_VOICES: &[&str] = &["fil-PH-BlessicaNeural", "fil-PH-AngeloNeural"];
13const MAX_TEXT_LEN: usize = 500;
14const AUDIO_FORMAT: &str = "audio-24khz-48kbitrate-mono-mp3";
15
16#[derive(serde::Deserialize)]
17pub struct TtsQuery {
18 text: String,
19 voice: Option<String>,
20}
21
22fn cache_dir() -> PathBuf {
23 let dir = PathBuf::from(std::env::var("DATA_DIR").unwrap_or_else(|_| "data".to_string()))
24 .join("tts-cache");
25 let _ = std::fs::create_dir_all(&dir);
26 dir
27}
28
29fn cache_key(text: &str, voice: &str) -> String {
30 let mut hasher = Sha256::new();
31 hasher.update(text.as_bytes());
32 hasher.update(b"|");
33 hasher.update(voice.as_bytes());
34 hex::encode(hasher.finalize())
35}
36
37pub async fn synthesize(Query(params): Query<TtsQuery>) -> Result<Response, AppError> {
38 let text = params.text.trim().to_string();
39 if text.is_empty() {
40 return Err(AppError::BadRequest("text is required".to_string()));
41 }
42 if text.len() > MAX_TEXT_LEN {
43 return Err(AppError::BadRequest(format!(
44 "text exceeds {MAX_TEXT_LEN} characters"
45 )));
46 }
47
48 let voice = params.voice.unwrap_or_else(|| DEFAULT_VOICE.to_string());
49 if !ALLOWED_VOICES.contains(&voice.as_str()) {
50 return Err(AppError::BadRequest(format!("unsupported voice: {voice}")));
51 }
52
53 // Check disk cache
54 let key = cache_key(&text, &voice);
55 let cache_path = cache_dir().join(format!("{key}.mp3"));
56
57 if let Ok(bytes) = tokio::fs::read(&cache_path).await {
58 return Ok(audio_response(bytes));
59 }
60
61 // Synthesize via Edge TTS
62 let mut tts = connect_async()
63 .await
64 .map_err(|e| AppError::Internal(format!("TTS connect failed: {e}")))?;
65
66 let config = SpeechConfig {
67 voice_name: voice,
68 audio_format: AUDIO_FORMAT.to_string(),
69 pitch: 0,
70 rate: 0,
71 volume: 0,
72 };
73
74 let audio = tts
75 .synthesize(&text, &config)
76 .await
77 .map_err(|e| AppError::Internal(format!("TTS synthesis failed: {e}")))?;
78
79 // Write to cache (best-effort)
80 let _ = tokio::fs::write(&cache_path, &audio.audio_bytes).await;
81
82 Ok(audio_response(audio.audio_bytes))
83}
84
85fn audio_response(bytes: Vec<u8>) -> Response {
86 (
87 [
88 (header::CONTENT_TYPE, "audio/mpeg"),
89 (header::CACHE_CONTROL, "public, max-age=86400"),
90 ],
91 bytes,
92 )
93 .into_response()
94}