personal memory agent
1# SPDX-License-Identifier: AGPL-3.0-only
2# Copyright (c) 2026 sol pbc
3
4"""Tests for observe.vad module."""
5
6from unittest.mock import patch
7
8import numpy as np
9
10from observe.utils import SAMPLE_RATE
11from observe.vad import (
12 GAP_BUFFER,
13 AudioReduction,
14 SpeechSegment,
15 VadResult,
16 compute_nonspeech_rms,
17 get_nonspeech_segments,
18 reduce_audio,
19 restore_statement_timestamps,
20 run_vad,
21)
22
23
24class TestVadResult:
25 """Test VadResult dataclass."""
26
27 def test_vad_result_fields(self):
28 """VadResult should have all expected fields."""
29 result = VadResult(
30 duration=10.0,
31 speech_duration=5.0,
32 has_speech=True,
33 speech_segments=[(1.0, 3.0), (5.0, 8.0)],
34 )
35
36 assert result.duration == 10.0
37 assert result.speech_duration == 5.0
38 assert result.has_speech is True
39 assert result.speech_segments == [(1.0, 3.0), (5.0, 8.0)]
40
41 def test_vad_result_no_speech(self):
42 """VadResult with no speech should have has_speech=False."""
43 result = VadResult(
44 duration=5.0,
45 speech_duration=0.0,
46 has_speech=False,
47 speech_segments=[],
48 )
49
50 assert result.duration == 5.0
51 assert result.speech_duration == 0.0
52 assert result.has_speech is False
53 assert result.speech_segments == []
54
55 def test_vad_result_default_speech_segments(self):
56 """VadResult speech_segments should default to empty list."""
57 result = VadResult(
58 duration=5.0,
59 speech_duration=0.0,
60 has_speech=False,
61 )
62
63 assert result.speech_segments == []
64
65 def test_vad_result_rms_fields(self):
66 """VadResult should have RMS fields with defaults."""
67 result = VadResult(
68 duration=10.0,
69 speech_duration=5.0,
70 has_speech=True,
71 )
72
73 # Default values
74 assert result.noisy_rms is None
75 assert result.noisy_s == 0.0
76
77 def test_vad_result_with_rms(self):
78 """VadResult should accept RMS values."""
79 result = VadResult(
80 duration=10.0,
81 speech_duration=5.0,
82 has_speech=True,
83 noisy_rms=0.015,
84 noisy_s=3.5,
85 )
86
87 assert result.noisy_rms == 0.015
88 assert result.noisy_s == 3.5
89
90 def test_is_noisy_above_threshold(self):
91 """is_noisy() should return True when RMS exceeds threshold."""
92 result = VadResult(
93 duration=10.0,
94 speech_duration=5.0,
95 has_speech=True,
96 noisy_rms=0.015, # Above default 0.01 threshold
97 )
98
99 assert result.is_noisy() is True
100
101 def test_is_noisy_below_threshold(self):
102 """is_noisy() should return False when RMS is below threshold."""
103 result = VadResult(
104 duration=10.0,
105 speech_duration=5.0,
106 has_speech=True,
107 noisy_rms=0.005, # Below default 0.01 threshold
108 )
109
110 assert result.is_noisy() is False
111
112 def test_is_noisy_none_rms(self):
113 """is_noisy() should return False when RMS is None."""
114 result = VadResult(
115 duration=10.0,
116 speech_duration=5.0,
117 has_speech=True,
118 noisy_rms=None,
119 )
120
121 assert result.is_noisy() is False
122
123 def test_is_noisy_custom_threshold(self):
124 """is_noisy() should respect custom threshold."""
125 result = VadResult(
126 duration=10.0,
127 speech_duration=5.0,
128 has_speech=True,
129 noisy_rms=0.015,
130 )
131
132 # With default threshold (0.01), should be noisy
133 assert result.is_noisy() is True
134
135 # With higher threshold (0.02), should not be noisy
136 assert result.is_noisy(threshold=0.02) is False
137
138 def test_speech_ratio(self):
139 """speech_ratio should return speech_duration / duration."""
140 result = VadResult(
141 duration=10.0,
142 speech_duration=7.5,
143 has_speech=True,
144 )
145 assert result.speech_ratio == 0.75
146
147 def test_speech_ratio_zero_duration(self):
148 """speech_ratio should return 0.0 when duration is zero."""
149 result = VadResult(
150 duration=0.0,
151 speech_duration=0.0,
152 has_speech=False,
153 )
154 assert result.speech_ratio == 0.0
155
156
157class TestGetNonspeechSegments:
158 """Test get_nonspeech_segments function."""
159
160 def test_leading_silence(self):
161 """Should detect leading silence before first speech."""
162 speech_segments = [(2.0, 4.0)]
163 nonspeech = get_nonspeech_segments(speech_segments, 5.0)
164
165 assert (0.0, 2.0) in nonspeech
166
167 def test_trailing_silence(self):
168 """Should detect trailing silence after last speech."""
169 speech_segments = [(1.0, 3.0)]
170 nonspeech = get_nonspeech_segments(speech_segments, 5.0)
171
172 assert (3.0, 5.0) in nonspeech
173
174 def test_gap_between_segments(self):
175 """Should detect gaps between speech segments."""
176 speech_segments = [(1.0, 2.0), (4.0, 5.0)]
177 nonspeech = get_nonspeech_segments(speech_segments, 6.0)
178
179 assert (2.0, 4.0) in nonspeech
180
181 def test_all_regions(self):
182 """Should detect leading, middle, and trailing silence."""
183 speech_segments = [(1.0, 2.0), (4.0, 5.0)]
184 nonspeech = get_nonspeech_segments(speech_segments, 7.0)
185
186 assert nonspeech == [(0.0, 1.0), (2.0, 4.0), (5.0, 7.0)]
187
188 def test_no_speech_segments(self):
189 """Should return empty list when no speech segments."""
190 nonspeech = get_nonspeech_segments([], 5.0)
191
192 assert nonspeech == []
193
194 def test_speech_fills_entire_audio(self):
195 """Should return empty list when speech fills entire audio."""
196 speech_segments = [(0.0, 5.0)]
197 nonspeech = get_nonspeech_segments(speech_segments, 5.0)
198
199 assert nonspeech == []
200
201 def test_adjacent_segments(self):
202 """Should not create zero-length gaps between adjacent segments."""
203 speech_segments = [(1.0, 2.0), (2.0, 3.0)]
204 nonspeech = get_nonspeech_segments(speech_segments, 4.0)
205
206 # Should only have leading and trailing, no gap between adjacent segments
207 assert nonspeech == [(0.0, 1.0), (3.0, 4.0)]
208
209
210class TestComputeNonspeechRms:
211 """Test compute_nonspeech_rms function."""
212
213 def test_silent_audio_returns_zero_rms(self):
214 """Silent audio should have RMS near zero."""
215 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
216 speech_segments = [(1.0, 2.0)] # Speech in middle
217
218 rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE)
219
220 assert rms is not None
221 assert rms < 0.001 # Effectively zero
222
223 def test_noisy_audio_returns_high_rms(self):
224 """Noisy audio should have measurable RMS."""
225 # Create audio with noise (amplitude 0.1)
226 audio = np.random.uniform(-0.1, 0.1, 5 * SAMPLE_RATE).astype(np.float32)
227 # Put "speech" in middle (doesn't affect RMS calculation of non-speech)
228 speech_segments = [(2.0, 3.0)]
229
230 rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE)
231
232 assert rms is not None
233 assert rms > 0.01 # Noisy threshold
234
235 def test_returns_duration_used(self):
236 """Should return total duration of non-speech segments used."""
237 audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32)
238 # Speech from 2-4s and 6-8s, leaving gaps at 0-2, 4-6, 8-10
239 speech_segments = [(2.0, 4.0), (6.0, 8.0)]
240
241 rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE)
242
243 # All three gaps are >= 0.5s (MIN_NONSPEECH_SEGMENT)
244 # Total non-speech: 2 + 2 + 2 = 6 seconds
245 assert duration == 6.0
246
247 def test_filters_short_segments(self):
248 """Should filter out non-speech segments shorter than min_segment."""
249 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
250 # Speech leaves only 0.3s gaps (below default 0.5s threshold)
251 speech_segments = [(0.3, 1.0), (1.3, 2.0), (2.3, 5.0)]
252
253 rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE)
254
255 # No qualifying segments
256 assert rms is None
257 assert duration == 0.0
258
259 def test_no_speech_segments_returns_none(self):
260 """Should return None when no speech segments (can't compute non-speech)."""
261 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
262
263 rms, duration = compute_nonspeech_rms(audio, [], SAMPLE_RATE)
264
265 assert rms is None
266 assert duration == 0.0
267
268 def test_custom_min_segment(self):
269 """Should respect custom min_segment threshold."""
270 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
271 # Speech from 1-2s, leaving 1s gap at start
272 speech_segments = [(1.0, 2.0)]
273
274 # With default 0.5s threshold, should include leading gap
275 rms, duration = compute_nonspeech_rms(
276 audio, speech_segments, SAMPLE_RATE, min_segment=0.5
277 )
278 assert duration == 4.0 # 1s leading + 3s trailing
279
280 # With 2.0s threshold, should only include trailing gap (3s)
281 rms, duration = compute_nonspeech_rms(
282 audio, speech_segments, SAMPLE_RATE, min_segment=2.0
283 )
284 assert duration == 3.0
285
286
287class TestRunVad:
288 """Test run_vad function."""
289
290 @patch("faster_whisper.vad.get_speech_timestamps")
291 def test_silent_audio_returns_no_speech(self, mock_get_timestamps):
292 """Silent audio should return has_speech=False."""
293 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
294 mock_get_timestamps.return_value = []
295
296 result = run_vad(audio, min_speech_seconds=1.0)
297
298 assert result.duration == 5.0
299 assert result.speech_duration == 0.0
300 assert result.has_speech is False
301
302 @patch("faster_whisper.vad.get_speech_timestamps")
303 def test_speech_audio_returns_has_speech(self, mock_get_timestamps):
304 """Audio with speech should return has_speech=True."""
305 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
306 # Mock: 2 seconds of speech (samples 16000-48000)
307 mock_get_timestamps.return_value = [{"start": 16000, "end": 48000}]
308
309 result = run_vad(audio, min_speech_seconds=1.0)
310
311 assert result.duration == 5.0
312 assert result.speech_duration == 2.0
313 assert result.has_speech is True
314 # Speech segments should be converted to seconds
315 assert result.speech_segments == [(1.0, 3.0)]
316
317 @patch("faster_whisper.vad.get_speech_timestamps")
318 def test_speech_below_threshold(self, mock_get_timestamps):
319 """Speech below threshold should return has_speech=False."""
320 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
321 # Mock: 0.5 seconds of speech (below 1.0s threshold)
322 mock_get_timestamps.return_value = [{"start": 0, "end": 8000}]
323
324 result = run_vad(audio, min_speech_seconds=1.0)
325
326 assert result.duration == 5.0
327 assert result.speech_duration == 0.5
328 assert result.has_speech is False
329
330 @patch("faster_whisper.vad.get_speech_timestamps")
331 def test_custom_min_speech_threshold(self, mock_get_timestamps):
332 """Custom min_speech_seconds threshold should be respected."""
333 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
334 # Mock: 0.5 seconds of speech
335 mock_get_timestamps.return_value = [{"start": 0, "end": 8000}]
336
337 # With 0.3s threshold, should have speech
338 result = run_vad(audio, min_speech_seconds=0.3)
339 assert result.has_speech is True
340
341 # With 1.0s threshold, should not have speech
342 result = run_vad(audio, min_speech_seconds=1.0)
343 assert result.has_speech is False
344
345 @patch("faster_whisper.vad.get_speech_timestamps")
346 def test_multiple_speech_chunks(self, mock_get_timestamps):
347 """Multiple speech chunks should be summed correctly."""
348 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
349 # Mock: Two 1-second speech segments
350 mock_get_timestamps.return_value = [
351 {"start": 16000, "end": 32000}, # 1 second
352 {"start": 48000, "end": 64000}, # 1 second
353 ]
354
355 result = run_vad(audio, min_speech_seconds=1.0)
356
357 assert result.duration == 5.0
358 assert result.speech_duration == 2.0
359 assert result.has_speech is True
360
361 @patch("faster_whisper.vad.get_speech_timestamps")
362 def test_returns_rms_for_silent_background(self, mock_get_timestamps):
363 """run_vad should return low RMS for silent non-speech regions."""
364 # Silent audio (zeros)
365 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
366 # Speech from 1-3s, leaving non-speech at 0-1s and 3-5s
367 mock_get_timestamps.return_value = [{"start": 16000, "end": 48000}]
368
369 result = run_vad(audio, min_speech_seconds=1.0)
370
371 assert result.noisy_rms is not None
372 assert result.noisy_rms < 0.001 # Effectively zero
373 assert result.noisy_s == 3.0 # 1s leading + 2s trailing
374
375 @patch("faster_whisper.vad.get_speech_timestamps")
376 def test_returns_rms_for_noisy_background(self, mock_get_timestamps):
377 """run_vad should return measurable RMS for noisy non-speech regions."""
378 # Noisy audio
379 np.random.seed(42)
380 audio = np.random.uniform(-0.1, 0.1, 5 * SAMPLE_RATE).astype(np.float32)
381 # Speech from 1-3s
382 mock_get_timestamps.return_value = [{"start": 16000, "end": 48000}]
383
384 result = run_vad(audio, min_speech_seconds=1.0)
385
386 assert result.noisy_rms is not None
387 assert result.noisy_rms > 0.01 # Noisy threshold
388 assert result.noisy_s == 3.0
389
390 @patch("faster_whisper.vad.get_speech_timestamps")
391 def test_returns_none_rms_when_no_qualifying_segments(self, mock_get_timestamps):
392 """run_vad should return None RMS when no qualifying non-speech segments."""
393 audio = np.zeros(2 * SAMPLE_RATE, dtype=np.float32)
394 # Speech fills most of audio, leaving only 0.2s gaps (below 0.5s threshold)
395 mock_get_timestamps.return_value = [
396 {"start": 3200, "end": 12800}, # 0.2s to 0.8s
397 {"start": 16000, "end": 28800}, # 1.0s to 1.8s
398 ]
399
400 result = run_vad(audio, min_speech_seconds=0.5)
401
402 assert result.noisy_rms is None
403 assert result.noisy_s == 0.0
404
405
406class TestSpeechSegment:
407 """Test SpeechSegment dataclass."""
408
409 def test_speech_segment_fields(self):
410 """SpeechSegment should have all expected fields."""
411 seg = SpeechSegment(
412 original_start=5.0,
413 original_end=10.0,
414 reduced_start=2.0,
415 reduced_end=7.0,
416 )
417
418 assert seg.original_start == 5.0
419 assert seg.original_end == 10.0
420 assert seg.reduced_start == 2.0
421 assert seg.reduced_end == 7.0
422
423
424class TestAudioReduction:
425 """Test AudioReduction dataclass and timestamp restoration."""
426
427 def test_empty_reduction(self):
428 """Empty reduction should return timestamp unchanged."""
429 reduction = AudioReduction()
430 assert reduction.restore_timestamp(5.0) == 5.0
431
432 def test_single_segment_restoration(self):
433 """Single segment should restore timestamps within segment."""
434 reduction = AudioReduction(
435 segments=[
436 SpeechSegment(
437 original_start=3.0,
438 original_end=8.0,
439 reduced_start=0.0,
440 reduced_end=5.0,
441 )
442 ],
443 original_duration=10.0,
444 reduced_duration=5.0,
445 )
446
447 # Reduced time 0.0 -> original 3.0
448 assert reduction.restore_timestamp(0.0) == 3.0
449
450 # Reduced time 2.5 -> original 5.5 (midpoint)
451 assert reduction.restore_timestamp(2.5) == 5.5
452
453 # Reduced time 5.0 -> original 8.0
454 assert reduction.restore_timestamp(5.0) == 8.0
455
456 def test_multiple_segments_restoration(self):
457 """Multiple segments should restore timestamps correctly."""
458 # Simulates: original 10s audio with speech at [1-3] and [7-9]
459 # with 4s gap trimmed to 2s, so reduced audio is 6s
460 reduction = AudioReduction(
461 segments=[
462 SpeechSegment(
463 original_start=1.0,
464 original_end=3.0,
465 reduced_start=1.0,
466 reduced_end=3.0,
467 ),
468 SpeechSegment(
469 original_start=7.0,
470 original_end=9.0,
471 reduced_start=5.0, # 3.0 + 2.0 gap
472 reduced_end=7.0,
473 ),
474 ],
475 original_duration=10.0,
476 reduced_duration=8.0,
477 )
478
479 # First segment: reduced 1.0 -> original 1.0
480 assert reduction.restore_timestamp(1.0) == 1.0
481
482 # First segment: reduced 2.0 -> original 2.0
483 assert reduction.restore_timestamp(2.0) == 2.0
484
485 # Second segment: reduced 5.0 -> original 7.0
486 assert reduction.restore_timestamp(5.0) == 7.0
487
488 # Second segment: reduced 6.0 -> original 8.0
489 assert reduction.restore_timestamp(6.0) == 8.0
490
491 def test_timestamp_in_gap(self):
492 """Timestamp in reduced gap should map proportionally to original gap."""
493 reduction = AudioReduction(
494 segments=[
495 SpeechSegment(
496 original_start=0.0,
497 original_end=2.0,
498 reduced_start=0.0,
499 reduced_end=2.0,
500 ),
501 SpeechSegment(
502 original_start=8.0,
503 original_end=10.0,
504 reduced_start=4.0, # 2.0 + 2.0 reduced gap
505 reduced_end=6.0,
506 ),
507 ],
508 original_duration=10.0,
509 reduced_duration=6.0,
510 )
511
512 # Gap in reduced: 2.0-4.0 (2s), original: 2.0-8.0 (6s)
513 # Reduced 3.0 is midpoint of gap -> original 5.0 (midpoint of 2-8)
514 result = reduction.restore_timestamp(3.0)
515 assert abs(result - 5.0) < 0.1 # Allow small tolerance
516
517 def test_timestamp_after_all_segments(self):
518 """Timestamp after all segments should extrapolate from last segment."""
519 reduction = AudioReduction(
520 segments=[
521 SpeechSegment(
522 original_start=0.0,
523 original_end=5.0,
524 reduced_start=0.0,
525 reduced_end=5.0,
526 ),
527 ],
528 original_duration=10.0,
529 reduced_duration=6.0,
530 )
531
532 # Reduced 6.0 is 1.0 after last segment end -> original 6.0
533 assert reduction.restore_timestamp(6.0) == 6.0
534
535 def test_timestamp_before_first_segment(self):
536 """Timestamp before first segment should map to leading buffer region."""
537 # Simulates: original audio with 5s silence then speech at [5-10]
538 # Leading 5s gap reduced to 1s buffer, so speech starts at reduced 1.0
539 reduction = AudioReduction(
540 segments=[
541 SpeechSegment(
542 original_start=5.0,
543 original_end=10.0,
544 reduced_start=1.0, # 1s buffer before speech
545 reduced_end=6.0,
546 ),
547 ],
548 original_duration=10.0,
549 reduced_duration=6.0,
550 )
551
552 # Reduced 0.0 is 1.0 before first segment start (5.0) -> original 4.0
553 assert reduction.restore_timestamp(0.0) == 4.0
554
555 # Reduced 0.5 is 0.5 before first segment start (5.0) -> original 4.5
556 assert reduction.restore_timestamp(0.5) == 4.5
557
558 # Reduced 1.0 is exactly at first segment start -> original 5.0
559 assert reduction.restore_timestamp(1.0) == 5.0
560
561
562class TestRestoreSegmentTimestamps:
563 """Test restore_statement_timestamps function."""
564
565 def test_restores_segment_timestamps(self):
566 """Should restore segment start and end timestamps."""
567 segments = [
568 {"id": 1, "start": 0.0, "end": 2.0, "text": "Hello"},
569 {"id": 2, "start": 4.0, "end": 6.0, "text": "World"},
570 ]
571
572 reduction = AudioReduction(
573 segments=[
574 SpeechSegment(
575 original_start=0.0,
576 original_end=2.0,
577 reduced_start=0.0,
578 reduced_end=2.0,
579 ),
580 SpeechSegment(
581 original_start=6.0,
582 original_end=8.0,
583 reduced_start=4.0,
584 reduced_end=6.0,
585 ),
586 ],
587 original_duration=10.0,
588 reduced_duration=8.0,
589 )
590
591 restored = restore_statement_timestamps(segments, reduction)
592
593 assert restored[0]["start"] == 0.0
594 assert restored[0]["end"] == 2.0
595 assert restored[1]["start"] == 6.0
596 assert restored[1]["end"] == 8.0
597
598 def test_restores_word_timestamps(self):
599 """Should restore word-level timestamps."""
600 segments = [
601 {
602 "id": 1,
603 "start": 4.0,
604 "end": 6.0,
605 "text": "Hello world",
606 "words": [
607 {"word": "Hello", "start": 4.0, "end": 5.0, "probability": 0.9},
608 {"word": "world", "start": 5.0, "end": 6.0, "probability": 0.9},
609 ],
610 },
611 ]
612
613 reduction = AudioReduction(
614 segments=[
615 SpeechSegment(
616 original_start=8.0,
617 original_end=10.0,
618 reduced_start=4.0,
619 reduced_end=6.0,
620 ),
621 ],
622 original_duration=12.0,
623 reduced_duration=8.0,
624 )
625
626 restored = restore_statement_timestamps(segments, reduction)
627
628 assert restored[0]["start"] == 8.0
629 assert restored[0]["end"] == 10.0
630 assert restored[0]["words"][0]["start"] == 8.0
631 assert restored[0]["words"][0]["end"] == 9.0
632 assert restored[0]["words"][1]["start"] == 9.0
633 assert restored[0]["words"][1]["end"] == 10.0
634
635 def test_preserves_other_fields(self):
636 """Should preserve non-timestamp fields."""
637 segments = [
638 {
639 "id": 1,
640 "start": 0.0,
641 "end": 2.0,
642 "text": "Hello",
643 "custom_field": "preserved",
644 },
645 ]
646
647 reduction = AudioReduction(
648 segments=[
649 SpeechSegment(
650 original_start=0.0,
651 original_end=2.0,
652 reduced_start=0.0,
653 reduced_end=2.0,
654 ),
655 ],
656 original_duration=5.0,
657 reduced_duration=2.0,
658 )
659
660 restored = restore_statement_timestamps(segments, reduction)
661
662 assert restored[0]["text"] == "Hello"
663 assert restored[0]["custom_field"] == "preserved"
664 assert restored[0]["id"] == 1
665
666 def test_handles_empty_segments(self):
667 """Should handle empty segment list."""
668 reduction = AudioReduction(
669 segments=[
670 SpeechSegment(
671 original_start=0.0,
672 original_end=5.0,
673 reduced_start=0.0,
674 reduced_end=5.0,
675 ),
676 ],
677 original_duration=10.0,
678 reduced_duration=5.0,
679 )
680
681 restored = restore_statement_timestamps([], reduction)
682 assert restored == []
683
684 def test_handles_segments_without_words(self):
685 """Should handle segments without words field."""
686 segments = [{"id": 1, "start": 0.0, "end": 2.0, "text": "Hello"}]
687
688 reduction = AudioReduction(
689 segments=[
690 SpeechSegment(
691 original_start=5.0,
692 original_end=7.0,
693 reduced_start=0.0,
694 reduced_end=2.0,
695 ),
696 ],
697 original_duration=10.0,
698 reduced_duration=2.0,
699 )
700
701 restored = restore_statement_timestamps(segments, reduction)
702
703 assert restored[0]["start"] == 5.0
704 assert restored[0]["end"] == 7.0
705 assert "words" not in restored[0]
706
707
708class TestReduceAudio:
709 """Test reduce_audio function."""
710
711 def test_no_gaps_to_reduce(self):
712 """Should return None when no gaps > 2s exist."""
713 # 5s audio with speech from 0.5-1.5s and 2.0-3.0s (gap = 0.5s < 2s)
714 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
715
716 vad_result = VadResult(
717 duration=5.0,
718 speech_duration=2.0,
719 has_speech=True,
720 speech_segments=[(0.5, 1.5), (2.0, 3.0)],
721 )
722
723 reduced_audio, reduction = reduce_audio(audio, vad_result)
724
725 assert reduced_audio is None
726 assert reduction is None
727
728 def test_no_speech_segments(self):
729 """Should return None when no speech segments."""
730 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
731
732 vad_result = VadResult(
733 duration=5.0,
734 speech_duration=0.0,
735 has_speech=False,
736 speech_segments=[],
737 )
738
739 reduced_audio, reduction = reduce_audio(audio, vad_result)
740
741 assert reduced_audio is None
742 assert reduction is None
743
744 def test_leading_gap_reduction(self):
745 """Should trim leading gap > 2s to GAP_BUFFER."""
746 # 10s audio with speech starting at 5s (leading gap = 5s > 2s)
747 audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32)
748
749 vad_result = VadResult(
750 duration=10.0,
751 speech_duration=3.0,
752 has_speech=True,
753 speech_segments=[(5.0, 8.0)], # Speech from 5-8s
754 )
755
756 reduced_audio, reduction = reduce_audio(audio, vad_result)
757
758 assert reduced_audio is not None
759 assert reduction is not None
760
761 # Should have: GAP_BUFFER (1s) + speech (3s) + trailing (2s) = 6s
762 # But trailing is <= 2s, so kept in full
763 expected_duration = GAP_BUFFER + 3.0 + 2.0 # 6s
764 actual_duration = len(reduced_audio) / SAMPLE_RATE
765 assert abs(actual_duration - expected_duration) < 0.1
766
767 # Check mapping: speech should start at GAP_BUFFER in reduced audio
768 assert len(reduction.segments) == 1
769 assert reduction.segments[0].original_start == 5.0
770 assert reduction.segments[0].reduced_start == GAP_BUFFER
771
772 def test_trailing_gap_reduction(self):
773 """Should trim trailing gap > 2s to GAP_BUFFER."""
774 # 10s audio with speech from 1-3s (trailing gap = 7s > 2s)
775 audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32)
776
777 vad_result = VadResult(
778 duration=10.0,
779 speech_duration=2.0,
780 has_speech=True,
781 speech_segments=[(1.0, 3.0)], # Speech from 1-3s
782 )
783
784 reduced_audio, reduction = reduce_audio(audio, vad_result)
785
786 assert reduced_audio is not None
787 assert reduction is not None
788
789 # Should have: leading (1s) + speech (2s) + GAP_BUFFER (1s) = 4s
790 expected_duration = 1.0 + 2.0 + GAP_BUFFER # 4s
791 actual_duration = len(reduced_audio) / SAMPLE_RATE
792 assert abs(actual_duration - expected_duration) < 0.1
793
794 def test_middle_gap_reduction(self):
795 """Should trim middle gap > 2s to 2*GAP_BUFFER."""
796 # 10s audio with speech at 0-2s and 7-9s (gap = 5s > 2s)
797 audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32)
798
799 vad_result = VadResult(
800 duration=10.0,
801 speech_duration=4.0,
802 has_speech=True,
803 speech_segments=[(0.0, 2.0), (7.0, 9.0)],
804 )
805
806 reduced_audio, reduction = reduce_audio(audio, vad_result)
807
808 assert reduced_audio is not None
809 assert reduction is not None
810
811 # Should have: speech1 (2s) + trimmed gap (2s) + speech2 (2s) + trailing (1s) = 7s
812 expected_duration = 2.0 + 2 * GAP_BUFFER + 2.0 + 1.0 # 7s
813 actual_duration = len(reduced_audio) / SAMPLE_RATE
814 assert abs(actual_duration - expected_duration) < 0.1
815
816 # Check mapping
817 assert len(reduction.segments) == 2
818 assert reduction.segments[0].original_start == 0.0
819 assert reduction.segments[0].reduced_start == 0.0
820 assert reduction.segments[1].original_start == 7.0
821 # Second segment should start at: speech1_end + trimmed_gap = 2.0 + 2.0 = 4.0
822 assert abs(reduction.segments[1].reduced_start - 4.0) < 0.1
823
824 def test_multiple_gaps_reduction(self):
825 """Should trim multiple gaps > 2s."""
826 # 20s audio with speech at 5-7, 12-14, and 19-20 (two big gaps)
827 audio = np.zeros(20 * SAMPLE_RATE, dtype=np.float32)
828
829 vad_result = VadResult(
830 duration=20.0,
831 speech_duration=5.0,
832 has_speech=True,
833 speech_segments=[(5.0, 7.0), (12.0, 14.0), (19.0, 20.0)],
834 )
835
836 reduced_audio, reduction = reduce_audio(audio, vad_result)
837
838 assert reduced_audio is not None
839 assert reduction is not None
840
841 # Should have:
842 # - Leading: GAP_BUFFER (1s)
843 # - Speech1: 2s
844 # - Gap1 (5s->2s): 2*GAP_BUFFER = 2s
845 # - Speech2: 2s
846 # - Gap2 (5s->2s): 2*GAP_BUFFER = 2s
847 # - Speech3: 1s
848 # Total: 1 + 2 + 2 + 2 + 2 + 1 = 10s
849 expected_duration = (
850 GAP_BUFFER + 2.0 + 2 * GAP_BUFFER + 2.0 + 2 * GAP_BUFFER + 1.0
851 )
852 actual_duration = len(reduced_audio) / SAMPLE_RATE
853 assert abs(actual_duration - expected_duration) < 0.1
854
855 # Check we have 3 speech segments in mapping
856 assert len(reduction.segments) == 3
857
858 def test_returns_numpy_array(self):
859 """Should return numpy array."""
860 audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32)
861
862 vad_result = VadResult(
863 duration=10.0,
864 speech_duration=2.0,
865 has_speech=True,
866 speech_segments=[(5.0, 7.0)], # Leading gap > 2s
867 )
868
869 reduced_audio, reduction = reduce_audio(audio, vad_result)
870
871 assert isinstance(reduced_audio, np.ndarray)
872 assert reduced_audio.dtype == np.float32