tests/test_vad.py at scratch/segment-sense-rd

solpbc.org / solstone
fork atom
personal memory agent
fork atom
solstone / tests / test_vad.py
at scratch/segment-sense-rd 872 lines 30 kB view raw
wrap content
Jer Miller Skip audio reduction for noisy clips with high speech ratio 7w ago
43e123cf
  1# SPDX-License-Identifier: AGPL-3.0-only
  2# Copyright (c) 2026 sol pbc
  3
  4"""Tests for observe.vad module."""
  5
  6from unittest.mock import patch
  7
  8import numpy as np
  9
 10from observe.utils import SAMPLE_RATE
 11from observe.vad import (
 12    GAP_BUFFER,
 13    AudioReduction,
 14    SpeechSegment,
 15    VadResult,
 16    compute_nonspeech_rms,
 17    get_nonspeech_segments,
 18    reduce_audio,
 19    restore_statement_timestamps,
 20    run_vad,
 21)
 22
 23
 24class TestVadResult:
 25    """Test VadResult dataclass."""
 26
 27    def test_vad_result_fields(self):
 28        """VadResult should have all expected fields."""
 29        result = VadResult(
 30            duration=10.0,
 31            speech_duration=5.0,
 32            has_speech=True,
 33            speech_segments=[(1.0, 3.0), (5.0, 8.0)],
 34        )
 35
 36        assert result.duration == 10.0
 37        assert result.speech_duration == 5.0
 38        assert result.has_speech is True
 39        assert result.speech_segments == [(1.0, 3.0), (5.0, 8.0)]
 40
 41    def test_vad_result_no_speech(self):
 42        """VadResult with no speech should have has_speech=False."""
 43        result = VadResult(
 44            duration=5.0,
 45            speech_duration=0.0,
 46            has_speech=False,
 47            speech_segments=[],
 48        )
 49
 50        assert result.duration == 5.0
 51        assert result.speech_duration == 0.0
 52        assert result.has_speech is False
 53        assert result.speech_segments == []
 54
 55    def test_vad_result_default_speech_segments(self):
 56        """VadResult speech_segments should default to empty list."""
 57        result = VadResult(
 58            duration=5.0,
 59            speech_duration=0.0,
 60            has_speech=False,
 61        )
 62
 63        assert result.speech_segments == []
 64
 65    def test_vad_result_rms_fields(self):
 66        """VadResult should have RMS fields with defaults."""
 67        result = VadResult(
 68            duration=10.0,
 69            speech_duration=5.0,
 70            has_speech=True,
 71        )
 72
 73        # Default values
 74        assert result.noisy_rms is None
 75        assert result.noisy_s == 0.0
 76
 77    def test_vad_result_with_rms(self):
 78        """VadResult should accept RMS values."""
 79        result = VadResult(
 80            duration=10.0,
 81            speech_duration=5.0,
 82            has_speech=True,
 83            noisy_rms=0.015,
 84            noisy_s=3.5,
 85        )
 86
 87        assert result.noisy_rms == 0.015
 88        assert result.noisy_s == 3.5
 89
 90    def test_is_noisy_above_threshold(self):
 91        """is_noisy() should return True when RMS exceeds threshold."""
 92        result = VadResult(
 93            duration=10.0,
 94            speech_duration=5.0,
 95            has_speech=True,
 96            noisy_rms=0.015,  # Above default 0.01 threshold
 97        )
 98
 99        assert result.is_noisy() is True
100
101    def test_is_noisy_below_threshold(self):
102        """is_noisy() should return False when RMS is below threshold."""
103        result = VadResult(
104            duration=10.0,
105            speech_duration=5.0,
106            has_speech=True,
107            noisy_rms=0.005,  # Below default 0.01 threshold
108        )
109
110        assert result.is_noisy() is False
111
112    def test_is_noisy_none_rms(self):
113        """is_noisy() should return False when RMS is None."""
114        result = VadResult(
115            duration=10.0,
116            speech_duration=5.0,
117            has_speech=True,
118            noisy_rms=None,
119        )
120
121        assert result.is_noisy() is False
122
123    def test_is_noisy_custom_threshold(self):
124        """is_noisy() should respect custom threshold."""
125        result = VadResult(
126            duration=10.0,
127            speech_duration=5.0,
128            has_speech=True,
129            noisy_rms=0.015,
130        )
131
132        # With default threshold (0.01), should be noisy
133        assert result.is_noisy() is True
134
135        # With higher threshold (0.02), should not be noisy
136        assert result.is_noisy(threshold=0.02) is False
137
138    def test_speech_ratio(self):
139        """speech_ratio should return speech_duration / duration."""
140        result = VadResult(
141            duration=10.0,
142            speech_duration=7.5,
143            has_speech=True,
144        )
145        assert result.speech_ratio == 0.75
146
147    def test_speech_ratio_zero_duration(self):
148        """speech_ratio should return 0.0 when duration is zero."""
149        result = VadResult(
150            duration=0.0,
151            speech_duration=0.0,
152            has_speech=False,
153        )
154        assert result.speech_ratio == 0.0
155
156
157class TestGetNonspeechSegments:
158    """Test get_nonspeech_segments function."""
159
160    def test_leading_silence(self):
161        """Should detect leading silence before first speech."""
162        speech_segments = [(2.0, 4.0)]
163        nonspeech = get_nonspeech_segments(speech_segments, 5.0)
164
165        assert (0.0, 2.0) in nonspeech
166
167    def test_trailing_silence(self):
168        """Should detect trailing silence after last speech."""
169        speech_segments = [(1.0, 3.0)]
170        nonspeech = get_nonspeech_segments(speech_segments, 5.0)
171
172        assert (3.0, 5.0) in nonspeech
173
174    def test_gap_between_segments(self):
175        """Should detect gaps between speech segments."""
176        speech_segments = [(1.0, 2.0), (4.0, 5.0)]
177        nonspeech = get_nonspeech_segments(speech_segments, 6.0)
178
179        assert (2.0, 4.0) in nonspeech
180
181    def test_all_regions(self):
182        """Should detect leading, middle, and trailing silence."""
183        speech_segments = [(1.0, 2.0), (4.0, 5.0)]
184        nonspeech = get_nonspeech_segments(speech_segments, 7.0)
185
186        assert nonspeech == [(0.0, 1.0), (2.0, 4.0), (5.0, 7.0)]
187
188    def test_no_speech_segments(self):
189        """Should return empty list when no speech segments."""
190        nonspeech = get_nonspeech_segments([], 5.0)
191
192        assert nonspeech == []
193
194    def test_speech_fills_entire_audio(self):
195        """Should return empty list when speech fills entire audio."""
196        speech_segments = [(0.0, 5.0)]
197        nonspeech = get_nonspeech_segments(speech_segments, 5.0)
198
199        assert nonspeech == []
200
201    def test_adjacent_segments(self):
202        """Should not create zero-length gaps between adjacent segments."""
203        speech_segments = [(1.0, 2.0), (2.0, 3.0)]
204        nonspeech = get_nonspeech_segments(speech_segments, 4.0)
205
206        # Should only have leading and trailing, no gap between adjacent segments
207        assert nonspeech == [(0.0, 1.0), (3.0, 4.0)]
208
209
210class TestComputeNonspeechRms:
211    """Test compute_nonspeech_rms function."""
212
213    def test_silent_audio_returns_zero_rms(self):
214        """Silent audio should have RMS near zero."""
215        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
216        speech_segments = [(1.0, 2.0)]  # Speech in middle
217
218        rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE)
219
220        assert rms is not None
221        assert rms < 0.001  # Effectively zero
222
223    def test_noisy_audio_returns_high_rms(self):
224        """Noisy audio should have measurable RMS."""
225        # Create audio with noise (amplitude 0.1)
226        audio = np.random.uniform(-0.1, 0.1, 5 * SAMPLE_RATE).astype(np.float32)
227        # Put "speech" in middle (doesn't affect RMS calculation of non-speech)
228        speech_segments = [(2.0, 3.0)]
229
230        rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE)
231
232        assert rms is not None
233        assert rms > 0.01  # Noisy threshold
234
235    def test_returns_duration_used(self):
236        """Should return total duration of non-speech segments used."""
237        audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32)
238        # Speech from 2-4s and 6-8s, leaving gaps at 0-2, 4-6, 8-10
239        speech_segments = [(2.0, 4.0), (6.0, 8.0)]
240
241        rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE)
242
243        # All three gaps are >= 0.5s (MIN_NONSPEECH_SEGMENT)
244        # Total non-speech: 2 + 2 + 2 = 6 seconds
245        assert duration == 6.0
246
247    def test_filters_short_segments(self):
248        """Should filter out non-speech segments shorter than min_segment."""
249        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
250        # Speech leaves only 0.3s gaps (below default 0.5s threshold)
251        speech_segments = [(0.3, 1.0), (1.3, 2.0), (2.3, 5.0)]
252
253        rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE)
254
255        # No qualifying segments
256        assert rms is None
257        assert duration == 0.0
258
259    def test_no_speech_segments_returns_none(self):
260        """Should return None when no speech segments (can't compute non-speech)."""
261        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
262
263        rms, duration = compute_nonspeech_rms(audio, [], SAMPLE_RATE)
264
265        assert rms is None
266        assert duration == 0.0
267
268    def test_custom_min_segment(self):
269        """Should respect custom min_segment threshold."""
270        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
271        # Speech from 1-2s, leaving 1s gap at start
272        speech_segments = [(1.0, 2.0)]
273
274        # With default 0.5s threshold, should include leading gap
275        rms, duration = compute_nonspeech_rms(
276            audio, speech_segments, SAMPLE_RATE, min_segment=0.5
277        )
278        assert duration == 4.0  # 1s leading + 3s trailing
279
280        # With 2.0s threshold, should only include trailing gap (3s)
281        rms, duration = compute_nonspeech_rms(
282            audio, speech_segments, SAMPLE_RATE, min_segment=2.0
283        )
284        assert duration == 3.0
285
286
287class TestRunVad:
288    """Test run_vad function."""
289
290    @patch("faster_whisper.vad.get_speech_timestamps")
291    def test_silent_audio_returns_no_speech(self, mock_get_timestamps):
292        """Silent audio should return has_speech=False."""
293        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
294        mock_get_timestamps.return_value = []
295
296        result = run_vad(audio, min_speech_seconds=1.0)
297
298        assert result.duration == 5.0
299        assert result.speech_duration == 0.0
300        assert result.has_speech is False
301
302    @patch("faster_whisper.vad.get_speech_timestamps")
303    def test_speech_audio_returns_has_speech(self, mock_get_timestamps):
304        """Audio with speech should return has_speech=True."""
305        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
306        # Mock: 2 seconds of speech (samples 16000-48000)
307        mock_get_timestamps.return_value = [{"start": 16000, "end": 48000}]
308
309        result = run_vad(audio, min_speech_seconds=1.0)
310
311        assert result.duration == 5.0
312        assert result.speech_duration == 2.0
313        assert result.has_speech is True
314        # Speech segments should be converted to seconds
315        assert result.speech_segments == [(1.0, 3.0)]
316
317    @patch("faster_whisper.vad.get_speech_timestamps")
318    def test_speech_below_threshold(self, mock_get_timestamps):
319        """Speech below threshold should return has_speech=False."""
320        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
321        # Mock: 0.5 seconds of speech (below 1.0s threshold)
322        mock_get_timestamps.return_value = [{"start": 0, "end": 8000}]
323
324        result = run_vad(audio, min_speech_seconds=1.0)
325
326        assert result.duration == 5.0
327        assert result.speech_duration == 0.5
328        assert result.has_speech is False
329
330    @patch("faster_whisper.vad.get_speech_timestamps")
331    def test_custom_min_speech_threshold(self, mock_get_timestamps):
332        """Custom min_speech_seconds threshold should be respected."""
333        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
334        # Mock: 0.5 seconds of speech
335        mock_get_timestamps.return_value = [{"start": 0, "end": 8000}]
336
337        # With 0.3s threshold, should have speech
338        result = run_vad(audio, min_speech_seconds=0.3)
339        assert result.has_speech is True
340
341        # With 1.0s threshold, should not have speech
342        result = run_vad(audio, min_speech_seconds=1.0)
343        assert result.has_speech is False
344
345    @patch("faster_whisper.vad.get_speech_timestamps")
346    def test_multiple_speech_chunks(self, mock_get_timestamps):
347        """Multiple speech chunks should be summed correctly."""
348        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
349        # Mock: Two 1-second speech segments
350        mock_get_timestamps.return_value = [
351            {"start": 16000, "end": 32000},  # 1 second
352            {"start": 48000, "end": 64000},  # 1 second
353        ]
354
355        result = run_vad(audio, min_speech_seconds=1.0)
356
357        assert result.duration == 5.0
358        assert result.speech_duration == 2.0
359        assert result.has_speech is True
360
361    @patch("faster_whisper.vad.get_speech_timestamps")
362    def test_returns_rms_for_silent_background(self, mock_get_timestamps):
363        """run_vad should return low RMS for silent non-speech regions."""
364        # Silent audio (zeros)
365        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
366        # Speech from 1-3s, leaving non-speech at 0-1s and 3-5s
367        mock_get_timestamps.return_value = [{"start": 16000, "end": 48000}]
368
369        result = run_vad(audio, min_speech_seconds=1.0)
370
371        assert result.noisy_rms is not None
372        assert result.noisy_rms < 0.001  # Effectively zero
373        assert result.noisy_s == 3.0  # 1s leading + 2s trailing
374
375    @patch("faster_whisper.vad.get_speech_timestamps")
376    def test_returns_rms_for_noisy_background(self, mock_get_timestamps):
377        """run_vad should return measurable RMS for noisy non-speech regions."""
378        # Noisy audio
379        np.random.seed(42)
380        audio = np.random.uniform(-0.1, 0.1, 5 * SAMPLE_RATE).astype(np.float32)
381        # Speech from 1-3s
382        mock_get_timestamps.return_value = [{"start": 16000, "end": 48000}]
383
384        result = run_vad(audio, min_speech_seconds=1.0)
385
386        assert result.noisy_rms is not None
387        assert result.noisy_rms > 0.01  # Noisy threshold
388        assert result.noisy_s == 3.0
389
390    @patch("faster_whisper.vad.get_speech_timestamps")
391    def test_returns_none_rms_when_no_qualifying_segments(self, mock_get_timestamps):
392        """run_vad should return None RMS when no qualifying non-speech segments."""
393        audio = np.zeros(2 * SAMPLE_RATE, dtype=np.float32)
394        # Speech fills most of audio, leaving only 0.2s gaps (below 0.5s threshold)
395        mock_get_timestamps.return_value = [
396            {"start": 3200, "end": 12800},  # 0.2s to 0.8s
397            {"start": 16000, "end": 28800},  # 1.0s to 1.8s
398        ]
399
400        result = run_vad(audio, min_speech_seconds=0.5)
401
402        assert result.noisy_rms is None
403        assert result.noisy_s == 0.0
404
405
406class TestSpeechSegment:
407    """Test SpeechSegment dataclass."""
408
409    def test_speech_segment_fields(self):
410        """SpeechSegment should have all expected fields."""
411        seg = SpeechSegment(
412            original_start=5.0,
413            original_end=10.0,
414            reduced_start=2.0,
415            reduced_end=7.0,
416        )
417
418        assert seg.original_start == 5.0
419        assert seg.original_end == 10.0
420        assert seg.reduced_start == 2.0
421        assert seg.reduced_end == 7.0
422
423
424class TestAudioReduction:
425    """Test AudioReduction dataclass and timestamp restoration."""
426
427    def test_empty_reduction(self):
428        """Empty reduction should return timestamp unchanged."""
429        reduction = AudioReduction()
430        assert reduction.restore_timestamp(5.0) == 5.0
431
432    def test_single_segment_restoration(self):
433        """Single segment should restore timestamps within segment."""
434        reduction = AudioReduction(
435            segments=[
436                SpeechSegment(
437                    original_start=3.0,
438                    original_end=8.0,
439                    reduced_start=0.0,
440                    reduced_end=5.0,
441                )
442            ],
443            original_duration=10.0,
444            reduced_duration=5.0,
445        )
446
447        # Reduced time 0.0 -> original 3.0
448        assert reduction.restore_timestamp(0.0) == 3.0
449
450        # Reduced time 2.5 -> original 5.5 (midpoint)
451        assert reduction.restore_timestamp(2.5) == 5.5
452
453        # Reduced time 5.0 -> original 8.0
454        assert reduction.restore_timestamp(5.0) == 8.0
455
456    def test_multiple_segments_restoration(self):
457        """Multiple segments should restore timestamps correctly."""
458        # Simulates: original 10s audio with speech at [1-3] and [7-9]
459        # with 4s gap trimmed to 2s, so reduced audio is 6s
460        reduction = AudioReduction(
461            segments=[
462                SpeechSegment(
463                    original_start=1.0,
464                    original_end=3.0,
465                    reduced_start=1.0,
466                    reduced_end=3.0,
467                ),
468                SpeechSegment(
469                    original_start=7.0,
470                    original_end=9.0,
471                    reduced_start=5.0,  # 3.0 + 2.0 gap
472                    reduced_end=7.0,
473                ),
474            ],
475            original_duration=10.0,
476            reduced_duration=8.0,
477        )
478
479        # First segment: reduced 1.0 -> original 1.0
480        assert reduction.restore_timestamp(1.0) == 1.0
481
482        # First segment: reduced 2.0 -> original 2.0
483        assert reduction.restore_timestamp(2.0) == 2.0
484
485        # Second segment: reduced 5.0 -> original 7.0
486        assert reduction.restore_timestamp(5.0) == 7.0
487
488        # Second segment: reduced 6.0 -> original 8.0
489        assert reduction.restore_timestamp(6.0) == 8.0
490
491    def test_timestamp_in_gap(self):
492        """Timestamp in reduced gap should map proportionally to original gap."""
493        reduction = AudioReduction(
494            segments=[
495                SpeechSegment(
496                    original_start=0.0,
497                    original_end=2.0,
498                    reduced_start=0.0,
499                    reduced_end=2.0,
500                ),
501                SpeechSegment(
502                    original_start=8.0,
503                    original_end=10.0,
504                    reduced_start=4.0,  # 2.0 + 2.0 reduced gap
505                    reduced_end=6.0,
506                ),
507            ],
508            original_duration=10.0,
509            reduced_duration=6.0,
510        )
511
512        # Gap in reduced: 2.0-4.0 (2s), original: 2.0-8.0 (6s)
513        # Reduced 3.0 is midpoint of gap -> original 5.0 (midpoint of 2-8)
514        result = reduction.restore_timestamp(3.0)
515        assert abs(result - 5.0) < 0.1  # Allow small tolerance
516
517    def test_timestamp_after_all_segments(self):
518        """Timestamp after all segments should extrapolate from last segment."""
519        reduction = AudioReduction(
520            segments=[
521                SpeechSegment(
522                    original_start=0.0,
523                    original_end=5.0,
524                    reduced_start=0.0,
525                    reduced_end=5.0,
526                ),
527            ],
528            original_duration=10.0,
529            reduced_duration=6.0,
530        )
531
532        # Reduced 6.0 is 1.0 after last segment end -> original 6.0
533        assert reduction.restore_timestamp(6.0) == 6.0
534
535    def test_timestamp_before_first_segment(self):
536        """Timestamp before first segment should map to leading buffer region."""
537        # Simulates: original audio with 5s silence then speech at [5-10]
538        # Leading 5s gap reduced to 1s buffer, so speech starts at reduced 1.0
539        reduction = AudioReduction(
540            segments=[
541                SpeechSegment(
542                    original_start=5.0,
543                    original_end=10.0,
544                    reduced_start=1.0,  # 1s buffer before speech
545                    reduced_end=6.0,
546                ),
547            ],
548            original_duration=10.0,
549            reduced_duration=6.0,
550        )
551
552        # Reduced 0.0 is 1.0 before first segment start (5.0) -> original 4.0
553        assert reduction.restore_timestamp(0.0) == 4.0
554
555        # Reduced 0.5 is 0.5 before first segment start (5.0) -> original 4.5
556        assert reduction.restore_timestamp(0.5) == 4.5
557
558        # Reduced 1.0 is exactly at first segment start -> original 5.0
559        assert reduction.restore_timestamp(1.0) == 5.0
560
561
562class TestRestoreSegmentTimestamps:
563    """Test restore_statement_timestamps function."""
564
565    def test_restores_segment_timestamps(self):
566        """Should restore segment start and end timestamps."""
567        segments = [
568            {"id": 1, "start": 0.0, "end": 2.0, "text": "Hello"},
569            {"id": 2, "start": 4.0, "end": 6.0, "text": "World"},
570        ]
571
572        reduction = AudioReduction(
573            segments=[
574                SpeechSegment(
575                    original_start=0.0,
576                    original_end=2.0,
577                    reduced_start=0.0,
578                    reduced_end=2.0,
579                ),
580                SpeechSegment(
581                    original_start=6.0,
582                    original_end=8.0,
583                    reduced_start=4.0,
584                    reduced_end=6.0,
585                ),
586            ],
587            original_duration=10.0,
588            reduced_duration=8.0,
589        )
590
591        restored = restore_statement_timestamps(segments, reduction)
592
593        assert restored[0]["start"] == 0.0
594        assert restored[0]["end"] == 2.0
595        assert restored[1]["start"] == 6.0
596        assert restored[1]["end"] == 8.0
597
598    def test_restores_word_timestamps(self):
599        """Should restore word-level timestamps."""
600        segments = [
601            {
602                "id": 1,
603                "start": 4.0,
604                "end": 6.0,
605                "text": "Hello world",
606                "words": [
607                    {"word": "Hello", "start": 4.0, "end": 5.0, "probability": 0.9},
608                    {"word": "world", "start": 5.0, "end": 6.0, "probability": 0.9},
609                ],
610            },
611        ]
612
613        reduction = AudioReduction(
614            segments=[
615                SpeechSegment(
616                    original_start=8.0,
617                    original_end=10.0,
618                    reduced_start=4.0,
619                    reduced_end=6.0,
620                ),
621            ],
622            original_duration=12.0,
623            reduced_duration=8.0,
624        )
625
626        restored = restore_statement_timestamps(segments, reduction)
627
628        assert restored[0]["start"] == 8.0
629        assert restored[0]["end"] == 10.0
630        assert restored[0]["words"][0]["start"] == 8.0
631        assert restored[0]["words"][0]["end"] == 9.0
632        assert restored[0]["words"][1]["start"] == 9.0
633        assert restored[0]["words"][1]["end"] == 10.0
634
635    def test_preserves_other_fields(self):
636        """Should preserve non-timestamp fields."""
637        segments = [
638            {
639                "id": 1,
640                "start": 0.0,
641                "end": 2.0,
642                "text": "Hello",
643                "custom_field": "preserved",
644            },
645        ]
646
647        reduction = AudioReduction(
648            segments=[
649                SpeechSegment(
650                    original_start=0.0,
651                    original_end=2.0,
652                    reduced_start=0.0,
653                    reduced_end=2.0,
654                ),
655            ],
656            original_duration=5.0,
657            reduced_duration=2.0,
658        )
659
660        restored = restore_statement_timestamps(segments, reduction)
661
662        assert restored[0]["text"] == "Hello"
663        assert restored[0]["custom_field"] == "preserved"
664        assert restored[0]["id"] == 1
665
666    def test_handles_empty_segments(self):
667        """Should handle empty segment list."""
668        reduction = AudioReduction(
669            segments=[
670                SpeechSegment(
671                    original_start=0.0,
672                    original_end=5.0,
673                    reduced_start=0.0,
674                    reduced_end=5.0,
675                ),
676            ],
677            original_duration=10.0,
678            reduced_duration=5.0,
679        )
680
681        restored = restore_statement_timestamps([], reduction)
682        assert restored == []
683
684    def test_handles_segments_without_words(self):
685        """Should handle segments without words field."""
686        segments = [{"id": 1, "start": 0.0, "end": 2.0, "text": "Hello"}]
687
688        reduction = AudioReduction(
689            segments=[
690                SpeechSegment(
691                    original_start=5.0,
692                    original_end=7.0,
693                    reduced_start=0.0,
694                    reduced_end=2.0,
695                ),
696            ],
697            original_duration=10.0,
698            reduced_duration=2.0,
699        )
700
701        restored = restore_statement_timestamps(segments, reduction)
702
703        assert restored[0]["start"] == 5.0
704        assert restored[0]["end"] == 7.0
705        assert "words" not in restored[0]
706
707
708class TestReduceAudio:
709    """Test reduce_audio function."""
710
711    def test_no_gaps_to_reduce(self):
712        """Should return None when no gaps > 2s exist."""
713        # 5s audio with speech from 0.5-1.5s and 2.0-3.0s (gap = 0.5s < 2s)
714        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
715
716        vad_result = VadResult(
717            duration=5.0,
718            speech_duration=2.0,
719            has_speech=True,
720            speech_segments=[(0.5, 1.5), (2.0, 3.0)],
721        )
722
723        reduced_audio, reduction = reduce_audio(audio, vad_result)
724
725        assert reduced_audio is None
726        assert reduction is None
727
728    def test_no_speech_segments(self):
729        """Should return None when no speech segments."""
730        audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32)
731
732        vad_result = VadResult(
733            duration=5.0,
734            speech_duration=0.0,
735            has_speech=False,
736            speech_segments=[],
737        )
738
739        reduced_audio, reduction = reduce_audio(audio, vad_result)
740
741        assert reduced_audio is None
742        assert reduction is None
743
744    def test_leading_gap_reduction(self):
745        """Should trim leading gap > 2s to GAP_BUFFER."""
746        # 10s audio with speech starting at 5s (leading gap = 5s > 2s)
747        audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32)
748
749        vad_result = VadResult(
750            duration=10.0,
751            speech_duration=3.0,
752            has_speech=True,
753            speech_segments=[(5.0, 8.0)],  # Speech from 5-8s
754        )
755
756        reduced_audio, reduction = reduce_audio(audio, vad_result)
757
758        assert reduced_audio is not None
759        assert reduction is not None
760
761        # Should have: GAP_BUFFER (1s) + speech (3s) + trailing (2s) = 6s
762        # But trailing is <= 2s, so kept in full
763        expected_duration = GAP_BUFFER + 3.0 + 2.0  # 6s
764        actual_duration = len(reduced_audio) / SAMPLE_RATE
765        assert abs(actual_duration - expected_duration) < 0.1
766
767        # Check mapping: speech should start at GAP_BUFFER in reduced audio
768        assert len(reduction.segments) == 1
769        assert reduction.segments[0].original_start == 5.0
770        assert reduction.segments[0].reduced_start == GAP_BUFFER
771
772    def test_trailing_gap_reduction(self):
773        """Should trim trailing gap > 2s to GAP_BUFFER."""
774        # 10s audio with speech from 1-3s (trailing gap = 7s > 2s)
775        audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32)
776
777        vad_result = VadResult(
778            duration=10.0,
779            speech_duration=2.0,
780            has_speech=True,
781            speech_segments=[(1.0, 3.0)],  # Speech from 1-3s
782        )
783
784        reduced_audio, reduction = reduce_audio(audio, vad_result)
785
786        assert reduced_audio is not None
787        assert reduction is not None
788
789        # Should have: leading (1s) + speech (2s) + GAP_BUFFER (1s) = 4s
790        expected_duration = 1.0 + 2.0 + GAP_BUFFER  # 4s
791        actual_duration = len(reduced_audio) / SAMPLE_RATE
792        assert abs(actual_duration - expected_duration) < 0.1
793
794    def test_middle_gap_reduction(self):
795        """Should trim middle gap > 2s to 2*GAP_BUFFER."""
796        # 10s audio with speech at 0-2s and 7-9s (gap = 5s > 2s)
797        audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32)
798
799        vad_result = VadResult(
800            duration=10.0,
801            speech_duration=4.0,
802            has_speech=True,
803            speech_segments=[(0.0, 2.0), (7.0, 9.0)],
804        )
805
806        reduced_audio, reduction = reduce_audio(audio, vad_result)
807
808        assert reduced_audio is not None
809        assert reduction is not None
810
811        # Should have: speech1 (2s) + trimmed gap (2s) + speech2 (2s) + trailing (1s) = 7s
812        expected_duration = 2.0 + 2 * GAP_BUFFER + 2.0 + 1.0  # 7s
813        actual_duration = len(reduced_audio) / SAMPLE_RATE
814        assert abs(actual_duration - expected_duration) < 0.1
815
816        # Check mapping
817        assert len(reduction.segments) == 2
818        assert reduction.segments[0].original_start == 0.0
819        assert reduction.segments[0].reduced_start == 0.0
820        assert reduction.segments[1].original_start == 7.0
821        # Second segment should start at: speech1_end + trimmed_gap = 2.0 + 2.0 = 4.0
822        assert abs(reduction.segments[1].reduced_start - 4.0) < 0.1
823
824    def test_multiple_gaps_reduction(self):
825        """Should trim multiple gaps > 2s."""
826        # 20s audio with speech at 5-7, 12-14, and 19-20 (two big gaps)
827        audio = np.zeros(20 * SAMPLE_RATE, dtype=np.float32)
828
829        vad_result = VadResult(
830            duration=20.0,
831            speech_duration=5.0,
832            has_speech=True,
833            speech_segments=[(5.0, 7.0), (12.0, 14.0), (19.0, 20.0)],
834        )
835
836        reduced_audio, reduction = reduce_audio(audio, vad_result)
837
838        assert reduced_audio is not None
839        assert reduction is not None
840
841        # Should have:
842        # - Leading: GAP_BUFFER (1s)
843        # - Speech1: 2s
844        # - Gap1 (5s->2s): 2*GAP_BUFFER = 2s
845        # - Speech2: 2s
846        # - Gap2 (5s->2s): 2*GAP_BUFFER = 2s
847        # - Speech3: 1s
848        # Total: 1 + 2 + 2 + 2 + 2 + 1 = 10s
849        expected_duration = (
850            GAP_BUFFER + 2.0 + 2 * GAP_BUFFER + 2.0 + 2 * GAP_BUFFER + 1.0
851        )
852        actual_duration = len(reduced_audio) / SAMPLE_RATE
853        assert abs(actual_duration - expected_duration) < 0.1
854
855        # Check we have 3 speech segments in mapping
856        assert len(reduction.segments) == 3
857
858    def test_returns_numpy_array(self):
859        """Should return numpy array."""
860        audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32)
861
862        vad_result = VadResult(
863            duration=10.0,
864            speech_duration=2.0,
865            has_speech=True,
866            speech_segments=[(5.0, 7.0)],  # Leading gap > 2s
867        )
868
869        reduced_audio, reduction = reduce_audio(audio, vad_result)
870
871        assert isinstance(reduced_audio, np.ndarray)
872        assert reduced_audio.dtype == np.float32