personal memory agent
at scratch/segment-sense-rd 872 lines 30 kB view raw
1# SPDX-License-Identifier: AGPL-3.0-only 2# Copyright (c) 2026 sol pbc 3 4"""Tests for observe.vad module.""" 5 6from unittest.mock import patch 7 8import numpy as np 9 10from observe.utils import SAMPLE_RATE 11from observe.vad import ( 12 GAP_BUFFER, 13 AudioReduction, 14 SpeechSegment, 15 VadResult, 16 compute_nonspeech_rms, 17 get_nonspeech_segments, 18 reduce_audio, 19 restore_statement_timestamps, 20 run_vad, 21) 22 23 24class TestVadResult: 25 """Test VadResult dataclass.""" 26 27 def test_vad_result_fields(self): 28 """VadResult should have all expected fields.""" 29 result = VadResult( 30 duration=10.0, 31 speech_duration=5.0, 32 has_speech=True, 33 speech_segments=[(1.0, 3.0), (5.0, 8.0)], 34 ) 35 36 assert result.duration == 10.0 37 assert result.speech_duration == 5.0 38 assert result.has_speech is True 39 assert result.speech_segments == [(1.0, 3.0), (5.0, 8.0)] 40 41 def test_vad_result_no_speech(self): 42 """VadResult with no speech should have has_speech=False.""" 43 result = VadResult( 44 duration=5.0, 45 speech_duration=0.0, 46 has_speech=False, 47 speech_segments=[], 48 ) 49 50 assert result.duration == 5.0 51 assert result.speech_duration == 0.0 52 assert result.has_speech is False 53 assert result.speech_segments == [] 54 55 def test_vad_result_default_speech_segments(self): 56 """VadResult speech_segments should default to empty list.""" 57 result = VadResult( 58 duration=5.0, 59 speech_duration=0.0, 60 has_speech=False, 61 ) 62 63 assert result.speech_segments == [] 64 65 def test_vad_result_rms_fields(self): 66 """VadResult should have RMS fields with defaults.""" 67 result = VadResult( 68 duration=10.0, 69 speech_duration=5.0, 70 has_speech=True, 71 ) 72 73 # Default values 74 assert result.noisy_rms is None 75 assert result.noisy_s == 0.0 76 77 def test_vad_result_with_rms(self): 78 """VadResult should accept RMS values.""" 79 result = VadResult( 80 duration=10.0, 81 speech_duration=5.0, 82 has_speech=True, 83 noisy_rms=0.015, 84 noisy_s=3.5, 85 ) 86 87 assert result.noisy_rms == 0.015 88 assert result.noisy_s == 3.5 89 90 def test_is_noisy_above_threshold(self): 91 """is_noisy() should return True when RMS exceeds threshold.""" 92 result = VadResult( 93 duration=10.0, 94 speech_duration=5.0, 95 has_speech=True, 96 noisy_rms=0.015, # Above default 0.01 threshold 97 ) 98 99 assert result.is_noisy() is True 100 101 def test_is_noisy_below_threshold(self): 102 """is_noisy() should return False when RMS is below threshold.""" 103 result = VadResult( 104 duration=10.0, 105 speech_duration=5.0, 106 has_speech=True, 107 noisy_rms=0.005, # Below default 0.01 threshold 108 ) 109 110 assert result.is_noisy() is False 111 112 def test_is_noisy_none_rms(self): 113 """is_noisy() should return False when RMS is None.""" 114 result = VadResult( 115 duration=10.0, 116 speech_duration=5.0, 117 has_speech=True, 118 noisy_rms=None, 119 ) 120 121 assert result.is_noisy() is False 122 123 def test_is_noisy_custom_threshold(self): 124 """is_noisy() should respect custom threshold.""" 125 result = VadResult( 126 duration=10.0, 127 speech_duration=5.0, 128 has_speech=True, 129 noisy_rms=0.015, 130 ) 131 132 # With default threshold (0.01), should be noisy 133 assert result.is_noisy() is True 134 135 # With higher threshold (0.02), should not be noisy 136 assert result.is_noisy(threshold=0.02) is False 137 138 def test_speech_ratio(self): 139 """speech_ratio should return speech_duration / duration.""" 140 result = VadResult( 141 duration=10.0, 142 speech_duration=7.5, 143 has_speech=True, 144 ) 145 assert result.speech_ratio == 0.75 146 147 def test_speech_ratio_zero_duration(self): 148 """speech_ratio should return 0.0 when duration is zero.""" 149 result = VadResult( 150 duration=0.0, 151 speech_duration=0.0, 152 has_speech=False, 153 ) 154 assert result.speech_ratio == 0.0 155 156 157class TestGetNonspeechSegments: 158 """Test get_nonspeech_segments function.""" 159 160 def test_leading_silence(self): 161 """Should detect leading silence before first speech.""" 162 speech_segments = [(2.0, 4.0)] 163 nonspeech = get_nonspeech_segments(speech_segments, 5.0) 164 165 assert (0.0, 2.0) in nonspeech 166 167 def test_trailing_silence(self): 168 """Should detect trailing silence after last speech.""" 169 speech_segments = [(1.0, 3.0)] 170 nonspeech = get_nonspeech_segments(speech_segments, 5.0) 171 172 assert (3.0, 5.0) in nonspeech 173 174 def test_gap_between_segments(self): 175 """Should detect gaps between speech segments.""" 176 speech_segments = [(1.0, 2.0), (4.0, 5.0)] 177 nonspeech = get_nonspeech_segments(speech_segments, 6.0) 178 179 assert (2.0, 4.0) in nonspeech 180 181 def test_all_regions(self): 182 """Should detect leading, middle, and trailing silence.""" 183 speech_segments = [(1.0, 2.0), (4.0, 5.0)] 184 nonspeech = get_nonspeech_segments(speech_segments, 7.0) 185 186 assert nonspeech == [(0.0, 1.0), (2.0, 4.0), (5.0, 7.0)] 187 188 def test_no_speech_segments(self): 189 """Should return empty list when no speech segments.""" 190 nonspeech = get_nonspeech_segments([], 5.0) 191 192 assert nonspeech == [] 193 194 def test_speech_fills_entire_audio(self): 195 """Should return empty list when speech fills entire audio.""" 196 speech_segments = [(0.0, 5.0)] 197 nonspeech = get_nonspeech_segments(speech_segments, 5.0) 198 199 assert nonspeech == [] 200 201 def test_adjacent_segments(self): 202 """Should not create zero-length gaps between adjacent segments.""" 203 speech_segments = [(1.0, 2.0), (2.0, 3.0)] 204 nonspeech = get_nonspeech_segments(speech_segments, 4.0) 205 206 # Should only have leading and trailing, no gap between adjacent segments 207 assert nonspeech == [(0.0, 1.0), (3.0, 4.0)] 208 209 210class TestComputeNonspeechRms: 211 """Test compute_nonspeech_rms function.""" 212 213 def test_silent_audio_returns_zero_rms(self): 214 """Silent audio should have RMS near zero.""" 215 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 216 speech_segments = [(1.0, 2.0)] # Speech in middle 217 218 rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE) 219 220 assert rms is not None 221 assert rms < 0.001 # Effectively zero 222 223 def test_noisy_audio_returns_high_rms(self): 224 """Noisy audio should have measurable RMS.""" 225 # Create audio with noise (amplitude 0.1) 226 audio = np.random.uniform(-0.1, 0.1, 5 * SAMPLE_RATE).astype(np.float32) 227 # Put "speech" in middle (doesn't affect RMS calculation of non-speech) 228 speech_segments = [(2.0, 3.0)] 229 230 rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE) 231 232 assert rms is not None 233 assert rms > 0.01 # Noisy threshold 234 235 def test_returns_duration_used(self): 236 """Should return total duration of non-speech segments used.""" 237 audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32) 238 # Speech from 2-4s and 6-8s, leaving gaps at 0-2, 4-6, 8-10 239 speech_segments = [(2.0, 4.0), (6.0, 8.0)] 240 241 rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE) 242 243 # All three gaps are >= 0.5s (MIN_NONSPEECH_SEGMENT) 244 # Total non-speech: 2 + 2 + 2 = 6 seconds 245 assert duration == 6.0 246 247 def test_filters_short_segments(self): 248 """Should filter out non-speech segments shorter than min_segment.""" 249 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 250 # Speech leaves only 0.3s gaps (below default 0.5s threshold) 251 speech_segments = [(0.3, 1.0), (1.3, 2.0), (2.3, 5.0)] 252 253 rms, duration = compute_nonspeech_rms(audio, speech_segments, SAMPLE_RATE) 254 255 # No qualifying segments 256 assert rms is None 257 assert duration == 0.0 258 259 def test_no_speech_segments_returns_none(self): 260 """Should return None when no speech segments (can't compute non-speech).""" 261 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 262 263 rms, duration = compute_nonspeech_rms(audio, [], SAMPLE_RATE) 264 265 assert rms is None 266 assert duration == 0.0 267 268 def test_custom_min_segment(self): 269 """Should respect custom min_segment threshold.""" 270 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 271 # Speech from 1-2s, leaving 1s gap at start 272 speech_segments = [(1.0, 2.0)] 273 274 # With default 0.5s threshold, should include leading gap 275 rms, duration = compute_nonspeech_rms( 276 audio, speech_segments, SAMPLE_RATE, min_segment=0.5 277 ) 278 assert duration == 4.0 # 1s leading + 3s trailing 279 280 # With 2.0s threshold, should only include trailing gap (3s) 281 rms, duration = compute_nonspeech_rms( 282 audio, speech_segments, SAMPLE_RATE, min_segment=2.0 283 ) 284 assert duration == 3.0 285 286 287class TestRunVad: 288 """Test run_vad function.""" 289 290 @patch("faster_whisper.vad.get_speech_timestamps") 291 def test_silent_audio_returns_no_speech(self, mock_get_timestamps): 292 """Silent audio should return has_speech=False.""" 293 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 294 mock_get_timestamps.return_value = [] 295 296 result = run_vad(audio, min_speech_seconds=1.0) 297 298 assert result.duration == 5.0 299 assert result.speech_duration == 0.0 300 assert result.has_speech is False 301 302 @patch("faster_whisper.vad.get_speech_timestamps") 303 def test_speech_audio_returns_has_speech(self, mock_get_timestamps): 304 """Audio with speech should return has_speech=True.""" 305 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 306 # Mock: 2 seconds of speech (samples 16000-48000) 307 mock_get_timestamps.return_value = [{"start": 16000, "end": 48000}] 308 309 result = run_vad(audio, min_speech_seconds=1.0) 310 311 assert result.duration == 5.0 312 assert result.speech_duration == 2.0 313 assert result.has_speech is True 314 # Speech segments should be converted to seconds 315 assert result.speech_segments == [(1.0, 3.0)] 316 317 @patch("faster_whisper.vad.get_speech_timestamps") 318 def test_speech_below_threshold(self, mock_get_timestamps): 319 """Speech below threshold should return has_speech=False.""" 320 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 321 # Mock: 0.5 seconds of speech (below 1.0s threshold) 322 mock_get_timestamps.return_value = [{"start": 0, "end": 8000}] 323 324 result = run_vad(audio, min_speech_seconds=1.0) 325 326 assert result.duration == 5.0 327 assert result.speech_duration == 0.5 328 assert result.has_speech is False 329 330 @patch("faster_whisper.vad.get_speech_timestamps") 331 def test_custom_min_speech_threshold(self, mock_get_timestamps): 332 """Custom min_speech_seconds threshold should be respected.""" 333 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 334 # Mock: 0.5 seconds of speech 335 mock_get_timestamps.return_value = [{"start": 0, "end": 8000}] 336 337 # With 0.3s threshold, should have speech 338 result = run_vad(audio, min_speech_seconds=0.3) 339 assert result.has_speech is True 340 341 # With 1.0s threshold, should not have speech 342 result = run_vad(audio, min_speech_seconds=1.0) 343 assert result.has_speech is False 344 345 @patch("faster_whisper.vad.get_speech_timestamps") 346 def test_multiple_speech_chunks(self, mock_get_timestamps): 347 """Multiple speech chunks should be summed correctly.""" 348 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 349 # Mock: Two 1-second speech segments 350 mock_get_timestamps.return_value = [ 351 {"start": 16000, "end": 32000}, # 1 second 352 {"start": 48000, "end": 64000}, # 1 second 353 ] 354 355 result = run_vad(audio, min_speech_seconds=1.0) 356 357 assert result.duration == 5.0 358 assert result.speech_duration == 2.0 359 assert result.has_speech is True 360 361 @patch("faster_whisper.vad.get_speech_timestamps") 362 def test_returns_rms_for_silent_background(self, mock_get_timestamps): 363 """run_vad should return low RMS for silent non-speech regions.""" 364 # Silent audio (zeros) 365 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 366 # Speech from 1-3s, leaving non-speech at 0-1s and 3-5s 367 mock_get_timestamps.return_value = [{"start": 16000, "end": 48000}] 368 369 result = run_vad(audio, min_speech_seconds=1.0) 370 371 assert result.noisy_rms is not None 372 assert result.noisy_rms < 0.001 # Effectively zero 373 assert result.noisy_s == 3.0 # 1s leading + 2s trailing 374 375 @patch("faster_whisper.vad.get_speech_timestamps") 376 def test_returns_rms_for_noisy_background(self, mock_get_timestamps): 377 """run_vad should return measurable RMS for noisy non-speech regions.""" 378 # Noisy audio 379 np.random.seed(42) 380 audio = np.random.uniform(-0.1, 0.1, 5 * SAMPLE_RATE).astype(np.float32) 381 # Speech from 1-3s 382 mock_get_timestamps.return_value = [{"start": 16000, "end": 48000}] 383 384 result = run_vad(audio, min_speech_seconds=1.0) 385 386 assert result.noisy_rms is not None 387 assert result.noisy_rms > 0.01 # Noisy threshold 388 assert result.noisy_s == 3.0 389 390 @patch("faster_whisper.vad.get_speech_timestamps") 391 def test_returns_none_rms_when_no_qualifying_segments(self, mock_get_timestamps): 392 """run_vad should return None RMS when no qualifying non-speech segments.""" 393 audio = np.zeros(2 * SAMPLE_RATE, dtype=np.float32) 394 # Speech fills most of audio, leaving only 0.2s gaps (below 0.5s threshold) 395 mock_get_timestamps.return_value = [ 396 {"start": 3200, "end": 12800}, # 0.2s to 0.8s 397 {"start": 16000, "end": 28800}, # 1.0s to 1.8s 398 ] 399 400 result = run_vad(audio, min_speech_seconds=0.5) 401 402 assert result.noisy_rms is None 403 assert result.noisy_s == 0.0 404 405 406class TestSpeechSegment: 407 """Test SpeechSegment dataclass.""" 408 409 def test_speech_segment_fields(self): 410 """SpeechSegment should have all expected fields.""" 411 seg = SpeechSegment( 412 original_start=5.0, 413 original_end=10.0, 414 reduced_start=2.0, 415 reduced_end=7.0, 416 ) 417 418 assert seg.original_start == 5.0 419 assert seg.original_end == 10.0 420 assert seg.reduced_start == 2.0 421 assert seg.reduced_end == 7.0 422 423 424class TestAudioReduction: 425 """Test AudioReduction dataclass and timestamp restoration.""" 426 427 def test_empty_reduction(self): 428 """Empty reduction should return timestamp unchanged.""" 429 reduction = AudioReduction() 430 assert reduction.restore_timestamp(5.0) == 5.0 431 432 def test_single_segment_restoration(self): 433 """Single segment should restore timestamps within segment.""" 434 reduction = AudioReduction( 435 segments=[ 436 SpeechSegment( 437 original_start=3.0, 438 original_end=8.0, 439 reduced_start=0.0, 440 reduced_end=5.0, 441 ) 442 ], 443 original_duration=10.0, 444 reduced_duration=5.0, 445 ) 446 447 # Reduced time 0.0 -> original 3.0 448 assert reduction.restore_timestamp(0.0) == 3.0 449 450 # Reduced time 2.5 -> original 5.5 (midpoint) 451 assert reduction.restore_timestamp(2.5) == 5.5 452 453 # Reduced time 5.0 -> original 8.0 454 assert reduction.restore_timestamp(5.0) == 8.0 455 456 def test_multiple_segments_restoration(self): 457 """Multiple segments should restore timestamps correctly.""" 458 # Simulates: original 10s audio with speech at [1-3] and [7-9] 459 # with 4s gap trimmed to 2s, so reduced audio is 6s 460 reduction = AudioReduction( 461 segments=[ 462 SpeechSegment( 463 original_start=1.0, 464 original_end=3.0, 465 reduced_start=1.0, 466 reduced_end=3.0, 467 ), 468 SpeechSegment( 469 original_start=7.0, 470 original_end=9.0, 471 reduced_start=5.0, # 3.0 + 2.0 gap 472 reduced_end=7.0, 473 ), 474 ], 475 original_duration=10.0, 476 reduced_duration=8.0, 477 ) 478 479 # First segment: reduced 1.0 -> original 1.0 480 assert reduction.restore_timestamp(1.0) == 1.0 481 482 # First segment: reduced 2.0 -> original 2.0 483 assert reduction.restore_timestamp(2.0) == 2.0 484 485 # Second segment: reduced 5.0 -> original 7.0 486 assert reduction.restore_timestamp(5.0) == 7.0 487 488 # Second segment: reduced 6.0 -> original 8.0 489 assert reduction.restore_timestamp(6.0) == 8.0 490 491 def test_timestamp_in_gap(self): 492 """Timestamp in reduced gap should map proportionally to original gap.""" 493 reduction = AudioReduction( 494 segments=[ 495 SpeechSegment( 496 original_start=0.0, 497 original_end=2.0, 498 reduced_start=0.0, 499 reduced_end=2.0, 500 ), 501 SpeechSegment( 502 original_start=8.0, 503 original_end=10.0, 504 reduced_start=4.0, # 2.0 + 2.0 reduced gap 505 reduced_end=6.0, 506 ), 507 ], 508 original_duration=10.0, 509 reduced_duration=6.0, 510 ) 511 512 # Gap in reduced: 2.0-4.0 (2s), original: 2.0-8.0 (6s) 513 # Reduced 3.0 is midpoint of gap -> original 5.0 (midpoint of 2-8) 514 result = reduction.restore_timestamp(3.0) 515 assert abs(result - 5.0) < 0.1 # Allow small tolerance 516 517 def test_timestamp_after_all_segments(self): 518 """Timestamp after all segments should extrapolate from last segment.""" 519 reduction = AudioReduction( 520 segments=[ 521 SpeechSegment( 522 original_start=0.0, 523 original_end=5.0, 524 reduced_start=0.0, 525 reduced_end=5.0, 526 ), 527 ], 528 original_duration=10.0, 529 reduced_duration=6.0, 530 ) 531 532 # Reduced 6.0 is 1.0 after last segment end -> original 6.0 533 assert reduction.restore_timestamp(6.0) == 6.0 534 535 def test_timestamp_before_first_segment(self): 536 """Timestamp before first segment should map to leading buffer region.""" 537 # Simulates: original audio with 5s silence then speech at [5-10] 538 # Leading 5s gap reduced to 1s buffer, so speech starts at reduced 1.0 539 reduction = AudioReduction( 540 segments=[ 541 SpeechSegment( 542 original_start=5.0, 543 original_end=10.0, 544 reduced_start=1.0, # 1s buffer before speech 545 reduced_end=6.0, 546 ), 547 ], 548 original_duration=10.0, 549 reduced_duration=6.0, 550 ) 551 552 # Reduced 0.0 is 1.0 before first segment start (5.0) -> original 4.0 553 assert reduction.restore_timestamp(0.0) == 4.0 554 555 # Reduced 0.5 is 0.5 before first segment start (5.0) -> original 4.5 556 assert reduction.restore_timestamp(0.5) == 4.5 557 558 # Reduced 1.0 is exactly at first segment start -> original 5.0 559 assert reduction.restore_timestamp(1.0) == 5.0 560 561 562class TestRestoreSegmentTimestamps: 563 """Test restore_statement_timestamps function.""" 564 565 def test_restores_segment_timestamps(self): 566 """Should restore segment start and end timestamps.""" 567 segments = [ 568 {"id": 1, "start": 0.0, "end": 2.0, "text": "Hello"}, 569 {"id": 2, "start": 4.0, "end": 6.0, "text": "World"}, 570 ] 571 572 reduction = AudioReduction( 573 segments=[ 574 SpeechSegment( 575 original_start=0.0, 576 original_end=2.0, 577 reduced_start=0.0, 578 reduced_end=2.0, 579 ), 580 SpeechSegment( 581 original_start=6.0, 582 original_end=8.0, 583 reduced_start=4.0, 584 reduced_end=6.0, 585 ), 586 ], 587 original_duration=10.0, 588 reduced_duration=8.0, 589 ) 590 591 restored = restore_statement_timestamps(segments, reduction) 592 593 assert restored[0]["start"] == 0.0 594 assert restored[0]["end"] == 2.0 595 assert restored[1]["start"] == 6.0 596 assert restored[1]["end"] == 8.0 597 598 def test_restores_word_timestamps(self): 599 """Should restore word-level timestamps.""" 600 segments = [ 601 { 602 "id": 1, 603 "start": 4.0, 604 "end": 6.0, 605 "text": "Hello world", 606 "words": [ 607 {"word": "Hello", "start": 4.0, "end": 5.0, "probability": 0.9}, 608 {"word": "world", "start": 5.0, "end": 6.0, "probability": 0.9}, 609 ], 610 }, 611 ] 612 613 reduction = AudioReduction( 614 segments=[ 615 SpeechSegment( 616 original_start=8.0, 617 original_end=10.0, 618 reduced_start=4.0, 619 reduced_end=6.0, 620 ), 621 ], 622 original_duration=12.0, 623 reduced_duration=8.0, 624 ) 625 626 restored = restore_statement_timestamps(segments, reduction) 627 628 assert restored[0]["start"] == 8.0 629 assert restored[0]["end"] == 10.0 630 assert restored[0]["words"][0]["start"] == 8.0 631 assert restored[0]["words"][0]["end"] == 9.0 632 assert restored[0]["words"][1]["start"] == 9.0 633 assert restored[0]["words"][1]["end"] == 10.0 634 635 def test_preserves_other_fields(self): 636 """Should preserve non-timestamp fields.""" 637 segments = [ 638 { 639 "id": 1, 640 "start": 0.0, 641 "end": 2.0, 642 "text": "Hello", 643 "custom_field": "preserved", 644 }, 645 ] 646 647 reduction = AudioReduction( 648 segments=[ 649 SpeechSegment( 650 original_start=0.0, 651 original_end=2.0, 652 reduced_start=0.0, 653 reduced_end=2.0, 654 ), 655 ], 656 original_duration=5.0, 657 reduced_duration=2.0, 658 ) 659 660 restored = restore_statement_timestamps(segments, reduction) 661 662 assert restored[0]["text"] == "Hello" 663 assert restored[0]["custom_field"] == "preserved" 664 assert restored[0]["id"] == 1 665 666 def test_handles_empty_segments(self): 667 """Should handle empty segment list.""" 668 reduction = AudioReduction( 669 segments=[ 670 SpeechSegment( 671 original_start=0.0, 672 original_end=5.0, 673 reduced_start=0.0, 674 reduced_end=5.0, 675 ), 676 ], 677 original_duration=10.0, 678 reduced_duration=5.0, 679 ) 680 681 restored = restore_statement_timestamps([], reduction) 682 assert restored == [] 683 684 def test_handles_segments_without_words(self): 685 """Should handle segments without words field.""" 686 segments = [{"id": 1, "start": 0.0, "end": 2.0, "text": "Hello"}] 687 688 reduction = AudioReduction( 689 segments=[ 690 SpeechSegment( 691 original_start=5.0, 692 original_end=7.0, 693 reduced_start=0.0, 694 reduced_end=2.0, 695 ), 696 ], 697 original_duration=10.0, 698 reduced_duration=2.0, 699 ) 700 701 restored = restore_statement_timestamps(segments, reduction) 702 703 assert restored[0]["start"] == 5.0 704 assert restored[0]["end"] == 7.0 705 assert "words" not in restored[0] 706 707 708class TestReduceAudio: 709 """Test reduce_audio function.""" 710 711 def test_no_gaps_to_reduce(self): 712 """Should return None when no gaps > 2s exist.""" 713 # 5s audio with speech from 0.5-1.5s and 2.0-3.0s (gap = 0.5s < 2s) 714 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 715 716 vad_result = VadResult( 717 duration=5.0, 718 speech_duration=2.0, 719 has_speech=True, 720 speech_segments=[(0.5, 1.5), (2.0, 3.0)], 721 ) 722 723 reduced_audio, reduction = reduce_audio(audio, vad_result) 724 725 assert reduced_audio is None 726 assert reduction is None 727 728 def test_no_speech_segments(self): 729 """Should return None when no speech segments.""" 730 audio = np.zeros(5 * SAMPLE_RATE, dtype=np.float32) 731 732 vad_result = VadResult( 733 duration=5.0, 734 speech_duration=0.0, 735 has_speech=False, 736 speech_segments=[], 737 ) 738 739 reduced_audio, reduction = reduce_audio(audio, vad_result) 740 741 assert reduced_audio is None 742 assert reduction is None 743 744 def test_leading_gap_reduction(self): 745 """Should trim leading gap > 2s to GAP_BUFFER.""" 746 # 10s audio with speech starting at 5s (leading gap = 5s > 2s) 747 audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32) 748 749 vad_result = VadResult( 750 duration=10.0, 751 speech_duration=3.0, 752 has_speech=True, 753 speech_segments=[(5.0, 8.0)], # Speech from 5-8s 754 ) 755 756 reduced_audio, reduction = reduce_audio(audio, vad_result) 757 758 assert reduced_audio is not None 759 assert reduction is not None 760 761 # Should have: GAP_BUFFER (1s) + speech (3s) + trailing (2s) = 6s 762 # But trailing is <= 2s, so kept in full 763 expected_duration = GAP_BUFFER + 3.0 + 2.0 # 6s 764 actual_duration = len(reduced_audio) / SAMPLE_RATE 765 assert abs(actual_duration - expected_duration) < 0.1 766 767 # Check mapping: speech should start at GAP_BUFFER in reduced audio 768 assert len(reduction.segments) == 1 769 assert reduction.segments[0].original_start == 5.0 770 assert reduction.segments[0].reduced_start == GAP_BUFFER 771 772 def test_trailing_gap_reduction(self): 773 """Should trim trailing gap > 2s to GAP_BUFFER.""" 774 # 10s audio with speech from 1-3s (trailing gap = 7s > 2s) 775 audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32) 776 777 vad_result = VadResult( 778 duration=10.0, 779 speech_duration=2.0, 780 has_speech=True, 781 speech_segments=[(1.0, 3.0)], # Speech from 1-3s 782 ) 783 784 reduced_audio, reduction = reduce_audio(audio, vad_result) 785 786 assert reduced_audio is not None 787 assert reduction is not None 788 789 # Should have: leading (1s) + speech (2s) + GAP_BUFFER (1s) = 4s 790 expected_duration = 1.0 + 2.0 + GAP_BUFFER # 4s 791 actual_duration = len(reduced_audio) / SAMPLE_RATE 792 assert abs(actual_duration - expected_duration) < 0.1 793 794 def test_middle_gap_reduction(self): 795 """Should trim middle gap > 2s to 2*GAP_BUFFER.""" 796 # 10s audio with speech at 0-2s and 7-9s (gap = 5s > 2s) 797 audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32) 798 799 vad_result = VadResult( 800 duration=10.0, 801 speech_duration=4.0, 802 has_speech=True, 803 speech_segments=[(0.0, 2.0), (7.0, 9.0)], 804 ) 805 806 reduced_audio, reduction = reduce_audio(audio, vad_result) 807 808 assert reduced_audio is not None 809 assert reduction is not None 810 811 # Should have: speech1 (2s) + trimmed gap (2s) + speech2 (2s) + trailing (1s) = 7s 812 expected_duration = 2.0 + 2 * GAP_BUFFER + 2.0 + 1.0 # 7s 813 actual_duration = len(reduced_audio) / SAMPLE_RATE 814 assert abs(actual_duration - expected_duration) < 0.1 815 816 # Check mapping 817 assert len(reduction.segments) == 2 818 assert reduction.segments[0].original_start == 0.0 819 assert reduction.segments[0].reduced_start == 0.0 820 assert reduction.segments[1].original_start == 7.0 821 # Second segment should start at: speech1_end + trimmed_gap = 2.0 + 2.0 = 4.0 822 assert abs(reduction.segments[1].reduced_start - 4.0) < 0.1 823 824 def test_multiple_gaps_reduction(self): 825 """Should trim multiple gaps > 2s.""" 826 # 20s audio with speech at 5-7, 12-14, and 19-20 (two big gaps) 827 audio = np.zeros(20 * SAMPLE_RATE, dtype=np.float32) 828 829 vad_result = VadResult( 830 duration=20.0, 831 speech_duration=5.0, 832 has_speech=True, 833 speech_segments=[(5.0, 7.0), (12.0, 14.0), (19.0, 20.0)], 834 ) 835 836 reduced_audio, reduction = reduce_audio(audio, vad_result) 837 838 assert reduced_audio is not None 839 assert reduction is not None 840 841 # Should have: 842 # - Leading: GAP_BUFFER (1s) 843 # - Speech1: 2s 844 # - Gap1 (5s->2s): 2*GAP_BUFFER = 2s 845 # - Speech2: 2s 846 # - Gap2 (5s->2s): 2*GAP_BUFFER = 2s 847 # - Speech3: 1s 848 # Total: 1 + 2 + 2 + 2 + 2 + 1 = 10s 849 expected_duration = ( 850 GAP_BUFFER + 2.0 + 2 * GAP_BUFFER + 2.0 + 2 * GAP_BUFFER + 1.0 851 ) 852 actual_duration = len(reduced_audio) / SAMPLE_RATE 853 assert abs(actual_duration - expected_duration) < 0.1 854 855 # Check we have 3 speech segments in mapping 856 assert len(reduction.segments) == 3 857 858 def test_returns_numpy_array(self): 859 """Should return numpy array.""" 860 audio = np.zeros(10 * SAMPLE_RATE, dtype=np.float32) 861 862 vad_result = VadResult( 863 duration=10.0, 864 speech_duration=2.0, 865 has_speech=True, 866 speech_segments=[(5.0, 7.0)], # Leading gap > 2s 867 ) 868 869 reduced_audio, reduction = reduce_audio(audio, vad_result) 870 871 assert isinstance(reduced_audio, np.ndarray) 872 assert reduced_audio.dtype == np.float32