# SPDX-License-Identifier: AGPL-3.0-only
# Copyright (c) 2026 sol pbc

import importlib

import pytest

from think.utils import day_path


def test_cluster(tmp_path, monkeypatch):
    """Test cluster() uses transcripts and agent output summaries (*.md files)."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")
    # Write JSONL format: metadata first, then entry in segment directory
    (day_dir / "default" / "120000_300").mkdir(parents=True)
    (day_dir / "default" / "120000_300" / "audio.jsonl").write_text(
        '{}\n{"text": "hi"}\n'
    )
    (day_dir / "default" / "120500_300").mkdir(parents=True)
    (day_dir / "default" / "120500_300" / "agents").mkdir()
    (day_dir / "default" / "120500_300" / "agents" / "screen.md").write_text(
        "screen summary"
    )
    result, counts = mod.cluster(
        "20240101", sources={"transcripts": True, "percepts": False, "agents": True}
    )
    assert counts["transcripts"] == 1
    assert counts["agents"] == 1
    assert "### Transcript" in result
    # Now uses insight rendering: "### {stem} summary"
    assert "screen summary" in result


def test_cluster_range(tmp_path, monkeypatch):
    """Test cluster_range with transcripts and agents sources."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")
    # Write JSONL format: metadata first, then entry with proper start time and source in segment directory
    (day_dir / "default" / "120000_300").mkdir(parents=True)
    (day_dir / "default" / "120000_300" / "audio.jsonl").write_text(
        '{"raw": "raw.flac", "model": "whisper-1"}\n'
        '{"start": "00:00:01", "source": "mic", "text": "hi from audio"}\n'
    )
    (day_dir / "default" / "120000_300" / "agents").mkdir()
    (day_dir / "default" / "120000_300" / "agents" / "screen.md").write_text(
        "screen summary content"
    )
    # Test with agents=True to include *.md files
    md = mod.cluster_range(
        "20240101",
        "120000",
        "120100",
        sources={"transcripts": True, "percepts": False, "agents": True},
    )
    # Check that the function works and includes expected sections
    assert "### Transcript" in md
    # Now uses insight rendering: "### {stem} summary"
    assert "screen summary" in md
    assert "screen summary content" in md


def test_cluster_scan(tmp_path, monkeypatch):
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")
    # Audio transcripts at 09:01, 09:05, 09:20 and 11:00 (JSONL format with empty metadata)
    (day_dir / "default" / "090101_300").mkdir(parents=True)
    (day_dir / "default" / "090101_300" / "audio.jsonl").write_text("{}\n")
    (day_dir / "default" / "090500_300").mkdir(parents=True)
    (day_dir / "default" / "090500_300" / "audio.jsonl").write_text("{}\n")
    (day_dir / "default" / "092000_300").mkdir(parents=True)
    (day_dir / "default" / "092000_300" / "audio.jsonl").write_text("{}\n")
    (day_dir / "default" / "110000_300").mkdir(parents=True)
    (day_dir / "default" / "110000_300" / "audio.jsonl").write_text("{}\n")
    # Screen transcripts at 10:01, 10:05, 10:20 and 12:00
    (day_dir / "default" / "100101_300").mkdir(parents=True)
    (day_dir / "default" / "100101_300" / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
    )
    (day_dir / "default" / "100500_300").mkdir(parents=True)
    (day_dir / "default" / "100500_300" / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
    )
    (day_dir / "default" / "102000_300").mkdir(parents=True)
    (day_dir / "default" / "102000_300" / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
    )
    (day_dir / "default" / "120000_300").mkdir(parents=True)
    (day_dir / "default" / "120000_300" / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
    )
    audio_ranges, screen_ranges = mod.cluster_scan("20240101")
    # Expected ranges: 15-minute slot grouping (segments 09:01-09:05-09:20 group together)
    # Slots: 09:00, 09:00, 09:15 -> ranges: 09:00-09:30; 11:00 -> 11:00-11:15
    assert audio_ranges == [("09:00", "09:30"), ("11:00", "11:15")]
    assert screen_ranges == [("10:00", "10:30"), ("12:00", "12:15")]


def test_cluster_segments(tmp_path, monkeypatch):
    """Test cluster_segments returns individual segments with their types."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create segment with duration: 090000_300 (09:00:00 for 5 minutes)
    (day_dir / "default" / "090000_300").mkdir(parents=True)
    (day_dir / "default" / "090000_300" / "audio.jsonl").write_text("{}\n")

    # Create segment with both audio and screen
    (day_dir / "default" / "100000_600").mkdir(parents=True)
    (day_dir / "default" / "100000_600" / "audio.jsonl").write_text("{}\n")
    (day_dir / "default" / "100000_600" / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
    )

    # Create segment with only screen
    (day_dir / "default" / "110000_300").mkdir(parents=True)
    (day_dir / "default" / "110000_300" / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
    )

    segments = mod.cluster_segments("20240101")

    assert len(segments) == 3

    # Check first segment (audio only)
    assert segments[0]["key"] == "090000_300"
    assert segments[0]["start"] == "09:00"
    assert segments[0]["end"] == "09:05"
    assert segments[0]["types"] == ["audio"]

    # Check second segment (both transcripts and screen)
    assert segments[1]["key"] == "100000_600"
    assert segments[1]["start"] == "10:00"
    assert segments[1]["end"] == "10:10"
    assert "audio" in segments[1]["types"]
    assert "screen" in segments[1]["types"]

    # Check third segment (screen only)
    assert segments[2]["key"] == "110000_300"
    assert segments[2]["start"] == "11:00"
    assert segments[2]["end"] == "11:05"
    assert segments[2]["types"] == ["screen"]


def test_cluster_period_uses_raw_screen(tmp_path, monkeypatch):
    """Test cluster_period uses raw screen.jsonl, not insight *.md files."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create segment with both audio and raw screen data
    segment = day_dir / "default" / "100000_300"
    segment.mkdir(parents=True)
    (segment / "audio.jsonl").write_text(
        '{"raw": "audio.flac"}\n{"start": "00:00:01", "text": "hello"}\n'
    )
    # Raw screen.jsonl with frame analysis (what cluster_period should use)
    (segment / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
        '{"timestamp": 10, "analysis": {"primary": "code_editor", '
        '"visual_description": "VS Code with Python file"}}\n'
    )
    # Also create screen.md (insight) to verify it's NOT used by cluster_period
    (segment / "agents").mkdir()
    (segment / "agents" / "screen.md").write_text("This insight should NOT appear")

    result, counts = mod.cluster_period(
        "20240101",
        "100000_300",
        sources={"transcripts": True, "percepts": True, "agents": False},
    )

    # Should have both transcript and screen entries
    assert counts["transcripts"] == 1
    assert counts["percepts"] == 1
    assert "### Transcript" in result
    # Should use raw screen format header
    assert "Screen Activity" in result
    # Raw screen content should be present
    assert "VS Code with Python file" in result
    # Insight content should NOT be present (agents=False for cluster_period)
    assert "This insight should NOT appear" not in result


def test_load_entries_from_toplevel_segment(tmp_path, monkeypatch):
    """_load_entries_from_segment resolves the day for top-level segment dirs."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")
    segment = day_dir / "100000_300"
    segment.mkdir()

    mod = importlib.import_module("think.cluster")

    entries = mod._load_entries_from_segment(
        str(segment),
        transcripts=True,
        percepts=False,
        agents=False,
    )

    assert entries == []


def test_cluster_range_with_agents(tmp_path, monkeypatch):
    """Test cluster_range with agents source loads all *.md files."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create segment with multiple insight files
    segment = day_dir / "default" / "100000_300"
    segment.mkdir(parents=True)
    (segment / "agents").mkdir()
    (segment / "audio.jsonl").write_text(
        '{"raw": "audio.flac"}\n{"start": "00:00:01", "text": "hello"}\n'
    )
    (segment / "agents" / "screen.md").write_text("Screen activity summary")
    (segment / "agents" / "activity.md").write_text("Activity insight content")
    # Also create screen.jsonl to verify it's NOT used when agents=True, screen=False
    (segment / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
        '{"timestamp": 10, "analysis": {"primary": "code_editor"}}\n'
    )

    # Test agents=True returns *.md summaries, not raw screen data
    result = mod.cluster_range(
        "20240101",
        "100000",
        "100500",
        sources={"transcripts": True, "percepts": False, "agents": True},
    )

    assert "### Transcript" in result
    # Should include both .md files as agent outputs
    assert "### screen summary" in result
    assert "Screen activity summary" in result
    assert "### activity summary" in result
    assert "Activity insight content" in result
    # Should NOT include raw screen data
    assert "code_editor" not in result


def test_cluster_range_with_screen(tmp_path, monkeypatch):
    """Test cluster_range with screen source loads raw screen.jsonl data."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create segment with raw screen data and insight file
    segment = day_dir / "default" / "100000_300"
    segment.mkdir(parents=True)
    (segment / "agents").mkdir()
    (segment / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
        '{"timestamp": 10, "analysis": {"primary": "code_editor"}}\n'
    )
    (segment / "agents" / "screen.md").write_text("Screen summary insight")

    # Test screen=True returns raw screen data, not agent outputs
    result = mod.cluster_range(
        "20240101",
        "100000",
        "100500",
        sources={"transcripts": False, "percepts": True, "agents": False},
    )

    assert "Screen Activity" in result
    assert "code_editor" in result
    # Should NOT include insight content
    assert "Screen summary insight" not in result
    assert "### screen summary" not in result


def test_cluster_range_with_multiple_screen_files(tmp_path, monkeypatch):
    """Test cluster_range loads multiple *_screen.jsonl files per segment."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create segment with multiple screen files (like multi-monitor setup)
    segment = day_dir / "default" / "100000_300"
    segment.mkdir(parents=True)
    (segment / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
        '{"timestamp": 10, "analysis": {"primary": "code_editor", '
        '"visual_description": "Primary monitor with VS Code"}}\n'
    )
    (segment / "monitor_2_screen.jsonl").write_text(
        '{"raw": "monitor_2.webm"}\n'
        '{"timestamp": 10, "analysis": {"primary": "browser", '
        '"visual_description": "Secondary monitor with documentation"}}\n'
    )

    # Test screen=True returns data from both screen files
    result = mod.cluster_range(
        "20240101",
        "100000",
        "100500",
        sources={"transcripts": False, "percepts": True, "agents": False},
    )

    # Should include content from both screen files
    assert "Primary monitor with VS Code" in result
    assert "Secondary monitor with documentation" in result


def test_cluster_scan_with_split_screen(tmp_path, monkeypatch):
    """Test cluster_scan detects *_screen.jsonl files."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create segment with only *_screen.jsonl (no screen.jsonl)
    (day_dir / "default" / "100000_300").mkdir(parents=True)
    (day_dir / "default" / "100000_300" / "monitor_1_screen.jsonl").write_text(
        '{"raw": "m1.webm"}\n'
    )

    audio_ranges, screen_ranges = mod.cluster_scan("20240101")

    # Should detect the segment as having screen content (15-minute slot grouping)
    assert screen_ranges == [("10:00", "10:15")]


def test_cluster_segments_with_split_screen(tmp_path, monkeypatch):
    """Test cluster_segments detects *_screen.jsonl files."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create segment with only *_screen.jsonl (no screen.jsonl)
    (day_dir / "default" / "100000_300").mkdir(parents=True)
    (day_dir / "default" / "100000_300" / "wayland_screen.jsonl").write_text(
        '{"raw": "w.webm"}\n'
    )

    segments = mod.cluster_segments("20240101")

    assert len(segments) == 1
    assert segments[0]["key"] == "100000_300"
    assert "screen" in segments[0]["types"]


def test_cluster_span(tmp_path, monkeypatch):
    """Test cluster_span processes a span of segments."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create three segments with different content
    (day_dir / "default" / "090000_300").mkdir(parents=True)
    (day_dir / "default" / "090000_300" / "audio.jsonl").write_text(
        '{"raw": "audio.flac"}\n{"start": "00:00:01", "text": "morning segment"}\n'
    )

    (day_dir / "default" / "100000_300").mkdir(parents=True)
    (day_dir / "default" / "100000_300" / "audio.jsonl").write_text(
        '{"raw": "audio.flac"}\n{"start": "00:00:01", "text": "mid-morning segment"}\n'
    )
    (day_dir / "default" / "100000_300" / "screen.jsonl").write_text(
        '{"raw": "screen.webm"}\n'
        '{"timestamp": 10, "analysis": {"primary": "code_editor"}}\n'
    )

    (day_dir / "default" / "110000_300").mkdir(parents=True)
    (day_dir / "default" / "110000_300" / "audio.jsonl").write_text(
        '{"raw": "audio.flac"}\n{"start": "00:00:01", "text": "late morning segment"}\n'
    )

    # Process only first and third segments as a span (audio only, no screen)
    result, counts = mod.cluster_span(
        "20240101",
        ["090000_300", "110000_300"],
        sources={"transcripts": True, "percepts": False, "agents": False},
    )

    # Should have 2 transcript entries (one per segment)
    assert counts["transcripts"] == 2
    assert counts["percepts"] == 0
    assert "morning segment" in result
    assert "late morning segment" in result
    # Should NOT include the skipped segment
    assert "mid-morning segment" not in result
    assert "code_editor" not in result


def test_cluster_span_missing_segment(tmp_path, monkeypatch):
    """Test cluster_span fails fast when segment is missing."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create only one segment
    (day_dir / "default" / "090000_300").mkdir(parents=True)
    (day_dir / "default" / "090000_300" / "audio.jsonl").write_text(
        '{"raw": "audio.flac"}\n'
    )

    # Try to process existing and non-existing segments
    with pytest.raises(ValueError) as exc_info:
        mod.cluster_span(
            "20240101",
            ["090000_300", "100000_300"],
            sources={"transcripts": True, "percepts": False, "agents": False},
        )

    assert "100000_300" in str(exc_info.value)
    assert "not found" in str(exc_info.value)


def test_cluster_with_agent_filter_dict(tmp_path, monkeypatch):
    """Test cluster() with dict-valued agents source for selective filtering."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create segment with multiple agent output files
    segment = day_dir / "default" / "120000_300"
    segment.mkdir(parents=True)
    (segment / "agents").mkdir()
    (segment / "audio.jsonl").write_text('{}\n{"text": "hello"}\n')
    (segment / "agents" / "entities.md").write_text("Entity extraction results")
    (segment / "agents" / "meetings.md").write_text("Meeting summary results")
    (segment / "agents" / "flow.md").write_text("Flow analysis results")

    # Test filtering to only include entities
    result, counts = mod.cluster(
        "20240101",
        sources={"transcripts": True, "percepts": False, "agents": {"entities": True}},
    )

    assert counts["transcripts"] == 1
    assert counts["agents"] == 1  # Only entities should be counted
    assert "Entity extraction results" in result
    assert "Meeting summary results" not in result
    assert "Flow analysis results" not in result


def test_cluster_with_agent_filter_multiple(tmp_path, monkeypatch):
    """Test cluster() with dict selecting multiple agents."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create segment with multiple agent output files
    segment = day_dir / "default" / "120000_300"
    segment.mkdir(parents=True)
    (segment / "agents").mkdir()
    (segment / "audio.jsonl").write_text('{}\n{"text": "hello"}\n')
    (segment / "agents" / "entities.md").write_text("Entity extraction results")
    (segment / "agents" / "meetings.md").write_text("Meeting summary results")
    (segment / "agents" / "flow.md").write_text("Flow analysis results")

    # Test filtering to include entities and meetings but not flow
    result, counts = mod.cluster(
        "20240101",
        sources={
            "transcripts": True,
            "percepts": False,
            "agents": {"entities": True, "meetings": "required", "flow": False},
        },
    )

    assert counts["transcripts"] == 1
    assert counts["agents"] == 2  # entities + meetings
    assert "Entity extraction results" in result
    assert "Meeting summary results" in result
    assert "Flow analysis results" not in result


def test_cluster_with_agent_filter_app_namespaced(tmp_path, monkeypatch):
    """Test cluster() with dict filtering app-namespaced agent outputs."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    # Create segment with app-namespaced agent output files
    # App agent output naming: "app:agent" -> "_app_agent.md"
    segment = day_dir / "default" / "120000_300"
    segment.mkdir(parents=True)
    (segment / "agents").mkdir()
    (segment / "audio.jsonl").write_text('{}\n{"text": "hello"}\n')
    (segment / "agents" / "entities.md").write_text("System entity results")
    (segment / "agents" / "_todos_review.md").write_text("Todos review results")

    # Test filtering to include app-namespaced agent
    result, counts = mod.cluster(
        "20240101",
        sources={
            "transcripts": True,
            "percepts": False,
            "agents": {"entities": False, "todos:review": True},
        },
    )

    assert counts["transcripts"] == 1
    assert counts["agents"] == 1  # Only todos:review
    assert "System entity results" not in result
    assert "Todos review results" in result


def test_cluster_with_empty_agent_filter(tmp_path, monkeypatch):
    """Test cluster() with empty dict means no agents."""
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    segment = day_dir / "default" / "120000_300"
    segment.mkdir(parents=True)
    (segment / "agents").mkdir()
    (segment / "audio.jsonl").write_text('{}\n{"text": "hello"}\n')
    (segment / "agents" / "entities.md").write_text("Entity extraction results")

    # Empty dict should mean no agents
    result, counts = mod.cluster(
        "20240101",
        sources={"transcripts": True, "percepts": False, "agents": {}},
    )

    assert counts["transcripts"] == 1
    assert counts["agents"] == 0
    assert "Entity extraction results" not in result


def test_filename_to_agent_key():
    """Test _filename_to_agent_key conversion."""
    from think.cluster import _filename_to_agent_key

    # System agents
    assert _filename_to_agent_key("entities") == "entities"
    assert _filename_to_agent_key("flow") == "flow"

    # App-namespaced agents
    assert _filename_to_agent_key("_todos_review") == "todos:review"
    assert _filename_to_agent_key("_entities_observer") == "entities:observer"

    # Edge case: single underscore component
    assert _filename_to_agent_key("_app") == "_app"  # No second part, returns as-is


def test_agent_matches_filter():
    """Test _agent_matches_filter logic."""
    from think.cluster import _agent_matches_filter

    # None filter means all agents
    assert _agent_matches_filter("entities", None) is True
    assert _agent_matches_filter("_todos_review", None) is True

    # Empty dict means no agents
    assert _agent_matches_filter("entities", {}) is False
    assert _agent_matches_filter("_todos_review", {}) is False

    # Specific filtering
    filter_dict = {"entities": True, "meetings": False, "todos:review": "required"}
    assert _agent_matches_filter("entities", filter_dict) is True
    assert _agent_matches_filter("meetings", filter_dict) is False
    assert _agent_matches_filter("_todos_review", filter_dict) is True
    assert _agent_matches_filter("flow", filter_dict) is False  # Not in filter


def test_scan_day_combined(tmp_path, monkeypatch):
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))
    day_dir = day_path("20240101")

    mod = importlib.import_module("think.cluster")

    first = day_dir / "default" / "090000_300"
    first.mkdir(parents=True)
    (first / "audio.jsonl").write_text("{}\n")
    (first / "screen.jsonl").write_text('{"raw": "screen.webm"}\n')

    second = day_dir / "default" / "093000_300"
    second.mkdir(parents=True)
    (second / "audio.jsonl").write_text("{}\n")

    audio_ranges, screen_ranges, segments = mod.scan_day("20240101")
    expected_ranges = mod.cluster_scan("20240101")
    expected_segments = mod.cluster_segments("20240101")

    assert audio_ranges == [("09:00", "09:15"), ("09:30", "09:45")]
    assert screen_ranges == [("09:00", "09:15")]
    assert segments == [
        {
            "key": "090000_300",
            "start": "09:00",
            "end": "09:05",
            "types": ["audio", "screen"],
            "stream": "default",
        },
        {
            "key": "093000_300",
            "start": "09:30",
            "end": "09:35",
            "types": ["audio"],
            "stream": "default",
        },
    ]
    assert (audio_ranges, screen_ranges) == expected_ranges
    assert segments == expected_segments


def test_scan_day_empty(tmp_path, monkeypatch):
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))

    mod = importlib.import_module("think.cluster")

    assert mod.scan_day("20250101") == ([], [], [])


def test_day_path_create_false(tmp_path, monkeypatch):
    monkeypatch.setenv("_SOLSTONE_JOURNAL_OVERRIDE", str(tmp_path))

    missing = day_path("29990101", create=False)
    assert not missing.exists()

    created = day_path("29990101")
    assert created.exists()