tests/integration/test_openai_provider.py at main

solpbc.org / solstone
fork atom
personal memory agent
fork atom
solstone / tests / integration / test_openai_provider.py
at main 326 lines 11 kB view raw
wrap content
Jer Miller refactor: rename solstone agents concept to talents 12hrs ago
0050be8c
  1# SPDX-License-Identifier: AGPL-3.0-only
  2# Copyright (c) 2026 sol pbc
  3
  4"""Integration test for OpenAI provider with real API calls."""
  5
  6import json
  7import os
  8import subprocess
  9from pathlib import Path
 10
 11import pytest
 12from dotenv import load_dotenv
 13
 14from tests.integration.conftest import require_cli_tool
 15from think.models import GPT_5
 16
 17
 18def get_fixtures_env():
 19    """Load the tests/fixtures/.env file and return the environment."""
 20    fixtures_env = Path(__file__).parent.parent / "fixtures" / ".env"
 21    if not fixtures_env.exists():
 22        return None, None, None
 23
 24    # Load the env file
 25    load_dotenv(fixtures_env, override=True)
 26
 27    api_key = os.getenv("OPENAI_API_KEY")
 28    journal_path = os.getenv("_SOLSTONE_JOURNAL_OVERRIDE")
 29
 30    return fixtures_env, api_key, journal_path
 31
 32
 33@pytest.mark.integration
 34@pytest.mark.requires_api
 35def test_openai_provider_basic():
 36    """Test OpenAI provider with basic prompt via CLI."""
 37    require_cli_tool("OpenAI", "codex")
 38    fixtures_env, api_key, journal_path = get_fixtures_env()
 39
 40    if not fixtures_env:
 41        pytest.skip("tests/fixtures/.env not found")
 42
 43    if not api_key:
 44        pytest.skip("OPENAI_API_KEY not found in tests/fixtures/.env file")
 45
 46    if not journal_path:
 47        pytest.skip("_SOLSTONE_JOURNAL_OVERRIDE not found in tests/fixtures/.env file")
 48
 49    # Prepare environment
 50    env = os.environ.copy()
 51    env["_SOLSTONE_JOURNAL_OVERRIDE"] = journal_path
 52    env["OPENAI_API_KEY"] = api_key
 53
 54    # Create NDJSON input (no tool config)
 55    ndjson_input = json.dumps(
 56        {
 57            "prompt": "what is 1+1? Just give me the number.",
 58            "provider": "openai",
 59            "name": "default",
 60            "model": GPT_5,
 61            "max_output_tokens": 100,
 62        }
 63    )
 64
 65    # Run the sol think.talents command
 66    cmd = ["sol", "providers", "check"]
 67    result = subprocess.run(
 68        cmd,
 69        env=env,
 70        input=ndjson_input,
 71        capture_output=True,
 72        text=True,
 73        timeout=10,
 74    )
 75
 76    # Check that the command succeeded
 77    assert result.returncode == 0, f"Command failed with stderr: {result.stderr}"
 78
 79    # Parse stdout events (should be JSONL format)
 80    stdout_lines = result.stdout.strip().split("\n")
 81    events = []
 82    for line in stdout_lines:
 83        if line:
 84            try:
 85                events.append(json.loads(line))
 86            except json.JSONDecodeError as e:
 87                pytest.fail(f"Failed to parse JSON line: {line}\nError: {e}")
 88
 89    # Verify we have events
 90    assert len(events) >= 2, (
 91        f"Expected at least start and finish events, got {len(events)}"
 92    )
 93
 94    # Check start event
 95    start_event = events[0]
 96    assert start_event["event"] == "start"
 97    assert start_event["prompt"] == "what is 1+1? Just give me the number."
 98    assert start_event["model"] == GPT_5
 99    assert start_event["name"] == "default"
100    assert isinstance(start_event["ts"], int)
101
102    # Check finish event
103    finish_event = events[-1]
104    assert finish_event["event"] == "finish"
105    assert isinstance(finish_event["ts"], int)
106    assert "result" in finish_event
107
108    # The result should contain "2"
109    result_text = finish_event["result"].lower()
110    assert "2" in result_text or "two" in result_text, (
111        f"Expected '2' in response, got: {finish_event['result']}"
112    )
113
114    # Check for no errors
115    error_events = [e for e in events if e.get("event") == "error"]
116    assert len(error_events) == 0, f"Found error events: {error_events}"
117
118    # Verify stderr has no errors (deprecation warnings from third-party libs are OK)
119    if result.stderr:
120        assert (
121            "error" not in result.stderr.lower()
122            or "deprecationwarning" in result.stderr.lower()
123        ), f"Unexpected stderr content: {result.stderr}"
124
125
126@pytest.mark.integration
127@pytest.mark.requires_api
128def test_openai_provider_with_reasoning():
129    """Test OpenAI provider with reasoning model to verify thinking summaries.
130
131    Uses GPT-5 which supports reasoning with summary="detailed" config.
132    The key test is that:
133    1. The request succeeds (reasoning config is valid)
134    2. We may receive thinking events with summaries (model-dependent)
135    3. If thinking events are present, they have the expected structure
136    """
137    require_cli_tool("OpenAI", "codex")
138    fixtures_env, api_key, journal_path = get_fixtures_env()
139
140    if not fixtures_env:
141        pytest.skip("tests/fixtures/.env not found")
142
143    if not api_key:
144        pytest.skip("OPENAI_API_KEY not found in tests/fixtures/.env file")
145
146    if not journal_path:
147        pytest.skip("_SOLSTONE_JOURNAL_OVERRIDE not found in tests/fixtures/.env file")
148
149    # Prepare environment
150    env = os.environ.copy()
151    env["_SOLSTONE_JOURNAL_OVERRIDE"] = journal_path
152    env["OPENAI_API_KEY"] = api_key
153
154    # Use a prompt that encourages step-by-step reasoning
155    ndjson_input = json.dumps(
156        {
157            "prompt": "If I have 3 apples and buy 5 more, then give away 2, how many do I have? Think through this step by step.",
158            "provider": "openai",
159            "name": "default",
160            "model": GPT_5,
161            "max_output_tokens": 500,
162        }
163    )
164
165    # Run the sol think.talents command
166    cmd = ["sol", "providers", "check"]
167    result = subprocess.run(
168        cmd,
169        env=env,
170        input=ndjson_input,
171        capture_output=True,
172        text=True,
173        timeout=30,  # Increased for reasoning
174    )
175
176    assert result.returncode == 0, f"Command failed with stderr: {result.stderr}"
177
178    # Parse events
179    stdout_lines = result.stdout.strip().split("\n")
180    events = [json.loads(line) for line in stdout_lines if line]
181
182    # Verify no errors
183    error_events = [e for e in events if e.get("event") == "error"]
184    assert len(error_events) == 0, f"Found error events: {error_events}"
185
186    # Check for thinking events - GPT-5 series should produce these
187    # when reasoning config is properly set
188    thinking_events = [e for e in events if e.get("event") == "thinking"]
189
190    # If we have thinking events, verify their structure
191    for thinking in thinking_events:
192        assert "summary" in thinking, f"Thinking event missing 'summary': {thinking}"
193        assert isinstance(thinking["summary"], str), (
194            f"Thinking summary should be string: {thinking}"
195        )
196        assert len(thinking["summary"]) > 0, "Thinking summary should not be empty"
197        assert "model" in thinking, f"Thinking event missing 'model': {thinking}"
198        assert "ts" in thinking, f"Thinking event missing 'ts': {thinking}"
199        assert isinstance(thinking["ts"], int), "Timestamp should be int"
200
201    # Verify the answer is correct (6 apples: 3 + 5 - 2 = 6)
202    finish_event = events[-1]
203    assert finish_event["event"] == "finish"
204    result_text = finish_event["result"].lower()
205    assert "6" in result_text or "six" in result_text, (
206        f"Expected '6' in response, got: {finish_event['result']}"
207    )
208
209    # Log whether we got thinking events for debugging
210    print(f"Received {len(thinking_events)} thinking events")
211
212
213@pytest.mark.integration
214@pytest.mark.requires_api
215def test_openai_provider_with_extra_context():
216    """Test OpenAI provider with extra_context to verify Responses API format.
217
218    This exercises the session.add_items() path that was broken when content type
219    was 'text' instead of 'input_text'. The key assertion is that we don't get
220    the 400 error about invalid content type.
221    """
222    require_cli_tool("OpenAI", "codex")
223    fixtures_env, api_key, journal_path = get_fixtures_env()
224
225    if not fixtures_env:
226        pytest.skip("tests/fixtures/.env not found")
227
228    if not api_key:
229        pytest.skip("OPENAI_API_KEY not found in tests/fixtures/.env file")
230
231    if not journal_path:
232        pytest.skip("_SOLSTONE_JOURNAL_OVERRIDE not found in tests/fixtures/.env file")
233
234    # Prepare environment
235    env = os.environ.copy()
236    env["_SOLSTONE_JOURNAL_OVERRIDE"] = journal_path
237    env["OPENAI_API_KEY"] = api_key
238
239    # Include extra_context like get_talent() does in production
240    # This exercises the _convert_turns_to_items() code path
241    ndjson_input = json.dumps(
242        {
243            "prompt": "What project was mentioned in the context above? Just the name.",
244            "provider": "openai",
245            "name": "default",
246            "model": GPT_5,
247            "max_output_tokens": 50,
248            "extra_context": "## Project Context\nYou are working on Project Moonshot.",
249        }
250    )
251
252    # Run the sol think.talents command
253    cmd = ["sol", "providers", "check"]
254    result = subprocess.run(
255        cmd,
256        env=env,
257        input=ndjson_input,
258        capture_output=True,
259        text=True,
260        timeout=15,
261    )
262
263    # Parse stdout events
264    stdout_lines = result.stdout.strip().split("\n")
265    events = [json.loads(line) for line in stdout_lines if line]
266
267    # The critical check: no 400 error about invalid content type
268    # This was the original bug - using 'text' instead of 'input_text'
269    error_events = [e for e in events if e.get("event") == "error"]
270    for err in error_events:
271        error_msg = err.get("error", "")
272        assert "Invalid value: 'text'" not in error_msg, (
273            f"Got content type format error - regression! Error: {error_msg}"
274        )
275        assert "input_text" not in error_msg or "Supported values" not in error_msg, (
276            f"Got content type format error - regression! Error: {error_msg}"
277        )
278
279    # Verify we got past the format validation (start event was emitted)
280    start_events = [e for e in events if e.get("event") == "start"]
281    assert len(start_events) == 1, "Should have start event"
282
283    # If we get a finish event, verify the response references the context
284    finish_events = [e for e in events if e.get("event") == "finish"]
285    if finish_events:
286        result_text = finish_events[0].get("result", "").lower()
287        assert "moonshot" in result_text, (
288            f"Expected 'moonshot' in response, got: {finish_events[0].get('result')}"
289        )
290
291
292@pytest.mark.integration
293@pytest.mark.requires_api
294def test_openai_json_truncation_detection():
295    """Test that OpenAI provider detects JSON response truncation via finish_reason.
296
297    Uses a small max_output_tokens to force truncation, verifying that
298    the provider returns finish_reason='max_tokens' which callers can use
299    to detect incomplete responses.
300    """
301    fixtures_env, api_key, _ = get_fixtures_env()
302
303    if not fixtures_env:
304        pytest.skip("tests/fixtures/.env not found")
305
306    if not api_key:
307        pytest.skip("OPENAI_API_KEY not found in tests/fixtures/.env file")
308
309    # Import provider directly for this test
310    from think.providers import openai as openai_provider
311
312    # Request JSON output with small token limit to force truncation
313    # Use run_generate which returns GenerateResult, then check finish_reason
314    result = openai_provider.run_generate(
315        contents="Return a JSON array of the first 50 prime numbers.",
316        model=GPT_5,
317        json_output=True,
318        max_output_tokens=50,  # Too small to complete the response
319    )
320
321    # Verify truncation was detected via finish_reason
322    assert result["finish_reason"] == "max_tokens", (
323        f"Expected max_tokens finish_reason, got: {result['finish_reason']}"
324    )
325    # Partial text should be present
326    assert isinstance(result["text"], str)