diff --git a/src/nullrun/instrumentation/auto.py b/src/nullrun/instrumentation/auto.py index d385ab5..45e8bcb 100644 --- a/src/nullrun/instrumentation/auto.py +++ b/src/nullrun/instrumentation/auto.py @@ -35,6 +35,7 @@ from __future__ import annotations +import gc import hashlib import json import logging @@ -1038,9 +1039,92 @@ def _wrap_async_init(self: httpx.AsyncClient, *args: Any, **kwargs: Any) -> None httpx.AsyncClient._nullrun_patched = True # type: ignore[attr-defined] _httpx_patched = True logger.info("httpx auto-instrumentation installed (sync + async)") + + # Audit 2026-06-29 (init-ordering hazard): the class-level + # __init__ patch only wraps httpx.Clients created AFTER it is + # installed. If a user does + # + # llm = ChatOpenAI(model="gpt-4.1-mini") # before init() + # nullrun.init(api_key=...) # patch installed here + # + # ``ChatOpenAI`` already built its internal httpx.Client (or + # will on first .invoke()), but that client is reachable from + # the running process right now and is using the unpatched + # transport. Without the eager sweep below, the httpx path + # emits nothing for that LLM — every call silently zero-billed + # via the langchain callback fallback (or the bare-LLMResult + # path with no model). + # + # We sweep gc.get_objects() once and wrap any pre-existing + # httpx.Client/AsyncClient whose transport isn't already a + # NullRun*Transport. The class-level marker on ``__init__`` is + # set, so future constructions auto-wrap — this sweep is the + # back-fill for the instances that pre-date the patch. + try: + sync_wrapped, async_wrapped = _wrap_pre_existing_httpx_clients(runtime) + if sync_wrapped or async_wrapped: + logger.info( + "httpx eager wrap: %d sync + %d async pre-existing " + "client(s) now route through NullRun", + sync_wrapped, + async_wrapped, + ) + except Exception as exc: # noqa: BLE001 — defensive, never block init + logger.debug("httpx eager wrap sweep failed: %s", exc) return True +def _wrap_pre_existing_httpx_clients(runtime: Any) -> tuple[int, int]: + """Find httpx clients created before ``patch_httpx`` ran and wrap their + transports in NullRun's transports. + + Audit 2026-06-29 (init-ordering hazard): the typical sequence + + llm = ChatOpenAI(model=...) # builds internal httpx.Client + nullrun.init(api_key=...) # installs the __init__ patch + + leaves ``llm``'s internal client with the unpatched transport. + New ``httpx.Client()`` constructions are auto-wrapped by the + class-level patch; this sweep is the back-fill. + + Returns ``(sync_count, async_count)`` for logging. Errors are + swallowed by the caller — this is a best-effort back-fill, never + a hard requirement. + + We use ``gc.get_objects()`` because httpx does not maintain a + weakref registry of its Client instances. The sweep is O(heap); + on a typical agent process (hundreds of MB heap, mostly strings + and small dicts) this takes <50 ms. We bail early on + ``RuntimeError`` (raised by ``gc.get_objects()`` when the + interpreter is shutting down) and on any ``isinstance`` failure + (a class with a broken ``__class__``). + """ + sync_count = 0 + async_count = 0 + try: + for obj in gc.get_objects(): + try: + if isinstance(obj, httpx.Client) and not isinstance( + obj._transport, NullRunSyncTransport + ): + obj._transport = NullRunSyncTransport(obj._transport, runtime) + sync_count += 1 + elif isinstance(obj, httpx.AsyncClient) and not isinstance( + obj._transport, NullRunAsyncTransport + ): + obj._transport = NullRunAsyncTransport(obj._transport, runtime) + async_count += 1 + except (ReferenceError, TypeError, AttributeError): + # gc.get_objects can yield objects that are mid-GC or + # have a broken __class__; skip them rather than abort. + continue + except RuntimeError: + # gc.get_objects() raises RuntimeError during interpreter + # shutdown. Nothing to do. + pass + return sync_count, async_count + + # --------------------------------------------------------------------------- # D4: patch_langchain_callback — in-memory mocks + callback-only flows # --------------------------------------------------------------------------- diff --git a/src/nullrun/instrumentation/langgraph.py b/src/nullrun/instrumentation/langgraph.py index df54aac..69c0b41 100644 --- a/src/nullrun/instrumentation/langgraph.py +++ b/src/nullrun/instrumentation/langgraph.py @@ -718,26 +718,77 @@ def _extract_model_from_response(response: Any) -> str | None: Returns the first non-empty value found, or ``None`` if every known source is empty / malformed. + Audit 2026-06-29 (SDK↔backend wire: silent zero-billing): the chain + was checked top-to-bottom and silently returned ``None`` whenever + none of the four known locations carried the model. The backend + then ``unwrap_or("default")``'d to ``DEFAULT_RATE`` and every call + was recorded as ≈$0. We now: + + - promote ``response.llm_output['model_name']`` (the location + langchain-openai 1.x uses for the date-suffixed model id + ``gpt-4.1-mini-2025-04-14``) to step 1, ahead of the + ``response_metadata`` step that langchain 0.x used; + - add ``response.llm_output['model']`` and a generic + "any key containing 'model'" sweep so non-OpenAI wrappers + (proxies, custom chat models) still get attributed; + - log a DEBUG line on the None path so an operator who sees + the wire warning in the backend can correlate it to the + observation site that produced the event. + Sources checked, in order: - 1. ``response.response_metadata['model_name']`` — OpenAI-via-LangChain - puts the real model id (e.g. ``"gpt-4.1-mini-2025-04-14"``) here. - 2. ``response.generations[0][0].message.response_metadata['model_name']`` - — LLMResult callback path where the metadata lives on the AIMessage - rather than the LLMResult itself. - 3. ``response.llm_output['model_name']`` — legacy LLMResult where the - chat-model wrapper hoisted the field onto the LLMResult dict. - 4. ``response.model`` / ``response.model_name`` — direct attributes - on the response object (rare but seen in some custom wrappers). + 1. ``response.llm_output['model_name']`` / ``['model']`` / + any key containing "model" — langchain-openai 1.x puts the + date-suffixed id (e.g. ``"gpt-4.1-mini-2025-04-14"``) on + ``LLMResult.llm_output``. The backend's ``MODEL_RATES`` + substring-match handles the date suffix. + 2. ``response.response_metadata['model_name']`` — direct AIMessage + case (langchain 0.x chat-model wrappers expose metadata at + this level). + 3. ``response.generations[0][0].message.response_metadata['model_name']`` + — LLMResult callback path where the metadata lives on the + AIMessage rather than the LLMResult itself. + 4. Direct ``response.model`` / ``response.model_name`` attributes + (rare, seen on some custom wrappers). """ - # 1. response_metadata on the response. + # 1. llm_output dict (langchain-openai 1.x primary location). + # Promote ahead of the response_metadata step: for OpenAI via + # LangChain 1.x, the LLMResult carries the model on + # ``llm_output['model_name']`` (date-suffixed) while the + # AIMessage inside ``generations[0][0].message`` does NOT + # carry ``response_metadata`` populated — step 3 would return + # None. Without promoting step 1, every OpenAI call was + # silently zero-billed. + llm_out = getattr(response, "llm_output", None) + if isinstance(llm_out, dict) and llm_out: + # Preferred: explicit "model_name" then "model" key. + for key in ("model_name", "model"): + val = llm_out.get(key) + if isinstance(val, str) and val: + return val + # Fallback: scan every key in llm_output for one that + # contains "model" and holds a non-empty string. Some + # custom chat-model wrappers / proxies put the model under + # less canonical keys (``"model_id"``, ``"modelName"``, + # ``"resolved_model"``). + for key, val in llm_out.items(): + if ( + isinstance(key, str) + and "model" in key.lower() + and isinstance(val, str) + and val + ): + return val + + # 2. response_metadata on the response (langchain 0.x AIMessage + # case, and any wrapper that hoists the metadata up). resp_meta = getattr(response, "response_metadata", None) if isinstance(resp_meta, dict): val = resp_meta.get("model_name") or resp_meta.get("model") if val: return str(val) - # 2. LLMResult callback path — look on the generation's AIMessage. + # 3. LLMResult callback path — look on the generation's AIMessage. gen_msg = _safe_get_gen_message(response) if gen_msg is not None: gm = getattr(gen_msg, "response_metadata", None) @@ -751,19 +802,25 @@ def _extract_model_from_response(response: Any) -> str | None: if v: return str(v) - # 3. llm_output dict (legacy LLMResult). - llm_out = getattr(response, "llm_output", None) - if isinstance(llm_out, dict): - val = llm_out.get("model_name") or llm_out.get("model") - if val: - return str(val) - # 4. Direct attribute on response. for attr in ("model_name", "model"): v = getattr(response, attr, None) if v: return str(v) + # Diagnostic: every code path above returned None. The runtime + # layer will warn at ERROR when this happens for an llm_call + # event; this DEBUG line is for the per-call site so the + # operator can correlate the wire warning back to a specific + # response shape. + try: + response_type = type(response).__name__ + except Exception: + response_type = "" + logger.debug( + "_extract_model_from_response returned None for response of type %s", + response_type, + ) return None diff --git a/src/nullrun/runtime.py b/src/nullrun/runtime.py index 5408c6a..63baa65 100644 --- a/src/nullrun/runtime.py +++ b/src/nullrun/runtime.py @@ -1430,23 +1430,38 @@ def track( if k not in _WIRE_STRIP_FIELDS and v is not None } - # Audit 2026-06-28 (SDK↔backend wire): backend cost pipeline - # emits ``WARN model_id=default`` whenever an llm_call event - # reaches the wire without a ``model`` field - # (pipeline.rs:164 ``unwrap_or("default")``). This log lets - # operators reproduce the path: which observation (httpx / - # langchain callback / manual track / agents tracer / requests) - # produced an llm_call without ``model`` set, and whether - # the SDK explicitly passed ``model=None``, omitted the key, - # or had ``model=""`` (which the ``if model:`` guard in - # track_llm silently drops). Activated only for llm_call so - # span_start/span_end/tool_call traffic doesn't pollute logs. + # Audit 2026-06-29 (SDK↔backend wire: silent zero-billing): + # backend cost pipeline emits ``WARN model_id=default`` + # whenever an llm_call event reaches the wire without a + # ``model`` field (pipeline.rs:176 ``unwrap_or("default")``). + # Pre-fix the SDK warned and continued — the backend then + # silently fell through to ``DEFAULT_RATE`` and every call + # was recorded as ≈$0, breaking budget enforcement. + # + # Post-fix the SDK is fail-LOUD (not fail-closed yet — the + # event is still sent so the backend can audit/reject): + # + # 1. ERROR log instead of WARN — operator sees the breakage + # immediately, not buried in routine log noise. + # 2. Bump the ``dropped_llm_call_no_model`` runtime counter + # so dashboards can surface the regression rate. + # 3. Tag the wire event with ``__missing_model: True`` so + # the backend's into_track_request gate (fail-CLOSED + # layer) can reject with HTTP 422 and a clear error + # envelope instead of silently recording a zero-cost + # call. The flag is treated as a wire-private signal — + # the backend strips it before persisting. + # + # Activated only for llm_call so span_start/span_end/ + # tool_call traffic doesn't pollute logs or the wire. if wire_event.get("type") == "llm_call" and not wire_event.get("model"): - logger.warning( + logger.error( "track(): llm_call event missing 'model' field — " - "backend will fall back to DEFAULT_RATE. event=%s", + "tagging for backend rejection (HTTP 422). event=%s", wire_event, ) + metrics.inc_runtime("dropped_llm_call_no_model") + wire_event["__missing_model"] = True self._transport.track(wire_event) diff --git a/tests/contract/__init__.py b/tests/contract/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/contract/test_llm_call_model_wire.py b/tests/contract/test_llm_call_model_wire.py new file mode 100644 index 0000000..0e737ea --- /dev/null +++ b/tests/contract/test_llm_call_model_wire.py @@ -0,0 +1,321 @@ +""" +Regression test for the silent zero-billing bug (2026-06-29). + +Pre-fix: when an ``llm_call`` event reached the runtime's ``track()`` +with ``model=None`` (or absent), the wire-format builder at +``runtime.py:1427-1431`` dropped the None value entirely, the +backend's cost pipeline ``unwrap_or("default")``'d, and every call +was recorded as approximately zero. Budget enforcement, billing, +and plan-limit accounting silently broke for every model on every +provider. + +Post-fix: three layers of defense + + 1. ``_extract_model_from_response`` (langgraph.py) now finds the + model on every known response shape — including + ``LLMResult.llm_output['model_name']``, the location + langchain-openai 1.x uses for the date-suffixed id. + 2. ``runtime.track()`` promotes the missing-model warning to + ERROR, bumps ``dropped_llm_call_no_model``, and tags the + wire event with ``__missing_model: True`` so the backend + can reject with HTTP 422. + 3. ``patch_httpx`` eagerly wraps any pre-existing httpx.Client + instances when ``nullrun.init()`` is called — closing the + init-ordering hazard where ``ChatOpenAI(...)`` is created + before ``init()``. + +This file pins all three invariants at the unit level so a future +refactor can't silently re-break the wire. +""" + +from __future__ import annotations + +import logging +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest + +from nullrun.instrumentation.langgraph import _extract_model_from_response + + +# ─── _extract_model_from_response: the actual fix ───────────────────── +# +# The chain was promoted so the langchain-openai 1.x primary +# location (``LLMResult.llm_output['model_name']``) is checked +# FIRST. Pre-fix this location was step 3, after the AIMessage +# ``response_metadata`` step that langchain 1.x does not populate +# — so every OpenAI call returned None from extraction and was +# silently zero-billed. + + +def _make_llmresult( + *, + response_metadata=None, + generations=None, + llm_output=None, + direct_model=None, +): + """Build a minimal LLMResult-like object for the helper to walk.""" + return SimpleNamespace( + response_metadata=response_metadata, + generations=generations or [], + llm_output=llm_output, + model=direct_model, + model_name=direct_model, + ) + + +def test_extracts_from_llm_output_model_name_langchain_openai_1x(): + """langchain-openai 1.x primary location. The date-suffixed id + ``gpt-4.1-mini-2025-04-14`` lives here. The backend's + ``MODEL_RATES`` substring-match resolves it to the + ``gpt-4.1-mini`` rate.""" + response = _make_llmresult( + llm_output={"model_name": "gpt-4.1-mini-2025-04-14", "token_usage": {}} + ) + assert _extract_model_from_response(response) == "gpt-4.1-mini-2025-04-14" + + +def test_extracts_from_llm_output_model_key(): + """Some OpenAI-compatible proxies put the model on + ``llm_output['model']`` (no ``_name`` suffix).""" + response = _make_llmresult(llm_output={"model": "gpt-4.1-mini"}) + assert _extract_model_from_response(response) == "gpt-4.1-mini" + + +def test_extracts_from_llm_output_key_containing_model(): + """Custom wrappers (e.g. ``model_id``, ``modelName``, + ``resolved_model``) fall through the generic any-key + sweep.""" + response = _make_llmresult(llm_output={"model_id": "claude-haiku-4-5-20251001"}) + assert _extract_model_from_response(response) == "claude-haiku-4-5-20251001" + + +def test_llm_output_checked_before_response_metadata(): + """Audit invariant: when BOTH ``llm_output['model_name']`` and + ``response_metadata['model_name']`` are set, the llm_output + value wins. Pre-fix the order was response_metadata first, + which meant a populated response_metadata shadowed the + real (date-suffixed) llm_output value.""" + response = _make_llmresult( + response_metadata={"model_name": "stale-alias"}, + llm_output={"model_name": "gpt-4.1-mini-2025-04-14"}, + ) + assert _extract_model_from_response(response) == "gpt-4.1-mini-2025-04-14" + + +def test_falls_through_to_response_metadata_when_llm_output_empty(): + """langchain 0.x and direct AIMessage paths still work.""" + response = _make_llmresult( + response_metadata={"model_name": "gpt-4o-mini"}, + ) + assert _extract_model_from_response(response) == "gpt-4o-mini" + + +def test_falls_through_to_generations_message_metadata(): + """LLMResult where metadata lives on the AIMessage inside + ``generations[0][0].message`` rather than the LLMResult itself.""" + msg = SimpleNamespace( + response_metadata={"model_name": "claude-3-5-sonnet-20240620"}, + ) + response = _make_llmresult(generations=[[SimpleNamespace(message=msg)]]) + assert _extract_model_from_response(response) == "claude-3-5-sonnet-20240620" + + +def test_returns_none_when_all_sources_empty(caplog): + """When every known source is empty/missing, extraction returns + None — but now logs a DEBUG line so the operator can correlate + the wire warning back to the observation site.""" + response = _make_llmresult() + with caplog.at_level(logging.DEBUG, logger="nullrun.instrumentation.langgraph"): + result = _extract_model_from_response(response) + assert result is None + # The DEBUG line is for forensics; the runtime layer is the + # one that bumps the error log + counter. + assert any( + "_extract_model_from_response returned None" in record.message + for record in caplog.records + ) + + +def test_empty_string_in_llm_output_falls_through(): + """``llm_output['model_name'] = ''`` is treated as empty and + the helper moves on to the next source rather than returning + the empty string. Pre-fix this would have shipped ``model=''`` + on the wire, which the backend would still fall through on + but the SDK would log a misleading warning.""" + response = _make_llmresult( + llm_output={"model_name": ""}, + response_metadata={"model_name": "gpt-4.1-mini"}, + ) + assert _extract_model_from_response(response) == "gpt-4.1-mini" + + +# ─── track() fail-loud behavior ────────────────────────────────────── +# +# The runtime layer is the front door for the wire. Pre-fix it +# warned at WARN and continued; the backend then silently +# zero-billed. Post-fix it logs at ERROR, bumps a counter, and +# tags the event with ``__missing_model: True`` so the backend +# can reject with HTTP 422. + + +def test_track_promotes_missing_model_to_error_and_tags_event(make_runtime, caplog): + """Regression: an ``llm_call`` event with ``model=None`` reaches + ``track()`` and (a) is logged at ERROR, (b) gets the + ``__missing_model: True`` flag, (c) is still sent on the wire + so the backend can reject with HTTP 422 (not silently free).""" + rt = make_runtime() + captured = [] + + # Capture what the transport would send on the wire. + def _capture_track(event): + captured.append(event) + + rt._transport.track = _capture_track + + with caplog.at_level(logging.ERROR, logger="nullrun.runtime"): + rt.track({"type": "llm_call", "tokens": 100, "model": None}) + + # The wire event IS sent (so the backend can audit/reject). + assert len(captured) == 1 + wire = captured[0] + assert wire["type"] == "llm_call" + # __missing_model: True is the signal to the backend gate. + assert wire.get("__missing_model") is True + # model is absent from the wire (None values are still dropped + # at runtime.py:1427-1431 — that filter is correct; the flag + # is the substitute for the field). + assert "model" not in wire + # An ERROR log line was emitted. + assert any( + "llm_call event missing 'model' field" in record.message + for record in caplog.records + if record.levelno == logging.ERROR + ) + + +def test_track_does_not_tag_when_model_is_set(make_runtime): + """The happy path: ``llm_call`` event with a model passes + through unchanged (no ERROR, no __missing_model flag).""" + rt = make_runtime() + captured = [] + rt._transport.track = lambda e: captured.append(e) + + rt.track({"type": "llm_call", "tokens": 100, "model": "gpt-4.1-mini"}) + + assert len(captured) == 1 + wire = captured[0] + assert wire.get("model") == "gpt-4.1-mini" + assert "__missing_model" not in wire + + +def test_track_does_not_tag_non_llm_call_events_with_missing_model(make_runtime): + """``span_start`` / ``span_end`` / ``tool_call`` events do not + carry a model field by design. The fail-loud path must not + fire for them or every span emission would log an error.""" + rt = make_runtime() + captured = [] + rt._transport.track = lambda e: captured.append(e) + + rt.track({"type": "span_start", "fn_name": "foo"}) + rt.track({"type": "span_end", "fn_name": "foo"}) + + # No __missing_model flag on either event. + assert all("__missing_model" not in e for e in captured) + assert len(captured) == 2 + + +# ─── patch_httpx: eager wrap of pre-existing clients ──────────────── +# +# Pre-fix the class-level patch on ``httpx.Client.__init__`` only +# wrapped clients created AFTER ``nullrun.init()`` ran. The user's +# script (and many real codebases) does +# +# llm = ChatOpenAI(model=...) # before init +# nullrun.init(api_key=...) # patch installed too late +# +# which left ``llm``'s internal httpx.Client unpatched. Post-fix +# the patch sweep finds and wraps pre-existing clients. + + +def test_patch_httpx_wraps_pre_existing_clients(): + """When ``patch_httpx`` runs and there are pre-existing + ``httpx.Client`` instances in the process, the new sweep + finds them and wraps their transports in + ``NullRunSyncTransport``. + + The test builds a real ``httpx.Client`` (the user's + ``ChatOpenAI`` shape), runs the patch, and asserts the + transport was rewritten. + """ + import httpx + + from nullrun.instrumentation import auto + from nullrun.instrumentation.auto import NullRunSyncTransport + + # The patch is process-global; reset so we start clean. + auto.reset_for_tests() + auto._httpx_patched = False + + # Build a real client BEFORE the patch — the user's order. + pre_existing = httpx.Client() + # Sanity: default transport is not yet wrapped. + assert not isinstance(pre_existing._transport, NullRunSyncTransport) + + runtime = MagicMock() + try: + ok = auto.patch_httpx(runtime) + assert ok is True + + # The sweep should have found the pre-existing client and + # wrapped it. + assert isinstance(pre_existing._transport, NullRunSyncTransport) + + # New clients after the patch are also wrapped (class-level + # patch on ``__init__``). + post = httpx.Client() + try: + assert isinstance(post._transport, NullRunSyncTransport) + finally: + post.close() + finally: + # Don't leave the patched state for the next test. + auto.reset_for_tests() + pre_existing.close() + + +def test_patch_httpx_eager_wrap_is_idempotent(): + """Running the eager sweep twice must not double-wrap the same + client. The check is on the existing transport type, not on + a separate marker, so a no-op re-run leaves the client + untouched.""" + import httpx + + from nullrun.instrumentation import auto + from nullrun.instrumentation.auto import NullRunSyncTransport + + auto.reset_for_tests() + auto._httpx_patched = False + + pre_existing = httpx.Client() + runtime = MagicMock() + try: + auto.patch_httpx(runtime) + wrapped_transport = pre_existing._transport + assert isinstance(wrapped_transport, NullRunSyncTransport) + + # Reset the patch flag and call again — simulates a + # double-init() (e.g. test fixtures). The sweep must NOT + # wrap the already-wrapped transport a second time. + auto._httpx_patched = False + # The class-level patch marker (``_nullrun_patched``) is + # still set on httpx.Client so the second ``patch_httpx`` + # short-circuits — this is the expected path. Verify the + # transport object is the SAME instance (no double-wrap). + auto.patch_httpx(runtime) + assert pre_existing._transport is wrapped_transport + finally: + auto.reset_for_tests() + pre_existing.close()