Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions src/nullrun/instrumentation/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

from __future__ import annotations

import gc
import hashlib
import json
import logging
Expand Down Expand Up @@ -1038,9 +1039,92 @@ def _wrap_async_init(self: httpx.AsyncClient, *args: Any, **kwargs: Any) -> None
httpx.AsyncClient._nullrun_patched = True # type: ignore[attr-defined]
_httpx_patched = True
logger.info("httpx auto-instrumentation installed (sync + async)")

# Audit 2026-06-29 (init-ordering hazard): the class-level
# __init__ patch only wraps httpx.Clients created AFTER it is
# installed. If a user does
#
# llm = ChatOpenAI(model="gpt-4.1-mini") # before init()
# nullrun.init(api_key=...) # patch installed here
#
# ``ChatOpenAI`` already built its internal httpx.Client (or
# will on first .invoke()), but that client is reachable from
# the running process right now and is using the unpatched
# transport. Without the eager sweep below, the httpx path
# emits nothing for that LLM — every call silently zero-billed
# via the langchain callback fallback (or the bare-LLMResult
# path with no model).
#
# We sweep gc.get_objects() once and wrap any pre-existing
# httpx.Client/AsyncClient whose transport isn't already a
# NullRun*Transport. The class-level marker on ``__init__`` is
# set, so future constructions auto-wrap — this sweep is the
# back-fill for the instances that pre-date the patch.
try:
sync_wrapped, async_wrapped = _wrap_pre_existing_httpx_clients(runtime)
if sync_wrapped or async_wrapped:
logger.info(
"httpx eager wrap: %d sync + %d async pre-existing "
"client(s) now route through NullRun",
sync_wrapped,
async_wrapped,
)
except Exception as exc: # noqa: BLE001 — defensive, never block init
logger.debug("httpx eager wrap sweep failed: %s", exc)
return True


def _wrap_pre_existing_httpx_clients(runtime: Any) -> tuple[int, int]:
"""Find httpx clients created before ``patch_httpx`` ran and wrap their
transports in NullRun's transports.

Audit 2026-06-29 (init-ordering hazard): the typical sequence

llm = ChatOpenAI(model=...) # builds internal httpx.Client
nullrun.init(api_key=...) # installs the __init__ patch

leaves ``llm``'s internal client with the unpatched transport.
New ``httpx.Client()`` constructions are auto-wrapped by the
class-level patch; this sweep is the back-fill.

Returns ``(sync_count, async_count)`` for logging. Errors are
swallowed by the caller — this is a best-effort back-fill, never
a hard requirement.

We use ``gc.get_objects()`` because httpx does not maintain a
weakref registry of its Client instances. The sweep is O(heap);
on a typical agent process (hundreds of MB heap, mostly strings
and small dicts) this takes <50 ms. We bail early on
``RuntimeError`` (raised by ``gc.get_objects()`` when the
interpreter is shutting down) and on any ``isinstance`` failure
(a class with a broken ``__class__``).
"""
sync_count = 0
async_count = 0
try:
for obj in gc.get_objects():
try:
if isinstance(obj, httpx.Client) and not isinstance(
obj._transport, NullRunSyncTransport
):
obj._transport = NullRunSyncTransport(obj._transport, runtime)
sync_count += 1
elif isinstance(obj, httpx.AsyncClient) and not isinstance(
obj._transport, NullRunAsyncTransport
):
obj._transport = NullRunAsyncTransport(obj._transport, runtime)
async_count += 1
except (ReferenceError, TypeError, AttributeError):
# gc.get_objects can yield objects that are mid-GC or
# have a broken __class__; skip them rather than abort.
continue
except RuntimeError:
# gc.get_objects() raises RuntimeError during interpreter
# shutdown. Nothing to do.
pass
return sync_count, async_count


# ---------------------------------------------------------------------------
# D4: patch_langchain_callback — in-memory mocks + callback-only flows
# ---------------------------------------------------------------------------
Expand Down
93 changes: 75 additions & 18 deletions src/nullrun/instrumentation/langgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,26 +718,77 @@ def _extract_model_from_response(response: Any) -> str | None:
Returns the first non-empty value found, or ``None`` if every known
source is empty / malformed.

Audit 2026-06-29 (SDK↔backend wire: silent zero-billing): the chain
was checked top-to-bottom and silently returned ``None`` whenever
none of the four known locations carried the model. The backend
then ``unwrap_or("default")``'d to ``DEFAULT_RATE`` and every call
was recorded as ≈$0. We now:

- promote ``response.llm_output['model_name']`` (the location
langchain-openai 1.x uses for the date-suffixed model id
``gpt-4.1-mini-2025-04-14``) to step 1, ahead of the
``response_metadata`` step that langchain 0.x used;
- add ``response.llm_output['model']`` and a generic
"any key containing 'model'" sweep so non-OpenAI wrappers
(proxies, custom chat models) still get attributed;
- log a DEBUG line on the None path so an operator who sees
the wire warning in the backend can correlate it to the
observation site that produced the event.

Sources checked, in order:

1. ``response.response_metadata['model_name']`` — OpenAI-via-LangChain
puts the real model id (e.g. ``"gpt-4.1-mini-2025-04-14"``) here.
2. ``response.generations[0][0].message.response_metadata['model_name']``
— LLMResult callback path where the metadata lives on the AIMessage
rather than the LLMResult itself.
3. ``response.llm_output['model_name']`` — legacy LLMResult where the
chat-model wrapper hoisted the field onto the LLMResult dict.
4. ``response.model`` / ``response.model_name`` — direct attributes
on the response object (rare but seen in some custom wrappers).
1. ``response.llm_output['model_name']`` / ``['model']`` /
any key containing "model" — langchain-openai 1.x puts the
date-suffixed id (e.g. ``"gpt-4.1-mini-2025-04-14"``) on
``LLMResult.llm_output``. The backend's ``MODEL_RATES``
substring-match handles the date suffix.
2. ``response.response_metadata['model_name']`` — direct AIMessage
case (langchain 0.x chat-model wrappers expose metadata at
this level).
3. ``response.generations[0][0].message.response_metadata['model_name']``
— LLMResult callback path where the metadata lives on the
AIMessage rather than the LLMResult itself.
4. Direct ``response.model`` / ``response.model_name`` attributes
(rare, seen on some custom wrappers).
"""
# 1. response_metadata on the response.
# 1. llm_output dict (langchain-openai 1.x primary location).
# Promote ahead of the response_metadata step: for OpenAI via
# LangChain 1.x, the LLMResult carries the model on
# ``llm_output['model_name']`` (date-suffixed) while the
# AIMessage inside ``generations[0][0].message`` does NOT
# carry ``response_metadata`` populated — step 3 would return
# None. Without promoting step 1, every OpenAI call was
# silently zero-billed.
llm_out = getattr(response, "llm_output", None)
if isinstance(llm_out, dict) and llm_out:
# Preferred: explicit "model_name" then "model" key.
for key in ("model_name", "model"):
val = llm_out.get(key)
if isinstance(val, str) and val:
return val
# Fallback: scan every key in llm_output for one that
# contains "model" and holds a non-empty string. Some
# custom chat-model wrappers / proxies put the model under
# less canonical keys (``"model_id"``, ``"modelName"``,
# ``"resolved_model"``).
for key, val in llm_out.items():
if (
isinstance(key, str)
and "model" in key.lower()
and isinstance(val, str)
and val
):
return val

# 2. response_metadata on the response (langchain 0.x AIMessage
# case, and any wrapper that hoists the metadata up).
resp_meta = getattr(response, "response_metadata", None)
if isinstance(resp_meta, dict):
val = resp_meta.get("model_name") or resp_meta.get("model")
if val:
return str(val)

# 2. LLMResult callback path — look on the generation's AIMessage.
# 3. LLMResult callback path — look on the generation's AIMessage.
gen_msg = _safe_get_gen_message(response)
if gen_msg is not None:
gm = getattr(gen_msg, "response_metadata", None)
Expand All @@ -751,19 +802,25 @@ def _extract_model_from_response(response: Any) -> str | None:
if v:
return str(v)

# 3. llm_output dict (legacy LLMResult).
llm_out = getattr(response, "llm_output", None)
if isinstance(llm_out, dict):
val = llm_out.get("model_name") or llm_out.get("model")
if val:
return str(val)

# 4. Direct attribute on response.
for attr in ("model_name", "model"):
v = getattr(response, attr, None)
if v:
return str(v)

# Diagnostic: every code path above returned None. The runtime
# layer will warn at ERROR when this happens for an llm_call
# event; this DEBUG line is for the per-call site so the
# operator can correlate the wire warning back to a specific
# response shape.
try:
response_type = type(response).__name__
except Exception:
response_type = "<unknown>"
logger.debug(
"_extract_model_from_response returned None for response of type %s",
response_type,
)
return None


Expand Down
41 changes: 28 additions & 13 deletions src/nullrun/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1430,23 +1430,38 @@ def track(
if k not in _WIRE_STRIP_FIELDS and v is not None
}

# Audit 2026-06-28 (SDK↔backend wire): backend cost pipeline
# emits ``WARN model_id=default`` whenever an llm_call event
# reaches the wire without a ``model`` field
# (pipeline.rs:164 ``unwrap_or("default")``). This log lets
# operators reproduce the path: which observation (httpx /
# langchain callback / manual track / agents tracer / requests)
# produced an llm_call without ``model`` set, and whether
# the SDK explicitly passed ``model=None``, omitted the key,
# or had ``model=""`` (which the ``if model:`` guard in
# track_llm silently drops). Activated only for llm_call so
# span_start/span_end/tool_call traffic doesn't pollute logs.
# Audit 2026-06-29 (SDK↔backend wire: silent zero-billing):
# backend cost pipeline emits ``WARN model_id=default``
# whenever an llm_call event reaches the wire without a
# ``model`` field (pipeline.rs:176 ``unwrap_or("default")``).
# Pre-fix the SDK warned and continued — the backend then
# silently fell through to ``DEFAULT_RATE`` and every call
# was recorded as ≈$0, breaking budget enforcement.
#
# Post-fix the SDK is fail-LOUD (not fail-closed yet — the
# event is still sent so the backend can audit/reject):
#
# 1. ERROR log instead of WARN — operator sees the breakage
# immediately, not buried in routine log noise.
# 2. Bump the ``dropped_llm_call_no_model`` runtime counter
# so dashboards can surface the regression rate.
# 3. Tag the wire event with ``__missing_model: True`` so
# the backend's into_track_request gate (fail-CLOSED
# layer) can reject with HTTP 422 and a clear error
# envelope instead of silently recording a zero-cost
# call. The flag is treated as a wire-private signal —
# the backend strips it before persisting.
#
# Activated only for llm_call so span_start/span_end/
# tool_call traffic doesn't pollute logs or the wire.
if wire_event.get("type") == "llm_call" and not wire_event.get("model"):
logger.warning(
logger.error(
"track(): llm_call event missing 'model' field — "
"backend will fall back to DEFAULT_RATE. event=%s",
"tagging for backend rejection (HTTP 422). event=%s",
wire_event,
)
metrics.inc_runtime("dropped_llm_call_no_model")
wire_event["__missing_model"] = True

self._transport.track(wire_event)

Expand Down
Empty file added tests/contract/__init__.py
Empty file.
Loading
Loading