nullrunio · maltsev-dev · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
diff --git a/src/nullrun/instrumentation/auto.py b/src/nullrun/instrumentation/auto.py
@@ -35,6 +35,7 @@
 
 from __future__ import annotations
 
+import gc
 import hashlib
 import json
 import logging
@@ -1038,9 +1039,92 @@ def _wrap_async_init(self: httpx.AsyncClient, *args: Any, **kwargs: Any) -> None
         httpx.AsyncClient._nullrun_patched = True  # type: ignore[attr-defined]
         _httpx_patched = True
         logger.info("httpx auto-instrumentation installed (sync + async)")
+
+        # Audit 2026-06-29 (init-ordering hazard): the class-level
+        # __init__ patch only wraps httpx.Clients created AFTER it is
+        # installed. If a user does
+        #
+        #     llm = ChatOpenAI(model="gpt-4.1-mini")  # before init()
+        #     nullrun.init(api_key=...)              # patch installed here
+        #
+        # ``ChatOpenAI`` already built its internal httpx.Client (or
+        # will on first .invoke()), but that client is reachable from
+        # the running process right now and is using the unpatched
+        # transport. Without the eager sweep below, the httpx path
+        # emits nothing for that LLM — every call silently zero-billed
+        # via the langchain callback fallback (or the bare-LLMResult
+        # path with no model).
+        #
+        # We sweep gc.get_objects() once and wrap any pre-existing
+        # httpx.Client/AsyncClient whose transport isn't already a
+        # NullRun*Transport. The class-level marker on ``__init__`` is
+        # set, so future constructions auto-wrap — this sweep is the
+        # back-fill for the instances that pre-date the patch.
+        try:
+            sync_wrapped, async_wrapped = _wrap_pre_existing_httpx_clients(runtime)
+            if sync_wrapped or async_wrapped:
+                logger.info(
+                    "httpx eager wrap: %d sync + %d async pre-existing "
+                    "client(s) now route through NullRun",
+                    sync_wrapped,
+                    async_wrapped,
+                )
+        except Exception as exc:  # noqa: BLE001 — defensive, never block init
+            logger.debug("httpx eager wrap sweep failed: %s", exc)
         return True
 
 
+def _wrap_pre_existing_httpx_clients(runtime: Any) -> tuple[int, int]:
+    """Find httpx clients created before ``patch_httpx`` ran and wrap their
+    transports in NullRun's transports.
+
+    Audit 2026-06-29 (init-ordering hazard): the typical sequence
+
+        llm = ChatOpenAI(model=...)  # builds internal httpx.Client
+        nullrun.init(api_key=...)    # installs the __init__ patch
+
+    leaves ``llm``'s internal client with the unpatched transport.
+    New ``httpx.Client()`` constructions are auto-wrapped by the
+    class-level patch; this sweep is the back-fill.
+
+    Returns ``(sync_count, async_count)`` for logging. Errors are
+    swallowed by the caller — this is a best-effort back-fill, never
+    a hard requirement.
+
+    We use ``gc.get_objects()`` because httpx does not maintain a
+    weakref registry of its Client instances. The sweep is O(heap);
+    on a typical agent process (hundreds of MB heap, mostly strings
+    and small dicts) this takes <50 ms. We bail early on
+    ``RuntimeError`` (raised by ``gc.get_objects()`` when the
+    interpreter is shutting down) and on any ``isinstance`` failure
+    (a class with a broken ``__class__``).
+    """
+    sync_count = 0
+    async_count = 0
+    try:
+        for obj in gc.get_objects():
+            try:
+                if isinstance(obj, httpx.Client) and not isinstance(
+                    obj._transport, NullRunSyncTransport
+                ):
+                    obj._transport = NullRunSyncTransport(obj._transport, runtime)
+                    sync_count += 1
+                elif isinstance(obj, httpx.AsyncClient) and not isinstance(
+                    obj._transport, NullRunAsyncTransport
+                ):
+                    obj._transport = NullRunAsyncTransport(obj._transport, runtime)
+                    async_count += 1
+            except (ReferenceError, TypeError, AttributeError):
+                # gc.get_objects can yield objects that are mid-GC or
+                # have a broken __class__; skip them rather than abort.
+                continue
+    except RuntimeError:
+        # gc.get_objects() raises RuntimeError during interpreter
+        # shutdown. Nothing to do.
+        pass
+    return sync_count, async_count
+
+
 # ---------------------------------------------------------------------------
 # D4: patch_langchain_callback — in-memory mocks + callback-only flows
 # ---------------------------------------------------------------------------

diff --git a/src/nullrun/instrumentation/langgraph.py b/src/nullrun/instrumentation/langgraph.py
@@ -718,26 +718,77 @@ def _extract_model_from_response(response: Any) -> str | None:
     Returns the first non-empty value found, or ``None`` if every known
     source is empty / malformed.
 
+    Audit 2026-06-29 (SDK↔backend wire: silent zero-billing): the chain
+    was checked top-to-bottom and silently returned ``None`` whenever
+    none of the four known locations carried the model. The backend
+    then ``unwrap_or("default")``'d to ``DEFAULT_RATE`` and every call
+    was recorded as ≈$0. We now:
+
+      - promote ``response.llm_output['model_name']`` (the location
+        langchain-openai 1.x uses for the date-suffixed model id
+        ``gpt-4.1-mini-2025-04-14``) to step 1, ahead of the
+        ``response_metadata`` step that langchain 0.x used;
+      - add ``response.llm_output['model']`` and a generic
+        "any key containing 'model'" sweep so non-OpenAI wrappers
+        (proxies, custom chat models) still get attributed;
+      - log a DEBUG line on the None path so an operator who sees
+        the wire warning in the backend can correlate it to the
+        observation site that produced the event.
+
     Sources checked, in order:
 
-    1. ``response.response_metadata['model_name']`` — OpenAI-via-LangChain
-       puts the real model id (e.g. ``"gpt-4.1-mini-2025-04-14"``) here.
-    2. ``response.generations[0][0].message.response_metadata['model_name']``
-       — LLMResult callback path where the metadata lives on the AIMessage
-       rather than the LLMResult itself.
-    3. ``response.llm_output['model_name']`` — legacy LLMResult where the
-       chat-model wrapper hoisted the field onto the LLMResult dict.
-    4. ``response.model`` / ``response.model_name`` — direct attributes
-       on the response object (rare but seen in some custom wrappers).
+    1. ``response.llm_output['model_name']`` / ``['model']`` /
+       any key containing "model" — langchain-openai 1.x puts the
+       date-suffixed id (e.g. ``"gpt-4.1-mini-2025-04-14"``) on
+       ``LLMResult.llm_output``. The backend's ``MODEL_RATES``
+       substring-match handles the date suffix.
+    2. ``response.response_metadata['model_name']`` — direct AIMessage
+       case (langchain 0.x chat-model wrappers expose metadata at
+       this level).
+    3. ``response.generations[0][0].message.response_metadata['model_name']``
+       — LLMResult callback path where the metadata lives on the
+       AIMessage rather than the LLMResult itself.
+    4. Direct ``response.model`` / ``response.model_name`` attributes
+       (rare, seen on some custom wrappers).
     """
-    # 1. response_metadata on the response.
+    # 1. llm_output dict (langchain-openai 1.x primary location).
+    #    Promote ahead of the response_metadata step: for OpenAI via
+    #    LangChain 1.x, the LLMResult carries the model on
+    #    ``llm_output['model_name']`` (date-suffixed) while the
+    #    AIMessage inside ``generations[0][0].message`` does NOT
+    #    carry ``response_metadata`` populated — step 3 would return
+    #    None. Without promoting step 1, every OpenAI call was
+    #    silently zero-billed.
+    llm_out = getattr(response, "llm_output", None)
+    if isinstance(llm_out, dict) and llm_out:
+        # Preferred: explicit "model_name" then "model" key.
+        for key in ("model_name", "model"):
+            val = llm_out.get(key)
+            if isinstance(val, str) and val:
+                return val
+        # Fallback: scan every key in llm_output for one that
+        # contains "model" and holds a non-empty string. Some
+        # custom chat-model wrappers / proxies put the model under
+        # less canonical keys (``"model_id"``, ``"modelName"``,
+        # ``"resolved_model"``).
+        for key, val in llm_out.items():
+            if (
+                isinstance(key, str)
+                and "model" in key.lower()
+                and isinstance(val, str)
+                and val
+            ):
+                return val
+
+    # 2. response_metadata on the response (langchain 0.x AIMessage
+    #    case, and any wrapper that hoists the metadata up).
     resp_meta = getattr(response, "response_metadata", None)
     if isinstance(resp_meta, dict):
         val = resp_meta.get("model_name") or resp_meta.get("model")
         if val:
             return str(val)
 
-    # 2. LLMResult callback path — look on the generation's AIMessage.
+    # 3. LLMResult callback path — look on the generation's AIMessage.
     gen_msg = _safe_get_gen_message(response)
     if gen_msg is not None:
         gm = getattr(gen_msg, "response_metadata", None)
@@ -751,19 +802,25 @@ def _extract_model_from_response(response: Any) -> str | None:
             if v:
                 return str(v)
 
-    # 3. llm_output dict (legacy LLMResult).
-    llm_out = getattr(response, "llm_output", None)
-    if isinstance(llm_out, dict):
-        val = llm_out.get("model_name") or llm_out.get("model")
-        if val:
-            return str(val)
-
     # 4. Direct attribute on response.
     for attr in ("model_name", "model"):
         v = getattr(response, attr, None)
         if v:
             return str(v)
 
+    # Diagnostic: every code path above returned None. The runtime
+    # layer will warn at ERROR when this happens for an llm_call
+    # event; this DEBUG line is for the per-call site so the
+    # operator can correlate the wire warning back to a specific
+    # response shape.
+    try:
+        response_type = type(response).__name__
+    except Exception:
+        response_type = "<unknown>"
+    logger.debug(
+        "_extract_model_from_response returned None for response of type %s",
+        response_type,
+    )
     return None
 
 

diff --git a/src/nullrun/runtime.py b/src/nullrun/runtime.py
@@ -1430,23 +1430,38 @@ def track(
             if k not in _WIRE_STRIP_FIELDS and v is not None
         }
 
-        # Audit 2026-06-28 (SDK↔backend wire): backend cost pipeline
-        # emits ``WARN model_id=default`` whenever an llm_call event
-        # reaches the wire without a ``model`` field
-        # (pipeline.rs:164 ``unwrap_or("default")``). This log lets
-        # operators reproduce the path: which observation (httpx /
-        # langchain callback / manual track / agents tracer / requests)
-        # produced an llm_call without ``model`` set, and whether
-        # the SDK explicitly passed ``model=None``, omitted the key,
-        # or had ``model=""`` (which the ``if model:`` guard in
-        # track_llm silently drops). Activated only for llm_call so
-        # span_start/span_end/tool_call traffic doesn't pollute logs.
+        # Audit 2026-06-29 (SDK↔backend wire: silent zero-billing):
+        # backend cost pipeline emits ``WARN model_id=default``
+        # whenever an llm_call event reaches the wire without a
+        # ``model`` field (pipeline.rs:176 ``unwrap_or("default")``).
+        # Pre-fix the SDK warned and continued — the backend then
+        # silently fell through to ``DEFAULT_RATE`` and every call
+        # was recorded as ≈$0, breaking budget enforcement.
+        #
+        # Post-fix the SDK is fail-LOUD (not fail-closed yet — the
+        # event is still sent so the backend can audit/reject):
+        #
+        #   1. ERROR log instead of WARN — operator sees the breakage
+        #      immediately, not buried in routine log noise.
+        #   2. Bump the ``dropped_llm_call_no_model`` runtime counter
+        #      so dashboards can surface the regression rate.
+        #   3. Tag the wire event with ``__missing_model: True`` so
+        #      the backend's into_track_request gate (fail-CLOSED
+        #      layer) can reject with HTTP 422 and a clear error
+        #      envelope instead of silently recording a zero-cost
+        #      call. The flag is treated as a wire-private signal —
+        #      the backend strips it before persisting.
+        #
+        # Activated only for llm_call so span_start/span_end/
+        # tool_call traffic doesn't pollute logs or the wire.
         if wire_event.get("type") == "llm_call" and not wire_event.get("model"):
-            logger.warning(
+            logger.error(
                 "track(): llm_call event missing 'model' field — "
-                "backend will fall back to DEFAULT_RATE. event=%s",
+                "tagging for backend rejection (HTTP 422). event=%s",
                 wire_event,
             )
+            metrics.inc_runtime("dropped_llm_call_no_model")
+            wire_event["__missing_model"] = True
 
         self._transport.track(wire_event)
 

diff --git a/tests/contract/__init__.py b/tests/contract/__init__.py