diff --git a/gateway/run.py b/gateway/run.py index adbba6928..3f950685f 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1755,7 +1755,7 @@ def _preserve_queued_followup_history_offset( return merged -async def _dispose_unused_adapter(adapter: "BasePlatformAdapter") -> None: +async def _dispose_unused_adapter(adapter: "BasePlatformAdapter | None") -> None: """Best-effort dispose for an adapter that never made it onto ``self.adapters``. The reconnect watcher in ``GatewayRunner._platform_reconnect_watcher`` @@ -1778,6 +1778,11 @@ async def _dispose_unused_adapter(adapter: "BasePlatformAdapter") -> None: failure paths in the reconnect watcher can all call it without each one having to know that ``disconnect()`` may itself raise on a half-constructed adapter. + + ``adapter`` may be ``None``: the reconnect watcher initialises + ``adapter = None`` before the ``try`` so the ``except Exception`` + arm can dispose a half-constructed object, and also early-returns + here when ``_create_adapter()`` returned ``None``. """ if adapter is None: return @@ -1788,6 +1793,15 @@ async def _dispose_unused_adapter(adapter: "BasePlatformAdapter") -> None: # crashed during aiohttp app setup) can raise from # disconnect() on objects that never finished initializing. # We must not let that escape and abort the watcher loop. + # + # On Python 3.8+, ``asyncio.CancelledError`` inherits from + # ``BaseException`` (not ``Exception``), so this ``except + # Exception`` does not swallow task cancellation. We don't + # re-raise explicitly because the watcher loop intentionally + # treats dispose failures as best-effort: a failed ``disconnect`` + # call should not take down the reconnect watcher that + # itself is what's keeping the gateway alive during a partial + # outage. logger.debug( "Adapter dispose raised on unowned adapter %r", getattr(adapter, "name", type(adapter).__name__), diff --git a/tests/gateway/test_platform_reconnect_fd_leak.py b/tests/gateway/test_platform_reconnect_fd_leak.py index 68cb19b81..221cf55ee 100644 --- a/tests/gateway/test_platform_reconnect_fd_leak.py +++ b/tests/gateway/test_platform_reconnect_fd_leak.py @@ -165,11 +165,18 @@ class TestReconnectFDLeakRegression: new=AsyncMock(return_value=False)): await _run_watcher_one_iteration(runner) - assert adapter._disconnect_calls >= 1, ( + # The intent of this test is "the watcher calls disconnect() + # exactly once on the unowned adapter" — not "at least once". + # An accidental double-dispose would be a new bug to catch + # (e.g. the watcher's two failure paths both calling dispose + # for the same adapter instance). Tighten to == 1. + assert adapter._disconnect_calls == 1, ( f"non-retryable reconnect failure must call adapter.disconnect() " - f"at least once; got {adapter._disconnect_calls} calls. " + f"exactly once; got {adapter._disconnect_calls} calls. " "Without it, 2 fds leak per retry at the 300s backoff cap " - "(#37011)." + "(#37011). More than one call would also be a bug — the " + "adapter has already been disposed once, a second call is " + "wasted work and may itself raise." ) assert adapter._open_fds == 0, ( f"adapter fds not released after disconnect(); " @@ -303,15 +310,21 @@ class TestAPIServerDisconnectClosesResponseStore: to ``~/.hermes/response_store.db`` (or :memory: as a fallback), which is exactly the resource that was leaking pre-fix. """ + import sqlite3 + store = ResponseStore(max_size=10, db_path=str(tmp_path / "rs.db")) adapter = self._build_adapter_with_store(store) await adapter.disconnect() # Post-disconnect, the underlying sqlite3 conn should be closed. - # Any further query raises ``ProgrammingError: Cannot operate - # on a closed database``. - with pytest.raises(Exception): + # sqlite3 raises ``ProgrammingError: Cannot operate on a closed + # database`` for any further operation. We assert on the + # specific exception type (not bare ``Exception``) so the test + # only passes when the close actually took effect — a generic + # ``Exception`` catcher would mask unrelated failures (env + # issues, AttributeError, etc.). + with pytest.raises(sqlite3.ProgrammingError): store._conn.execute("SELECT 1").fetchone() @pytest.mark.asyncio