From 82c157b267e405ef81ace088d93bdc681b6eb05a Mon Sep 17 00:00:00 2001 From: Ben Barclay Date: Fri, 5 Jun 2026 10:19:08 +1000 Subject: [PATCH] fix(docker): clean up orphaned container when docker run fails (salvage #7440) (#39412) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `docker run -d` fails after Docker has already created the container object (e.g. exit 125 when the daemon isn't ready, or a timeout mid image pull), the code raised before `self._container_id` was set — so the container leaked permanently in "Created" state. Reported in #7439: 110+ orphaned containers accumulated over 3 days from hourly cron- scheduled gateway sessions hitting a Docker Desktop startup race. The orphan reaper added in #33645 (reap_orphan_containers) does NOT cover this case: it filters `status=exited`, but a failed-create container is in `Created` state, so it slips through and is never reaped. Wrap the `docker run -d` call in try/except and `docker rm -f` the container by its known name before re-raising. Salvages #7440 by @Tranquil-Flow. Their branch predated the cross-process reuse + labels rework on `main`, so a cherry-pick conflicted; reconstructed the same intent (plus their two regression tests, adapted to mock the new reuse `docker ps` probe) against current `main`. Verified adversarially: reverted just the product change to origin/main's `docker.py`, ran the two new tests -> both FAIL with `assert 0 == 1 ("docker rm should be called once")`. With the fix applied, both pass; full test_docker_environment.py is 65/65 green. Closes #7440. Fixes #7439. Co-authored-by: Evi Nova <66773372+Tranquil-Flow@users.noreply.github.com> --- tests/tools/test_docker_environment.py | 78 ++++++++++++++++++++++++++ tools/environments/docker.py | 32 ++++++++--- 2 files changed, 103 insertions(+), 7 deletions(-) diff --git a/tests/tools/test_docker_environment.py b/tests/tools/test_docker_environment.py index 099e167e7..90f4d5dcd 100644 --- a/tests/tools/test_docker_environment.py +++ b/tests/tools/test_docker_environment.py @@ -786,6 +786,84 @@ def test_reuse_falls_back_to_fresh_run_when_start_fails(monkeypatch): assert run_invocations, "fallback to fresh docker run must happen on start failure" +def test_failed_docker_run_cleans_up_orphaned_container(monkeypatch): + """When ``docker run`` fails (e.g. exit 125), the partially-created + container must be removed by name. + + Docker can create the container object before failing to start it, + leaving a stale ``Created`` container. The exited-only orphan reaper + (``reap_orphan_containers``, ``status=exited``) never catches a + ``Created`` orphan, so without this cleanup it leaks permanently. + Regression for #7439. Salvage of #7440 (@Tranquil-Flow). + """ + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default") + + cleanup_calls = [] + + def _run(cmd, **kwargs): + if isinstance(cmd, list) and len(cmd) >= 2: + sub = cmd[1] + if sub == "version": + return subprocess.CompletedProcess(cmd, 0, stdout="Docker version", stderr="") + if sub == "ps": + # No reusable container -> fall through to a fresh `docker run`. + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if sub == "run": + raise subprocess.CalledProcessError( + 125, cmd, output="", stderr="docker: Error response from daemon" + ) + if sub == "rm": + cleanup_calls.append(list(cmd)) + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr(docker_env.subprocess, "run", _run) + + with pytest.raises(subprocess.CalledProcessError): + _make_dummy_env() + + assert len(cleanup_calls) == 1, "docker rm should be called once for the orphaned container" + rm_cmd = cleanup_calls[0] + assert rm_cmd[1] == "rm" and rm_cmd[2] == "-f" + assert rm_cmd[3].startswith("hermes-"), "should remove the container by its generated name" + + +def test_docker_run_timeout_cleans_up_orphaned_container(monkeypatch): + """When ``docker run`` times out (e.g. slow image pull), the + partially-created container must be removed. Salvage of #7440 + (@Tranquil-Flow); regression for #7439. + """ + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default") + + cleanup_calls = [] + + def _run(cmd, **kwargs): + if isinstance(cmd, list) and len(cmd) >= 2: + sub = cmd[1] + if sub == "version": + return subprocess.CompletedProcess(cmd, 0, stdout="Docker version", stderr="") + if sub == "ps": + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + if sub == "run": + raise subprocess.TimeoutExpired(cmd, 120) + if sub == "rm": + cleanup_calls.append(list(cmd)) + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr(docker_env.subprocess, "run", _run) + + with pytest.raises(subprocess.TimeoutExpired): + _make_dummy_env() + + assert len(cleanup_calls) == 1, "docker rm should be called once for the orphaned container" + rm_cmd = cleanup_calls[0] + assert rm_cmd[1] == "rm" and rm_cmd[2] == "-f" + assert rm_cmd[3].startswith("hermes-"), "should remove the container by its generated name" + + def test_no_reuse_when_persist_across_processes_disabled(monkeypatch): """Opt-out path: ``persist_across_processes=False`` skips the ps probe entirely and always starts a fresh container, matching the pre-fix diff --git a/tools/environments/docker.py b/tools/environments/docker.py index 5a0af2692..b87bdb125 100644 --- a/tools/environments/docker.py +++ b/tools/environments/docker.py @@ -854,13 +854,31 @@ class DockerEnvironment(BaseEnvironment): "sleep", "infinity", # no fixed lifetime — idle reaper handles cleanup ] logger.debug(f"Starting container: {' '.join(run_cmd)}") - result = subprocess.run( - run_cmd, - capture_output=True, - text=True, - timeout=120, # image pull may take a while - check=True, - ) + try: + result = subprocess.run( + run_cmd, + capture_output=True, + text=True, + timeout=120, # image pull may take a while + check=True, + ) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + # Docker may create the container object before `docker run` + # fails to start it (e.g. exit code 125 when the daemon isn't + # ready, or a timeout mid-pull). That orphan is left in + # "Created" state — which the exited-only orphan reaper + # (reap_orphan_containers, status=exited) never catches, so it + # leaks permanently. Remove it by its known name before + # re-raising. See #7439. + logger.warning( + "docker run failed for %s, cleaning up orphaned container: %s", + container_name, e, + ) + subprocess.run( + [self._docker_exe, "rm", "-f", container_name], + capture_output=True, timeout=10, + ) + raise self._container_id = result.stdout.strip() logger.info(f"Started container {container_name} ({self._container_id[:12]})")