When `docker run -d` fails after Docker has already created the container object (e.g. exit 125 when the daemon isn't ready, or a timeout mid image pull), the code raised before `self._container_id` was set — so the container leaked permanently in "Created" state. Reported in #7439: 110+ orphaned containers accumulated over 3 days from hourly cron- scheduled gateway sessions hitting a Docker Desktop startup race. The orphan reaper added in #33645 (reap_orphan_containers) does NOT cover this case: it filters `status=exited`, but a failed-create container is in `Created` state, so it slips through and is never reaped. Wrap the `docker run -d` call in try/except and `docker rm -f` the container by its known name before re-raising. Salvages #7440 by @Tranquil-Flow. Their branch predated the cross-process reuse + labels rework on `main`, so a cherry-pick conflicted; reconstructed the same intent (plus their two regression tests, adapted to mock the new reuse `docker ps` probe) against current `main`. Verified adversarially: reverted just the product change to origin/main's `docker.py`, ran the two new tests -> both FAIL with `assert 0 == 1 ("docker rm should be called once")`. With the fix applied, both pass; full test_docker_environment.py is 65/65 green. Closes #7440. Fixes #7439. Co-authored-by: Evi Nova <66773372+Tranquil-Flow@users.noreply.github.com>
This commit is contained in:
@ -786,6 +786,84 @@ def test_reuse_falls_back_to_fresh_run_when_start_fails(monkeypatch):
|
||||
assert run_invocations, "fallback to fresh docker run must happen on start failure"
|
||||
|
||||
|
||||
def test_failed_docker_run_cleans_up_orphaned_container(monkeypatch):
|
||||
"""When ``docker run`` fails (e.g. exit 125), the partially-created
|
||||
container must be removed by name.
|
||||
|
||||
Docker can create the container object before failing to start it,
|
||||
leaving a stale ``Created`` container. The exited-only orphan reaper
|
||||
(``reap_orphan_containers``, ``status=exited``) never catches a
|
||||
``Created`` orphan, so without this cleanup it leaks permanently.
|
||||
Regression for #7439. Salvage of #7440 (@Tranquil-Flow).
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
|
||||
cleanup_calls = []
|
||||
|
||||
def _run(cmd, **kwargs):
|
||||
if isinstance(cmd, list) and len(cmd) >= 2:
|
||||
sub = cmd[1]
|
||||
if sub == "version":
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="Docker version", stderr="")
|
||||
if sub == "ps":
|
||||
# No reusable container -> fall through to a fresh `docker run`.
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
if sub == "run":
|
||||
raise subprocess.CalledProcessError(
|
||||
125, cmd, output="", stderr="docker: Error response from daemon"
|
||||
)
|
||||
if sub == "rm":
|
||||
cleanup_calls.append(list(cmd))
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "run", _run)
|
||||
|
||||
with pytest.raises(subprocess.CalledProcessError):
|
||||
_make_dummy_env()
|
||||
|
||||
assert len(cleanup_calls) == 1, "docker rm should be called once for the orphaned container"
|
||||
rm_cmd = cleanup_calls[0]
|
||||
assert rm_cmd[1] == "rm" and rm_cmd[2] == "-f"
|
||||
assert rm_cmd[3].startswith("hermes-"), "should remove the container by its generated name"
|
||||
|
||||
|
||||
def test_docker_run_timeout_cleans_up_orphaned_container(monkeypatch):
|
||||
"""When ``docker run`` times out (e.g. slow image pull), the
|
||||
partially-created container must be removed. Salvage of #7440
|
||||
(@Tranquil-Flow); regression for #7439.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
|
||||
cleanup_calls = []
|
||||
|
||||
def _run(cmd, **kwargs):
|
||||
if isinstance(cmd, list) and len(cmd) >= 2:
|
||||
sub = cmd[1]
|
||||
if sub == "version":
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="Docker version", stderr="")
|
||||
if sub == "ps":
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
if sub == "run":
|
||||
raise subprocess.TimeoutExpired(cmd, 120)
|
||||
if sub == "rm":
|
||||
cleanup_calls.append(list(cmd))
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "run", _run)
|
||||
|
||||
with pytest.raises(subprocess.TimeoutExpired):
|
||||
_make_dummy_env()
|
||||
|
||||
assert len(cleanup_calls) == 1, "docker rm should be called once for the orphaned container"
|
||||
rm_cmd = cleanup_calls[0]
|
||||
assert rm_cmd[1] == "rm" and rm_cmd[2] == "-f"
|
||||
assert rm_cmd[3].startswith("hermes-"), "should remove the container by its generated name"
|
||||
|
||||
|
||||
def test_no_reuse_when_persist_across_processes_disabled(monkeypatch):
|
||||
"""Opt-out path: ``persist_across_processes=False`` skips the ps probe
|
||||
entirely and always starts a fresh container, matching the pre-fix
|
||||
|
||||
Reference in New Issue
Block a user