fix(uv): move venv aside instead of gutting it in place on Windows rebuild

hermes update can brick a Windows install. When 'hermes update --force' runs
past the concurrent-process guard, rebuild_venv runs while the venv is still in
use: shutil.rmtree(ignore_errors=True) deletes site-packages + certifi's cert
bundle but can't remove the locked python.exe, leaving a half-gutted venv that
uv venv then refuses to overwrite. Every later HTTPS call dies with
FileNotFoundError for the missing cacert and there is no recovery.

--clear alone (the c136eb4de retry path) does not fix the real lock case: when
the locked interpreter is *inside* the venv being rebuilt, neither rmtree nor
uv venv --clear can delete it. os.replace of the parent directory *is* allowed
on Windows (a running .exe is tracked by handle, not path), so we move the old
venv aside atomically to <venv>.old, rebuild with --clear in its place, and the
still-running gateway/desktop keep using the moved-aside copy until they
restart. If the venv genuinely can't be moved, we abort cleanly and leave it
fully intact; if the rebuild fails, we restore the moved-aside copy.

Folds in the call-site guards from #38511 (@f3rs3n):
- rebuild_venv() returns False (and restores the backup) if uv exits 0 without
  producing an interpreter.
- both hermes update venv-rebuild call sites abort with RuntimeError instead of
  continuing into dependency install when rebuild_venv() returns False.

Also gitignore /venv.old/ so the update autostash (git stash --include-untracked)
doesn't sweep the moved-aside venv on every run.

Root-cause fix for #37881. Supersedes the --clear-only retry from c136eb4de.

Co-authored-by: f3rs3n <32328813+f3rs3n@users.noreply.github.com>
This commit is contained in:
Jeff
2026-06-04 08:04:01 -07:00
committed by ethernet
parent ee7948ea6e
commit 1f347ee543
5 changed files with 175 additions and 115 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
.DS_Store .DS_Store
/venv/ /venv/
/venv.old/
/_pycache/ /_pycache/
*.pyc* *.pyc*
__pycache__/ __pycache__/

View File

@ -8030,7 +8030,10 @@ def _update_via_zip(args):
# may point to a Python without FTS5. Rebuild it so the new managed # may point to a Python without FTS5. Rebuild it so the new managed
# uv provides a fresh interpreter with FTS5 guaranteed. # uv provides a fresh interpreter with FTS5 guaranteed.
if fresh_bootstrap and uv_bin: if fresh_bootstrap and uv_bin:
rebuild_venv(uv_bin, PROJECT_ROOT / "venv") if not rebuild_venv(uv_bin, PROJECT_ROOT / "venv"):
raise RuntimeError(
"venv rebuild failed; aborting update before dependency install"
)
pip_cmd = [sys.executable, "-m", "pip"] pip_cmd = [sys.executable, "-m", "pip"]
if not uv_bin: if not uv_bin:
@ -10573,7 +10576,10 @@ def _cmd_update_impl(args, gateway_mode: bool):
# may point to a Python without FTS5. Rebuild it so the new managed # may point to a Python without FTS5. Rebuild it so the new managed
# uv provides a fresh interpreter with FTS5 guaranteed. # uv provides a fresh interpreter with FTS5 guaranteed.
if fresh_bootstrap and uv_bin: if fresh_bootstrap and uv_bin:
rebuild_venv(uv_bin, PROJECT_ROOT / "venv") if not rebuild_venv(uv_bin, PROJECT_ROOT / "venv"):
raise RuntimeError(
"venv rebuild failed; aborting update before dependency install"
)
pip_cmd = [sys.executable, "-m", "pip"] pip_cmd = [sys.executable, "-m", "pip"]
if not uv_bin: if not uv_bin:

View File

@ -106,41 +106,69 @@ def rebuild_venv(uv_bin: str, venv_dir: Path, python_version: str = "3.11") -> b
fresh interpreter from the current managed uv. Returns ``True`` on fresh interpreter from the current managed uv. Returns ``True`` on
success. success.
On Windows, ``shutil.rmtree(..., ignore_errors=True)`` can silently leave The old venv is moved aside *atomically* (``os.replace`` to ``<venv>.old``)
the venv directory partially intact when another process is holding an before recreating — never deleted in place. On Windows a still-running
open handle to a file inside it (typical culprits: a running ``hermes.exe`` (gateway/desktop) holds ``venv\\Scripts\\python.exe`` open;
``hermes.exe`` REPL, the gateway, AV scanners). If we don't notice that ``shutil.rmtree(ignore_errors=True)`` would delete everything it *can*
and just call ``uv venv``, uv refuses with (site-packages, certifi's cert bundle) and silently leave a half-gutted
``Caused by: A directory already exists at: venv`` and the *whole venv that the following ``uv venv`` then refuses to overwrite ("directory
update* falls back to installing on top of the stale venv — which has already exists") — bricking the install with no recovery (every later HTTPS
historically produced partial installs where a freshly added dependency call dies with ``FileNotFoundError`` for the missing cert bundle).
(e.g. ``pathspec``) silently fails to land. Retry with ``--clear`` to ``--clear`` alone does not fix this: when the locked interpreter is *inside*
force uv past that condition before giving up. the venv being rebuilt, neither ``rmtree`` nor ``uv venv --clear`` can
delete the held ``python.exe``. ``os.replace`` of the parent directory *is*
allowed (Windows tracks a running ``.exe`` by handle, not path), so the
rebuild completes while the running process keeps using the moved-aside copy
until it restarts. If the venv genuinely cannot be moved, we abort cleanly
and leave it fully intact; and if the rebuild itself fails we move the old
venv back so Hermes is never left with no venv at all.
""" """
backup: Optional[Path] = None
if venv_dir.exists(): if venv_dir.exists():
print(f" → Rebuilding venv (old Python may lack FTS5)...") print(f" → Rebuilding venv (old Python may lack FTS5)...")
shutil.rmtree(venv_dir, ignore_errors=True) backup = venv_dir.with_name(venv_dir.name + ".old")
shutil.rmtree(backup, ignore_errors=True) # clear any stale backup
try:
# Atomic move — fails (without partial deletion) if a process still
# holds files inside the venv, which is exactly the Windows
# file-lock case that previously bricked the install.
os.replace(venv_dir, backup)
except OSError as exc:
logger.warning("venv rebuild aborted — venv in use: %s", exc)
print(
" ✗ venv rebuild aborted — the venv is in use; stop the "
f"gateway/desktop and retry ({exc})"
)
return False
def _run_uv_venv(extra_args: list[str]) -> subprocess.CompletedProcess[str]: result = subprocess.run(
return subprocess.run( [uv_bin, "venv", str(venv_dir), "--python", python_version, "--clear"],
[uv_bin, "venv", str(venv_dir), "--python", python_version, *extra_args], capture_output=True,
capture_output=True, text=True,
text=True, check=False,
check=False, )
)
result = _run_uv_venv([]) def _restore_backup() -> None:
if backup is not None and backup.exists():
# If uv refused because the directory still exists (rmtree above was shutil.rmtree(venv_dir, ignore_errors=True)
# blocked by an open file handle, common on Windows), retry with try:
# --clear so uv overwrites it. Match on stderr because uv's exit code os.replace(backup, venv_dir)
# alone doesn't distinguish "dir exists" from real failures. print(" ↩ Restored previous venv after failed rebuild.")
if result.returncode != 0 and "already exists" in (result.stderr or "").lower(): except OSError:
print(" → venv dir not fully removed (likely an open file handle); retrying with --clear...") pass
result = _run_uv_venv(["--clear"])
if result.returncode == 0: if result.returncode == 0:
venv_python = venv_dir / ("Scripts" if platform.system() == "Windows" else "bin") / "python" venv_python = venv_dir / ("Scripts" if platform.system() == "Windows" else "bin") / "python"
# uv can exit 0 yet leave no usable interpreter (e.g. a half-written
# venv). Don't report success on a venv that has no python — restore the
# moved-aside copy so the caller can abort without losing a working env.
if not venv_python.exists():
logger.warning("venv rebuild reported success but %s is missing", venv_python)
print(f" ✗ venv rebuild failed: Python interpreter missing at {venv_python}")
_restore_backup()
return False
if backup is not None:
shutil.rmtree(backup, ignore_errors=True)
py_ver = subprocess.run( py_ver = subprocess.run(
[str(venv_python), "--version"], [str(venv_python), "--version"],
capture_output=True, capture_output=True,
@ -150,6 +178,9 @@ def rebuild_venv(uv_bin: str, venv_dir: Path, python_version: str = "3.11") -> b
print(f" ✓ venv rebuilt ({py_ver})") print(f" ✓ venv rebuilt ({py_ver})")
return True return True
else: else:
# Rebuild failed — restore the old venv so we never leave Hermes with no
# venv (the bricked-install failure mode this function exists to avoid).
_restore_backup()
logger.warning("venv rebuild failed: %s", result.stderr) logger.warning("venv rebuild failed: %s", result.stderr)
print(f" ✗ venv rebuild failed: {result.stderr.strip()}") print(f" ✗ venv rebuild failed: {result.stderr.strip()}")
return False return False

View File

@ -108,121 +108,108 @@ class TestEnsureUv:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
class TestRebuildVenv: class TestRebuildVenv:
def test_removes_old_venv_and_creates_new(self, tmp_path): def test_moves_old_venv_aside_and_creates_new(self, tmp_path):
"""The old venv is moved aside to <venv>.old (never rmtree'd in place),
uv is invoked with --clear, the moved-aside backup is removed on
success, and the rebuilt interpreter is reported."""
venv_dir = tmp_path / "venv" venv_dir = tmp_path / "venv"
venv_dir.mkdir() venv_dir.mkdir()
(venv_dir / "old_file").write_text("stale") (venv_dir / "old_file").write_text("stale")
uv_bin = str(tmp_path / "bin" / "uv") uv_bin = str(tmp_path / "bin" / "uv")
call_log: list[list[str]] = []
def fake_run(cmd, **kwargs): def fake_run(cmd, **kwargs):
m = MagicMock(returncode=0) call_log.append(list(cmd))
if cmd[1] == "venv": m = MagicMock(returncode=0, stderr="", stdout="")
# Simulate uv creating the venv dir if len(cmd) >= 2 and cmd[1] == "venv":
venv_dir.mkdir(exist_ok=True) # Simulate uv creating the venv dir with a python interpreter
bin_dir = venv_dir / "bin" bin_dir = venv_dir / ("Scripts" if os.name == "nt" else "bin")
bin_dir.mkdir(parents=True, exist_ok=True) bin_dir.mkdir(parents=True, exist_ok=True)
(bin_dir / "python").write_text("#!/bin/sh\necho Python 3.11.0") python_name = "python.exe" if os.name == "nt" else "python"
(bin_dir / python_name).write_text("#!/bin/sh\necho Python 3.11.0")
elif "--version" in cmd: elif "--version" in cmd:
m.stdout = "Python 3.11.0" m.stdout = "Python 3.11.0"
return m return m
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \ with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run):
patch("hermes_cli.managed_uv.shutil.rmtree") as mock_rmtree:
from hermes_cli.managed_uv import rebuild_venv from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir) result = rebuild_venv(uv_bin, venv_dir)
assert result is True
mock_rmtree.assert_called_once_with(venv_dir, ignore_errors=True) assert result is True
# uv venv was invoked exactly once, always with --clear.
venv_calls = [c for c in call_log if len(c) >= 2 and c[1] == "venv"]
assert len(venv_calls) == 1, f"expected 1 venv call, got {venv_calls}"
assert "--clear" in venv_calls[0]
# The moved-aside backup is cleaned up after a successful rebuild.
assert not (tmp_path / "venv.old").exists()
def test_aborts_without_deleting_when_venv_in_use(self, tmp_path):
"""If os.replace fails (Windows file lock — venv in use), we must abort
cleanly WITHOUT deleting the venv and WITHOUT invoking uv."""
venv_dir = tmp_path / "venv"
venv_dir.mkdir()
(venv_dir / "locked") .write_text("held open")
uv_bin = str(tmp_path / "bin" / "uv")
call_log: list[list[str]] = []
def fake_run(cmd, **kwargs):
call_log.append(list(cmd))
return MagicMock(returncode=0, stderr="", stdout="")
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \
patch("hermes_cli.managed_uv.os.replace", side_effect=OSError("in use")):
from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir)
assert result is False
# venv left fully intact, uv never invoked.
assert venv_dir.exists() and (venv_dir / "locked").exists()
assert [c for c in call_log if len(c) >= 2 and c[1] == "venv"] == []
def test_restores_backup_when_rebuild_fails(self, tmp_path):
"""If uv venv exits non-zero, the moved-aside venv is restored so we
never leave Hermes with no venv at all."""
venv_dir = tmp_path / "venv"
venv_dir.mkdir()
(venv_dir / "marker").write_text("original")
uv_bin = str(tmp_path / "bin" / "uv")
def fake_run(cmd, **kwargs):
return MagicMock(returncode=1, stderr="boom", stdout="")
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run):
from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir)
assert result is False
# Original venv restored from the .old backup.
assert venv_dir.exists() and (venv_dir / "marker").read_text() == "original"
assert not (tmp_path / "venv.old").exists()
def test_rebuild_failure_returns_false(self, tmp_path): def test_rebuild_failure_returns_false(self, tmp_path):
venv_dir = tmp_path / "venv" venv_dir = tmp_path / "venv"
uv_bin = str(tmp_path / "bin" / "uv") uv_bin = str(tmp_path / "bin" / "uv")
with patch("hermes_cli.managed_uv.subprocess.run") as mock_run, \ with patch("hermes_cli.managed_uv.subprocess.run") as mock_run:
patch("hermes_cli.managed_uv.shutil.rmtree"):
mock_run.return_value = MagicMock(returncode=1, stderr="nope") mock_run.return_value = MagicMock(returncode=1, stderr="nope")
from hermes_cli.managed_uv import rebuild_venv from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir) result = rebuild_venv(uv_bin, venv_dir)
assert result is False assert result is False
def test_retries_with_clear_when_dir_already_exists(self, tmp_path): def test_rebuild_success_without_python_returns_false(self, tmp_path):
"""On Windows, rmtree can silently fail when an open handle holds a """uv can exit 0 yet leave no interpreter; that must not count as success
file in the venv (running hermes.exe, gateway, AV scanner). uv then (guard adapted from #38511)."""
refuses with ``Caused by: A directory already exists at: venv``.
Make sure we don't give up — retry with ``--clear`` to force uv past
the stale directory and rebuild successfully."""
venv_dir = tmp_path / "venv"
venv_dir.mkdir()
(venv_dir / "stale_open_handle").write_text("rmtree couldn't delete me")
uv_bin = str(tmp_path / "bin" / "uv")
call_log: list[list[str]] = []
def fake_run(cmd, **kwargs):
call_log.append(list(cmd))
m = MagicMock()
if cmd[1] == "venv" and "--clear" not in cmd:
# First attempt: uv refuses because dir still exists
m.returncode = 1
m.stderr = (
"error: Failed to create virtual environment\n"
" Caused by: A directory already exists at: venv\n"
"hint: Use the `--clear` flag or set `UV_VENV_CLEAR=1` to replace the existing directory\n"
)
m.stdout = ""
return m
if cmd[1] == "venv" and "--clear" in cmd:
# Retry: succeeds. Simulate uv writing the python shim.
m.returncode = 0
m.stderr = ""
m.stdout = ""
bin_dir = venv_dir / ("Scripts" if os.name == "nt" else "bin")
bin_dir.mkdir(parents=True, exist_ok=True)
python_name = "python.exe" if os.name == "nt" else "python"
(bin_dir / python_name).write_text("#!/bin/sh\necho Python 3.11.0")
return m
if "--version" in cmd:
m.returncode = 0
m.stdout = "Python 3.11.0"
m.stderr = ""
return m
m.returncode = 0
return m
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \
patch("hermes_cli.managed_uv.shutil.rmtree"):
from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir)
assert result is True, "rebuild should succeed after --clear retry"
# We expect exactly two ``uv venv`` calls: one without --clear, one with.
venv_calls = [c for c in call_log if len(c) >= 2 and c[1] == "venv"]
assert len(venv_calls) == 2, f"expected 2 venv calls, got {venv_calls}"
assert "--clear" not in venv_calls[0], "first call should not pass --clear"
assert "--clear" in venv_calls[1], "retry must pass --clear"
def test_does_not_retry_when_first_failure_is_not_dir_exists(self, tmp_path):
"""If uv venv fails for some other reason (e.g. interpreter download
failed, disk full), we should NOT silently retry with --clear —
that would mask a real problem. Just surface the original failure."""
venv_dir = tmp_path / "venv" venv_dir = tmp_path / "venv"
uv_bin = str(tmp_path / "bin" / "uv") uv_bin = str(tmp_path / "bin" / "uv")
call_log: list[list[str]] = []
def fake_run(cmd, **kwargs): with patch("hermes_cli.managed_uv.subprocess.run") as mock_run:
call_log.append(list(cmd)) mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
m = MagicMock(returncode=1, stderr="error: No space left on device", stdout="")
return m
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \
patch("hermes_cli.managed_uv.shutil.rmtree"):
from hermes_cli.managed_uv import rebuild_venv from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir) result = rebuild_venv(uv_bin, venv_dir)
assert result is False
assert result is False # Returned before the `python --version` probe ran (only the uv venv call).
venv_calls = [c for c in call_log if len(c) >= 2 and c[1] == "venv"] assert mock_run.call_count == 1
assert len(venv_calls) == 1, "should not retry on non-dir-exists failures"
assert "--clear" not in venv_calls[0]
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View File

@ -423,6 +423,41 @@ def test_cmd_update_succeeds_with_extras(monkeypatch, tmp_path):
assert ".[all]" in install_cmds[0] assert ".[all]" in install_cmds[0]
def test_cmd_update_aborts_when_fresh_managed_uv_rebuild_fails(monkeypatch, tmp_path):
"""A failed fresh managed-uv venv rebuild must not continue into pip install
(guard adapted from #38511)."""
_setup_update_mocks(monkeypatch, tmp_path)
monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
recorded = []
def fake_run(cmd, **kwargs):
recorded.append(cmd)
# Tolerant matching: the update flow's exact git invocations vary by
# checkout, so key off the verb. Branch detection must return a real name
# and rev-list a parseable count, or the flow aborts early before it ever
# reaches the venv rebuild this test exercises.
if isinstance(cmd, (list, tuple)) and cmd and cmd[0] == "git":
if "rev-parse" in cmd:
return SimpleNamespace(stdout="main\n", stderr="", returncode=0)
if "rev-list" in cmd:
return SimpleNamespace(stdout="1\n", stderr="", returncode=0)
if "pull" in cmd:
return SimpleNamespace(stdout="Updating\n", stderr="", returncode=0)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(hermes_main.subprocess, "run", fake_run)
with patch("hermes_cli.managed_uv.ensure_uv", return_value=("/usr/bin/uv", True)), \
patch("hermes_cli.managed_uv.rebuild_venv", return_value=False), \
pytest.raises(RuntimeError, match="venv rebuild failed"):
hermes_main.cmd_update(SimpleNamespace())
install_cmds = [c for c in recorded if "pip" in c and "install" in c]
assert install_cmds == []
def test_install_with_optional_fallback_honors_custom_group(monkeypatch): def test_install_with_optional_fallback_honors_custom_group(monkeypatch):
"""Termux update path should target .[termux-all] when requested.""" """Termux update path should target .[termux-all] when requested."""
calls = [] calls = []