fix(uv): move venv aside instead of gutting it in place on Windows rebuild

hermes update can brick a Windows install. When 'hermes update --force' runs
past the concurrent-process guard, rebuild_venv runs while the venv is still in
use: shutil.rmtree(ignore_errors=True) deletes site-packages + certifi's cert
bundle but can't remove the locked python.exe, leaving a half-gutted venv that
uv venv then refuses to overwrite. Every later HTTPS call dies with
FileNotFoundError for the missing cacert and there is no recovery.

--clear alone (the c136eb4de retry path) does not fix the real lock case: when
the locked interpreter is *inside* the venv being rebuilt, neither rmtree nor
uv venv --clear can delete it. os.replace of the parent directory *is* allowed
on Windows (a running .exe is tracked by handle, not path), so we move the old
venv aside atomically to <venv>.old, rebuild with --clear in its place, and the
still-running gateway/desktop keep using the moved-aside copy until they
restart. If the venv genuinely can't be moved, we abort cleanly and leave it
fully intact; if the rebuild fails, we restore the moved-aside copy.

Folds in the call-site guards from #38511 (@f3rs3n):
- rebuild_venv() returns False (and restores the backup) if uv exits 0 without
  producing an interpreter.
- both hermes update venv-rebuild call sites abort with RuntimeError instead of
  continuing into dependency install when rebuild_venv() returns False.

Also gitignore /venv.old/ so the update autostash (git stash --include-untracked)
doesn't sweep the moved-aside venv on every run.

Root-cause fix for #37881. Supersedes the --clear-only retry from c136eb4de.

Co-authored-by: f3rs3n <32328813+f3rs3n@users.noreply.github.com>
This commit is contained in:
Jeff
2026-06-04 08:04:01 -07:00
committed by ethernet
parent ee7948ea6e
commit 1f347ee543
5 changed files with 175 additions and 115 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
.DS_Store
/venv/
/venv.old/
/_pycache/
*.pyc*
__pycache__/

View File

@ -8030,7 +8030,10 @@ def _update_via_zip(args):
# may point to a Python without FTS5. Rebuild it so the new managed
# uv provides a fresh interpreter with FTS5 guaranteed.
if fresh_bootstrap and uv_bin:
rebuild_venv(uv_bin, PROJECT_ROOT / "venv")
if not rebuild_venv(uv_bin, PROJECT_ROOT / "venv"):
raise RuntimeError(
"venv rebuild failed; aborting update before dependency install"
)
pip_cmd = [sys.executable, "-m", "pip"]
if not uv_bin:
@ -10573,7 +10576,10 @@ def _cmd_update_impl(args, gateway_mode: bool):
# may point to a Python without FTS5. Rebuild it so the new managed
# uv provides a fresh interpreter with FTS5 guaranteed.
if fresh_bootstrap and uv_bin:
rebuild_venv(uv_bin, PROJECT_ROOT / "venv")
if not rebuild_venv(uv_bin, PROJECT_ROOT / "venv"):
raise RuntimeError(
"venv rebuild failed; aborting update before dependency install"
)
pip_cmd = [sys.executable, "-m", "pip"]
if not uv_bin:

View File

@ -106,41 +106,69 @@ def rebuild_venv(uv_bin: str, venv_dir: Path, python_version: str = "3.11") -> b
fresh interpreter from the current managed uv. Returns ``True`` on
success.
On Windows, ``shutil.rmtree(..., ignore_errors=True)`` can silently leave
the venv directory partially intact when another process is holding an
open handle to a file inside it (typical culprits: a running
``hermes.exe`` REPL, the gateway, AV scanners). If we don't notice that
and just call ``uv venv``, uv refuses with
``Caused by: A directory already exists at: venv`` and the *whole
update* falls back to installing on top of the stale venv — which has
historically produced partial installs where a freshly added dependency
(e.g. ``pathspec``) silently fails to land. Retry with ``--clear`` to
force uv past that condition before giving up.
The old venv is moved aside *atomically* (``os.replace`` to ``<venv>.old``)
before recreating — never deleted in place. On Windows a still-running
``hermes.exe`` (gateway/desktop) holds ``venv\\Scripts\\python.exe`` open;
``shutil.rmtree(ignore_errors=True)`` would delete everything it *can*
(site-packages, certifi's cert bundle) and silently leave a half-gutted
venv that the following ``uv venv`` then refuses to overwrite ("directory
already exists") — bricking the install with no recovery (every later HTTPS
call dies with ``FileNotFoundError`` for the missing cert bundle).
``--clear`` alone does not fix this: when the locked interpreter is *inside*
the venv being rebuilt, neither ``rmtree`` nor ``uv venv --clear`` can
delete the held ``python.exe``. ``os.replace`` of the parent directory *is*
allowed (Windows tracks a running ``.exe`` by handle, not path), so the
rebuild completes while the running process keeps using the moved-aside copy
until it restarts. If the venv genuinely cannot be moved, we abort cleanly
and leave it fully intact; and if the rebuild itself fails we move the old
venv back so Hermes is never left with no venv at all.
"""
backup: Optional[Path] = None
if venv_dir.exists():
print(f" → Rebuilding venv (old Python may lack FTS5)...")
shutil.rmtree(venv_dir, ignore_errors=True)
backup = venv_dir.with_name(venv_dir.name + ".old")
shutil.rmtree(backup, ignore_errors=True) # clear any stale backup
try:
# Atomic move — fails (without partial deletion) if a process still
# holds files inside the venv, which is exactly the Windows
# file-lock case that previously bricked the install.
os.replace(venv_dir, backup)
except OSError as exc:
logger.warning("venv rebuild aborted — venv in use: %s", exc)
print(
" ✗ venv rebuild aborted — the venv is in use; stop the "
f"gateway/desktop and retry ({exc})"
)
return False
def _run_uv_venv(extra_args: list[str]) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[uv_bin, "venv", str(venv_dir), "--python", python_version, *extra_args],
capture_output=True,
text=True,
check=False,
)
result = subprocess.run(
[uv_bin, "venv", str(venv_dir), "--python", python_version, "--clear"],
capture_output=True,
text=True,
check=False,
)
result = _run_uv_venv([])
# If uv refused because the directory still exists (rmtree above was
# blocked by an open file handle, common on Windows), retry with
# --clear so uv overwrites it. Match on stderr because uv's exit code
# alone doesn't distinguish "dir exists" from real failures.
if result.returncode != 0 and "already exists" in (result.stderr or "").lower():
print(" → venv dir not fully removed (likely an open file handle); retrying with --clear...")
result = _run_uv_venv(["--clear"])
def _restore_backup() -> None:
if backup is not None and backup.exists():
shutil.rmtree(venv_dir, ignore_errors=True)
try:
os.replace(backup, venv_dir)
print(" ↩ Restored previous venv after failed rebuild.")
except OSError:
pass
if result.returncode == 0:
venv_python = venv_dir / ("Scripts" if platform.system() == "Windows" else "bin") / "python"
# uv can exit 0 yet leave no usable interpreter (e.g. a half-written
# venv). Don't report success on a venv that has no python — restore the
# moved-aside copy so the caller can abort without losing a working env.
if not venv_python.exists():
logger.warning("venv rebuild reported success but %s is missing", venv_python)
print(f" ✗ venv rebuild failed: Python interpreter missing at {venv_python}")
_restore_backup()
return False
if backup is not None:
shutil.rmtree(backup, ignore_errors=True)
py_ver = subprocess.run(
[str(venv_python), "--version"],
capture_output=True,
@ -150,6 +178,9 @@ def rebuild_venv(uv_bin: str, venv_dir: Path, python_version: str = "3.11") -> b
print(f" ✓ venv rebuilt ({py_ver})")
return True
else:
# Rebuild failed — restore the old venv so we never leave Hermes with no
# venv (the bricked-install failure mode this function exists to avoid).
_restore_backup()
logger.warning("venv rebuild failed: %s", result.stderr)
print(f" ✗ venv rebuild failed: {result.stderr.strip()}")
return False

View File

@ -108,121 +108,108 @@ class TestEnsureUv:
# ---------------------------------------------------------------------------
class TestRebuildVenv:
def test_removes_old_venv_and_creates_new(self, tmp_path):
def test_moves_old_venv_aside_and_creates_new(self, tmp_path):
"""The old venv is moved aside to <venv>.old (never rmtree'd in place),
uv is invoked with --clear, the moved-aside backup is removed on
success, and the rebuilt interpreter is reported."""
venv_dir = tmp_path / "venv"
venv_dir.mkdir()
(venv_dir / "old_file").write_text("stale")
uv_bin = str(tmp_path / "bin" / "uv")
call_log: list[list[str]] = []
def fake_run(cmd, **kwargs):
m = MagicMock(returncode=0)
if cmd[1] == "venv":
# Simulate uv creating the venv dir
venv_dir.mkdir(exist_ok=True)
bin_dir = venv_dir / "bin"
call_log.append(list(cmd))
m = MagicMock(returncode=0, stderr="", stdout="")
if len(cmd) >= 2 and cmd[1] == "venv":
# Simulate uv creating the venv dir with a python interpreter
bin_dir = venv_dir / ("Scripts" if os.name == "nt" else "bin")
bin_dir.mkdir(parents=True, exist_ok=True)
(bin_dir / "python").write_text("#!/bin/sh\necho Python 3.11.0")
python_name = "python.exe" if os.name == "nt" else "python"
(bin_dir / python_name).write_text("#!/bin/sh\necho Python 3.11.0")
elif "--version" in cmd:
m.stdout = "Python 3.11.0"
return m
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \
patch("hermes_cli.managed_uv.shutil.rmtree") as mock_rmtree:
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run):
from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir)
assert result is True
mock_rmtree.assert_called_once_with(venv_dir, ignore_errors=True)
assert result is True
# uv venv was invoked exactly once, always with --clear.
venv_calls = [c for c in call_log if len(c) >= 2 and c[1] == "venv"]
assert len(venv_calls) == 1, f"expected 1 venv call, got {venv_calls}"
assert "--clear" in venv_calls[0]
# The moved-aside backup is cleaned up after a successful rebuild.
assert not (tmp_path / "venv.old").exists()
def test_aborts_without_deleting_when_venv_in_use(self, tmp_path):
"""If os.replace fails (Windows file lock — venv in use), we must abort
cleanly WITHOUT deleting the venv and WITHOUT invoking uv."""
venv_dir = tmp_path / "venv"
venv_dir.mkdir()
(venv_dir / "locked") .write_text("held open")
uv_bin = str(tmp_path / "bin" / "uv")
call_log: list[list[str]] = []
def fake_run(cmd, **kwargs):
call_log.append(list(cmd))
return MagicMock(returncode=0, stderr="", stdout="")
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \
patch("hermes_cli.managed_uv.os.replace", side_effect=OSError("in use")):
from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir)
assert result is False
# venv left fully intact, uv never invoked.
assert venv_dir.exists() and (venv_dir / "locked").exists()
assert [c for c in call_log if len(c) >= 2 and c[1] == "venv"] == []
def test_restores_backup_when_rebuild_fails(self, tmp_path):
"""If uv venv exits non-zero, the moved-aside venv is restored so we
never leave Hermes with no venv at all."""
venv_dir = tmp_path / "venv"
venv_dir.mkdir()
(venv_dir / "marker").write_text("original")
uv_bin = str(tmp_path / "bin" / "uv")
def fake_run(cmd, **kwargs):
return MagicMock(returncode=1, stderr="boom", stdout="")
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run):
from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir)
assert result is False
# Original venv restored from the .old backup.
assert venv_dir.exists() and (venv_dir / "marker").read_text() == "original"
assert not (tmp_path / "venv.old").exists()
def test_rebuild_failure_returns_false(self, tmp_path):
venv_dir = tmp_path / "venv"
uv_bin = str(tmp_path / "bin" / "uv")
with patch("hermes_cli.managed_uv.subprocess.run") as mock_run, \
patch("hermes_cli.managed_uv.shutil.rmtree"):
with patch("hermes_cli.managed_uv.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=1, stderr="nope")
from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir)
assert result is False
def test_retries_with_clear_when_dir_already_exists(self, tmp_path):
"""On Windows, rmtree can silently fail when an open handle holds a
file in the venv (running hermes.exe, gateway, AV scanner). uv then
refuses with ``Caused by: A directory already exists at: venv``.
Make sure we don't give up — retry with ``--clear`` to force uv past
the stale directory and rebuild successfully."""
venv_dir = tmp_path / "venv"
venv_dir.mkdir()
(venv_dir / "stale_open_handle").write_text("rmtree couldn't delete me")
uv_bin = str(tmp_path / "bin" / "uv")
call_log: list[list[str]] = []
def fake_run(cmd, **kwargs):
call_log.append(list(cmd))
m = MagicMock()
if cmd[1] == "venv" and "--clear" not in cmd:
# First attempt: uv refuses because dir still exists
m.returncode = 1
m.stderr = (
"error: Failed to create virtual environment\n"
" Caused by: A directory already exists at: venv\n"
"hint: Use the `--clear` flag or set `UV_VENV_CLEAR=1` to replace the existing directory\n"
)
m.stdout = ""
return m
if cmd[1] == "venv" and "--clear" in cmd:
# Retry: succeeds. Simulate uv writing the python shim.
m.returncode = 0
m.stderr = ""
m.stdout = ""
bin_dir = venv_dir / ("Scripts" if os.name == "nt" else "bin")
bin_dir.mkdir(parents=True, exist_ok=True)
python_name = "python.exe" if os.name == "nt" else "python"
(bin_dir / python_name).write_text("#!/bin/sh\necho Python 3.11.0")
return m
if "--version" in cmd:
m.returncode = 0
m.stdout = "Python 3.11.0"
m.stderr = ""
return m
m.returncode = 0
return m
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \
patch("hermes_cli.managed_uv.shutil.rmtree"):
from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir)
assert result is True, "rebuild should succeed after --clear retry"
# We expect exactly two ``uv venv`` calls: one without --clear, one with.
venv_calls = [c for c in call_log if len(c) >= 2 and c[1] == "venv"]
assert len(venv_calls) == 2, f"expected 2 venv calls, got {venv_calls}"
assert "--clear" not in venv_calls[0], "first call should not pass --clear"
assert "--clear" in venv_calls[1], "retry must pass --clear"
def test_does_not_retry_when_first_failure_is_not_dir_exists(self, tmp_path):
"""If uv venv fails for some other reason (e.g. interpreter download
failed, disk full), we should NOT silently retry with --clear —
that would mask a real problem. Just surface the original failure."""
def test_rebuild_success_without_python_returns_false(self, tmp_path):
"""uv can exit 0 yet leave no interpreter; that must not count as success
(guard adapted from #38511)."""
venv_dir = tmp_path / "venv"
uv_bin = str(tmp_path / "bin" / "uv")
call_log: list[list[str]] = []
def fake_run(cmd, **kwargs):
call_log.append(list(cmd))
m = MagicMock(returncode=1, stderr="error: No space left on device", stdout="")
return m
with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \
patch("hermes_cli.managed_uv.shutil.rmtree"):
with patch("hermes_cli.managed_uv.subprocess.run") as mock_run:
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
from hermes_cli.managed_uv import rebuild_venv
result = rebuild_venv(uv_bin, venv_dir)
assert result is False
venv_calls = [c for c in call_log if len(c) >= 2 and c[1] == "venv"]
assert len(venv_calls) == 1, "should not retry on non-dir-exists failures"
assert "--clear" not in venv_calls[0]
assert result is False
# Returned before the `python --version` probe ran (only the uv venv call).
assert mock_run.call_count == 1
# ---------------------------------------------------------------------------

View File

@ -423,6 +423,41 @@ def test_cmd_update_succeeds_with_extras(monkeypatch, tmp_path):
assert ".[all]" in install_cmds[0]
def test_cmd_update_aborts_when_fresh_managed_uv_rebuild_fails(monkeypatch, tmp_path):
"""A failed fresh managed-uv venv rebuild must not continue into pip install
(guard adapted from #38511)."""
_setup_update_mocks(monkeypatch, tmp_path)
monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False)
recorded = []
def fake_run(cmd, **kwargs):
recorded.append(cmd)
# Tolerant matching: the update flow's exact git invocations vary by
# checkout, so key off the verb. Branch detection must return a real name
# and rev-list a parseable count, or the flow aborts early before it ever
# reaches the venv rebuild this test exercises.
if isinstance(cmd, (list, tuple)) and cmd and cmd[0] == "git":
if "rev-parse" in cmd:
return SimpleNamespace(stdout="main\n", stderr="", returncode=0)
if "rev-list" in cmd:
return SimpleNamespace(stdout="1\n", stderr="", returncode=0)
if "pull" in cmd:
return SimpleNamespace(stdout="Updating\n", stderr="", returncode=0)
return SimpleNamespace(returncode=0, stdout="", stderr="")
monkeypatch.setattr(hermes_main.subprocess, "run", fake_run)
with patch("hermes_cli.managed_uv.ensure_uv", return_value=("/usr/bin/uv", True)), \
patch("hermes_cli.managed_uv.rebuild_venv", return_value=False), \
pytest.raises(RuntimeError, match="venv rebuild failed"):
hermes_main.cmd_update(SimpleNamespace())
install_cmds = [c for c in recorded if "pip" in c and "install" in c]
assert install_cmds == []
def test_install_with_optional_fallback_honors_custom_group(monkeypatch):
"""Termux update path should target .[termux-all] when requested."""
calls = []