From 1f347ee543b650bd788b69af65830719703d74d9 Mon Sep 17 00:00:00 2001 From: Jeff Date: Thu, 4 Jun 2026 08:04:01 -0700 Subject: [PATCH] fix(uv): move venv aside instead of gutting it in place on Windows rebuild hermes update can brick a Windows install. When 'hermes update --force' runs past the concurrent-process guard, rebuild_venv runs while the venv is still in use: shutil.rmtree(ignore_errors=True) deletes site-packages + certifi's cert bundle but can't remove the locked python.exe, leaving a half-gutted venv that uv venv then refuses to overwrite. Every later HTTPS call dies with FileNotFoundError for the missing cacert and there is no recovery. --clear alone (the c136eb4de retry path) does not fix the real lock case: when the locked interpreter is *inside* the venv being rebuilt, neither rmtree nor uv venv --clear can delete it. os.replace of the parent directory *is* allowed on Windows (a running .exe is tracked by handle, not path), so we move the old venv aside atomically to .old, rebuild with --clear in its place, and the still-running gateway/desktop keep using the moved-aside copy until they restart. If the venv genuinely can't be moved, we abort cleanly and leave it fully intact; if the rebuild fails, we restore the moved-aside copy. Folds in the call-site guards from #38511 (@f3rs3n): - rebuild_venv() returns False (and restores the backup) if uv exits 0 without producing an interpreter. - both hermes update venv-rebuild call sites abort with RuntimeError instead of continuing into dependency install when rebuild_venv() returns False. Also gitignore /venv.old/ so the update autostash (git stash --include-untracked) doesn't sweep the moved-aside venv on every run. Root-cause fix for #37881. Supersedes the --clear-only retry from c136eb4de. Co-authored-by: f3rs3n <32328813+f3rs3n@users.noreply.github.com> --- .gitignore | 1 + hermes_cli/main.py | 10 +- hermes_cli/managed_uv.py | 85 ++++++++---- tests/hermes_cli/test_managed_uv.py | 159 ++++++++++------------ tests/hermes_cli/test_update_autostash.py | 35 +++++ 5 files changed, 175 insertions(+), 115 deletions(-) diff --git a/.gitignore b/.gitignore index f97db5994..b0abe3140 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .DS_Store /venv/ +/venv.old/ /_pycache/ *.pyc* __pycache__/ diff --git a/hermes_cli/main.py b/hermes_cli/main.py index aba90fb3e..9c91e7a90 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -8030,7 +8030,10 @@ def _update_via_zip(args): # may point to a Python without FTS5. Rebuild it so the new managed # uv provides a fresh interpreter with FTS5 guaranteed. if fresh_bootstrap and uv_bin: - rebuild_venv(uv_bin, PROJECT_ROOT / "venv") + if not rebuild_venv(uv_bin, PROJECT_ROOT / "venv"): + raise RuntimeError( + "venv rebuild failed; aborting update before dependency install" + ) pip_cmd = [sys.executable, "-m", "pip"] if not uv_bin: @@ -10573,7 +10576,10 @@ def _cmd_update_impl(args, gateway_mode: bool): # may point to a Python without FTS5. Rebuild it so the new managed # uv provides a fresh interpreter with FTS5 guaranteed. if fresh_bootstrap and uv_bin: - rebuild_venv(uv_bin, PROJECT_ROOT / "venv") + if not rebuild_venv(uv_bin, PROJECT_ROOT / "venv"): + raise RuntimeError( + "venv rebuild failed; aborting update before dependency install" + ) pip_cmd = [sys.executable, "-m", "pip"] if not uv_bin: diff --git a/hermes_cli/managed_uv.py b/hermes_cli/managed_uv.py index 31bbbc8b9..722cf5ba3 100644 --- a/hermes_cli/managed_uv.py +++ b/hermes_cli/managed_uv.py @@ -106,41 +106,69 @@ def rebuild_venv(uv_bin: str, venv_dir: Path, python_version: str = "3.11") -> b fresh interpreter from the current managed uv. Returns ``True`` on success. - On Windows, ``shutil.rmtree(..., ignore_errors=True)`` can silently leave - the venv directory partially intact when another process is holding an - open handle to a file inside it (typical culprits: a running - ``hermes.exe`` REPL, the gateway, AV scanners). If we don't notice that - and just call ``uv venv``, uv refuses with - ``Caused by: A directory already exists at: venv`` and the *whole - update* falls back to installing on top of the stale venv — which has - historically produced partial installs where a freshly added dependency - (e.g. ``pathspec``) silently fails to land. Retry with ``--clear`` to - force uv past that condition before giving up. + The old venv is moved aside *atomically* (``os.replace`` to ``.old``) + before recreating — never deleted in place. On Windows a still-running + ``hermes.exe`` (gateway/desktop) holds ``venv\\Scripts\\python.exe`` open; + ``shutil.rmtree(ignore_errors=True)`` would delete everything it *can* + (site-packages, certifi's cert bundle) and silently leave a half-gutted + venv that the following ``uv venv`` then refuses to overwrite ("directory + already exists") — bricking the install with no recovery (every later HTTPS + call dies with ``FileNotFoundError`` for the missing cert bundle). + ``--clear`` alone does not fix this: when the locked interpreter is *inside* + the venv being rebuilt, neither ``rmtree`` nor ``uv venv --clear`` can + delete the held ``python.exe``. ``os.replace`` of the parent directory *is* + allowed (Windows tracks a running ``.exe`` by handle, not path), so the + rebuild completes while the running process keeps using the moved-aside copy + until it restarts. If the venv genuinely cannot be moved, we abort cleanly + and leave it fully intact; and if the rebuild itself fails we move the old + venv back so Hermes is never left with no venv at all. """ + backup: Optional[Path] = None if venv_dir.exists(): print(f" → Rebuilding venv (old Python may lack FTS5)...") - shutil.rmtree(venv_dir, ignore_errors=True) + backup = venv_dir.with_name(venv_dir.name + ".old") + shutil.rmtree(backup, ignore_errors=True) # clear any stale backup + try: + # Atomic move — fails (without partial deletion) if a process still + # holds files inside the venv, which is exactly the Windows + # file-lock case that previously bricked the install. + os.replace(venv_dir, backup) + except OSError as exc: + logger.warning("venv rebuild aborted — venv in use: %s", exc) + print( + " ✗ venv rebuild aborted — the venv is in use; stop the " + f"gateway/desktop and retry ({exc})" + ) + return False - def _run_uv_venv(extra_args: list[str]) -> subprocess.CompletedProcess[str]: - return subprocess.run( - [uv_bin, "venv", str(venv_dir), "--python", python_version, *extra_args], - capture_output=True, - text=True, - check=False, - ) + result = subprocess.run( + [uv_bin, "venv", str(venv_dir), "--python", python_version, "--clear"], + capture_output=True, + text=True, + check=False, + ) - result = _run_uv_venv([]) - - # If uv refused because the directory still exists (rmtree above was - # blocked by an open file handle, common on Windows), retry with - # --clear so uv overwrites it. Match on stderr because uv's exit code - # alone doesn't distinguish "dir exists" from real failures. - if result.returncode != 0 and "already exists" in (result.stderr or "").lower(): - print(" → venv dir not fully removed (likely an open file handle); retrying with --clear...") - result = _run_uv_venv(["--clear"]) + def _restore_backup() -> None: + if backup is not None and backup.exists(): + shutil.rmtree(venv_dir, ignore_errors=True) + try: + os.replace(backup, venv_dir) + print(" ↩ Restored previous venv after failed rebuild.") + except OSError: + pass if result.returncode == 0: venv_python = venv_dir / ("Scripts" if platform.system() == "Windows" else "bin") / "python" + # uv can exit 0 yet leave no usable interpreter (e.g. a half-written + # venv). Don't report success on a venv that has no python — restore the + # moved-aside copy so the caller can abort without losing a working env. + if not venv_python.exists(): + logger.warning("venv rebuild reported success but %s is missing", venv_python) + print(f" ✗ venv rebuild failed: Python interpreter missing at {venv_python}") + _restore_backup() + return False + if backup is not None: + shutil.rmtree(backup, ignore_errors=True) py_ver = subprocess.run( [str(venv_python), "--version"], capture_output=True, @@ -150,6 +178,9 @@ def rebuild_venv(uv_bin: str, venv_dir: Path, python_version: str = "3.11") -> b print(f" ✓ venv rebuilt ({py_ver})") return True else: + # Rebuild failed — restore the old venv so we never leave Hermes with no + # venv (the bricked-install failure mode this function exists to avoid). + _restore_backup() logger.warning("venv rebuild failed: %s", result.stderr) print(f" ✗ venv rebuild failed: {result.stderr.strip()}") return False diff --git a/tests/hermes_cli/test_managed_uv.py b/tests/hermes_cli/test_managed_uv.py index f1394f6ef..aff6b8de7 100644 --- a/tests/hermes_cli/test_managed_uv.py +++ b/tests/hermes_cli/test_managed_uv.py @@ -108,121 +108,108 @@ class TestEnsureUv: # --------------------------------------------------------------------------- class TestRebuildVenv: - def test_removes_old_venv_and_creates_new(self, tmp_path): + def test_moves_old_venv_aside_and_creates_new(self, tmp_path): + """The old venv is moved aside to .old (never rmtree'd in place), + uv is invoked with --clear, the moved-aside backup is removed on + success, and the rebuilt interpreter is reported.""" venv_dir = tmp_path / "venv" venv_dir.mkdir() (venv_dir / "old_file").write_text("stale") uv_bin = str(tmp_path / "bin" / "uv") + call_log: list[list[str]] = [] def fake_run(cmd, **kwargs): - m = MagicMock(returncode=0) - if cmd[1] == "venv": - # Simulate uv creating the venv dir - venv_dir.mkdir(exist_ok=True) - bin_dir = venv_dir / "bin" + call_log.append(list(cmd)) + m = MagicMock(returncode=0, stderr="", stdout="") + if len(cmd) >= 2 and cmd[1] == "venv": + # Simulate uv creating the venv dir with a python interpreter + bin_dir = venv_dir / ("Scripts" if os.name == "nt" else "bin") bin_dir.mkdir(parents=True, exist_ok=True) - (bin_dir / "python").write_text("#!/bin/sh\necho Python 3.11.0") + python_name = "python.exe" if os.name == "nt" else "python" + (bin_dir / python_name).write_text("#!/bin/sh\necho Python 3.11.0") elif "--version" in cmd: m.stdout = "Python 3.11.0" return m - with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \ - patch("hermes_cli.managed_uv.shutil.rmtree") as mock_rmtree: + with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run): from hermes_cli.managed_uv import rebuild_venv result = rebuild_venv(uv_bin, venv_dir) - assert result is True - mock_rmtree.assert_called_once_with(venv_dir, ignore_errors=True) + + assert result is True + # uv venv was invoked exactly once, always with --clear. + venv_calls = [c for c in call_log if len(c) >= 2 and c[1] == "venv"] + assert len(venv_calls) == 1, f"expected 1 venv call, got {venv_calls}" + assert "--clear" in venv_calls[0] + # The moved-aside backup is cleaned up after a successful rebuild. + assert not (tmp_path / "venv.old").exists() + + def test_aborts_without_deleting_when_venv_in_use(self, tmp_path): + """If os.replace fails (Windows file lock — venv in use), we must abort + cleanly WITHOUT deleting the venv and WITHOUT invoking uv.""" + venv_dir = tmp_path / "venv" + venv_dir.mkdir() + (venv_dir / "locked") .write_text("held open") + uv_bin = str(tmp_path / "bin" / "uv") + call_log: list[list[str]] = [] + + def fake_run(cmd, **kwargs): + call_log.append(list(cmd)) + return MagicMock(returncode=0, stderr="", stdout="") + + with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \ + patch("hermes_cli.managed_uv.os.replace", side_effect=OSError("in use")): + from hermes_cli.managed_uv import rebuild_venv + result = rebuild_venv(uv_bin, venv_dir) + + assert result is False + # venv left fully intact, uv never invoked. + assert venv_dir.exists() and (venv_dir / "locked").exists() + assert [c for c in call_log if len(c) >= 2 and c[1] == "venv"] == [] + + def test_restores_backup_when_rebuild_fails(self, tmp_path): + """If uv venv exits non-zero, the moved-aside venv is restored so we + never leave Hermes with no venv at all.""" + venv_dir = tmp_path / "venv" + venv_dir.mkdir() + (venv_dir / "marker").write_text("original") + uv_bin = str(tmp_path / "bin" / "uv") + + def fake_run(cmd, **kwargs): + return MagicMock(returncode=1, stderr="boom", stdout="") + + with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run): + from hermes_cli.managed_uv import rebuild_venv + result = rebuild_venv(uv_bin, venv_dir) + + assert result is False + # Original venv restored from the .old backup. + assert venv_dir.exists() and (venv_dir / "marker").read_text() == "original" + assert not (tmp_path / "venv.old").exists() def test_rebuild_failure_returns_false(self, tmp_path): venv_dir = tmp_path / "venv" uv_bin = str(tmp_path / "bin" / "uv") - with patch("hermes_cli.managed_uv.subprocess.run") as mock_run, \ - patch("hermes_cli.managed_uv.shutil.rmtree"): + with patch("hermes_cli.managed_uv.subprocess.run") as mock_run: mock_run.return_value = MagicMock(returncode=1, stderr="nope") from hermes_cli.managed_uv import rebuild_venv result = rebuild_venv(uv_bin, venv_dir) assert result is False - def test_retries_with_clear_when_dir_already_exists(self, tmp_path): - """On Windows, rmtree can silently fail when an open handle holds a - file in the venv (running hermes.exe, gateway, AV scanner). uv then - refuses with ``Caused by: A directory already exists at: venv``. - Make sure we don't give up — retry with ``--clear`` to force uv past - the stale directory and rebuild successfully.""" - venv_dir = tmp_path / "venv" - venv_dir.mkdir() - (venv_dir / "stale_open_handle").write_text("rmtree couldn't delete me") - - uv_bin = str(tmp_path / "bin" / "uv") - call_log: list[list[str]] = [] - - def fake_run(cmd, **kwargs): - call_log.append(list(cmd)) - m = MagicMock() - if cmd[1] == "venv" and "--clear" not in cmd: - # First attempt: uv refuses because dir still exists - m.returncode = 1 - m.stderr = ( - "error: Failed to create virtual environment\n" - " Caused by: A directory already exists at: venv\n" - "hint: Use the `--clear` flag or set `UV_VENV_CLEAR=1` to replace the existing directory\n" - ) - m.stdout = "" - return m - if cmd[1] == "venv" and "--clear" in cmd: - # Retry: succeeds. Simulate uv writing the python shim. - m.returncode = 0 - m.stderr = "" - m.stdout = "" - bin_dir = venv_dir / ("Scripts" if os.name == "nt" else "bin") - bin_dir.mkdir(parents=True, exist_ok=True) - python_name = "python.exe" if os.name == "nt" else "python" - (bin_dir / python_name).write_text("#!/bin/sh\necho Python 3.11.0") - return m - if "--version" in cmd: - m.returncode = 0 - m.stdout = "Python 3.11.0" - m.stderr = "" - return m - m.returncode = 0 - return m - - with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \ - patch("hermes_cli.managed_uv.shutil.rmtree"): - from hermes_cli.managed_uv import rebuild_venv - result = rebuild_venv(uv_bin, venv_dir) - - assert result is True, "rebuild should succeed after --clear retry" - # We expect exactly two ``uv venv`` calls: one without --clear, one with. - venv_calls = [c for c in call_log if len(c) >= 2 and c[1] == "venv"] - assert len(venv_calls) == 2, f"expected 2 venv calls, got {venv_calls}" - assert "--clear" not in venv_calls[0], "first call should not pass --clear" - assert "--clear" in venv_calls[1], "retry must pass --clear" - - def test_does_not_retry_when_first_failure_is_not_dir_exists(self, tmp_path): - """If uv venv fails for some other reason (e.g. interpreter download - failed, disk full), we should NOT silently retry with --clear — - that would mask a real problem. Just surface the original failure.""" + def test_rebuild_success_without_python_returns_false(self, tmp_path): + """uv can exit 0 yet leave no interpreter; that must not count as success + (guard adapted from #38511).""" venv_dir = tmp_path / "venv" uv_bin = str(tmp_path / "bin" / "uv") - call_log: list[list[str]] = [] - def fake_run(cmd, **kwargs): - call_log.append(list(cmd)) - m = MagicMock(returncode=1, stderr="error: No space left on device", stdout="") - return m - - with patch("hermes_cli.managed_uv.subprocess.run", side_effect=fake_run), \ - patch("hermes_cli.managed_uv.shutil.rmtree"): + with patch("hermes_cli.managed_uv.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") from hermes_cli.managed_uv import rebuild_venv result = rebuild_venv(uv_bin, venv_dir) - - assert result is False - venv_calls = [c for c in call_log if len(c) >= 2 and c[1] == "venv"] - assert len(venv_calls) == 1, "should not retry on non-dir-exists failures" - assert "--clear" not in venv_calls[0] + assert result is False + # Returned before the `python --version` probe ran (only the uv venv call). + assert mock_run.call_count == 1 # --------------------------------------------------------------------------- diff --git a/tests/hermes_cli/test_update_autostash.py b/tests/hermes_cli/test_update_autostash.py index adcd24bf3..b29179077 100644 --- a/tests/hermes_cli/test_update_autostash.py +++ b/tests/hermes_cli/test_update_autostash.py @@ -423,6 +423,41 @@ def test_cmd_update_succeeds_with_extras(monkeypatch, tmp_path): assert ".[all]" in install_cmds[0] +def test_cmd_update_aborts_when_fresh_managed_uv_rebuild_fails(monkeypatch, tmp_path): + """A failed fresh managed-uv venv rebuild must not continue into pip install + (guard adapted from #38511).""" + _setup_update_mocks(monkeypatch, tmp_path) + monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/uv" if name == "uv" else None) + monkeypatch.setattr(hermes_main, "_is_termux_env", lambda env=None: False) + + recorded = [] + + def fake_run(cmd, **kwargs): + recorded.append(cmd) + # Tolerant matching: the update flow's exact git invocations vary by + # checkout, so key off the verb. Branch detection must return a real name + # and rev-list a parseable count, or the flow aborts early before it ever + # reaches the venv rebuild this test exercises. + if isinstance(cmd, (list, tuple)) and cmd and cmd[0] == "git": + if "rev-parse" in cmd: + return SimpleNamespace(stdout="main\n", stderr="", returncode=0) + if "rev-list" in cmd: + return SimpleNamespace(stdout="1\n", stderr="", returncode=0) + if "pull" in cmd: + return SimpleNamespace(stdout="Updating\n", stderr="", returncode=0) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr(hermes_main.subprocess, "run", fake_run) + + with patch("hermes_cli.managed_uv.ensure_uv", return_value=("/usr/bin/uv", True)), \ + patch("hermes_cli.managed_uv.rebuild_venv", return_value=False), \ + pytest.raises(RuntimeError, match="venv rebuild failed"): + hermes_main.cmd_update(SimpleNamespace()) + + install_cmds = [c for c in recorded if "pip" in c and "install" in c] + assert install_cmds == [] + + def test_install_with_optional_fallback_honors_custom_group(monkeypatch): """Termux update path should target .[termux-all] when requested.""" calls = []