fix(uv): move venv aside instead of gutting it in place on Windows rebuild

hermes update can brick a Windows install. When 'hermes update --force' runs
past the concurrent-process guard, rebuild_venv runs while the venv is still in
use: shutil.rmtree(ignore_errors=True) deletes site-packages + certifi's cert
bundle but can't remove the locked python.exe, leaving a half-gutted venv that
uv venv then refuses to overwrite. Every later HTTPS call dies with
FileNotFoundError for the missing cacert and there is no recovery.

--clear alone (the c136eb4de retry path) does not fix the real lock case: when
the locked interpreter is *inside* the venv being rebuilt, neither rmtree nor
uv venv --clear can delete it. os.replace of the parent directory *is* allowed
on Windows (a running .exe is tracked by handle, not path), so we move the old
venv aside atomically to <venv>.old, rebuild with --clear in its place, and the
still-running gateway/desktop keep using the moved-aside copy until they
restart. If the venv genuinely can't be moved, we abort cleanly and leave it
fully intact; if the rebuild fails, we restore the moved-aside copy.

Folds in the call-site guards from #38511 (@f3rs3n):
- rebuild_venv() returns False (and restores the backup) if uv exits 0 without
  producing an interpreter.
- both hermes update venv-rebuild call sites abort with RuntimeError instead of
  continuing into dependency install when rebuild_venv() returns False.

Also gitignore /venv.old/ so the update autostash (git stash --include-untracked)
doesn't sweep the moved-aside venv on every run.

Root-cause fix for #37881. Supersedes the --clear-only retry from c136eb4de.

Co-authored-by: f3rs3n <32328813+f3rs3n@users.noreply.github.com>
This commit is contained in:
Jeff
2026-06-04 08:04:01 -07:00
committed by ethernet
parent ee7948ea6e
commit 1f347ee543
5 changed files with 175 additions and 115 deletions

View File

@ -8030,7 +8030,10 @@ def _update_via_zip(args):
# may point to a Python without FTS5. Rebuild it so the new managed
# uv provides a fresh interpreter with FTS5 guaranteed.
if fresh_bootstrap and uv_bin:
rebuild_venv(uv_bin, PROJECT_ROOT / "venv")
if not rebuild_venv(uv_bin, PROJECT_ROOT / "venv"):
raise RuntimeError(
"venv rebuild failed; aborting update before dependency install"
)
pip_cmd = [sys.executable, "-m", "pip"]
if not uv_bin:
@ -10573,7 +10576,10 @@ def _cmd_update_impl(args, gateway_mode: bool):
# may point to a Python without FTS5. Rebuild it so the new managed
# uv provides a fresh interpreter with FTS5 guaranteed.
if fresh_bootstrap and uv_bin:
rebuild_venv(uv_bin, PROJECT_ROOT / "venv")
if not rebuild_venv(uv_bin, PROJECT_ROOT / "venv"):
raise RuntimeError(
"venv rebuild failed; aborting update before dependency install"
)
pip_cmd = [sys.executable, "-m", "pip"]
if not uv_bin:

View File

@ -106,41 +106,69 @@ def rebuild_venv(uv_bin: str, venv_dir: Path, python_version: str = "3.11") -> b
fresh interpreter from the current managed uv. Returns ``True`` on
success.
On Windows, ``shutil.rmtree(..., ignore_errors=True)`` can silently leave
the venv directory partially intact when another process is holding an
open handle to a file inside it (typical culprits: a running
``hermes.exe`` REPL, the gateway, AV scanners). If we don't notice that
and just call ``uv venv``, uv refuses with
``Caused by: A directory already exists at: venv`` and the *whole
update* falls back to installing on top of the stale venv — which has
historically produced partial installs where a freshly added dependency
(e.g. ``pathspec``) silently fails to land. Retry with ``--clear`` to
force uv past that condition before giving up.
The old venv is moved aside *atomically* (``os.replace`` to ``<venv>.old``)
before recreating — never deleted in place. On Windows a still-running
``hermes.exe`` (gateway/desktop) holds ``venv\\Scripts\\python.exe`` open;
``shutil.rmtree(ignore_errors=True)`` would delete everything it *can*
(site-packages, certifi's cert bundle) and silently leave a half-gutted
venv that the following ``uv venv`` then refuses to overwrite ("directory
already exists") — bricking the install with no recovery (every later HTTPS
call dies with ``FileNotFoundError`` for the missing cert bundle).
``--clear`` alone does not fix this: when the locked interpreter is *inside*
the venv being rebuilt, neither ``rmtree`` nor ``uv venv --clear`` can
delete the held ``python.exe``. ``os.replace`` of the parent directory *is*
allowed (Windows tracks a running ``.exe`` by handle, not path), so the
rebuild completes while the running process keeps using the moved-aside copy
until it restarts. If the venv genuinely cannot be moved, we abort cleanly
and leave it fully intact; and if the rebuild itself fails we move the old
venv back so Hermes is never left with no venv at all.
"""
backup: Optional[Path] = None
if venv_dir.exists():
print(f" → Rebuilding venv (old Python may lack FTS5)...")
shutil.rmtree(venv_dir, ignore_errors=True)
backup = venv_dir.with_name(venv_dir.name + ".old")
shutil.rmtree(backup, ignore_errors=True) # clear any stale backup
try:
# Atomic move — fails (without partial deletion) if a process still
# holds files inside the venv, which is exactly the Windows
# file-lock case that previously bricked the install.
os.replace(venv_dir, backup)
except OSError as exc:
logger.warning("venv rebuild aborted — venv in use: %s", exc)
print(
" ✗ venv rebuild aborted — the venv is in use; stop the "
f"gateway/desktop and retry ({exc})"
)
return False
def _run_uv_venv(extra_args: list[str]) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[uv_bin, "venv", str(venv_dir), "--python", python_version, *extra_args],
capture_output=True,
text=True,
check=False,
)
result = subprocess.run(
[uv_bin, "venv", str(venv_dir), "--python", python_version, "--clear"],
capture_output=True,
text=True,
check=False,
)
result = _run_uv_venv([])
# If uv refused because the directory still exists (rmtree above was
# blocked by an open file handle, common on Windows), retry with
# --clear so uv overwrites it. Match on stderr because uv's exit code
# alone doesn't distinguish "dir exists" from real failures.
if result.returncode != 0 and "already exists" in (result.stderr or "").lower():
print(" → venv dir not fully removed (likely an open file handle); retrying with --clear...")
result = _run_uv_venv(["--clear"])
def _restore_backup() -> None:
if backup is not None and backup.exists():
shutil.rmtree(venv_dir, ignore_errors=True)
try:
os.replace(backup, venv_dir)
print(" ↩ Restored previous venv after failed rebuild.")
except OSError:
pass
if result.returncode == 0:
venv_python = venv_dir / ("Scripts" if platform.system() == "Windows" else "bin") / "python"
# uv can exit 0 yet leave no usable interpreter (e.g. a half-written
# venv). Don't report success on a venv that has no python — restore the
# moved-aside copy so the caller can abort without losing a working env.
if not venv_python.exists():
logger.warning("venv rebuild reported success but %s is missing", venv_python)
print(f" ✗ venv rebuild failed: Python interpreter missing at {venv_python}")
_restore_backup()
return False
if backup is not None:
shutil.rmtree(backup, ignore_errors=True)
py_ver = subprocess.run(
[str(venv_python), "--version"],
capture_output=True,
@ -150,6 +178,9 @@ def rebuild_venv(uv_bin: str, venv_dir: Path, python_version: str = "3.11") -> b
print(f" ✓ venv rebuilt ({py_ver})")
return True
else:
# Rebuild failed — restore the old venv so we never leave Hermes with no
# venv (the bricked-install failure mode this function exists to avoid).
_restore_backup()
logger.warning("venv rebuild failed: %s", result.stderr)
print(f" ✗ venv rebuild failed: {result.stderr.strip()}")
return False