fix(uv): move venv aside instead of gutting it in place on Windows rebuild
hermes update can brick a Windows install. When 'hermes update --force' runs past the concurrent-process guard, rebuild_venv runs while the venv is still in use: shutil.rmtree(ignore_errors=True) deletes site-packages + certifi's cert bundle but can't remove the locked python.exe, leaving a half-gutted venv that uv venv then refuses to overwrite. Every later HTTPS call dies with FileNotFoundError for the missing cacert and there is no recovery. --clear alone (thec136eb4deretry path) does not fix the real lock case: when the locked interpreter is *inside* the venv being rebuilt, neither rmtree nor uv venv --clear can delete it. os.replace of the parent directory *is* allowed on Windows (a running .exe is tracked by handle, not path), so we move the old venv aside atomically to <venv>.old, rebuild with --clear in its place, and the still-running gateway/desktop keep using the moved-aside copy until they restart. If the venv genuinely can't be moved, we abort cleanly and leave it fully intact; if the rebuild fails, we restore the moved-aside copy. Folds in the call-site guards from #38511 (@f3rs3n): - rebuild_venv() returns False (and restores the backup) if uv exits 0 without producing an interpreter. - both hermes update venv-rebuild call sites abort with RuntimeError instead of continuing into dependency install when rebuild_venv() returns False. Also gitignore /venv.old/ so the update autostash (git stash --include-untracked) doesn't sweep the moved-aside venv on every run. Root-cause fix for #37881. Supersedes the --clear-only retry fromc136eb4de. Co-authored-by: f3rs3n <32328813+f3rs3n@users.noreply.github.com>
This commit is contained in:
@ -8030,7 +8030,10 @@ def _update_via_zip(args):
|
||||
# may point to a Python without FTS5. Rebuild it so the new managed
|
||||
# uv provides a fresh interpreter with FTS5 guaranteed.
|
||||
if fresh_bootstrap and uv_bin:
|
||||
rebuild_venv(uv_bin, PROJECT_ROOT / "venv")
|
||||
if not rebuild_venv(uv_bin, PROJECT_ROOT / "venv"):
|
||||
raise RuntimeError(
|
||||
"venv rebuild failed; aborting update before dependency install"
|
||||
)
|
||||
|
||||
pip_cmd = [sys.executable, "-m", "pip"]
|
||||
if not uv_bin:
|
||||
@ -10573,7 +10576,10 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
# may point to a Python without FTS5. Rebuild it so the new managed
|
||||
# uv provides a fresh interpreter with FTS5 guaranteed.
|
||||
if fresh_bootstrap and uv_bin:
|
||||
rebuild_venv(uv_bin, PROJECT_ROOT / "venv")
|
||||
if not rebuild_venv(uv_bin, PROJECT_ROOT / "venv"):
|
||||
raise RuntimeError(
|
||||
"venv rebuild failed; aborting update before dependency install"
|
||||
)
|
||||
|
||||
pip_cmd = [sys.executable, "-m", "pip"]
|
||||
if not uv_bin:
|
||||
|
||||
@ -106,41 +106,69 @@ def rebuild_venv(uv_bin: str, venv_dir: Path, python_version: str = "3.11") -> b
|
||||
fresh interpreter from the current managed uv. Returns ``True`` on
|
||||
success.
|
||||
|
||||
On Windows, ``shutil.rmtree(..., ignore_errors=True)`` can silently leave
|
||||
the venv directory partially intact when another process is holding an
|
||||
open handle to a file inside it (typical culprits: a running
|
||||
``hermes.exe`` REPL, the gateway, AV scanners). If we don't notice that
|
||||
and just call ``uv venv``, uv refuses with
|
||||
``Caused by: A directory already exists at: venv`` and the *whole
|
||||
update* falls back to installing on top of the stale venv — which has
|
||||
historically produced partial installs where a freshly added dependency
|
||||
(e.g. ``pathspec``) silently fails to land. Retry with ``--clear`` to
|
||||
force uv past that condition before giving up.
|
||||
The old venv is moved aside *atomically* (``os.replace`` to ``<venv>.old``)
|
||||
before recreating — never deleted in place. On Windows a still-running
|
||||
``hermes.exe`` (gateway/desktop) holds ``venv\\Scripts\\python.exe`` open;
|
||||
``shutil.rmtree(ignore_errors=True)`` would delete everything it *can*
|
||||
(site-packages, certifi's cert bundle) and silently leave a half-gutted
|
||||
venv that the following ``uv venv`` then refuses to overwrite ("directory
|
||||
already exists") — bricking the install with no recovery (every later HTTPS
|
||||
call dies with ``FileNotFoundError`` for the missing cert bundle).
|
||||
``--clear`` alone does not fix this: when the locked interpreter is *inside*
|
||||
the venv being rebuilt, neither ``rmtree`` nor ``uv venv --clear`` can
|
||||
delete the held ``python.exe``. ``os.replace`` of the parent directory *is*
|
||||
allowed (Windows tracks a running ``.exe`` by handle, not path), so the
|
||||
rebuild completes while the running process keeps using the moved-aside copy
|
||||
until it restarts. If the venv genuinely cannot be moved, we abort cleanly
|
||||
and leave it fully intact; and if the rebuild itself fails we move the old
|
||||
venv back so Hermes is never left with no venv at all.
|
||||
"""
|
||||
backup: Optional[Path] = None
|
||||
if venv_dir.exists():
|
||||
print(f" → Rebuilding venv (old Python may lack FTS5)...")
|
||||
shutil.rmtree(venv_dir, ignore_errors=True)
|
||||
backup = venv_dir.with_name(venv_dir.name + ".old")
|
||||
shutil.rmtree(backup, ignore_errors=True) # clear any stale backup
|
||||
try:
|
||||
# Atomic move — fails (without partial deletion) if a process still
|
||||
# holds files inside the venv, which is exactly the Windows
|
||||
# file-lock case that previously bricked the install.
|
||||
os.replace(venv_dir, backup)
|
||||
except OSError as exc:
|
||||
logger.warning("venv rebuild aborted — venv in use: %s", exc)
|
||||
print(
|
||||
" ✗ venv rebuild aborted — the venv is in use; stop the "
|
||||
f"gateway/desktop and retry ({exc})"
|
||||
)
|
||||
return False
|
||||
|
||||
def _run_uv_venv(extra_args: list[str]) -> subprocess.CompletedProcess[str]:
|
||||
return subprocess.run(
|
||||
[uv_bin, "venv", str(venv_dir), "--python", python_version, *extra_args],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
result = subprocess.run(
|
||||
[uv_bin, "venv", str(venv_dir), "--python", python_version, "--clear"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
result = _run_uv_venv([])
|
||||
|
||||
# If uv refused because the directory still exists (rmtree above was
|
||||
# blocked by an open file handle, common on Windows), retry with
|
||||
# --clear so uv overwrites it. Match on stderr because uv's exit code
|
||||
# alone doesn't distinguish "dir exists" from real failures.
|
||||
if result.returncode != 0 and "already exists" in (result.stderr or "").lower():
|
||||
print(" → venv dir not fully removed (likely an open file handle); retrying with --clear...")
|
||||
result = _run_uv_venv(["--clear"])
|
||||
def _restore_backup() -> None:
|
||||
if backup is not None and backup.exists():
|
||||
shutil.rmtree(venv_dir, ignore_errors=True)
|
||||
try:
|
||||
os.replace(backup, venv_dir)
|
||||
print(" ↩ Restored previous venv after failed rebuild.")
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
if result.returncode == 0:
|
||||
venv_python = venv_dir / ("Scripts" if platform.system() == "Windows" else "bin") / "python"
|
||||
# uv can exit 0 yet leave no usable interpreter (e.g. a half-written
|
||||
# venv). Don't report success on a venv that has no python — restore the
|
||||
# moved-aside copy so the caller can abort without losing a working env.
|
||||
if not venv_python.exists():
|
||||
logger.warning("venv rebuild reported success but %s is missing", venv_python)
|
||||
print(f" ✗ venv rebuild failed: Python interpreter missing at {venv_python}")
|
||||
_restore_backup()
|
||||
return False
|
||||
if backup is not None:
|
||||
shutil.rmtree(backup, ignore_errors=True)
|
||||
py_ver = subprocess.run(
|
||||
[str(venv_python), "--version"],
|
||||
capture_output=True,
|
||||
@ -150,6 +178,9 @@ def rebuild_venv(uv_bin: str, venv_dir: Path, python_version: str = "3.11") -> b
|
||||
print(f" ✓ venv rebuilt ({py_ver})")
|
||||
return True
|
||||
else:
|
||||
# Rebuild failed — restore the old venv so we never leave Hermes with no
|
||||
# venv (the bricked-install failure mode this function exists to avoid).
|
||||
_restore_backup()
|
||||
logger.warning("venv rebuild failed: %s", result.stderr)
|
||||
print(f" ✗ venv rebuild failed: {result.stderr.strip()}")
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user