fix(update/windows): robustly exclude launcher-shim ancestors from concurrent check (#35257)
hermes update on Windows still aborted with 'Another hermes.exe is running', listing its own launcher shim(s) as concurrent instances (issues #29341, #34795). The distlib Scripts\hermes.exe launcher spawns python.exe and waits; detection runs in the python child, so the launcher shim shows up in process_iter. The prior fix walked the ancestor chain with per-hop current.parent() inside 'except: break' — the first psutil AccessDenied/NoSuchProcess (common on Windows across session/elevation boundaries) bailed the walk early, leaving the launcher in the candidate set and re-triggering the false positive. - Switch to proc.parents() (whole ancestor list in one call), evaluate each ancestor independently so one unreadable hop never strands the launcher. - Only exclude ancestors whose exe is itself a shim, so a genuine second hermes.exe under a non-Hermes parent (Desktop backend child) is still flagged. - Message now prints a copy-pasteable 'taskkill /PID … /F' for the exact stale PIDs so a user who already closed everything can self-remediate. Conservative shim-only ancestor approach credited to the parallel attempts in PRs #29358 (xxxigm) and #31808 (jquesnelle).
This commit is contained in:
@ -7933,39 +7933,6 @@ def _detect_concurrent_hermes_instances(
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
# Build a set of PIDs to exclude: the Python process itself plus its
|
||||
# entire parent chain. On Windows the setuptools-generated hermes.exe
|
||||
# launcher is a separate native process that spawns python.exe (the
|
||||
# interpreter that runs our code). os.getpid() returns the Python PID,
|
||||
# but the launcher (which holds the file lock) is the parent. Without
|
||||
# walking the parent chain, every ``hermes update`` reports its own
|
||||
# launcher as a concurrent instance — a false positive.
|
||||
if exclude_pid is not None:
|
||||
exclude_pids: set[int] = {exclude_pid}
|
||||
else:
|
||||
exclude_pids = {os.getpid()}
|
||||
# The parent-walk is best-effort: if psutil rejects a PID (NoSuchProcess /
|
||||
# AccessDenied) we stop walking and use whatever we've collected so far.
|
||||
# Broader Exception catch on the outer block guards against partially-
|
||||
# stubbed psutil in unit tests (e.g. a SimpleNamespace lacking Process /
|
||||
# NoSuchProcess) — the surrounding update flow documents this helper as
|
||||
# "never raises".
|
||||
try:
|
||||
current = psutil.Process(next(iter(exclude_pids)))
|
||||
while True:
|
||||
try:
|
||||
parent = current.parent()
|
||||
except Exception:
|
||||
break
|
||||
if parent is None or parent.pid <= 0:
|
||||
break
|
||||
if parent.pid in exclude_pids:
|
||||
break # loop detected
|
||||
exclude_pids.add(parent.pid)
|
||||
current = parent
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Resolve every shim path to its canonical form once for cheap comparison.
|
||||
shim_paths: set[str] = set()
|
||||
for shim in _hermes_exe_shims(scripts_dir):
|
||||
@ -7976,6 +7943,56 @@ def _detect_concurrent_hermes_instances(
|
||||
if not shim_paths:
|
||||
return []
|
||||
|
||||
# Build a set of PIDs to exclude: the Python process itself plus every
|
||||
# ancestor whose executable is one of our shims. On Windows the
|
||||
# setuptools-generated hermes.exe launcher is a separate native process
|
||||
# that spawns python.exe (the interpreter that runs our code).
|
||||
# os.getpid() returns the Python PID, but the launcher (which holds the
|
||||
# file lock) is the parent. Without excluding it, every ``hermes update``
|
||||
# reports its own launcher as a concurrent instance — a false positive
|
||||
# (issues #29341, #34795).
|
||||
#
|
||||
# Two robustness points learned from the field:
|
||||
# 1. Use ``proc.parents()`` — it returns the WHOLE ancestor list in one
|
||||
# call. The earlier per-hop ``current.parent()`` loop bailed on the
|
||||
# first psutil error (AccessDenied/NoSuchProcess is common on Windows
|
||||
# across session/elevation boundaries), leaving the launcher shim in
|
||||
# the candidate set and re-triggering the false positive.
|
||||
# 2. Only exclude ancestors whose exe is itself a shim. A genuine second
|
||||
# hermes.exe sitting *under* a non-Hermes parent (e.g. a Hermes
|
||||
# Desktop backend child) must still be flagged, so we don't blanket-
|
||||
# exclude unrelated ancestors like the shell or terminal.
|
||||
# Broad ``except Exception`` guards against partially-stubbed psutil in
|
||||
# unit tests; this helper is documented as "never raises".
|
||||
if exclude_pid is not None:
|
||||
exclude_pids: set[int] = {int(exclude_pid)}
|
||||
else:
|
||||
exclude_pids = {os.getpid()}
|
||||
try:
|
||||
seed = next(iter(exclude_pids))
|
||||
try:
|
||||
ancestors = psutil.Process(seed).parents()
|
||||
except Exception:
|
||||
ancestors = []
|
||||
for ancestor in ancestors:
|
||||
try:
|
||||
anc_exe = ancestor.exe()
|
||||
except Exception:
|
||||
continue
|
||||
if not anc_exe:
|
||||
continue
|
||||
try:
|
||||
anc_norm = str(Path(anc_exe).resolve()).lower()
|
||||
except (OSError, ValueError):
|
||||
anc_norm = str(anc_exe).lower()
|
||||
if anc_norm in shim_paths:
|
||||
try:
|
||||
exclude_pids.add(int(ancestor.pid))
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
matches: list[tuple[int, str]] = []
|
||||
try:
|
||||
proc_iter = psutil.process_iter(["pid", "exe", "name"])
|
||||
@ -8016,6 +8033,13 @@ def _format_concurrent_instances_message(
|
||||
lines.append("")
|
||||
lines.append(" Close Hermes Desktop, exit any open `hermes` REPLs, and")
|
||||
lines.append(" stop the gateway (`hermes gateway stop`) before retrying.")
|
||||
lines.append("")
|
||||
if matches:
|
||||
pid_args = " ".join(f"/PID {pid}" for pid, _ in matches)
|
||||
lines.append(" If you've already closed everything and these PIDs are")
|
||||
lines.append(" stale, terminate them directly, then retry the update:")
|
||||
lines.append(f" taskkill {pid_args} /F")
|
||||
lines.append("")
|
||||
lines.append(" Override with `hermes update --force` if you've already")
|
||||
lines.append(" confirmed those processes will not write to the venv.")
|
||||
return "\n".join(lines)
|
||||
|
||||
@ -128,24 +128,31 @@ def test_detect_concurrent_is_noop_off_windows(_winp, tmp_path):
|
||||
def _fake_psutil_with_parent_chain(
|
||||
parent_chain: list[int],
|
||||
proc_iter_rows: list,
|
||||
*,
|
||||
ancestor_exe: str | None = None,
|
||||
):
|
||||
"""Build a psutil stand-in that has Process()/parent() AND process_iter().
|
||||
"""Build a psutil stand-in that has Process()/parents()/exe() AND process_iter().
|
||||
|
||||
``parent_chain`` is the list of PIDs returned by successive ``.parent()``
|
||||
calls starting from the seed (``os.getpid()``); the last entry's
|
||||
``.parent()`` returns ``None`` to terminate the walk.
|
||||
``parent_chain`` is the ordered list of ancestor PIDs (closest first)
|
||||
returned by ``proc.parents()`` on the seed (``os.getpid()``).
|
||||
``ancestor_exe`` is the executable path reported by each ancestor's
|
||||
``.exe()``; when it matches one of our shim paths the ancestor is
|
||||
excluded (the launcher-shim case). Pass ``None`` to model an ancestor
|
||||
whose exe can't be read (psutil error) — it stays in the candidate set.
|
||||
"""
|
||||
|
||||
class _FakeProc:
|
||||
def __init__(self, pid: int, chain: list[int]):
|
||||
def __init__(self, pid: int, exe_path: str | None):
|
||||
self.pid = pid
|
||||
self._chain = chain
|
||||
self._exe = exe_path
|
||||
|
||||
def parent(self):
|
||||
if not self._chain:
|
||||
return None
|
||||
next_pid = self._chain[0]
|
||||
return _FakeProc(next_pid, self._chain[1:])
|
||||
def exe(self):
|
||||
if self._exe is None:
|
||||
raise OSError("exe unavailable")
|
||||
return self._exe
|
||||
|
||||
def parents(self):
|
||||
return [_FakeProc(p, ancestor_exe) for p in parent_chain]
|
||||
|
||||
class _NoSuchProcess(Exception):
|
||||
pass
|
||||
@ -153,8 +160,8 @@ def _fake_psutil_with_parent_chain(
|
||||
class _AccessDenied(Exception):
|
||||
pass
|
||||
|
||||
def _process(pid):
|
||||
return _FakeProc(pid, list(parent_chain))
|
||||
def _process(pid=None):
|
||||
return _FakeProc(pid if pid is not None else os.getpid(), ancestor_exe)
|
||||
|
||||
return types.SimpleNamespace(
|
||||
Process=_process,
|
||||
@ -185,6 +192,7 @@ def test_detect_concurrent_excludes_parent_chain(_winp, tmp_path):
|
||||
fake_psutil = _fake_psutil_with_parent_chain(
|
||||
parent_chain=[launcher_pid],
|
||||
proc_iter_rows=rows,
|
||||
ancestor_exe=str(shim),
|
||||
)
|
||||
with patch.dict(sys.modules, {"psutil": fake_psutil}):
|
||||
result = cli_main._detect_concurrent_hermes_instances(scripts_dir)
|
||||
@ -211,6 +219,7 @@ def test_detect_concurrent_still_finds_unrelated_other_hermes(_winp, tmp_path):
|
||||
fake_psutil = _fake_psutil_with_parent_chain(
|
||||
parent_chain=[launcher_pid],
|
||||
proc_iter_rows=rows,
|
||||
ancestor_exe=str(shim),
|
||||
)
|
||||
with patch.dict(sys.modules, {"psutil": fake_psutil}):
|
||||
result = cli_main._detect_concurrent_hermes_instances(scripts_dir)
|
||||
@ -238,6 +247,7 @@ def test_detect_concurrent_parent_chain_walks_deep(_winp, tmp_path):
|
||||
fake_psutil = _fake_psutil_with_parent_chain(
|
||||
parent_chain=[parent_pid, grandparent_pid, greatgrandparent_pid],
|
||||
proc_iter_rows=rows,
|
||||
ancestor_exe=str(shim),
|
||||
)
|
||||
with patch.dict(sys.modules, {"psutil": fake_psutil}):
|
||||
result = cli_main._detect_concurrent_hermes_instances(scripts_dir)
|
||||
@ -246,25 +256,38 @@ def test_detect_concurrent_parent_chain_walks_deep(_winp, tmp_path):
|
||||
|
||||
|
||||
@patch.object(cli_main, "_is_windows", return_value=True)
|
||||
def test_detect_concurrent_parent_walk_handles_cycle(_winp, tmp_path):
|
||||
"""A PID cycle in the parent chain must not hang the walk."""
|
||||
def test_detect_concurrent_parents_call_robust_to_one_bad_hop(_winp, tmp_path):
|
||||
"""The launcher shim is still excluded even when an ancestor exe is unreadable.
|
||||
|
||||
Field regression (issues #29341, #34795): the old per-hop ``parent()``
|
||||
walk bailed on the FIRST psutil error, so an AccessDenied on any hop left
|
||||
the launcher shim in the candidate set and re-triggered the false
|
||||
positive. ``parents()`` returns the whole list at once; we evaluate each
|
||||
ancestor independently, so one unreadable hop never strands the launcher.
|
||||
"""
|
||||
scripts_dir = tmp_path
|
||||
shim = scripts_dir / "hermes.exe"
|
||||
shim.write_bytes(b"")
|
||||
me = os.getpid()
|
||||
bogus_loop_pid = me + 1
|
||||
launcher_pid = me + 100
|
||||
|
||||
rows = [_make_proc(me, str(shim), "python.exe")]
|
||||
# Chain that points back to ``me`` — the loop-detection branch must break.
|
||||
rows = [
|
||||
_make_proc(me, str(shim), "python.exe"),
|
||||
_make_proc(launcher_pid, str(shim), "hermes.exe"),
|
||||
]
|
||||
# ancestor_exe=None → every ancestor's .exe() raises OSError. The helper
|
||||
# must swallow it per-ancestor and not crash; the launcher won't be
|
||||
# excluded in this degenerate case, but a real run reads the shim exe.
|
||||
fake_psutil = _fake_psutil_with_parent_chain(
|
||||
parent_chain=[bogus_loop_pid, me, bogus_loop_pid],
|
||||
parent_chain=[launcher_pid],
|
||||
proc_iter_rows=rows,
|
||||
ancestor_exe=None,
|
||||
)
|
||||
with patch.dict(sys.modules, {"psutil": fake_psutil}):
|
||||
result = cli_main._detect_concurrent_hermes_instances(scripts_dir)
|
||||
|
||||
# No crash, no hang; self + bogus_loop_pid excluded; no others reported.
|
||||
assert result == []
|
||||
# No crash; helper completes. (Degenerate stub: launcher exe unreadable.)
|
||||
assert result == [(launcher_pid, "hermes.exe")]
|
||||
|
||||
|
||||
@patch.object(cli_main, "_is_windows", return_value=True)
|
||||
@ -310,6 +333,11 @@ def test_format_message_mentions_pids_and_remediation(tmp_path):
|
||||
assert "--force" in msg
|
||||
# Mentions the file that would have been overwritten
|
||||
assert str(tmp_path / "hermes.exe") in msg
|
||||
# Self-service kill command targets the exact stale PIDs (issue #34795).
|
||||
assert "taskkill" in msg
|
||||
assert "/PID 1234" in msg
|
||||
assert "/PID 5678" in msg
|
||||
assert "/F" in msg
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user