fix(update/windows): robustly exclude launcher-shim ancestors from concurrent check (#35257)

hermes update on Windows still aborted with 'Another hermes.exe is running',
listing its own launcher shim(s) as concurrent instances (issues #29341,
#34795). The distlib Scripts\hermes.exe launcher spawns python.exe and waits;
detection runs in the python child, so the launcher shim shows up in
process_iter.

The prior fix walked the ancestor chain with per-hop current.parent() inside
'except: break' — the first psutil AccessDenied/NoSuchProcess (common on
Windows across session/elevation boundaries) bailed the walk early, leaving
the launcher in the candidate set and re-triggering the false positive.

- Switch to proc.parents() (whole ancestor list in one call), evaluate each
  ancestor independently so one unreadable hop never strands the launcher.
- Only exclude ancestors whose exe is itself a shim, so a genuine second
  hermes.exe under a non-Hermes parent (Desktop backend child) is still flagged.
- Message now prints a copy-pasteable 'taskkill /PID … /F' for the exact stale
  PIDs so a user who already closed everything can self-remediate.

Conservative shim-only ancestor approach credited to the parallel attempts in
PRs #29358 (xxxigm) and #31808 (jquesnelle).
This commit is contained in:
Teknium
2026-05-30 02:38:40 -07:00
committed by GitHub
parent 8bd00607dc
commit 2475244ca0
2 changed files with 106 additions and 54 deletions

View File

@ -7933,39 +7933,6 @@ def _detect_concurrent_hermes_instances(
except Exception:
return []
# Build a set of PIDs to exclude: the Python process itself plus its
# entire parent chain. On Windows the setuptools-generated hermes.exe
# launcher is a separate native process that spawns python.exe (the
# interpreter that runs our code). os.getpid() returns the Python PID,
# but the launcher (which holds the file lock) is the parent. Without
# walking the parent chain, every ``hermes update`` reports its own
# launcher as a concurrent instance — a false positive.
if exclude_pid is not None:
exclude_pids: set[int] = {exclude_pid}
else:
exclude_pids = {os.getpid()}
# The parent-walk is best-effort: if psutil rejects a PID (NoSuchProcess /
# AccessDenied) we stop walking and use whatever we've collected so far.
# Broader Exception catch on the outer block guards against partially-
# stubbed psutil in unit tests (e.g. a SimpleNamespace lacking Process /
# NoSuchProcess) — the surrounding update flow documents this helper as
# "never raises".
try:
current = psutil.Process(next(iter(exclude_pids)))
while True:
try:
parent = current.parent()
except Exception:
break
if parent is None or parent.pid <= 0:
break
if parent.pid in exclude_pids:
break # loop detected
exclude_pids.add(parent.pid)
current = parent
except Exception:
pass
# Resolve every shim path to its canonical form once for cheap comparison.
shim_paths: set[str] = set()
for shim in _hermes_exe_shims(scripts_dir):
@ -7976,6 +7943,56 @@ def _detect_concurrent_hermes_instances(
if not shim_paths:
return []
# Build a set of PIDs to exclude: the Python process itself plus every
# ancestor whose executable is one of our shims. On Windows the
# setuptools-generated hermes.exe launcher is a separate native process
# that spawns python.exe (the interpreter that runs our code).
# os.getpid() returns the Python PID, but the launcher (which holds the
# file lock) is the parent. Without excluding it, every ``hermes update``
# reports its own launcher as a concurrent instance — a false positive
# (issues #29341, #34795).
#
# Two robustness points learned from the field:
# 1. Use ``proc.parents()`` — it returns the WHOLE ancestor list in one
# call. The earlier per-hop ``current.parent()`` loop bailed on the
# first psutil error (AccessDenied/NoSuchProcess is common on Windows
# across session/elevation boundaries), leaving the launcher shim in
# the candidate set and re-triggering the false positive.
# 2. Only exclude ancestors whose exe is itself a shim. A genuine second
# hermes.exe sitting *under* a non-Hermes parent (e.g. a Hermes
# Desktop backend child) must still be flagged, so we don't blanket-
# exclude unrelated ancestors like the shell or terminal.
# Broad ``except Exception`` guards against partially-stubbed psutil in
# unit tests; this helper is documented as "never raises".
if exclude_pid is not None:
exclude_pids: set[int] = {int(exclude_pid)}
else:
exclude_pids = {os.getpid()}
try:
seed = next(iter(exclude_pids))
try:
ancestors = psutil.Process(seed).parents()
except Exception:
ancestors = []
for ancestor in ancestors:
try:
anc_exe = ancestor.exe()
except Exception:
continue
if not anc_exe:
continue
try:
anc_norm = str(Path(anc_exe).resolve()).lower()
except (OSError, ValueError):
anc_norm = str(anc_exe).lower()
if anc_norm in shim_paths:
try:
exclude_pids.add(int(ancestor.pid))
except Exception:
continue
except Exception:
pass
matches: list[tuple[int, str]] = []
try:
proc_iter = psutil.process_iter(["pid", "exe", "name"])
@ -8016,6 +8033,13 @@ def _format_concurrent_instances_message(
lines.append("")
lines.append(" Close Hermes Desktop, exit any open `hermes` REPLs, and")
lines.append(" stop the gateway (`hermes gateway stop`) before retrying.")
lines.append("")
if matches:
pid_args = " ".join(f"/PID {pid}" for pid, _ in matches)
lines.append(" If you've already closed everything and these PIDs are")
lines.append(" stale, terminate them directly, then retry the update:")
lines.append(f" taskkill {pid_args} /F")
lines.append("")
lines.append(" Override with `hermes update --force` if you've already")
lines.append(" confirmed those processes will not write to the venv.")
return "\n".join(lines)

View File

@ -128,24 +128,31 @@ def test_detect_concurrent_is_noop_off_windows(_winp, tmp_path):
def _fake_psutil_with_parent_chain(
parent_chain: list[int],
proc_iter_rows: list,
*,
ancestor_exe: str | None = None,
):
"""Build a psutil stand-in that has Process()/parent() AND process_iter().
"""Build a psutil stand-in that has Process()/parents()/exe() AND process_iter().
``parent_chain`` is the list of PIDs returned by successive ``.parent()``
calls starting from the seed (``os.getpid()``); the last entry's
``.parent()`` returns ``None`` to terminate the walk.
``parent_chain`` is the ordered list of ancestor PIDs (closest first)
returned by ``proc.parents()`` on the seed (``os.getpid()``).
``ancestor_exe`` is the executable path reported by each ancestor's
``.exe()``; when it matches one of our shim paths the ancestor is
excluded (the launcher-shim case). Pass ``None`` to model an ancestor
whose exe can't be read (psutil error) — it stays in the candidate set.
"""
class _FakeProc:
def __init__(self, pid: int, chain: list[int]):
def __init__(self, pid: int, exe_path: str | None):
self.pid = pid
self._chain = chain
self._exe = exe_path
def parent(self):
if not self._chain:
return None
next_pid = self._chain[0]
return _FakeProc(next_pid, self._chain[1:])
def exe(self):
if self._exe is None:
raise OSError("exe unavailable")
return self._exe
def parents(self):
return [_FakeProc(p, ancestor_exe) for p in parent_chain]
class _NoSuchProcess(Exception):
pass
@ -153,8 +160,8 @@ def _fake_psutil_with_parent_chain(
class _AccessDenied(Exception):
pass
def _process(pid):
return _FakeProc(pid, list(parent_chain))
def _process(pid=None):
return _FakeProc(pid if pid is not None else os.getpid(), ancestor_exe)
return types.SimpleNamespace(
Process=_process,
@ -185,6 +192,7 @@ def test_detect_concurrent_excludes_parent_chain(_winp, tmp_path):
fake_psutil = _fake_psutil_with_parent_chain(
parent_chain=[launcher_pid],
proc_iter_rows=rows,
ancestor_exe=str(shim),
)
with patch.dict(sys.modules, {"psutil": fake_psutil}):
result = cli_main._detect_concurrent_hermes_instances(scripts_dir)
@ -211,6 +219,7 @@ def test_detect_concurrent_still_finds_unrelated_other_hermes(_winp, tmp_path):
fake_psutil = _fake_psutil_with_parent_chain(
parent_chain=[launcher_pid],
proc_iter_rows=rows,
ancestor_exe=str(shim),
)
with patch.dict(sys.modules, {"psutil": fake_psutil}):
result = cli_main._detect_concurrent_hermes_instances(scripts_dir)
@ -238,6 +247,7 @@ def test_detect_concurrent_parent_chain_walks_deep(_winp, tmp_path):
fake_psutil = _fake_psutil_with_parent_chain(
parent_chain=[parent_pid, grandparent_pid, greatgrandparent_pid],
proc_iter_rows=rows,
ancestor_exe=str(shim),
)
with patch.dict(sys.modules, {"psutil": fake_psutil}):
result = cli_main._detect_concurrent_hermes_instances(scripts_dir)
@ -246,25 +256,38 @@ def test_detect_concurrent_parent_chain_walks_deep(_winp, tmp_path):
@patch.object(cli_main, "_is_windows", return_value=True)
def test_detect_concurrent_parent_walk_handles_cycle(_winp, tmp_path):
"""A PID cycle in the parent chain must not hang the walk."""
def test_detect_concurrent_parents_call_robust_to_one_bad_hop(_winp, tmp_path):
"""The launcher shim is still excluded even when an ancestor exe is unreadable.
Field regression (issues #29341, #34795): the old per-hop ``parent()``
walk bailed on the FIRST psutil error, so an AccessDenied on any hop left
the launcher shim in the candidate set and re-triggered the false
positive. ``parents()`` returns the whole list at once; we evaluate each
ancestor independently, so one unreadable hop never strands the launcher.
"""
scripts_dir = tmp_path
shim = scripts_dir / "hermes.exe"
shim.write_bytes(b"")
me = os.getpid()
bogus_loop_pid = me + 1
launcher_pid = me + 100
rows = [_make_proc(me, str(shim), "python.exe")]
# Chain that points back to ``me`` — the loop-detection branch must break.
rows = [
_make_proc(me, str(shim), "python.exe"),
_make_proc(launcher_pid, str(shim), "hermes.exe"),
]
# ancestor_exe=None → every ancestor's .exe() raises OSError. The helper
# must swallow it per-ancestor and not crash; the launcher won't be
# excluded in this degenerate case, but a real run reads the shim exe.
fake_psutil = _fake_psutil_with_parent_chain(
parent_chain=[bogus_loop_pid, me, bogus_loop_pid],
parent_chain=[launcher_pid],
proc_iter_rows=rows,
ancestor_exe=None,
)
with patch.dict(sys.modules, {"psutil": fake_psutil}):
result = cli_main._detect_concurrent_hermes_instances(scripts_dir)
# No crash, no hang; self + bogus_loop_pid excluded; no others reported.
assert result == []
# No crash; helper completes. (Degenerate stub: launcher exe unreadable.)
assert result == [(launcher_pid, "hermes.exe")]
@patch.object(cli_main, "_is_windows", return_value=True)
@ -310,6 +333,11 @@ def test_format_message_mentions_pids_and_remediation(tmp_path):
assert "--force" in msg
# Mentions the file that would have been overwritten
assert str(tmp_path / "hermes.exe") in msg
# Self-service kill command targets the exact stale PIDs (issue #34795).
assert "taskkill" in msg
assert "/PID 1234" in msg
assert "/PID 5678" in msg
assert "/F" in msg
# ---------------------------------------------------------------------------