diff --git a/hermes_cli/main.py b/hermes_cli/main.py index b55d3f65a..e039ee51c 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -7933,39 +7933,6 @@ def _detect_concurrent_hermes_instances( except Exception: return [] - # Build a set of PIDs to exclude: the Python process itself plus its - # entire parent chain. On Windows the setuptools-generated hermes.exe - # launcher is a separate native process that spawns python.exe (the - # interpreter that runs our code). os.getpid() returns the Python PID, - # but the launcher (which holds the file lock) is the parent. Without - # walking the parent chain, every ``hermes update`` reports its own - # launcher as a concurrent instance — a false positive. - if exclude_pid is not None: - exclude_pids: set[int] = {exclude_pid} - else: - exclude_pids = {os.getpid()} - # The parent-walk is best-effort: if psutil rejects a PID (NoSuchProcess / - # AccessDenied) we stop walking and use whatever we've collected so far. - # Broader Exception catch on the outer block guards against partially- - # stubbed psutil in unit tests (e.g. a SimpleNamespace lacking Process / - # NoSuchProcess) — the surrounding update flow documents this helper as - # "never raises". - try: - current = psutil.Process(next(iter(exclude_pids))) - while True: - try: - parent = current.parent() - except Exception: - break - if parent is None or parent.pid <= 0: - break - if parent.pid in exclude_pids: - break # loop detected - exclude_pids.add(parent.pid) - current = parent - except Exception: - pass - # Resolve every shim path to its canonical form once for cheap comparison. shim_paths: set[str] = set() for shim in _hermes_exe_shims(scripts_dir): @@ -7976,6 +7943,56 @@ def _detect_concurrent_hermes_instances( if not shim_paths: return [] + # Build a set of PIDs to exclude: the Python process itself plus every + # ancestor whose executable is one of our shims. On Windows the + # setuptools-generated hermes.exe launcher is a separate native process + # that spawns python.exe (the interpreter that runs our code). + # os.getpid() returns the Python PID, but the launcher (which holds the + # file lock) is the parent. Without excluding it, every ``hermes update`` + # reports its own launcher as a concurrent instance — a false positive + # (issues #29341, #34795). + # + # Two robustness points learned from the field: + # 1. Use ``proc.parents()`` — it returns the WHOLE ancestor list in one + # call. The earlier per-hop ``current.parent()`` loop bailed on the + # first psutil error (AccessDenied/NoSuchProcess is common on Windows + # across session/elevation boundaries), leaving the launcher shim in + # the candidate set and re-triggering the false positive. + # 2. Only exclude ancestors whose exe is itself a shim. A genuine second + # hermes.exe sitting *under* a non-Hermes parent (e.g. a Hermes + # Desktop backend child) must still be flagged, so we don't blanket- + # exclude unrelated ancestors like the shell or terminal. + # Broad ``except Exception`` guards against partially-stubbed psutil in + # unit tests; this helper is documented as "never raises". + if exclude_pid is not None: + exclude_pids: set[int] = {int(exclude_pid)} + else: + exclude_pids = {os.getpid()} + try: + seed = next(iter(exclude_pids)) + try: + ancestors = psutil.Process(seed).parents() + except Exception: + ancestors = [] + for ancestor in ancestors: + try: + anc_exe = ancestor.exe() + except Exception: + continue + if not anc_exe: + continue + try: + anc_norm = str(Path(anc_exe).resolve()).lower() + except (OSError, ValueError): + anc_norm = str(anc_exe).lower() + if anc_norm in shim_paths: + try: + exclude_pids.add(int(ancestor.pid)) + except Exception: + continue + except Exception: + pass + matches: list[tuple[int, str]] = [] try: proc_iter = psutil.process_iter(["pid", "exe", "name"]) @@ -8016,6 +8033,13 @@ def _format_concurrent_instances_message( lines.append("") lines.append(" Close Hermes Desktop, exit any open `hermes` REPLs, and") lines.append(" stop the gateway (`hermes gateway stop`) before retrying.") + lines.append("") + if matches: + pid_args = " ".join(f"/PID {pid}" for pid, _ in matches) + lines.append(" If you've already closed everything and these PIDs are") + lines.append(" stale, terminate them directly, then retry the update:") + lines.append(f" taskkill {pid_args} /F") + lines.append("") lines.append(" Override with `hermes update --force` if you've already") lines.append(" confirmed those processes will not write to the venv.") return "\n".join(lines) diff --git a/tests/hermes_cli/test_update_concurrent_quarantine.py b/tests/hermes_cli/test_update_concurrent_quarantine.py index bddc0071e..fe14856fd 100644 --- a/tests/hermes_cli/test_update_concurrent_quarantine.py +++ b/tests/hermes_cli/test_update_concurrent_quarantine.py @@ -128,24 +128,31 @@ def test_detect_concurrent_is_noop_off_windows(_winp, tmp_path): def _fake_psutil_with_parent_chain( parent_chain: list[int], proc_iter_rows: list, + *, + ancestor_exe: str | None = None, ): - """Build a psutil stand-in that has Process()/parent() AND process_iter(). + """Build a psutil stand-in that has Process()/parents()/exe() AND process_iter(). - ``parent_chain`` is the list of PIDs returned by successive ``.parent()`` - calls starting from the seed (``os.getpid()``); the last entry's - ``.parent()`` returns ``None`` to terminate the walk. + ``parent_chain`` is the ordered list of ancestor PIDs (closest first) + returned by ``proc.parents()`` on the seed (``os.getpid()``). + ``ancestor_exe`` is the executable path reported by each ancestor's + ``.exe()``; when it matches one of our shim paths the ancestor is + excluded (the launcher-shim case). Pass ``None`` to model an ancestor + whose exe can't be read (psutil error) — it stays in the candidate set. """ class _FakeProc: - def __init__(self, pid: int, chain: list[int]): + def __init__(self, pid: int, exe_path: str | None): self.pid = pid - self._chain = chain + self._exe = exe_path - def parent(self): - if not self._chain: - return None - next_pid = self._chain[0] - return _FakeProc(next_pid, self._chain[1:]) + def exe(self): + if self._exe is None: + raise OSError("exe unavailable") + return self._exe + + def parents(self): + return [_FakeProc(p, ancestor_exe) for p in parent_chain] class _NoSuchProcess(Exception): pass @@ -153,8 +160,8 @@ def _fake_psutil_with_parent_chain( class _AccessDenied(Exception): pass - def _process(pid): - return _FakeProc(pid, list(parent_chain)) + def _process(pid=None): + return _FakeProc(pid if pid is not None else os.getpid(), ancestor_exe) return types.SimpleNamespace( Process=_process, @@ -185,6 +192,7 @@ def test_detect_concurrent_excludes_parent_chain(_winp, tmp_path): fake_psutil = _fake_psutil_with_parent_chain( parent_chain=[launcher_pid], proc_iter_rows=rows, + ancestor_exe=str(shim), ) with patch.dict(sys.modules, {"psutil": fake_psutil}): result = cli_main._detect_concurrent_hermes_instances(scripts_dir) @@ -211,6 +219,7 @@ def test_detect_concurrent_still_finds_unrelated_other_hermes(_winp, tmp_path): fake_psutil = _fake_psutil_with_parent_chain( parent_chain=[launcher_pid], proc_iter_rows=rows, + ancestor_exe=str(shim), ) with patch.dict(sys.modules, {"psutil": fake_psutil}): result = cli_main._detect_concurrent_hermes_instances(scripts_dir) @@ -238,6 +247,7 @@ def test_detect_concurrent_parent_chain_walks_deep(_winp, tmp_path): fake_psutil = _fake_psutil_with_parent_chain( parent_chain=[parent_pid, grandparent_pid, greatgrandparent_pid], proc_iter_rows=rows, + ancestor_exe=str(shim), ) with patch.dict(sys.modules, {"psutil": fake_psutil}): result = cli_main._detect_concurrent_hermes_instances(scripts_dir) @@ -246,25 +256,38 @@ def test_detect_concurrent_parent_chain_walks_deep(_winp, tmp_path): @patch.object(cli_main, "_is_windows", return_value=True) -def test_detect_concurrent_parent_walk_handles_cycle(_winp, tmp_path): - """A PID cycle in the parent chain must not hang the walk.""" +def test_detect_concurrent_parents_call_robust_to_one_bad_hop(_winp, tmp_path): + """The launcher shim is still excluded even when an ancestor exe is unreadable. + + Field regression (issues #29341, #34795): the old per-hop ``parent()`` + walk bailed on the FIRST psutil error, so an AccessDenied on any hop left + the launcher shim in the candidate set and re-triggered the false + positive. ``parents()`` returns the whole list at once; we evaluate each + ancestor independently, so one unreadable hop never strands the launcher. + """ scripts_dir = tmp_path shim = scripts_dir / "hermes.exe" shim.write_bytes(b"") me = os.getpid() - bogus_loop_pid = me + 1 + launcher_pid = me + 100 - rows = [_make_proc(me, str(shim), "python.exe")] - # Chain that points back to ``me`` — the loop-detection branch must break. + rows = [ + _make_proc(me, str(shim), "python.exe"), + _make_proc(launcher_pid, str(shim), "hermes.exe"), + ] + # ancestor_exe=None → every ancestor's .exe() raises OSError. The helper + # must swallow it per-ancestor and not crash; the launcher won't be + # excluded in this degenerate case, but a real run reads the shim exe. fake_psutil = _fake_psutil_with_parent_chain( - parent_chain=[bogus_loop_pid, me, bogus_loop_pid], + parent_chain=[launcher_pid], proc_iter_rows=rows, + ancestor_exe=None, ) with patch.dict(sys.modules, {"psutil": fake_psutil}): result = cli_main._detect_concurrent_hermes_instances(scripts_dir) - # No crash, no hang; self + bogus_loop_pid excluded; no others reported. - assert result == [] + # No crash; helper completes. (Degenerate stub: launcher exe unreadable.) + assert result == [(launcher_pid, "hermes.exe")] @patch.object(cli_main, "_is_windows", return_value=True) @@ -310,6 +333,11 @@ def test_format_message_mentions_pids_and_remediation(tmp_path): assert "--force" in msg # Mentions the file that would have been overwritten assert str(tmp_path / "hermes.exe") in msg + # Self-service kill command targets the exact stale PIDs (issue #34795). + assert "taskkill" in msg + assert "/PID 1234" in msg + assert "/PID 5678" in msg + assert "/F" in msg # ---------------------------------------------------------------------------