fix: batch of small robustness/correctness fixes from @kyssta-exe

Salvages 8 distinct fixes from a batch of PRs by @kyssta-exe, reapplied
onto current main (original branches were stale) with a few refinements.

- cron(jobs.py): load_jobs() validates top-level JSON shape — a bare
  list auto-repairs into the {"jobs": [...]} dict; scalars/null raise a
  clear RuntimeError instead of an uncaught AttributeError that took
  down the whole cron subsystem (#37065, closes #36867).
- web(web_server.py): close the per-action log file handle after Popen
  so the parent stops leaking one fd per spawned action (#36843).
- web(web_server.py): DELETE /api/env returns 400 for invalid key names
  instead of a misleading 500, mirroring PUT /api/env (#36840).
- gateway(gateway.py): read /proc/<pid>/cmdline inside a with-block so
  the fd is released immediately instead of relying on GC (#36804).
- web-tools(web_tools.py): include "xai" in check_web_api_key() so a
  configured X.AI web backend reports as available (#36802).
- compression(conversation_compression.py): mark the feasibility check
  done only after it completes, and default the gate to "not checked"
  if the attribute is missing (#36803).
- completion(completion.py): replace `ls` with directory globbing in the
  generated bash/zsh/fish profile listers — handles names with spaces
  and skips non-directory entries (#36806).
- terminal-tool(terminal_tool.py): drop a duplicate `import threading`
  (#36808).
- claw(claw.py): the migrate recommendation now points at the real
  `hermes gateway stop` command instead of the non-existent
  `hermes stop` (#36795, #36796, closes #36771).
- tests: guard against a leaked HERMES_CRON_SESSION breaking gateway
  approval tests — add it to the hermetic conftest unset list (root
  cause, protects every test) and pop it in the affected test's
  setup_method (#36796).

Co-authored-by: kyssta-exe <kyssta-exe@users.noreply.github.com>
This commit is contained in:
kyssta-exe
2026-06-01 19:19:15 -07:00
committed by Teknium
parent 64f7f36713
commit d4b533de4e
11 changed files with 64 additions and 26 deletions

View File

@ -308,11 +308,14 @@ def compress_context(
# The check itself sets ``agent._compression_warning`` so the
# status-callback replay machinery still emits the warning to the user
# the first time it would matter.
if not getattr(agent, "_compression_feasibility_checked", True):
try:
check_compression_model_feasibility(agent)
finally:
agent._compression_feasibility_checked = True
if not getattr(agent, "_compression_feasibility_checked", False):
# Mark as checked only after the probe completes. If the check
# raises (e.g. a fatal aux-context ValueError that aborts the
# session), leaving the flag unset is harmless; a non-fatal
# transient failure is swallowed inside the function so the flag
# is set normally on the next successful pass.
check_compression_model_feasibility(agent)
agent._compression_feasibility_checked = True
_pre_msg_count = len(messages)
logger.info(

View File

@ -428,22 +428,18 @@ def load_jobs() -> List[Dict[str, Any]]:
ensure_dirs()
if not JOBS_FILE.exists():
return []
_strict_retry = False # track whether we used the strict=False fallback
try:
with open(JOBS_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
return data.get("jobs", [])
except json.JSONDecodeError:
# Retry with strict=False to handle bare control chars in string values
_strict_retry = True
try:
with open(JOBS_FILE, 'r', encoding='utf-8') as f:
data = json.loads(f.read(), strict=False)
jobs = data.get("jobs", [])
if jobs:
# Auto-repair: rewrite with proper escaping
save_jobs(jobs)
logger.warning("Auto-repaired jobs.json (had invalid control characters)")
return jobs
except Exception as e:
logger.error("Failed to auto-repair jobs.json: %s", e)
raise RuntimeError(f"Cron database corrupted and unrepairable: {e}") from e
@ -451,6 +447,29 @@ def load_jobs() -> List[Dict[str, Any]]:
logger.error("IOError reading jobs.json: %s", e)
raise RuntimeError(f"Failed to read cron database: {e}") from e
# Validate the top-level JSON shape: accept a dict (expected) or a bare
# list (auto-repair). Anything else (str/number/null) is corruption that
# would otherwise raise an uncaught AttributeError on ``.get()`` and take
# down the whole cron subsystem.
if isinstance(data, dict):
jobs = data.get("jobs", [])
if _strict_retry and jobs:
# Hit control-character corruption — rewrite with proper escaping.
save_jobs(jobs)
logger.warning("Auto-repaired jobs.json (had invalid control characters)")
return jobs
if isinstance(data, list):
# Bare array — likely saved/edited outside save_jobs(). Wrap it back
# into the expected {"jobs": [...]} structure.
if data:
save_jobs(data)
logger.warning("Auto-repaired jobs.json (bare list wrapped as dict)")
return data
raise RuntimeError(
f"Cron database corrupted: expected {{'jobs': [...]}}, got {type(data).__name__}"
)
def save_jobs(jobs: List[Dict[str, Any]]):
"""Save all jobs to storage."""

View File

@ -177,7 +177,7 @@ def _warn_if_gateway_running(auto_yes: bool) -> None:
"conflicts (Telegram, Discord, and Slack only allow one active "
"session per token)."
)
print_info("Recommendation: stop the gateway first with 'hermes stop'.")
print_info("Recommendation: stop the gateway first with 'hermes gateway stop'.")
print()
if not auto_yes and not prompt_yes_no("Continue anyway?", default=False):
print_info("Migration cancelled. Stop the gateway and try again.")

View File

@ -105,7 +105,9 @@ _hermes_profiles() {{
local profiles_dir="$HOME/.hermes/profiles"
local profiles="default"
if [ -d "$profiles_dir" ]; then
profiles="$profiles $(ls "$profiles_dir" 2>/dev/null)"
for f in "$profiles_dir"/*/; do
[ -d "$f" ] && profiles="$profiles $(basename "$f")"
done
fi
echo "$profiles"
}}
@ -206,7 +208,7 @@ _hermes_profiles() {{
local -a profiles
profiles=(default)
if [[ -d "$HOME/.hermes/profiles" ]]; then
profiles+=("${{(@f)$(ls $HOME/.hermes/profiles 2>/dev/null)}}")
profiles+=($HOME/.hermes/profiles/*(N/:t))
fi
_describe 'profile' profiles
}}
@ -260,7 +262,9 @@ def generate_fish(parser: argparse.ArgumentParser) -> str:
"function __hermes_profiles",
" echo default",
" if test -d $HOME/.hermes/profiles",
" ls $HOME/.hermes/profiles 2>/dev/null",
" for d in $HOME/.hermes/profiles/*/",
" basename $d",
" end",
" end",
"end",
"",

View File

@ -453,11 +453,8 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
if pid == my_pid or pid in exclude_pids:
continue
try:
cmdline = (
open(f"/proc/{pid}/cmdline", "rb")
.read()
.decode("utf-8", errors="replace")
)
with open(f"/proc/{pid}/cmdline", "rb") as _f:
cmdline = _f.read().decode("utf-8", errors="replace")
cmdline = cmdline.replace("\x00", " ")
cmdline_lc = cmdline.lower()
if any(p in cmdline_lc for p in patterns) and (

View File

@ -838,6 +838,10 @@ def _spawn_hermes_action(subcommand: List[str], name: str) -> subprocess.Popen:
popen_kwargs["start_new_session"] = True
proc = subprocess.Popen(cmd, **popen_kwargs)
# The child inherits its own duplicated fd for stdout/stderr, so the
# parent's handle can be released immediately — otherwise we leak one
# fd per spawned action.
log_file.close()
_ACTION_RESULTS.pop(name, None)
_ACTION_PROCS[name] = proc
return proc
@ -1808,6 +1812,11 @@ async def remove_env_var(body: EnvVarDelete):
return {"ok": True, "key": body.key}
except HTTPException:
raise
except ValueError as exc:
# remove_env_value raises ValueError for invalid key names. Surface
# the message to the SPA so the user understands why the delete was
# refused instead of seeing an opaque 500. Mirrors PUT /api/env.
raise HTTPException(status_code=400, detail=str(exc)) from exc
except Exception:
_log.exception("DELETE /api/env failed")
raise HTTPException(status_code=500, detail="Internal server error")

View File

@ -64,6 +64,7 @@ AUTHOR_MAP = {
"524706+Twanislas@users.noreply.github.com": "Twanislas",
"9592417+adam91holt@users.noreply.github.com": "adam91holt",
"kchuang1015@users.noreply.github.com": "kchuang1015",
"kyssta-exe@users.noreply.github.com": "kyssta-exe",
"45688690+fujinice@users.noreply.github.com": "fujinice",
"276689385+carltonawong@users.noreply.github.com": "carltonawong",
"195255660+EvilHumphrey@users.noreply.github.com": "EvilHumphrey",

View File

@ -182,6 +182,7 @@ _HERMES_BEHAVIORAL_VARS = frozenset({
"HERMES_SESSION_SOURCE",
"HERMES_SESSION_KEY",
"HERMES_GATEWAY_SESSION",
"HERMES_CRON_SESSION",
"_HERMES_GATEWAY",
"HERMES_PLATFORM",
"HERMES_MODEL",

View File

@ -1395,11 +1395,16 @@ class TestApprovalTimeoutIsNotConsent:
self._saved_env = {
k: os.environ.get(k)
for k in ("HERMES_GATEWAY_SESSION", "HERMES_YOLO_MODE",
for k in ("HERMES_GATEWAY_SESSION", "HERMES_CRON_SESSION",
"HERMES_YOLO_MODE",
"HERMES_SESSION_KEY", "HERMES_INTERACTIVE")
}
os.environ.pop("HERMES_YOLO_MODE", None)
os.environ.pop("HERMES_INTERACTIVE", None)
# HERMES_CRON_SESSION takes priority over HERMES_GATEWAY_SESSION in
# _is_gateway_approval_context(); a leaked value from a parent cron
# process would force the cron path and break these gateway tests.
os.environ.pop("HERMES_CRON_SESSION", None)
os.environ["HERMES_GATEWAY_SESSION"] = "1"
os.environ["HERMES_SESSION_KEY"] = self.SESSION_KEY

View File

@ -170,7 +170,6 @@ _sudo_password_cache_lock = threading.Lock()
# own callback exactly like before. Gateway mode resolves approvals via
# the per-session queue in tools.approval, not through these callbacks,
# so it's unaffected.
import threading
_callback_tls = threading.local()

View File

@ -1155,11 +1155,11 @@ async def web_extract_tool(
def check_web_api_key() -> bool:
"""Check whether the configured web backend is available."""
configured = _load_web_config().get("backend", "").lower().strip()
if configured in {"exa", "parallel", "firecrawl", "tavily", "searxng", "brave-free", "ddgs"}:
if configured in {"exa", "parallel", "firecrawl", "tavily", "searxng", "brave-free", "ddgs", "xai"}:
return _is_backend_available(configured)
return any(
_is_backend_available(backend)
for backend in ("exa", "parallel", "firecrawl", "tavily", "searxng", "brave-free", "ddgs")
for backend in ("exa", "parallel", "firecrawl", "tavily", "searxng", "brave-free", "ddgs", "xai")
)