fix(cron): restore jobs.json emptied by config migration on update

Config-version migrations have been observed to leave cron/jobs.json
valid-but-empty after `hermes update`, silently dropping every scheduled
job (#34600). The existing malformed-shape guards in cron/jobs.py don't
catch this because {"jobs": []} is valid JSON.

Add restore_cron_jobs_if_emptied() as a post-migration safety net: if the
live cron/jobs.json now has zero jobs while the pre-update snapshot held
one or more, restore the snapshot copy in place and warn loudly. The
check is conservative — it only restores on unambiguous evidence of loss
(snapshot had jobs, live file readable-and-empty), so a user who genuinely
cleared their jobs is never second-guessed and an unreadable live file is
left untouched so real corruption still surfaces.

Wired into _cmd_update_impl after migrate_config(), reusing the existing
pre-update quick snapshot (which already captures cron/jobs.json).

Closes #34600
This commit is contained in:
Bartok9
2026-05-29 08:17:03 -04:00
committed by Teknium
parent d473e7c938
commit 3845d86b93
3 changed files with 224 additions and 3 deletions

View File

@ -670,6 +670,105 @@ def restore_quick_snapshot(
return restored > 0
# Relative path of the cron job database inside HERMES_HOME. Kept in sync with
# the entry in ``_QUICK_STATE_FILES`` and with ``cron/jobs.py``'s ``JOBS_FILE``.
_CRON_JOBS_REL = "cron/jobs.json"
def _count_cron_jobs(path: Path) -> Optional[int]:
"""Return the number of cron jobs stored in ``path``.
The canonical on-disk shape is ``{"jobs": [...]}`` (see ``cron/jobs.py``).
A legacy bare-list shape (``[...]``) is also honoured.
Returns:
The job count for any *valid, readable* JSON document, or ``None`` if
the file is missing or cannot be parsed. ``None`` means "unknown"
callers must not treat it as "zero jobs", because acting on an
unreadable file could mask a real corruption the user needs to see.
"""
if not path.is_file():
return None
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
except (OSError, json.JSONDecodeError):
return None
if isinstance(data, dict):
jobs = data.get("jobs", [])
return len(jobs) if isinstance(jobs, list) else None
if isinstance(data, list):
return len(data)
return None
def restore_cron_jobs_if_emptied(
snapshot_id: str,
hermes_home: Optional[Path] = None,
) -> Optional[Dict[str, Any]]:
"""Safety net for silent cron-job loss across ``hermes update``.
Config-version migrations have been observed to leave ``cron/jobs.json``
valid-but-empty after an update, silently dropping every scheduled job
(issue #34600). The existing malformed-shape guards in ``cron/jobs.py``
don't catch this case because ``{"jobs": []}`` is perfectly valid JSON.
This compares the *current* job count against the pre-update snapshot. If
the live file now has **zero** jobs while the snapshot captured **one or
more**, the snapshot copy of ``cron/jobs.json`` is restored in place.
The check is deliberately conservative — it only ever restores when there
is unambiguous evidence of loss (snapshot had jobs, live file has none),
so a user who genuinely deleted all their jobs during/after the update is
never second-guessed, and an unreadable live file (count ``None``) is left
untouched so real corruption still surfaces.
Args:
snapshot_id: The pre-update quick-snapshot id (from
:func:`create_quick_snapshot`).
hermes_home: Override for the Hermes home directory (tests).
Returns:
``None`` when no action was taken (the common, healthy path). On a
successful restore, a dict ``{"restored": True, "job_count": N,
"snapshot_id": ...}`` so the caller can warn the user.
"""
if not snapshot_id:
return None
home = hermes_home or get_hermes_home()
live_path = home / _CRON_JOBS_REL
live_count = _count_cron_jobs(live_path)
# Only act when the live file is readable AND empty. ``None`` (missing or
# unparseable) is intentionally left alone — that's a different failure
# mode the user should see rather than have papered over.
if live_count is None or live_count > 0:
return None
snap_path = _quick_snapshot_root(home) / snapshot_id / _CRON_JOBS_REL
snap_count = _count_cron_jobs(snap_path)
if not snap_count: # None or 0 — nothing worth restoring
return None
try:
live_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(snap_path, live_path)
except (OSError, PermissionError) as exc:
logger.error(
"Cron jobs were emptied during update but auto-restore failed: %s", exc
)
return None
logger.warning(
"Restored %d cron job(s) from pre-update snapshot %s "
"(cron/jobs.json was emptied during migration)",
snap_count,
snapshot_id,
)
return {"restored": True, "job_count": snap_count, "snapshot_id": snapshot_id}
def _prune_quick_snapshots(root: Path, keep: int = _QUICK_DEFAULT_KEEP) -> int:
"""Remove oldest quick snapshots beyond the keep limit. Returns count deleted."""
if not root.exists():

View File

@ -9125,12 +9125,13 @@ def _cmd_update_impl(args, gateway_mode: bool):
# though `git pull` can't touch $HERMES_HOME, this is cheap
# belt-and-suspenders insurance and gives the user something to
# restore from via `/snapshot list` / `/snapshot restore <id>`.
pre_update_snapshot_id = None
try:
from hermes_cli.backup import create_quick_snapshot
snap_id = create_quick_snapshot(label="pre-update", keep=1)
if snap_id:
print(f" ✓ Pre-update snapshot: {snap_id}")
pre_update_snapshot_id = create_quick_snapshot(label="pre-update", keep=1)
if pre_update_snapshot_id:
print(f" ✓ Pre-update snapshot: {pre_update_snapshot_id}")
except Exception as exc:
# Never let a snapshot failure block an update.
logger.debug("Pre-update snapshot failed: %s", exc)
@ -9467,6 +9468,25 @@ def _cmd_update_impl(args, gateway_mode: bool):
else:
print(" ✓ Configuration is up to date")
# Safety net: config-version migrations have been observed to leave
# cron/jobs.json valid-but-empty, silently dropping every scheduled
# job (issue #34600). If the live file is now empty while the
# pre-update snapshot held jobs, restore it and warn loudly.
try:
from hermes_cli.backup import restore_cron_jobs_if_emptied
cron_restore = restore_cron_jobs_if_emptied(pre_update_snapshot_id)
if cron_restore:
print()
print(
" ⚠️ cron/jobs.json was emptied during this update — "
f"restored {cron_restore['job_count']} job(s) from "
f"pre-update snapshot {cron_restore['snapshot_id']}."
)
except Exception as exc:
# Never let the cron safety net break an otherwise-good update.
logger.debug("Cron jobs auto-restore check failed: %s", exc)
print()
print("✓ Update complete!")

View File

@ -1679,3 +1679,105 @@ class TestPreMigrationBackup:
_t.sleep(1.05)
# Update backup must still be there
assert update_backup.exists(), "pre-migration rotation wrongly pruned the pre-update backup"
# ---------------------------------------------------------------------------
# Cron jobs auto-restore after silent migration loss (issue #34600)
# ---------------------------------------------------------------------------
class TestRestoreCronJobsIfEmptied:
"""`hermes update` config migration can leave cron/jobs.json valid-but-empty,
silently dropping every scheduled job. `restore_cron_jobs_if_emptied` is the
post-migration safety net that restores from the pre-update snapshot."""
@staticmethod
def _seed_jobs(path: Path, jobs):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps({"jobs": jobs}))
def _make_snapshot(self, hermes_home: Path, label="pre-update"):
from hermes_cli.backup import create_quick_snapshot
return create_quick_snapshot(label=label, hermes_home=hermes_home, keep=5)
def test_restores_when_emptied_after_migration(self, tmp_path):
from hermes_cli.backup import restore_cron_jobs_if_emptied
hermes_home = tmp_path / ".hermes"
jobs_path = hermes_home / "cron" / "jobs.json"
# Pre-update: 3 real jobs.
self._seed_jobs(jobs_path, [{"id": "a"}, {"id": "b"}, {"id": "c"}])
snap_id = self._make_snapshot(hermes_home)
assert snap_id
# Migration silently empties the file (valid JSON, zero jobs).
jobs_path.write_text(json.dumps({"jobs": []}))
result = restore_cron_jobs_if_emptied(snap_id, hermes_home=hermes_home)
assert result is not None
assert result["restored"] is True
assert result["job_count"] == 3
assert result["snapshot_id"] == snap_id
# The live file now has the jobs back.
restored = json.loads(jobs_path.read_text())
assert len(restored["jobs"]) == 3
def test_noop_when_live_file_still_has_jobs(self, tmp_path):
from hermes_cli.backup import restore_cron_jobs_if_emptied
hermes_home = tmp_path / ".hermes"
jobs_path = hermes_home / "cron" / "jobs.json"
self._seed_jobs(jobs_path, [{"id": "a"}, {"id": "b"}])
snap_id = self._make_snapshot(hermes_home)
# Healthy path: file unchanged after update.
result = restore_cron_jobs_if_emptied(snap_id, hermes_home=hermes_home)
assert result is None
def test_noop_when_snapshot_had_no_jobs(self, tmp_path):
from hermes_cli.backup import restore_cron_jobs_if_emptied
hermes_home = tmp_path / ".hermes"
jobs_path = hermes_home / "cron" / "jobs.json"
# Pre-update genuinely had zero jobs; current is also empty.
self._seed_jobs(jobs_path, [])
snap_id = self._make_snapshot(hermes_home)
jobs_path.write_text(json.dumps({"jobs": []}))
result = restore_cron_jobs_if_emptied(snap_id, hermes_home=hermes_home)
assert result is None
def test_noop_when_live_file_unreadable(self, tmp_path):
"""An unparseable live file is left alone — that's a different failure
mode the user should see, not silently overwrite."""
from hermes_cli.backup import restore_cron_jobs_if_emptied
hermes_home = tmp_path / ".hermes"
jobs_path = hermes_home / "cron" / "jobs.json"
self._seed_jobs(jobs_path, [{"id": "a"}])
snap_id = self._make_snapshot(hermes_home)
jobs_path.write_text("{ this is not valid json")
result = restore_cron_jobs_if_emptied(snap_id, hermes_home=hermes_home)
assert result is None
# File left untouched.
assert jobs_path.read_text() == "{ this is not valid json"
def test_noop_when_snapshot_id_missing(self, tmp_path):
from hermes_cli.backup import restore_cron_jobs_if_emptied
hermes_home = tmp_path / ".hermes"
jobs_path = hermes_home / "cron" / "jobs.json"
self._seed_jobs(jobs_path, [])
assert restore_cron_jobs_if_emptied(None, hermes_home=hermes_home) is None
assert restore_cron_jobs_if_emptied("", hermes_home=hermes_home) is None
def test_restores_legacy_bare_list_snapshot_shape(self, tmp_path):
"""A legacy snapshot storing a bare JSON list (not {"jobs": [...]}) is
still counted and restored."""
from hermes_cli.backup import restore_cron_jobs_if_emptied
hermes_home = tmp_path / ".hermes"
jobs_path = hermes_home / "cron" / "jobs.json"
jobs_path.parent.mkdir(parents=True, exist_ok=True)
jobs_path.write_text(json.dumps([{"id": "a"}, {"id": "b"}]))
snap_id = self._make_snapshot(hermes_home)
jobs_path.write_text(json.dumps({"jobs": []}))
result = restore_cron_jobs_if_emptied(snap_id, hermes_home=hermes_home)
assert result is not None
assert result["job_count"] == 2