fix(cron): restore jobs.json emptied by config migration on update
Config-version migrations have been observed to leave cron/jobs.json valid-but-empty after `hermes update`, silently dropping every scheduled job (#34600). The existing malformed-shape guards in cron/jobs.py don't catch this because {"jobs": []} is valid JSON. Add restore_cron_jobs_if_emptied() as a post-migration safety net: if the live cron/jobs.json now has zero jobs while the pre-update snapshot held one or more, restore the snapshot copy in place and warn loudly. The check is conservative — it only restores on unambiguous evidence of loss (snapshot had jobs, live file readable-and-empty), so a user who genuinely cleared their jobs is never second-guessed and an unreadable live file is left untouched so real corruption still surfaces. Wired into _cmd_update_impl after migrate_config(), reusing the existing pre-update quick snapshot (which already captures cron/jobs.json). Closes #34600
This commit is contained in:
@ -670,6 +670,105 @@ def restore_quick_snapshot(
|
||||
return restored > 0
|
||||
|
||||
|
||||
# Relative path of the cron job database inside HERMES_HOME. Kept in sync with
|
||||
# the entry in ``_QUICK_STATE_FILES`` and with ``cron/jobs.py``'s ``JOBS_FILE``.
|
||||
_CRON_JOBS_REL = "cron/jobs.json"
|
||||
|
||||
|
||||
def _count_cron_jobs(path: Path) -> Optional[int]:
|
||||
"""Return the number of cron jobs stored in ``path``.
|
||||
|
||||
The canonical on-disk shape is ``{"jobs": [...]}`` (see ``cron/jobs.py``).
|
||||
A legacy bare-list shape (``[...]``) is also honoured.
|
||||
|
||||
Returns:
|
||||
The job count for any *valid, readable* JSON document, or ``None`` if
|
||||
the file is missing or cannot be parsed. ``None`` means "unknown" —
|
||||
callers must not treat it as "zero jobs", because acting on an
|
||||
unreadable file could mask a real corruption the user needs to see.
|
||||
"""
|
||||
if not path.is_file():
|
||||
return None
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return None
|
||||
if isinstance(data, dict):
|
||||
jobs = data.get("jobs", [])
|
||||
return len(jobs) if isinstance(jobs, list) else None
|
||||
if isinstance(data, list):
|
||||
return len(data)
|
||||
return None
|
||||
|
||||
|
||||
def restore_cron_jobs_if_emptied(
|
||||
snapshot_id: str,
|
||||
hermes_home: Optional[Path] = None,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Safety net for silent cron-job loss across ``hermes update``.
|
||||
|
||||
Config-version migrations have been observed to leave ``cron/jobs.json``
|
||||
valid-but-empty after an update, silently dropping every scheduled job
|
||||
(issue #34600). The existing malformed-shape guards in ``cron/jobs.py``
|
||||
don't catch this case because ``{"jobs": []}`` is perfectly valid JSON.
|
||||
|
||||
This compares the *current* job count against the pre-update snapshot. If
|
||||
the live file now has **zero** jobs while the snapshot captured **one or
|
||||
more**, the snapshot copy of ``cron/jobs.json`` is restored in place.
|
||||
|
||||
The check is deliberately conservative — it only ever restores when there
|
||||
is unambiguous evidence of loss (snapshot had jobs, live file has none),
|
||||
so a user who genuinely deleted all their jobs during/after the update is
|
||||
never second-guessed, and an unreadable live file (count ``None``) is left
|
||||
untouched so real corruption still surfaces.
|
||||
|
||||
Args:
|
||||
snapshot_id: The pre-update quick-snapshot id (from
|
||||
:func:`create_quick_snapshot`).
|
||||
hermes_home: Override for the Hermes home directory (tests).
|
||||
|
||||
Returns:
|
||||
``None`` when no action was taken (the common, healthy path). On a
|
||||
successful restore, a dict ``{"restored": True, "job_count": N,
|
||||
"snapshot_id": ...}`` so the caller can warn the user.
|
||||
"""
|
||||
if not snapshot_id:
|
||||
return None
|
||||
|
||||
home = hermes_home or get_hermes_home()
|
||||
live_path = home / _CRON_JOBS_REL
|
||||
|
||||
live_count = _count_cron_jobs(live_path)
|
||||
# Only act when the live file is readable AND empty. ``None`` (missing or
|
||||
# unparseable) is intentionally left alone — that's a different failure
|
||||
# mode the user should see rather than have papered over.
|
||||
if live_count is None or live_count > 0:
|
||||
return None
|
||||
|
||||
snap_path = _quick_snapshot_root(home) / snapshot_id / _CRON_JOBS_REL
|
||||
snap_count = _count_cron_jobs(snap_path)
|
||||
if not snap_count: # None or 0 — nothing worth restoring
|
||||
return None
|
||||
|
||||
try:
|
||||
live_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(snap_path, live_path)
|
||||
except (OSError, PermissionError) as exc:
|
||||
logger.error(
|
||||
"Cron jobs were emptied during update but auto-restore failed: %s", exc
|
||||
)
|
||||
return None
|
||||
|
||||
logger.warning(
|
||||
"Restored %d cron job(s) from pre-update snapshot %s "
|
||||
"(cron/jobs.json was emptied during migration)",
|
||||
snap_count,
|
||||
snapshot_id,
|
||||
)
|
||||
return {"restored": True, "job_count": snap_count, "snapshot_id": snapshot_id}
|
||||
|
||||
|
||||
def _prune_quick_snapshots(root: Path, keep: int = _QUICK_DEFAULT_KEEP) -> int:
|
||||
"""Remove oldest quick snapshots beyond the keep limit. Returns count deleted."""
|
||||
if not root.exists():
|
||||
|
||||
@ -9125,12 +9125,13 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
# though `git pull` can't touch $HERMES_HOME, this is cheap
|
||||
# belt-and-suspenders insurance and gives the user something to
|
||||
# restore from via `/snapshot list` / `/snapshot restore <id>`.
|
||||
pre_update_snapshot_id = None
|
||||
try:
|
||||
from hermes_cli.backup import create_quick_snapshot
|
||||
|
||||
snap_id = create_quick_snapshot(label="pre-update", keep=1)
|
||||
if snap_id:
|
||||
print(f" ✓ Pre-update snapshot: {snap_id}")
|
||||
pre_update_snapshot_id = create_quick_snapshot(label="pre-update", keep=1)
|
||||
if pre_update_snapshot_id:
|
||||
print(f" ✓ Pre-update snapshot: {pre_update_snapshot_id}")
|
||||
except Exception as exc:
|
||||
# Never let a snapshot failure block an update.
|
||||
logger.debug("Pre-update snapshot failed: %s", exc)
|
||||
@ -9467,6 +9468,25 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
||||
else:
|
||||
print(" ✓ Configuration is up to date")
|
||||
|
||||
# Safety net: config-version migrations have been observed to leave
|
||||
# cron/jobs.json valid-but-empty, silently dropping every scheduled
|
||||
# job (issue #34600). If the live file is now empty while the
|
||||
# pre-update snapshot held jobs, restore it and warn loudly.
|
||||
try:
|
||||
from hermes_cli.backup import restore_cron_jobs_if_emptied
|
||||
|
||||
cron_restore = restore_cron_jobs_if_emptied(pre_update_snapshot_id)
|
||||
if cron_restore:
|
||||
print()
|
||||
print(
|
||||
" ⚠️ cron/jobs.json was emptied during this update — "
|
||||
f"restored {cron_restore['job_count']} job(s) from "
|
||||
f"pre-update snapshot {cron_restore['snapshot_id']}."
|
||||
)
|
||||
except Exception as exc:
|
||||
# Never let the cron safety net break an otherwise-good update.
|
||||
logger.debug("Cron jobs auto-restore check failed: %s", exc)
|
||||
|
||||
print()
|
||||
print("✓ Update complete!")
|
||||
|
||||
|
||||
@ -1679,3 +1679,105 @@ class TestPreMigrationBackup:
|
||||
_t.sleep(1.05)
|
||||
# Update backup must still be there
|
||||
assert update_backup.exists(), "pre-migration rotation wrongly pruned the pre-update backup"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cron jobs auto-restore after silent migration loss (issue #34600)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRestoreCronJobsIfEmptied:
|
||||
"""`hermes update` config migration can leave cron/jobs.json valid-but-empty,
|
||||
silently dropping every scheduled job. `restore_cron_jobs_if_emptied` is the
|
||||
post-migration safety net that restores from the pre-update snapshot."""
|
||||
|
||||
@staticmethod
|
||||
def _seed_jobs(path: Path, jobs):
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps({"jobs": jobs}))
|
||||
|
||||
def _make_snapshot(self, hermes_home: Path, label="pre-update"):
|
||||
from hermes_cli.backup import create_quick_snapshot
|
||||
return create_quick_snapshot(label=label, hermes_home=hermes_home, keep=5)
|
||||
|
||||
def test_restores_when_emptied_after_migration(self, tmp_path):
|
||||
from hermes_cli.backup import restore_cron_jobs_if_emptied
|
||||
hermes_home = tmp_path / ".hermes"
|
||||
jobs_path = hermes_home / "cron" / "jobs.json"
|
||||
# Pre-update: 3 real jobs.
|
||||
self._seed_jobs(jobs_path, [{"id": "a"}, {"id": "b"}, {"id": "c"}])
|
||||
snap_id = self._make_snapshot(hermes_home)
|
||||
assert snap_id
|
||||
|
||||
# Migration silently empties the file (valid JSON, zero jobs).
|
||||
jobs_path.write_text(json.dumps({"jobs": []}))
|
||||
|
||||
result = restore_cron_jobs_if_emptied(snap_id, hermes_home=hermes_home)
|
||||
assert result is not None
|
||||
assert result["restored"] is True
|
||||
assert result["job_count"] == 3
|
||||
assert result["snapshot_id"] == snap_id
|
||||
|
||||
# The live file now has the jobs back.
|
||||
restored = json.loads(jobs_path.read_text())
|
||||
assert len(restored["jobs"]) == 3
|
||||
|
||||
def test_noop_when_live_file_still_has_jobs(self, tmp_path):
|
||||
from hermes_cli.backup import restore_cron_jobs_if_emptied
|
||||
hermes_home = tmp_path / ".hermes"
|
||||
jobs_path = hermes_home / "cron" / "jobs.json"
|
||||
self._seed_jobs(jobs_path, [{"id": "a"}, {"id": "b"}])
|
||||
snap_id = self._make_snapshot(hermes_home)
|
||||
|
||||
# Healthy path: file unchanged after update.
|
||||
result = restore_cron_jobs_if_emptied(snap_id, hermes_home=hermes_home)
|
||||
assert result is None
|
||||
|
||||
def test_noop_when_snapshot_had_no_jobs(self, tmp_path):
|
||||
from hermes_cli.backup import restore_cron_jobs_if_emptied
|
||||
hermes_home = tmp_path / ".hermes"
|
||||
jobs_path = hermes_home / "cron" / "jobs.json"
|
||||
# Pre-update genuinely had zero jobs; current is also empty.
|
||||
self._seed_jobs(jobs_path, [])
|
||||
snap_id = self._make_snapshot(hermes_home)
|
||||
jobs_path.write_text(json.dumps({"jobs": []}))
|
||||
|
||||
result = restore_cron_jobs_if_emptied(snap_id, hermes_home=hermes_home)
|
||||
assert result is None
|
||||
|
||||
def test_noop_when_live_file_unreadable(self, tmp_path):
|
||||
"""An unparseable live file is left alone — that's a different failure
|
||||
mode the user should see, not silently overwrite."""
|
||||
from hermes_cli.backup import restore_cron_jobs_if_emptied
|
||||
hermes_home = tmp_path / ".hermes"
|
||||
jobs_path = hermes_home / "cron" / "jobs.json"
|
||||
self._seed_jobs(jobs_path, [{"id": "a"}])
|
||||
snap_id = self._make_snapshot(hermes_home)
|
||||
jobs_path.write_text("{ this is not valid json")
|
||||
|
||||
result = restore_cron_jobs_if_emptied(snap_id, hermes_home=hermes_home)
|
||||
assert result is None
|
||||
# File left untouched.
|
||||
assert jobs_path.read_text() == "{ this is not valid json"
|
||||
|
||||
def test_noop_when_snapshot_id_missing(self, tmp_path):
|
||||
from hermes_cli.backup import restore_cron_jobs_if_emptied
|
||||
hermes_home = tmp_path / ".hermes"
|
||||
jobs_path = hermes_home / "cron" / "jobs.json"
|
||||
self._seed_jobs(jobs_path, [])
|
||||
assert restore_cron_jobs_if_emptied(None, hermes_home=hermes_home) is None
|
||||
assert restore_cron_jobs_if_emptied("", hermes_home=hermes_home) is None
|
||||
|
||||
def test_restores_legacy_bare_list_snapshot_shape(self, tmp_path):
|
||||
"""A legacy snapshot storing a bare JSON list (not {"jobs": [...]}) is
|
||||
still counted and restored."""
|
||||
from hermes_cli.backup import restore_cron_jobs_if_emptied
|
||||
hermes_home = tmp_path / ".hermes"
|
||||
jobs_path = hermes_home / "cron" / "jobs.json"
|
||||
jobs_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
jobs_path.write_text(json.dumps([{"id": "a"}, {"id": "b"}]))
|
||||
snap_id = self._make_snapshot(hermes_home)
|
||||
|
||||
jobs_path.write_text(json.dumps({"jobs": []}))
|
||||
result = restore_cron_jobs_if_emptied(snap_id, hermes_home=hermes_home)
|
||||
assert result is not None
|
||||
assert result["job_count"] == 2
|
||||
|
||||
Reference in New Issue
Block a user