fix(cron): re-validate stale cron-output entries before deletion (#37721)

quick() and dry_run() previously trusted the stored category from
tracked.json without re-validating at delete time. Stale entries from
before #34840 could carry category="cron-output" for cron control-plane
paths (e.g. cron/jobs.json), causing quick() to delete the live
scheduler registry.

Fix:
- Fix guess_category() to only classify cron/output/** as cron-output
  (was classifying ALL cron/* paths, missing the #34840 fix).
- Re-validate cron-output entries via guess_category() at delete time
  in quick() and dry_run(); stale entries that are no longer classified
  as cron-output are skipped and removed from tracked.json.
- Add _is_protected_cron_path() as a hard defense-in-depth guard that
  blocks deletion of cron/cronjobs directories and known control-plane
  files (jobs.json, .tick.lock) regardless of stored category.
- Update test_cron_subtree_categorised to match fixed guess_category
  (only cron/output/* is cron-output, not all of cron/).

Tests: add 5 regression tests in TestStaleCronEntryMigration.
This commit is contained in:
kyssta-exe
2026-06-03 04:10:57 +00:00
committed by Teknium
parent 693f4c7e9c
commit 30412a9771
2 changed files with 186 additions and 0 deletions

View File

@ -145,6 +145,33 @@ ALLOWED_CATEGORIES = {
} }
# Paths under $HERMES_HOME that must NEVER be deleted by quick(),
# regardless of what the stored category says. This is a defense-in-depth
# guard against stale tracked.json entries from before #34840.
_PROTECTED_CRON_PATHS: set[str] = set()
def _is_protected_cron_path(p: Path) -> bool:
"""Return True if *p* is a cron control-plane file/directory that must
never be deleted.
This only matches the directory itself and known control-plane files
(``jobs.json``, ``.tick.lock``) — it does NOT blanket-protect
everything under ``cron/`` because ``cron/output/`` is disposable.
"""
# Lazily build the set once per process so HERMES_HOME is resolved
# exactly once.
if not _PROTECTED_CRON_PATHS:
hermes_home = get_hermes_home()
for parent in ("cron", "cronjobs"):
base = hermes_home / parent
_PROTECTED_CRON_PATHS.add(str(base))
_PROTECTED_CRON_PATHS.add(str(base / "jobs.json"))
_PROTECTED_CRON_PATHS.add(str(base / ".tick.lock"))
resolved = str(p.resolve())
return resolved in _PROTECTED_CRON_PATHS
def fmt_size(n: float) -> str: def fmt_size(n: float) -> str:
for unit in ("B", "KB", "MB", "GB", "TB"): for unit in ("B", "KB", "MB", "GB", "TB"):
if n < 1024: if n < 1024:
@ -226,6 +253,14 @@ def dry_run() -> Tuple[List[Dict], List[Dict]]:
cat = item["category"] cat = item["category"]
size = item["size"] size = item["size"]
# Re-validate stale "cron-output" entries (fixes #37721).
if cat == "cron-output":
re_cat = guess_category(p)
if re_cat != "cron-output":
# Stale entry — would be skipped by quick(); omit from
# dry-run output too.
continue
if cat == "test": if cat == "test":
auto.append(item) auto.append(item)
elif cat == "temp" and age > 7: elif cat == "temp" and age > 7:
@ -269,6 +304,28 @@ def quick() -> Dict[str, Any]:
age = (now - datetime.fromisoformat(item["timestamp"])).days age = (now - datetime.fromisoformat(item["timestamp"])).days
# ---- stale-state migration (fixes #37721) ----
# Old tracked.json entries may carry a "cron-output" category for
# paths that are NOT under cron/output/ (e.g. cron/jobs.json).
# guess_category() was fixed in #34840, but existing entries are
# never re-validated. Re-classify here so stale entries for cron
# control-plane state are not deleted.
if cat == "cron-output":
re_cat = guess_category(p)
if re_cat != "cron-output":
_log(
f"SKIP stale cron-output entry: {p} "
f"(re-classified as {re_cat!r})"
)
# Drop the stale entry — it was misclassified.
continue
# Hard safety net: never delete cron control-plane state even if
# the category somehow slipped through re-validation above.
if _is_protected_cron_path(p):
_log(f"SKIP protected cron path: {p}")
continue
should_delete = ( should_delete = (
cat == "test" cat == "test"
or (cat == "temp" and age > 7) or (cat == "temp" and age > 7)

View File

@ -170,6 +170,135 @@ class TestGuessCategory:
assert dg.guess_category(p) is None assert dg.guess_category(p) is None
class TestStaleCronEntryMigration:
"""Regression tests for #37721 — stale cron-output entries in tracked.json."""
def test_quick_skips_stale_cron_output_for_jobs_json(self, _isolate_env):
"""A stale tracked.json entry with category="cron-output" for
cron/jobs.json must NOT be deleted by quick().
This is the exact scenario from #37721: an old tracked.json has
{"path": ".../cron/jobs.json", "category": "cron-output"} which
would pass the delete filter but must be skipped because
guess_category() now returns None for non-output cron paths.
"""
dg = _load_lib()
cron_dir = _isolate_env / "cron"
cron_dir.mkdir()
jobs_json = cron_dir / "jobs.json"
jobs_json.write_text('{"jobs": []}')
# Simulate a stale tracked.json entry from before #34840 by
# directly writing the tracked file (track() would reject it).
tracked_file = _isolate_env / "disk-cleanup" / "tracked.json"
tracked_file.parent.mkdir(parents=True, exist_ok=True)
tracked_file.write_text(json.dumps([{
"path": str(jobs_json),
"category": "cron-output",
"timestamp": "2025-01-01T00:00:00+00:00", # very old
"size": 123,
}]))
summary = dg.quick()
assert summary["deleted"] == 0, "cron/jobs.json must not be deleted"
assert jobs_json.exists(), "jobs.json must still exist"
# The stale entry should have been dropped from tracking.
remaining = json.loads(tracked_file.read_text())
assert len(remaining) == 0
def test_quick_skips_stale_cron_output_for_cron_dir(self, _isolate_env):
"""Stale entry for the cron/ directory itself must not be deleted."""
dg = _load_lib()
cron_dir = _isolate_env / "cron"
cron_dir.mkdir()
output_dir = cron_dir / "output"
output_dir.mkdir()
(output_dir / "run.md").write_text("x")
tracked_file = _isolate_env / "disk-cleanup" / "tracked.json"
tracked_file.parent.mkdir(parents=True, exist_ok=True)
tracked_file.write_text(json.dumps([{
"path": str(cron_dir),
"category": "cron-output",
"timestamp": "2025-01-01T00:00:00+00:00",
"size": 0,
}]))
summary = dg.quick()
assert summary["deleted"] == 0, "cron/ dir must not be deleted"
assert cron_dir.exists()
def test_quick_skips_protected_cron_paths_defense_in_depth(self, _isolate_env):
"""Defense-in-depth: even if guess_category returned cron-output
(hypothetically), protected cron paths are never deleted."""
dg = _load_lib()
cron_dir = _isolate_env / "cron"
cron_dir.mkdir()
tick_lock = cron_dir / ".tick.lock"
tick_lock.write_text("")
# Manually inject a stale entry with "test" category (would normally
# be auto-deleted) — the protected path guard must still block it.
tracked_file = _isolate_env / "disk-cleanup" / "tracked.json"
tracked_file.parent.mkdir(parents=True, exist_ok=True)
tracked_file.write_text(json.dumps([{
"path": str(tick_lock),
"category": "test",
"timestamp": "2025-01-01T00:00:00+00:00",
"size": 0,
}]))
summary = dg.quick()
assert summary["deleted"] == 0, ".tick.lock must not be deleted"
assert tick_lock.exists()
def test_dry_run_omits_stale_cron_output(self, _isolate_env):
"""dry_run() should also skip stale cron-output entries."""
dg = _load_lib()
cron_dir = _isolate_env / "cron"
cron_dir.mkdir()
jobs_json = cron_dir / "jobs.json"
jobs_json.write_text("[]")
tracked_file = _isolate_env / "disk-cleanup" / "tracked.json"
tracked_file.parent.mkdir(parents=True, exist_ok=True)
tracked_file.write_text(json.dumps([{
"path": str(jobs_json),
"category": "cron-output",
"timestamp": "2025-01-01T00:00:00+00:00",
"size": 123,
}]))
auto, prompt = dg.dry_run()
assert len(auto) == 0, "stale cron-output for jobs.json must not appear"
assert len(prompt) == 0
def test_legitimate_cron_output_still_deleted(self, _isolate_env):
"""A valid cron-output entry under cron/output/ must still be deleted."""
dg = _load_lib()
output_dir = _isolate_env / "cron" / "output" / "job_1"
output_dir.mkdir(parents=True)
run_md = output_dir / "run.md"
run_md.write_text("x")
# Old enough to be deleted (>14 days)
from datetime import datetime, timezone, timedelta
old_ts = (datetime.now(timezone.utc) - timedelta(days=20)).isoformat()
tracked_file = _isolate_env / "disk-cleanup" / "tracked.json"
tracked_file.parent.mkdir(parents=True, exist_ok=True)
tracked_file.write_text(json.dumps([{
"path": str(run_md),
"category": "cron-output",
"timestamp": old_ts,
"size": 10,
}]))
summary = dg.quick()
assert summary["deleted"] == 1, "valid old cron-output should be deleted"
assert not run_md.exists()
class TestTrackForgetQuick: class TestTrackForgetQuick:
def test_track_then_quick_deletes_test(self, _isolate_env): def test_track_then_quick_deletes_test(self, _isolate_env):
dg = _load_lib() dg = _load_lib()