diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 7b3fb14b1..081f61786 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -13390,6 +13390,11 @@ Examples: "--yes", "-y", action="store_true", help="Skip confirmation" ) + sessions_subparsers.add_parser( + "optimize", + help="Reclaim disk space: merge FTS5 segments + VACUUM (no data change)", + ) + sessions_subparsers.add_parser("stats", help="Show session store statistics") sessions_rename = sessions_subparsers.add_parser( @@ -13562,6 +13567,39 @@ Examples: relaunch(["--resume", selected_id]) return # won't reach here after execvp + elif action == "optimize": + db_path = db.db_path + before_mb = ( + os.path.getsize(db_path) / (1024 * 1024) + if db_path.exists() + else 0.0 + ) + print("Optimizing session store (FTS merge + VACUUM)…") + try: + # vacuum() merges FTS5 segments (optimize_fts) then VACUUMs. + # Probe the index count first for the summary line. + n = sum( + 1 + for t in db._FTS_TABLES + if db._fts_table_exists(t) + ) + db.vacuum() + except Exception as e: + print(f"Error: optimization failed: {e}") + db.close() + return + after_mb = ( + os.path.getsize(db_path) / (1024 * 1024) + if db_path.exists() + else 0.0 + ) + saved = before_mb - after_mb + print(f"Optimized {n} FTS index(es).") + print( + f"Database size: {before_mb:.1f} MB -> {after_mb:.1f} MB " + f"(reclaimed {saved:.1f} MB)" + ) + elif action == "stats": total = db.session_count() msgs = db.message_count() diff --git a/hermes_state.py b/hermes_state.py index ced775633..7a03ffbdf 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -3251,6 +3251,58 @@ class SessionDB: # ── Space reclamation ── + # FTS5 virtual tables whose b-tree segments we merge on optimize. The + # trigram table is created lazily / may be disabled, so we probe before + # touching it (see optimize_fts). + _FTS_TABLES = ("messages_fts", "messages_fts_trigram") + + def _fts_table_exists(self, name: str) -> bool: + """True if an FTS5 virtual table is queryable in this DB.""" + try: + self._conn.execute(f"SELECT 1 FROM {name} LIMIT 0") + return True + except sqlite3.OperationalError: + return False + + def optimize_fts(self) -> int: + """Merge fragmented FTS5 b-tree segments into one per index. + + FTS5 indexes grow as a series of incremental segments — one per + ``INSERT`` batch driven by the message triggers. Over tens of + thousands of messages these segments accumulate, which both bloats + the ``*_data`` shadow tables and slows ``MATCH`` queries that must + scan every segment. The special ``'optimize'`` command rewrites each + index as a single merged segment. + + This is purely a maintenance operation — it changes neither search + results nor ``snippet()`` output, only on-disk layout and query + speed. It is complementary to VACUUM: ``optimize`` compacts the FTS + index internally, then VACUUM returns the freed pages to the OS. + + Skips any FTS table that does not exist (e.g. the trigram index when + disabled via ``HERMES_DISABLE_FTS_TRIGRAM`` or not yet created), so + it is safe to call unconditionally. + + Returns the number of FTS indexes that were optimized. + """ + optimized = 0 + with self._lock: + for tbl in self._FTS_TABLES: + if not self._fts_table_exists(tbl): + continue + try: + # The column name in the INSERT must match the table name + # for FTS5 special commands. + self._conn.execute( + f"INSERT INTO {tbl}({tbl}) VALUES('optimize')" + ) + optimized += 1 + except sqlite3.OperationalError as exc: + logger.warning( + "FTS optimize failed for %s: %s", tbl, exc + ) + return optimized + def vacuum(self) -> None: """Run VACUUM to reclaim disk space after large deletes. @@ -3264,7 +3316,17 @@ class SessionDB: exclusive lock, so callers must ensure no other writers are active. Safe to call at startup before the gateway/CLI starts serving traffic. + + FTS5 segments are merged first via :meth:`optimize_fts` so the + subsequent VACUUM reclaims the pages freed by the merge. This is a + layout-only optimization — search results are unchanged. """ + # Merge FTS5 segments before VACUUM so the freed pages are returned + # to the OS in the same pass. optimize_fts() manages its own lock. + try: + self.optimize_fts() + except Exception as exc: + logger.warning("FTS optimize before VACUUM failed: %s", exc) # VACUUM cannot be executed inside a transaction. with self._lock: # Best-effort WAL checkpoint first, then VACUUM. diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index 881856ee0..cec3c13f0 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -2676,6 +2676,64 @@ class TestVacuum: db.vacuum() +class TestOptimizeFts: + def test_optimize_returns_index_count(self, db): + """A fresh DB has both FTS indexes; optimize merges both.""" + db.create_session(session_id="s1", source="cli") + db.append_message(session_id="s1", role="user", content="hello world") + assert db.optimize_fts() == 2 + + def test_optimize_preserves_search_and_snippet(self, db): + """Optimize is layout-only: MATCH results + snippets are unchanged.""" + db.create_session(session_id="s1", source="cli") + for i in range(50): + db.append_message( + session_id="s1", + role="user", + content=f"needle alpha bravo charlie message {i}", + ) + before = db.search_messages("needle") + n = db.optimize_fts() + assert n == 2 + after = db.search_messages("needle") + assert len(after) == len(before) + assert len(after) > 0 + # Snippet must still be populated (would be empty/None if the FTS + # content shadow were lost during optimize). + assert all(row.get("snippet") for row in after) + # IDs and snippets are identical before/after — pure layout change. + assert [r["id"] for r in after] == [r["id"] for r in before] + assert [r["snippet"] for r in after] == [r["snippet"] for r in before] + + def test_optimize_skips_missing_trigram_table(self, db): + """When the trigram index is absent, optimize handles only the porter + index and does not raise.""" + db.create_session(session_id="s1", source="cli") + db.append_message(session_id="s1", role="user", content="hello") + # Drop the trigram table + triggers to simulate a disabled/absent index. + with db._lock: + for trig in ( + "messages_fts_trigram_insert", + "messages_fts_trigram_delete", + "messages_fts_trigram_update", + ): + db._conn.execute(f"DROP TRIGGER IF EXISTS {trig}") + db._conn.execute("DROP TABLE IF EXISTS messages_fts_trigram") + assert db._fts_table_exists("messages_fts_trigram") is False + assert db._fts_table_exists("messages_fts") is True + # Only the porter index remains -> 1 optimized, no error. + assert db.optimize_fts() == 1 + + def test_optimize_idempotent(self, db): + """Running optimize twice is safe (second pass is a no-op merge).""" + db.create_session(session_id="s1", source="cli") + db.append_message(session_id="s1", role="user", content="repeat me") + assert db.optimize_fts() == 2 + assert db.optimize_fts() == 2 + # Search still works after repeated optimization. + assert len(db.search_messages("repeat")) == 1 + + class TestAutoMaintenance: def _make_old_ended(self, db, sid: str, days_old: int = 100): """Create a session that is ended and was started `days_old` days ago."""