feat: trigram FTS5 index for CJK search, replace LIKE fallback (#16651)

* fix: bypass FTS5 for CJK queries in session_search FTS5 default tokenizer splits CJK characters into individual tokens, so multi-character queries like "大别山项目" become AND of single chars. This produces few/no results compared to LIKE substring search. For CJK queries, skip FTS5 entirely and use LIKE for accurate phrase matching. Fixes NousResearch/hermes-agent#15500 * fix: cache _contains_cjk, escape LIKE wildcards, add regression tests On top of the CJK FTS5 bypass from #15509: - Cache _contains_cjk() result in a local var to avoid redundant O(n) scans on every CJK query - Escape %, _ in LIKE queries so literal wildcards in user input are not treated as SQL wildcards (consistent with other LIKE queries in hermes_state.py that use ESCAPE '\') - Fix misleading comment ('or CJK fallback' → accurate description) - Add 3 regression tests: - test_cjk_partial_fts5_results_supplemented_by_like (#15500 / #14829) - test_cjk_like_dedup_no_duplicates - test_cjk_like_escapes_wildcards (new wildcard escaping) * feat: trigram FTS5 index for CJK search, replace LIKE fallback Replace the LIKE '%query%' full-table-scan fallback for CJK queries with a proper trigram FTS5 index (messages_fts_trigram). The trigram tokenizer creates overlapping 3-byte sequences so substring matching works natively for any script — CJK, Thai, etc. For queries with 3+ CJK characters: uses the trigram FTS5 table with proper ranking, snippets, and indexed lookups. For shorter queries (1-2 CJK chars): falls back to LIKE since the trigram tokenizer needs ≥9 UTF-8 bytes (3 CJK chars) minimum. Schema v10 migration creates the trigram table and backfills existing messages. Triggers keep the index in sync on INSERT/UPDATE/DELETE. Builds on top of #16276 (bypass FTS5 for CJK, escape LIKE wildcards). --------- Co-authored-by: vominh1919 <vominh1919@gmail.com>
2026-04-28 00:12:07 +05:30
parent e80504b088
commit 1fa76607c0
2 changed files with 212 additions and 47 deletions
--- a/hermes_state.py
+++ b/hermes_state.py
@ -31,7 +31,7 @@ T = TypeVar("T")
 DEFAULT_DB_PATH = get_hermes_home() / "state.db"
-SCHEMA_VERSION = 9
+SCHEMA_VERSION = 10
 SCHEMA_SQL = """
 CREATE TABLE IF NOT EXISTS schema_version (
@ -119,6 +119,32 @@ CREATE TRIGGER IF NOT EXISTS messages_fts_update AFTER UPDATE ON messages BEGIN
 END;
 """
 # Trigram FTS5 table for CJK substring search.  The default unicode61
 # tokenizer splits CJK characters into individual tokens, breaking phrase
 # matching.  The trigram tokenizer creates overlapping 3-byte sequences so
 # substring queries work natively for any script (CJK, Thai, etc.).
 FTS_TRIGRAM_SQL = """
 CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts_trigram USING fts5(
    content,
    content=messages,
    content_rowid=id,
    tokenize='trigram'
 );
 CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_insert AFTER INSERT ON messages BEGIN
    INSERT INTO messages_fts_trigram(rowid, content) VALUES (new.id, new.content);
 END;
 CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_delete AFTER DELETE ON messages BEGIN
    INSERT INTO messages_fts_trigram(messages_fts_trigram, rowid, content) VALUES('delete', old.id, old.content);
 END;
 CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_update AFTER UPDATE ON messages BEGIN
    INSERT INTO messages_fts_trigram(messages_fts_trigram, rowid, content) VALUES('delete', old.id, old.content);
    INSERT INTO messages_fts_trigram(rowid, content) VALUES (new.id, new.content);
 END;
 """
 class SessionDB:
    """
@ -366,6 +392,18 @@ class SessionDB:
                except sqlite3.OperationalError:
                    pass  # Column already exists
                cursor.execute("UPDATE schema_version SET version = 9")
            if current_version < 10:
                # v10: trigram FTS5 table for CJK/substring search.
                # Created via FTS_TRIGRAM_SQL below; backfill existing messages.
                try:
                    cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0")
                except sqlite3.OperationalError:
                    cursor.executescript(FTS_TRIGRAM_SQL)
                    cursor.execute(
                        "INSERT INTO messages_fts_trigram(rowid, content) "
                        "SELECT id, content FROM messages WHERE content IS NOT NULL"
                    )
                cursor.execute("UPDATE schema_version SET version = 10")
        # Unique title index — always ensure it exists (safe to run after migrations
        # since the title column is guaranteed to exist at this point)
@ -383,6 +421,12 @@ class SessionDB:
        except sqlite3.OperationalError:
            cursor.executescript(FTS_SQL)
        # Trigram FTS5 for CJK/substring search
        try:
            cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0")
        except sqlite3.OperationalError:
            cursor.executescript(FTS_TRIGRAM_SQL)
        self._conn.commit()
    # =========================================================================
@ -1291,6 +1335,16 @@ class SessionDB:
        return sanitized.strip()
    @staticmethod
    def _is_cjk_codepoint(cp: int) -> bool:
        return (0x4E00 <= cp <= 0x9FFF or    # CJK Unified Ideographs
                0x3400 <= cp <= 0x4DBF or    # CJK Extension A
                0x20000 <= cp <= 0x2A6DF or  # CJK Extension B
                0x3000 <= cp <= 0x303F or    # CJK Symbols
                0x3040 <= cp <= 0x309F or    # Hiragana
                0x30A0 <= cp <= 0x30FF or    # Katakana
                0xAC00 <= cp <= 0xD7AF)      # Hangul Syllables
    @staticmethod
    def _contains_cjk(text: str) -> bool:
        """Check if text contains CJK (Chinese, Japanese, Korean) characters."""
@ -1306,6 +1360,11 @@ class SessionDB:
                return True
        return False
    @classmethod
    def _count_cjk(cls, text: str) -> int:
        """Count CJK characters in text."""
        return sum(1 for ch in text if cls._is_cjk_codepoint(ord(ch)))
    def search_messages(
        self,
        query: str,
@ -1376,52 +1435,113 @@ class SessionDB:
            LIMIT ? OFFSET ?
        """
-        with self._lock:
+        # CJK queries bypass the unicode61 FTS5 table.  The default tokenizer
-            try:
+        # splits CJK characters into individual tokens, so "大别山项目" becomes
-                cursor = self._conn.execute(sql, params)
+        # "大 AND 别 AND 山 AND 项 AND 目" — producing false positives and
-            except sqlite3.OperationalError:
+        # missing exact phrase matches.
-                # FTS5 query syntax error despite sanitization — return empty
+        #
-                # unless query contains CJK (fall back to LIKE below)
+        # For queries with 3+ CJK characters, we use the trigram FTS5 table
-                if not self._contains_cjk(query):
+        # (indexed substring matching with ranking and snippets).  For shorter
-                    return []
+        # CJK queries (1-2 chars), trigram can't match (it needs ≥9 UTF-8
-                matches = []
+        # bytes = 3 CJK chars), so we fall back to LIKE.
-            else:
+        is_cjk = self._contains_cjk(query)
-                matches = [dict(row) for row in cursor.fetchall()]
+        if is_cjk:
        # LIKE fallback for CJK queries: FTS5 default tokenizer splits CJK
        # characters individually, causing multi-character queries to fail.
        if not matches and self._contains_cjk(query):
            raw_query = query.strip('"').strip()
-            like_where = ["m.content LIKE ?"]
+            cjk_count = self._count_cjk(raw_query)
-            like_params: list = [f"%{raw_query}%"]
+
-            if source_filter is not None:
+            if cjk_count >= 3:
-                like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
+                # Trigram FTS5 path — quote each non-operator token to handle
-                like_params.extend(source_filter)
+                # FTS5 special chars (%, *, etc.) while preserving boolean
-            if exclude_sources is not None:
+                # operators (AND, OR, NOT) for multi-term queries.
-                like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
+                tokens = raw_query.split()
-                like_params.extend(exclude_sources)
+                parts = []
-            if role_filter:
+                for tok in tokens:
-                like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
+                    if tok.upper() in ("AND", "OR", "NOT"):
-                like_params.extend(role_filter)
+                        parts.append(tok)
-            like_sql = f"""
+                    else:
-                SELECT m.id, m.session_id, m.role,
+                        parts.append('"' + tok.replace('"', '""') + '"')
-                       substr(m.content,
+                trigram_query = " ".join(parts)
-                              max(1, instr(m.content, ?) - 40),
+                tri_where = ["messages_fts_trigram MATCH ?"]
-                              120) AS snippet,
+                tri_params: list = [trigram_query]
-                       m.content, m.timestamp, m.tool_name,
+                if source_filter is not None:
-                       s.source, s.model, s.started_at AS session_started
+                    tri_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
-                FROM messages m
+                    tri_params.extend(source_filter)
-                JOIN sessions s ON s.id = m.session_id
+                if exclude_sources is not None:
-                WHERE {' AND '.join(like_where)}
+                    tri_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
-                ORDER BY m.timestamp DESC
+                    tri_params.extend(exclude_sources)
-                LIMIT ? OFFSET ?
+                if role_filter:
-            """
+                    tri_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
-            like_params.extend([limit, offset])
+                    tri_params.extend(role_filter)
-            # instr() parameter goes first in the bound list
+                tri_sql = f"""
-            like_params = [raw_query] + like_params
+                    SELECT
                        m.id,
                        m.session_id,
                        m.role,
                        snippet(messages_fts_trigram, 0, '>>>', '<<<', '...', 40) AS snippet,
                        m.content,
                        m.timestamp,
                        m.tool_name,
                        s.source,
                        s.model,
                        s.started_at AS session_started
                    FROM messages_fts_trigram
                    JOIN messages m ON m.id = messages_fts_trigram.rowid
                    JOIN sessions s ON s.id = m.session_id
                    WHERE {' AND '.join(tri_where)}
                    ORDER BY rank
                    LIMIT ? OFFSET ?
                """
                tri_params.extend([limit, offset])
                with self._lock:
                    try:
                        tri_cursor = self._conn.execute(tri_sql, tri_params)
                    except sqlite3.OperationalError:
                        matches = []
                    else:
                        matches = [dict(row) for row in tri_cursor.fetchall()]
            else:
                # Short CJK query (1-2 chars) — trigram needs ≥3 CJK chars.
                # Fall back to LIKE substring search.
                escaped = raw_query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
                like_where = ["m.content LIKE ? ESCAPE '\\'"]
                like_params: list = [f"%{escaped}%"]
                if source_filter is not None:
                    like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
                    like_params.extend(source_filter)
                if exclude_sources is not None:
                    like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
                    like_params.extend(exclude_sources)
                if role_filter:
                    like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
                    like_params.extend(role_filter)
                like_sql = f"""
                    SELECT m.id, m.session_id, m.role,
                           substr(m.content,
                                  max(1, instr(m.content, ?) - 40),
                                  120) AS snippet,
                           m.content, m.timestamp, m.tool_name,
                           s.source, s.model, s.started_at AS session_started
                    FROM messages m
                    JOIN sessions s ON s.id = m.session_id
                    WHERE {' AND '.join(like_where)}
                    ORDER BY m.timestamp DESC
                    LIMIT ? OFFSET ?
                """
                like_params.extend([limit, offset])
                # instr() parameter goes first in the bound list
                like_params = [raw_query] + like_params
                with self._lock:
                    like_cursor = self._conn.execute(like_sql, like_params)
                    matches = [dict(row) for row in like_cursor.fetchall()]
        else:
            with self._lock:
-                like_cursor = self._conn.execute(like_sql, like_params)
+                try:
-                matches = [dict(row) for row in like_cursor.fetchall()]
+                    cursor = self._conn.execute(sql, params)
                except sqlite3.OperationalError:
                    # FTS5 query syntax error despite sanitization — return empty
                    return []
                else:
                    matches = [dict(row) for row in cursor.fetchall()]
        # Add surrounding context (1 message before + after each match).
        # Done outside the lock so we don't hold it across N sequential queries.
--- a/tests/test_hermes_state.py
+++ b/tests/test_hermes_state.py
@ -772,6 +772,51 @@ class TestCJKSearchFallback:
        results = db.search_messages("Agent通信")
        assert len(results) == 1
    def test_cjk_partial_fts5_results_supplemented_by_like(self, db):
        """When FTS5 returns *some* CJK results, LIKE must still find all matches.
        Regression test for #15500 / #14829: FTS5 unicode61 tokenizer drops
        certain CJK characters, so multi-character queries may return partial
        results.  The LIKE path must always run for CJK queries.
        """
        db.create_session(session_id="s1", source="cli")
        db.create_session(session_id="s2", source="telegram")
        db.append_message("s1", role="user", content="昨晚讨论了记忆系统")
        db.append_message("s2", role="user", content="昨晚的会议纪要已发送")
        results = db.search_messages("昨晚")
        assert len(results) == 2
        session_ids = {r["session_id"] for r in results}
        assert session_ids == {"s1", "s2"}
    def test_cjk_like_dedup_no_duplicates(self, db):
        """When FTS5 and LIKE both find the same message, no duplicates."""
        db.create_session(session_id="s1", source="cli")
        db.append_message("s1", role="user", content="测试去重逻辑")
        results = db.search_messages("测试")
        assert len(results) == 1
    def test_cjk_like_escapes_wildcards(self, db):
        """Special characters (%, _) in CJK queries are treated as literals."""
        db.create_session(session_id="s1", source="cli")
        db.create_session(session_id="s2", source="cli")
        db.append_message("s1", role="user", content="达成100%完成率")
        db.append_message("s2", role="user", content="达成100完成率是目标")
        # The % in the query must be literal — should only match s1
        results = db.search_messages("100%完成")
        assert len(results) == 1
        assert results[0]["session_id"] == "s1"
    def test_cjk_trigram_preserves_boolean_operators(self, db):
        """Boolean operators (OR, AND, NOT) work in CJK trigram queries."""
        db.create_session(session_id="s1", source="cli")
        db.create_session(session_id="s2", source="cli")
        db.append_message("s1", role="user", content="记忆系统很好用")
        db.append_message("s2", role="user", content="断裂连接需要修复")
        results = db.search_messages("记忆系统 OR 断裂连接")
        assert len(results) == 2
        session_ids = {r["session_id"] for r in results}
        assert session_ids == {"s1", "s2"}
 # =========================================================================
 # Session search and listing
@ -1229,7 +1274,7 @@ class TestSchemaInit:
    def test_schema_version(self, db):
        cursor = db._conn.execute("SELECT version FROM schema_version")
        version = cursor.fetchone()[0]
-        assert version == 9
+        assert version == 10
    def test_title_column_exists(self, db):
        """Verify the title column was created in the sessions table."""
@ -1290,7 +1335,7 @@ class TestSchemaInit:
        # Verify migration
        cursor = migrated_db._conn.execute("SELECT version FROM schema_version")
-        assert cursor.fetchone()[0] == 9
+        assert cursor.fetchone()[0] == 10
        # Verify title column exists and is NULL for existing sessions
        session = migrated_db.get_session("existing")