feat: trigram FTS5 index for CJK search, replace LIKE fallback (#16651)

* fix: bypass FTS5 for CJK queries in session_search

FTS5 default tokenizer splits CJK characters into individual tokens,
so multi-character queries like "大别山项目" become AND of single chars.
This produces few/no results compared to LIKE substring search.

For CJK queries, skip FTS5 entirely and use LIKE for accurate
phrase matching.

Fixes NousResearch/hermes-agent#15500

* fix: cache _contains_cjk, escape LIKE wildcards, add regression tests

On top of the CJK FTS5 bypass from #15509:

- Cache _contains_cjk() result in a local var to avoid redundant O(n)
  scans on every CJK query
- Escape %, _ in LIKE queries so literal wildcards in user input are
  not treated as SQL wildcards (consistent with other LIKE queries in
  hermes_state.py that use ESCAPE '\')
- Fix misleading comment ('or CJK fallback' → accurate description)
- Add 3 regression tests:
  - test_cjk_partial_fts5_results_supplemented_by_like (#15500 / #14829)
  - test_cjk_like_dedup_no_duplicates
  - test_cjk_like_escapes_wildcards (new wildcard escaping)

* feat: trigram FTS5 index for CJK search, replace LIKE fallback

Replace the LIKE '%query%' full-table-scan fallback for CJK queries with
a proper trigram FTS5 index (messages_fts_trigram).  The trigram tokenizer
creates overlapping 3-byte sequences so substring matching works natively
for any script — CJK, Thai, etc.

For queries with 3+ CJK characters: uses the trigram FTS5 table with
proper ranking, snippets, and indexed lookups.  For shorter queries
(1-2 CJK chars): falls back to LIKE since the trigram tokenizer needs
≥9 UTF-8 bytes (3 CJK chars) minimum.

Schema v10 migration creates the trigram table and backfills existing
messages.  Triggers keep the index in sync on INSERT/UPDATE/DELETE.

Builds on top of #16276 (bypass FTS5 for CJK, escape LIKE wildcards).

---------

Co-authored-by: vominh1919 <vominh1919@gmail.com>
This commit is contained in:
Siddharth Balyan
2026-04-28 00:12:07 +05:30
committed by GitHub
parent e80504b088
commit 1fa76607c0
2 changed files with 212 additions and 47 deletions

View File

@ -31,7 +31,7 @@ T = TypeVar("T")
DEFAULT_DB_PATH = get_hermes_home() / "state.db" DEFAULT_DB_PATH = get_hermes_home() / "state.db"
SCHEMA_VERSION = 9 SCHEMA_VERSION = 10
SCHEMA_SQL = """ SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS schema_version ( CREATE TABLE IF NOT EXISTS schema_version (
@ -119,6 +119,32 @@ CREATE TRIGGER IF NOT EXISTS messages_fts_update AFTER UPDATE ON messages BEGIN
END; END;
""" """
# Trigram FTS5 table for CJK substring search. The default unicode61
# tokenizer splits CJK characters into individual tokens, breaking phrase
# matching. The trigram tokenizer creates overlapping 3-byte sequences so
# substring queries work natively for any script (CJK, Thai, etc.).
FTS_TRIGRAM_SQL = """
CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts_trigram USING fts5(
content,
content=messages,
content_rowid=id,
tokenize='trigram'
);
CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_insert AFTER INSERT ON messages BEGIN
INSERT INTO messages_fts_trigram(rowid, content) VALUES (new.id, new.content);
END;
CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_delete AFTER DELETE ON messages BEGIN
INSERT INTO messages_fts_trigram(messages_fts_trigram, rowid, content) VALUES('delete', old.id, old.content);
END;
CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_update AFTER UPDATE ON messages BEGIN
INSERT INTO messages_fts_trigram(messages_fts_trigram, rowid, content) VALUES('delete', old.id, old.content);
INSERT INTO messages_fts_trigram(rowid, content) VALUES (new.id, new.content);
END;
"""
class SessionDB: class SessionDB:
""" """
@ -366,6 +392,18 @@ class SessionDB:
except sqlite3.OperationalError: except sqlite3.OperationalError:
pass # Column already exists pass # Column already exists
cursor.execute("UPDATE schema_version SET version = 9") cursor.execute("UPDATE schema_version SET version = 9")
if current_version < 10:
# v10: trigram FTS5 table for CJK/substring search.
# Created via FTS_TRIGRAM_SQL below; backfill existing messages.
try:
cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0")
except sqlite3.OperationalError:
cursor.executescript(FTS_TRIGRAM_SQL)
cursor.execute(
"INSERT INTO messages_fts_trigram(rowid, content) "
"SELECT id, content FROM messages WHERE content IS NOT NULL"
)
cursor.execute("UPDATE schema_version SET version = 10")
# Unique title index — always ensure it exists (safe to run after migrations # Unique title index — always ensure it exists (safe to run after migrations
# since the title column is guaranteed to exist at this point) # since the title column is guaranteed to exist at this point)
@ -383,6 +421,12 @@ class SessionDB:
except sqlite3.OperationalError: except sqlite3.OperationalError:
cursor.executescript(FTS_SQL) cursor.executescript(FTS_SQL)
# Trigram FTS5 for CJK/substring search
try:
cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0")
except sqlite3.OperationalError:
cursor.executescript(FTS_TRIGRAM_SQL)
self._conn.commit() self._conn.commit()
# ========================================================================= # =========================================================================
@ -1291,6 +1335,16 @@ class SessionDB:
return sanitized.strip() return sanitized.strip()
@staticmethod
def _is_cjk_codepoint(cp: int) -> bool:
return (0x4E00 <= cp <= 0x9FFF or # CJK Unified Ideographs
0x3400 <= cp <= 0x4DBF or # CJK Extension A
0x20000 <= cp <= 0x2A6DF or # CJK Extension B
0x3000 <= cp <= 0x303F or # CJK Symbols
0x3040 <= cp <= 0x309F or # Hiragana
0x30A0 <= cp <= 0x30FF or # Katakana
0xAC00 <= cp <= 0xD7AF) # Hangul Syllables
@staticmethod @staticmethod
def _contains_cjk(text: str) -> bool: def _contains_cjk(text: str) -> bool:
"""Check if text contains CJK (Chinese, Japanese, Korean) characters.""" """Check if text contains CJK (Chinese, Japanese, Korean) characters."""
@ -1306,6 +1360,11 @@ class SessionDB:
return True return True
return False return False
@classmethod
def _count_cjk(cls, text: str) -> int:
"""Count CJK characters in text."""
return sum(1 for ch in text if cls._is_cjk_codepoint(ord(ch)))
def search_messages( def search_messages(
self, self,
query: str, query: str,
@ -1376,52 +1435,113 @@ class SessionDB:
LIMIT ? OFFSET ? LIMIT ? OFFSET ?
""" """
with self._lock: # CJK queries bypass the unicode61 FTS5 table. The default tokenizer
try: # splits CJK characters into individual tokens, so "大别山项目" becomes
cursor = self._conn.execute(sql, params) # "大 AND 别 AND 山 AND 项 AND 目" — producing false positives and
except sqlite3.OperationalError: # missing exact phrase matches.
# FTS5 query syntax error despite sanitization — return empty #
# unless query contains CJK (fall back to LIKE below) # For queries with 3+ CJK characters, we use the trigram FTS5 table
if not self._contains_cjk(query): # (indexed substring matching with ranking and snippets). For shorter
return [] # CJK queries (1-2 chars), trigram can't match (it needs ≥9 UTF-8
matches = [] # bytes = 3 CJK chars), so we fall back to LIKE.
else: is_cjk = self._contains_cjk(query)
matches = [dict(row) for row in cursor.fetchall()] if is_cjk:
# LIKE fallback for CJK queries: FTS5 default tokenizer splits CJK
# characters individually, causing multi-character queries to fail.
if not matches and self._contains_cjk(query):
raw_query = query.strip('"').strip() raw_query = query.strip('"').strip()
like_where = ["m.content LIKE ?"] cjk_count = self._count_cjk(raw_query)
like_params: list = [f"%{raw_query}%"]
if source_filter is not None: if cjk_count >= 3:
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})") # Trigram FTS5 path — quote each non-operator token to handle
like_params.extend(source_filter) # FTS5 special chars (%, *, etc.) while preserving boolean
if exclude_sources is not None: # operators (AND, OR, NOT) for multi-term queries.
like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})") tokens = raw_query.split()
like_params.extend(exclude_sources) parts = []
if role_filter: for tok in tokens:
like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})") if tok.upper() in ("AND", "OR", "NOT"):
like_params.extend(role_filter) parts.append(tok)
like_sql = f""" else:
SELECT m.id, m.session_id, m.role, parts.append('"' + tok.replace('"', '""') + '"')
substr(m.content, trigram_query = " ".join(parts)
max(1, instr(m.content, ?) - 40), tri_where = ["messages_fts_trigram MATCH ?"]
120) AS snippet, tri_params: list = [trigram_query]
m.content, m.timestamp, m.tool_name, if source_filter is not None:
s.source, s.model, s.started_at AS session_started tri_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
FROM messages m tri_params.extend(source_filter)
JOIN sessions s ON s.id = m.session_id if exclude_sources is not None:
WHERE {' AND '.join(like_where)} tri_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
ORDER BY m.timestamp DESC tri_params.extend(exclude_sources)
LIMIT ? OFFSET ? if role_filter:
""" tri_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
like_params.extend([limit, offset]) tri_params.extend(role_filter)
# instr() parameter goes first in the bound list tri_sql = f"""
like_params = [raw_query] + like_params SELECT
m.id,
m.session_id,
m.role,
snippet(messages_fts_trigram, 0, '>>>', '<<<', '...', 40) AS snippet,
m.content,
m.timestamp,
m.tool_name,
s.source,
s.model,
s.started_at AS session_started
FROM messages_fts_trigram
JOIN messages m ON m.id = messages_fts_trigram.rowid
JOIN sessions s ON s.id = m.session_id
WHERE {' AND '.join(tri_where)}
ORDER BY rank
LIMIT ? OFFSET ?
"""
tri_params.extend([limit, offset])
with self._lock:
try:
tri_cursor = self._conn.execute(tri_sql, tri_params)
except sqlite3.OperationalError:
matches = []
else:
matches = [dict(row) for row in tri_cursor.fetchall()]
else:
# Short CJK query (1-2 chars) — trigram needs ≥3 CJK chars.
# Fall back to LIKE substring search.
escaped = raw_query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
like_where = ["m.content LIKE ? ESCAPE '\\'"]
like_params: list = [f"%{escaped}%"]
if source_filter is not None:
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
like_params.extend(source_filter)
if exclude_sources is not None:
like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
like_params.extend(exclude_sources)
if role_filter:
like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
like_params.extend(role_filter)
like_sql = f"""
SELECT m.id, m.session_id, m.role,
substr(m.content,
max(1, instr(m.content, ?) - 40),
120) AS snippet,
m.content, m.timestamp, m.tool_name,
s.source, s.model, s.started_at AS session_started
FROM messages m
JOIN sessions s ON s.id = m.session_id
WHERE {' AND '.join(like_where)}
ORDER BY m.timestamp DESC
LIMIT ? OFFSET ?
"""
like_params.extend([limit, offset])
# instr() parameter goes first in the bound list
like_params = [raw_query] + like_params
with self._lock:
like_cursor = self._conn.execute(like_sql, like_params)
matches = [dict(row) for row in like_cursor.fetchall()]
else:
with self._lock: with self._lock:
like_cursor = self._conn.execute(like_sql, like_params) try:
matches = [dict(row) for row in like_cursor.fetchall()] cursor = self._conn.execute(sql, params)
except sqlite3.OperationalError:
# FTS5 query syntax error despite sanitization — return empty
return []
else:
matches = [dict(row) for row in cursor.fetchall()]
# Add surrounding context (1 message before + after each match). # Add surrounding context (1 message before + after each match).
# Done outside the lock so we don't hold it across N sequential queries. # Done outside the lock so we don't hold it across N sequential queries.

View File

@ -772,6 +772,51 @@ class TestCJKSearchFallback:
results = db.search_messages("Agent通信") results = db.search_messages("Agent通信")
assert len(results) == 1 assert len(results) == 1
def test_cjk_partial_fts5_results_supplemented_by_like(self, db):
"""When FTS5 returns *some* CJK results, LIKE must still find all matches.
Regression test for #15500 / #14829: FTS5 unicode61 tokenizer drops
certain CJK characters, so multi-character queries may return partial
results. The LIKE path must always run for CJK queries.
"""
db.create_session(session_id="s1", source="cli")
db.create_session(session_id="s2", source="telegram")
db.append_message("s1", role="user", content="昨晚讨论了记忆系统")
db.append_message("s2", role="user", content="昨晚的会议纪要已发送")
results = db.search_messages("昨晚")
assert len(results) == 2
session_ids = {r["session_id"] for r in results}
assert session_ids == {"s1", "s2"}
def test_cjk_like_dedup_no_duplicates(self, db):
"""When FTS5 and LIKE both find the same message, no duplicates."""
db.create_session(session_id="s1", source="cli")
db.append_message("s1", role="user", content="测试去重逻辑")
results = db.search_messages("测试")
assert len(results) == 1
def test_cjk_like_escapes_wildcards(self, db):
"""Special characters (%, _) in CJK queries are treated as literals."""
db.create_session(session_id="s1", source="cli")
db.create_session(session_id="s2", source="cli")
db.append_message("s1", role="user", content="达成100%完成率")
db.append_message("s2", role="user", content="达成100完成率是目标")
# The % in the query must be literal — should only match s1
results = db.search_messages("100%完成")
assert len(results) == 1
assert results[0]["session_id"] == "s1"
def test_cjk_trigram_preserves_boolean_operators(self, db):
"""Boolean operators (OR, AND, NOT) work in CJK trigram queries."""
db.create_session(session_id="s1", source="cli")
db.create_session(session_id="s2", source="cli")
db.append_message("s1", role="user", content="记忆系统很好用")
db.append_message("s2", role="user", content="断裂连接需要修复")
results = db.search_messages("记忆系统 OR 断裂连接")
assert len(results) == 2
session_ids = {r["session_id"] for r in results}
assert session_ids == {"s1", "s2"}
# ========================================================================= # =========================================================================
# Session search and listing # Session search and listing
@ -1229,7 +1274,7 @@ class TestSchemaInit:
def test_schema_version(self, db): def test_schema_version(self, db):
cursor = db._conn.execute("SELECT version FROM schema_version") cursor = db._conn.execute("SELECT version FROM schema_version")
version = cursor.fetchone()[0] version = cursor.fetchone()[0]
assert version == 9 assert version == 10
def test_title_column_exists(self, db): def test_title_column_exists(self, db):
"""Verify the title column was created in the sessions table.""" """Verify the title column was created in the sessions table."""
@ -1290,7 +1335,7 @@ class TestSchemaInit:
# Verify migration # Verify migration
cursor = migrated_db._conn.execute("SELECT version FROM schema_version") cursor = migrated_db._conn.execute("SELECT version FROM schema_version")
assert cursor.fetchone()[0] == 9 assert cursor.fetchone()[0] == 10
# Verify title column exists and is NULL for existing sessions # Verify title column exists and is NULL for existing sessions
session = migrated_db.get_session("existing") session = migrated_db.get_session("existing")