feat: trigram FTS5 index for CJK search, replace LIKE fallback (#16651)
* fix: bypass FTS5 for CJK queries in session_search FTS5 default tokenizer splits CJK characters into individual tokens, so multi-character queries like "大别山项目" become AND of single chars. This produces few/no results compared to LIKE substring search. For CJK queries, skip FTS5 entirely and use LIKE for accurate phrase matching. Fixes NousResearch/hermes-agent#15500 * fix: cache _contains_cjk, escape LIKE wildcards, add regression tests On top of the CJK FTS5 bypass from #15509: - Cache _contains_cjk() result in a local var to avoid redundant O(n) scans on every CJK query - Escape %, _ in LIKE queries so literal wildcards in user input are not treated as SQL wildcards (consistent with other LIKE queries in hermes_state.py that use ESCAPE '\') - Fix misleading comment ('or CJK fallback' → accurate description) - Add 3 regression tests: - test_cjk_partial_fts5_results_supplemented_by_like (#15500 / #14829) - test_cjk_like_dedup_no_duplicates - test_cjk_like_escapes_wildcards (new wildcard escaping) * feat: trigram FTS5 index for CJK search, replace LIKE fallback Replace the LIKE '%query%' full-table-scan fallback for CJK queries with a proper trigram FTS5 index (messages_fts_trigram). The trigram tokenizer creates overlapping 3-byte sequences so substring matching works natively for any script — CJK, Thai, etc. For queries with 3+ CJK characters: uses the trigram FTS5 table with proper ranking, snippets, and indexed lookups. For shorter queries (1-2 CJK chars): falls back to LIKE since the trigram tokenizer needs ≥9 UTF-8 bytes (3 CJK chars) minimum. Schema v10 migration creates the trigram table and backfills existing messages. Triggers keep the index in sync on INSERT/UPDATE/DELETE. Builds on top of #16276 (bypass FTS5 for CJK, escape LIKE wildcards). --------- Co-authored-by: vominh1919 <vominh1919@gmail.com>
This commit is contained in:
210
hermes_state.py
210
hermes_state.py
@ -31,7 +31,7 @@ T = TypeVar("T")
|
|||||||
|
|
||||||
DEFAULT_DB_PATH = get_hermes_home() / "state.db"
|
DEFAULT_DB_PATH = get_hermes_home() / "state.db"
|
||||||
|
|
||||||
SCHEMA_VERSION = 9
|
SCHEMA_VERSION = 10
|
||||||
|
|
||||||
SCHEMA_SQL = """
|
SCHEMA_SQL = """
|
||||||
CREATE TABLE IF NOT EXISTS schema_version (
|
CREATE TABLE IF NOT EXISTS schema_version (
|
||||||
@ -119,6 +119,32 @@ CREATE TRIGGER IF NOT EXISTS messages_fts_update AFTER UPDATE ON messages BEGIN
|
|||||||
END;
|
END;
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Trigram FTS5 table for CJK substring search. The default unicode61
|
||||||
|
# tokenizer splits CJK characters into individual tokens, breaking phrase
|
||||||
|
# matching. The trigram tokenizer creates overlapping 3-byte sequences so
|
||||||
|
# substring queries work natively for any script (CJK, Thai, etc.).
|
||||||
|
FTS_TRIGRAM_SQL = """
|
||||||
|
CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts_trigram USING fts5(
|
||||||
|
content,
|
||||||
|
content=messages,
|
||||||
|
content_rowid=id,
|
||||||
|
tokenize='trigram'
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_insert AFTER INSERT ON messages BEGIN
|
||||||
|
INSERT INTO messages_fts_trigram(rowid, content) VALUES (new.id, new.content);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_delete AFTER DELETE ON messages BEGIN
|
||||||
|
INSERT INTO messages_fts_trigram(messages_fts_trigram, rowid, content) VALUES('delete', old.id, old.content);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_update AFTER UPDATE ON messages BEGIN
|
||||||
|
INSERT INTO messages_fts_trigram(messages_fts_trigram, rowid, content) VALUES('delete', old.id, old.content);
|
||||||
|
INSERT INTO messages_fts_trigram(rowid, content) VALUES (new.id, new.content);
|
||||||
|
END;
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class SessionDB:
|
class SessionDB:
|
||||||
"""
|
"""
|
||||||
@ -366,6 +392,18 @@ class SessionDB:
|
|||||||
except sqlite3.OperationalError:
|
except sqlite3.OperationalError:
|
||||||
pass # Column already exists
|
pass # Column already exists
|
||||||
cursor.execute("UPDATE schema_version SET version = 9")
|
cursor.execute("UPDATE schema_version SET version = 9")
|
||||||
|
if current_version < 10:
|
||||||
|
# v10: trigram FTS5 table for CJK/substring search.
|
||||||
|
# Created via FTS_TRIGRAM_SQL below; backfill existing messages.
|
||||||
|
try:
|
||||||
|
cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
cursor.executescript(FTS_TRIGRAM_SQL)
|
||||||
|
cursor.execute(
|
||||||
|
"INSERT INTO messages_fts_trigram(rowid, content) "
|
||||||
|
"SELECT id, content FROM messages WHERE content IS NOT NULL"
|
||||||
|
)
|
||||||
|
cursor.execute("UPDATE schema_version SET version = 10")
|
||||||
|
|
||||||
# Unique title index — always ensure it exists (safe to run after migrations
|
# Unique title index — always ensure it exists (safe to run after migrations
|
||||||
# since the title column is guaranteed to exist at this point)
|
# since the title column is guaranteed to exist at this point)
|
||||||
@ -383,6 +421,12 @@ class SessionDB:
|
|||||||
except sqlite3.OperationalError:
|
except sqlite3.OperationalError:
|
||||||
cursor.executescript(FTS_SQL)
|
cursor.executescript(FTS_SQL)
|
||||||
|
|
||||||
|
# Trigram FTS5 for CJK/substring search
|
||||||
|
try:
|
||||||
|
cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
cursor.executescript(FTS_TRIGRAM_SQL)
|
||||||
|
|
||||||
self._conn.commit()
|
self._conn.commit()
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
@ -1291,6 +1335,16 @@ class SessionDB:
|
|||||||
return sanitized.strip()
|
return sanitized.strip()
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_cjk_codepoint(cp: int) -> bool:
|
||||||
|
return (0x4E00 <= cp <= 0x9FFF or # CJK Unified Ideographs
|
||||||
|
0x3400 <= cp <= 0x4DBF or # CJK Extension A
|
||||||
|
0x20000 <= cp <= 0x2A6DF or # CJK Extension B
|
||||||
|
0x3000 <= cp <= 0x303F or # CJK Symbols
|
||||||
|
0x3040 <= cp <= 0x309F or # Hiragana
|
||||||
|
0x30A0 <= cp <= 0x30FF or # Katakana
|
||||||
|
0xAC00 <= cp <= 0xD7AF) # Hangul Syllables
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _contains_cjk(text: str) -> bool:
|
def _contains_cjk(text: str) -> bool:
|
||||||
"""Check if text contains CJK (Chinese, Japanese, Korean) characters."""
|
"""Check if text contains CJK (Chinese, Japanese, Korean) characters."""
|
||||||
@ -1306,6 +1360,11 @@ class SessionDB:
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _count_cjk(cls, text: str) -> int:
|
||||||
|
"""Count CJK characters in text."""
|
||||||
|
return sum(1 for ch in text if cls._is_cjk_codepoint(ord(ch)))
|
||||||
|
|
||||||
def search_messages(
|
def search_messages(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
@ -1376,52 +1435,113 @@ class SessionDB:
|
|||||||
LIMIT ? OFFSET ?
|
LIMIT ? OFFSET ?
|
||||||
"""
|
"""
|
||||||
|
|
||||||
with self._lock:
|
# CJK queries bypass the unicode61 FTS5 table. The default tokenizer
|
||||||
try:
|
# splits CJK characters into individual tokens, so "大别山项目" becomes
|
||||||
cursor = self._conn.execute(sql, params)
|
# "大 AND 别 AND 山 AND 项 AND 目" — producing false positives and
|
||||||
except sqlite3.OperationalError:
|
# missing exact phrase matches.
|
||||||
# FTS5 query syntax error despite sanitization — return empty
|
#
|
||||||
# unless query contains CJK (fall back to LIKE below)
|
# For queries with 3+ CJK characters, we use the trigram FTS5 table
|
||||||
if not self._contains_cjk(query):
|
# (indexed substring matching with ranking and snippets). For shorter
|
||||||
return []
|
# CJK queries (1-2 chars), trigram can't match (it needs ≥9 UTF-8
|
||||||
matches = []
|
# bytes = 3 CJK chars), so we fall back to LIKE.
|
||||||
else:
|
is_cjk = self._contains_cjk(query)
|
||||||
matches = [dict(row) for row in cursor.fetchall()]
|
if is_cjk:
|
||||||
|
|
||||||
# LIKE fallback for CJK queries: FTS5 default tokenizer splits CJK
|
|
||||||
# characters individually, causing multi-character queries to fail.
|
|
||||||
if not matches and self._contains_cjk(query):
|
|
||||||
raw_query = query.strip('"').strip()
|
raw_query = query.strip('"').strip()
|
||||||
like_where = ["m.content LIKE ?"]
|
cjk_count = self._count_cjk(raw_query)
|
||||||
like_params: list = [f"%{raw_query}%"]
|
|
||||||
if source_filter is not None:
|
if cjk_count >= 3:
|
||||||
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
# Trigram FTS5 path — quote each non-operator token to handle
|
||||||
like_params.extend(source_filter)
|
# FTS5 special chars (%, *, etc.) while preserving boolean
|
||||||
if exclude_sources is not None:
|
# operators (AND, OR, NOT) for multi-term queries.
|
||||||
like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
|
tokens = raw_query.split()
|
||||||
like_params.extend(exclude_sources)
|
parts = []
|
||||||
if role_filter:
|
for tok in tokens:
|
||||||
like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
|
if tok.upper() in ("AND", "OR", "NOT"):
|
||||||
like_params.extend(role_filter)
|
parts.append(tok)
|
||||||
like_sql = f"""
|
else:
|
||||||
SELECT m.id, m.session_id, m.role,
|
parts.append('"' + tok.replace('"', '""') + '"')
|
||||||
substr(m.content,
|
trigram_query = " ".join(parts)
|
||||||
max(1, instr(m.content, ?) - 40),
|
tri_where = ["messages_fts_trigram MATCH ?"]
|
||||||
120) AS snippet,
|
tri_params: list = [trigram_query]
|
||||||
m.content, m.timestamp, m.tool_name,
|
if source_filter is not None:
|
||||||
s.source, s.model, s.started_at AS session_started
|
tri_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
||||||
FROM messages m
|
tri_params.extend(source_filter)
|
||||||
JOIN sessions s ON s.id = m.session_id
|
if exclude_sources is not None:
|
||||||
WHERE {' AND '.join(like_where)}
|
tri_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
|
||||||
ORDER BY m.timestamp DESC
|
tri_params.extend(exclude_sources)
|
||||||
LIMIT ? OFFSET ?
|
if role_filter:
|
||||||
"""
|
tri_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
|
||||||
like_params.extend([limit, offset])
|
tri_params.extend(role_filter)
|
||||||
# instr() parameter goes first in the bound list
|
tri_sql = f"""
|
||||||
like_params = [raw_query] + like_params
|
SELECT
|
||||||
|
m.id,
|
||||||
|
m.session_id,
|
||||||
|
m.role,
|
||||||
|
snippet(messages_fts_trigram, 0, '>>>', '<<<', '...', 40) AS snippet,
|
||||||
|
m.content,
|
||||||
|
m.timestamp,
|
||||||
|
m.tool_name,
|
||||||
|
s.source,
|
||||||
|
s.model,
|
||||||
|
s.started_at AS session_started
|
||||||
|
FROM messages_fts_trigram
|
||||||
|
JOIN messages m ON m.id = messages_fts_trigram.rowid
|
||||||
|
JOIN sessions s ON s.id = m.session_id
|
||||||
|
WHERE {' AND '.join(tri_where)}
|
||||||
|
ORDER BY rank
|
||||||
|
LIMIT ? OFFSET ?
|
||||||
|
"""
|
||||||
|
tri_params.extend([limit, offset])
|
||||||
|
with self._lock:
|
||||||
|
try:
|
||||||
|
tri_cursor = self._conn.execute(tri_sql, tri_params)
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
matches = []
|
||||||
|
else:
|
||||||
|
matches = [dict(row) for row in tri_cursor.fetchall()]
|
||||||
|
else:
|
||||||
|
# Short CJK query (1-2 chars) — trigram needs ≥3 CJK chars.
|
||||||
|
# Fall back to LIKE substring search.
|
||||||
|
escaped = raw_query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
|
||||||
|
like_where = ["m.content LIKE ? ESCAPE '\\'"]
|
||||||
|
like_params: list = [f"%{escaped}%"]
|
||||||
|
if source_filter is not None:
|
||||||
|
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
||||||
|
like_params.extend(source_filter)
|
||||||
|
if exclude_sources is not None:
|
||||||
|
like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
|
||||||
|
like_params.extend(exclude_sources)
|
||||||
|
if role_filter:
|
||||||
|
like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
|
||||||
|
like_params.extend(role_filter)
|
||||||
|
like_sql = f"""
|
||||||
|
SELECT m.id, m.session_id, m.role,
|
||||||
|
substr(m.content,
|
||||||
|
max(1, instr(m.content, ?) - 40),
|
||||||
|
120) AS snippet,
|
||||||
|
m.content, m.timestamp, m.tool_name,
|
||||||
|
s.source, s.model, s.started_at AS session_started
|
||||||
|
FROM messages m
|
||||||
|
JOIN sessions s ON s.id = m.session_id
|
||||||
|
WHERE {' AND '.join(like_where)}
|
||||||
|
ORDER BY m.timestamp DESC
|
||||||
|
LIMIT ? OFFSET ?
|
||||||
|
"""
|
||||||
|
like_params.extend([limit, offset])
|
||||||
|
# instr() parameter goes first in the bound list
|
||||||
|
like_params = [raw_query] + like_params
|
||||||
|
with self._lock:
|
||||||
|
like_cursor = self._conn.execute(like_sql, like_params)
|
||||||
|
matches = [dict(row) for row in like_cursor.fetchall()]
|
||||||
|
else:
|
||||||
with self._lock:
|
with self._lock:
|
||||||
like_cursor = self._conn.execute(like_sql, like_params)
|
try:
|
||||||
matches = [dict(row) for row in like_cursor.fetchall()]
|
cursor = self._conn.execute(sql, params)
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
# FTS5 query syntax error despite sanitization — return empty
|
||||||
|
return []
|
||||||
|
else:
|
||||||
|
matches = [dict(row) for row in cursor.fetchall()]
|
||||||
|
|
||||||
# Add surrounding context (1 message before + after each match).
|
# Add surrounding context (1 message before + after each match).
|
||||||
# Done outside the lock so we don't hold it across N sequential queries.
|
# Done outside the lock so we don't hold it across N sequential queries.
|
||||||
|
|||||||
@ -772,6 +772,51 @@ class TestCJKSearchFallback:
|
|||||||
results = db.search_messages("Agent通信")
|
results = db.search_messages("Agent通信")
|
||||||
assert len(results) == 1
|
assert len(results) == 1
|
||||||
|
|
||||||
|
def test_cjk_partial_fts5_results_supplemented_by_like(self, db):
|
||||||
|
"""When FTS5 returns *some* CJK results, LIKE must still find all matches.
|
||||||
|
|
||||||
|
Regression test for #15500 / #14829: FTS5 unicode61 tokenizer drops
|
||||||
|
certain CJK characters, so multi-character queries may return partial
|
||||||
|
results. The LIKE path must always run for CJK queries.
|
||||||
|
"""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.create_session(session_id="s2", source="telegram")
|
||||||
|
db.append_message("s1", role="user", content="昨晚讨论了记忆系统")
|
||||||
|
db.append_message("s2", role="user", content="昨晚的会议纪要已发送")
|
||||||
|
results = db.search_messages("昨晚")
|
||||||
|
assert len(results) == 2
|
||||||
|
session_ids = {r["session_id"] for r in results}
|
||||||
|
assert session_ids == {"s1", "s2"}
|
||||||
|
|
||||||
|
def test_cjk_like_dedup_no_duplicates(self, db):
|
||||||
|
"""When FTS5 and LIKE both find the same message, no duplicates."""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.append_message("s1", role="user", content="测试去重逻辑")
|
||||||
|
results = db.search_messages("测试")
|
||||||
|
assert len(results) == 1
|
||||||
|
|
||||||
|
def test_cjk_like_escapes_wildcards(self, db):
|
||||||
|
"""Special characters (%, _) in CJK queries are treated as literals."""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.create_session(session_id="s2", source="cli")
|
||||||
|
db.append_message("s1", role="user", content="达成100%完成率")
|
||||||
|
db.append_message("s2", role="user", content="达成100完成率是目标")
|
||||||
|
# The % in the query must be literal — should only match s1
|
||||||
|
results = db.search_messages("100%完成")
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0]["session_id"] == "s1"
|
||||||
|
|
||||||
|
def test_cjk_trigram_preserves_boolean_operators(self, db):
|
||||||
|
"""Boolean operators (OR, AND, NOT) work in CJK trigram queries."""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.create_session(session_id="s2", source="cli")
|
||||||
|
db.append_message("s1", role="user", content="记忆系统很好用")
|
||||||
|
db.append_message("s2", role="user", content="断裂连接需要修复")
|
||||||
|
results = db.search_messages("记忆系统 OR 断裂连接")
|
||||||
|
assert len(results) == 2
|
||||||
|
session_ids = {r["session_id"] for r in results}
|
||||||
|
assert session_ids == {"s1", "s2"}
|
||||||
|
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Session search and listing
|
# Session search and listing
|
||||||
@ -1229,7 +1274,7 @@ class TestSchemaInit:
|
|||||||
def test_schema_version(self, db):
|
def test_schema_version(self, db):
|
||||||
cursor = db._conn.execute("SELECT version FROM schema_version")
|
cursor = db._conn.execute("SELECT version FROM schema_version")
|
||||||
version = cursor.fetchone()[0]
|
version = cursor.fetchone()[0]
|
||||||
assert version == 9
|
assert version == 10
|
||||||
|
|
||||||
def test_title_column_exists(self, db):
|
def test_title_column_exists(self, db):
|
||||||
"""Verify the title column was created in the sessions table."""
|
"""Verify the title column was created in the sessions table."""
|
||||||
@ -1290,7 +1335,7 @@ class TestSchemaInit:
|
|||||||
|
|
||||||
# Verify migration
|
# Verify migration
|
||||||
cursor = migrated_db._conn.execute("SELECT version FROM schema_version")
|
cursor = migrated_db._conn.execute("SELECT version FROM schema_version")
|
||||||
assert cursor.fetchone()[0] == 9
|
assert cursor.fetchone()[0] == 10
|
||||||
|
|
||||||
# Verify title column exists and is NULL for existing sessions
|
# Verify title column exists and is NULL for existing sessions
|
||||||
session = migrated_db.get_session("existing")
|
session = migrated_db.get_session("existing")
|
||||||
|
|||||||
Reference in New Issue
Block a user