fix(search): quote underscored terms in FTS5 query sanitization
FTS5 default tokenizer splits 'sp_new1' into tokens 'sp' and 'new1'.
Without quoting, a search for 'sp_new' becomes an AND query
('sp AND new') that fails to match rows indexed as 'sp_new1'.
Fix: add underscore to the character class in Step 5 regex
([.-] -> [._-]) so underscored terms are wrapped in double quotes.
Also adds test_sanitize_fts5_quotes_underscored_terms.
This commit is contained in:
@ -1355,9 +1355,9 @@ class SessionDB:
|
||||
# quotes. FTS5's tokenizer splits on dots and hyphens, turning
|
||||
# ``chat-send`` into ``chat AND send`` and ``P2.2`` into ``p2 AND 2``.
|
||||
# Quoting preserves phrase semantics. A single pass avoids the
|
||||
# double-quoting bug that would occur if dotted and hyphenated
|
||||
# double-quoting bug that would occur if dotted, hyphenated and underscored
|
||||
# patterns were applied sequentially (e.g. ``my-app.config``).
|
||||
sanitized = re.sub(r"\b(\w+(?:[.-]\w+)+)\b", r'"\1"', sanitized)
|
||||
sanitized = re.sub(r"\b(\w+(?:[._-]\w+)+)\b", r'"\1"', sanitized)
|
||||
|
||||
# Step 6: Restore preserved quoted phrases
|
||||
for i, quoted in enumerate(_quoted_parts):
|
||||
|
||||
@ -655,6 +655,30 @@ class TestFTS5Search:
|
||||
assert s('my-app.config') == '"my-app.config"'
|
||||
assert s('my-app.config.ts') == '"my-app.config.ts"'
|
||||
|
||||
def test_sanitize_fts5_quotes_underscored_terms(self):
|
||||
"""Underscored terms should be wrapped in quotes for exact matching.
|
||||
|
||||
FTS5 default tokenizer splits 'sp_new1' into tokens 'sp' and 'new1'.
|
||||
Without quoting, a search for 'sp_new' becomes an AND query
|
||||
('sp AND new') that fails to match rows indexed as 'sp_new1'.
|
||||
"""
|
||||
from hermes_state import SessionDB
|
||||
s = SessionDB._sanitize_fts5_query
|
||||
# Simple underscored term
|
||||
assert s('sp_new') == '"sp_new"'
|
||||
# Multiple underscores
|
||||
assert s('a_b_c') == '"a_b_c"'
|
||||
# Mixed underscores and hyphens/dots — single pass avoids double-quoting
|
||||
assert s('sp_new1') == '"sp_new1"'
|
||||
assert s('docker-compose_up') == '"docker-compose_up"'
|
||||
assert s('my.app_config.ts') == '"my.app_config.ts"'
|
||||
# Already-quoted — no double quoting
|
||||
assert s('"sp_new"') == '"sp_new"'
|
||||
# Mixed with other words
|
||||
result = s('sp_new and 血管瘤')
|
||||
assert '"sp_new"' in result
|
||||
assert '血管瘤' in result
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# CJK (Chinese/Japanese/Korean) LIKE fallback
|
||||
|
||||
Reference in New Issue
Block a user