diff --git a/hermes_state.py b/hermes_state.py index 2cc9615f0..24aa16d28 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -1355,9 +1355,9 @@ class SessionDB: # quotes. FTS5's tokenizer splits on dots and hyphens, turning # ``chat-send`` into ``chat AND send`` and ``P2.2`` into ``p2 AND 2``. # Quoting preserves phrase semantics. A single pass avoids the - # double-quoting bug that would occur if dotted and hyphenated + # double-quoting bug that would occur if dotted, hyphenated and underscored # patterns were applied sequentially (e.g. ``my-app.config``). - sanitized = re.sub(r"\b(\w+(?:[.-]\w+)+)\b", r'"\1"', sanitized) + sanitized = re.sub(r"\b(\w+(?:[._-]\w+)+)\b", r'"\1"', sanitized) # Step 6: Restore preserved quoted phrases for i, quoted in enumerate(_quoted_parts): diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index a8ba0cbc3..244b87c12 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -655,6 +655,30 @@ class TestFTS5Search: assert s('my-app.config') == '"my-app.config"' assert s('my-app.config.ts') == '"my-app.config.ts"' + def test_sanitize_fts5_quotes_underscored_terms(self): + """Underscored terms should be wrapped in quotes for exact matching. + + FTS5 default tokenizer splits 'sp_new1' into tokens 'sp' and 'new1'. + Without quoting, a search for 'sp_new' becomes an AND query + ('sp AND new') that fails to match rows indexed as 'sp_new1'. + """ + from hermes_state import SessionDB + s = SessionDB._sanitize_fts5_query + # Simple underscored term + assert s('sp_new') == '"sp_new"' + # Multiple underscores + assert s('a_b_c') == '"a_b_c"' + # Mixed underscores and hyphens/dots — single pass avoids double-quoting + assert s('sp_new1') == '"sp_new1"' + assert s('docker-compose_up') == '"docker-compose_up"' + assert s('my.app_config.ts') == '"my.app_config.ts"' + # Already-quoted — no double quoting + assert s('"sp_new"') == '"sp_new"' + # Mixed with other words + result = s('sp_new and 血管瘤') + assert '"sp_new"' in result + assert '血管瘤' in result + # ========================================================================= # CJK (Chinese/Japanese/Korean) LIKE fallback