Merge pull request #13715 from NousResearch/bb/tui-markdown-tilde-subscript

fix(tui): don't swallow Kimi/Qwen ~! ~? kaomoji as subscript spans
2026-04-21 18:12:59 -05:00
parent 9fa49206dc 43eb1153e9
commit 5504ee8de8
2 changed files with 39 additions and 2 deletions
--- a/ui-tui/src/tests/markdown.test.ts
+++ b/ui-tui/src/tests/markdown.test.ts
@ -23,6 +23,31 @@ describe('INLINE_RE emphasis', () => {
    expect(matches('a*b*c')).toEqual(['*b*'])
    expect(matches('a**bold**c')).toEqual(['**bold**'])
  })
+
+  it('matches short alphanumeric subscript (H~2~O, CO~2~, X~n~)', () => {
+    expect(matches('H~2~O')).toEqual(['~2~'])
+    expect(matches('CO~2~ levels')).toEqual(['~2~'])
+    expect(matches('the X~n~ term')).toEqual(['~n~'])
+  })
+
+  it('ignores kaomoji-style ~! and ~? punctuation', () => {
+    // Kimi / Qwen / GLM emit these as decorators and the whole span between
+    // two tildes used to get collapsed into one dim blob.
+    expect(matches('Aww ~! Building step by step, I love it ~!')).toEqual([])
+    expect(matches('cool ~? yeah ~?')).toEqual([])
+    expect(matches('mixed ~! and ~? flow')).toEqual([])
+  })
+
+  it('ignores tilde spans that contain spaces or punctuation', () => {
+    // Real subscript doesn't contain spaces; a tilde followed by words-then-
+    // tilde is almost always conversational. Matching it swallows text.
+    expect(matches('hello ~good idea~ there')).toEqual([])
+    expect(matches('x ~oh no!~ y')).toEqual([])
+  })
+
+  it('does not let strikethrough eat subscript', () => {
+    expect(matches('~~strike~~ and H~2~O')).toEqual(['~~strike~~', '~2~'])
+  })
 })

 describe('stripInlineMarkup', () => {
@ -31,6 +56,11 @@ describe('stripInlineMarkup', () => {
    expect(stripInlineMarkup('browser_screenshot_ecc.png')).toBe('browser_screenshot_ecc.png')
    expect(stripInlineMarkup('__bold__ and foo__bar__')).toBe('bold and foo__bar__')
  })
+
+  it('leaves ~!/~? kaomoji alone and still handles real subscript', () => {
+    expect(stripInlineMarkup('Yay ~! nice work ~!')).toBe('Yay ~! nice work ~!')
+    expect(stripInlineMarkup('H~2~O and CO~2~')).toBe('H_2O and CO_2')
+  })
 })

 describe('protocol sentinels', () => {
--- a/ui-tui/src/components/markdown.tsx
+++ b/ui-tui/src/components/markdown.tsx
@ -16,8 +16,15 @@ const MD_URL_RE = '((?:[^\\s()]|\\([^\\s()]*\\))+?)'
 export const MEDIA_LINE_RE = /^\s*[`"']?MEDIA:\s*(\S+?)[`"']?\s*$/
 export const AUDIO_DIRECTIVE_RE = /^\s*\[\[audio_as_voice\]\]\s*$/

+// Subscript (`~x~`) is restricted to short alphanumeric runs so prose like
+// `thing ~! more ~?` from Kimi / Qwen / GLM (kaomoji-style decorators) doesn't
+// get parsed as a span that swallows everything between two stray tildes. Real
+// Pandoc subscript is H~2~O / CO~2~ / X~n~ — always word-char content. Without
+// this constraint the old pattern `~([^~\s][^~]*?)~` paired up `~!` openers
+// with the next `~` anywhere on the line and rendered the interior as dim
+// text with a `_` prefix.
 export const INLINE_RE = new RegExp(
-  `(!\\[(.*?)\\]\\(${MD_URL_RE}\\)|\\[(.+?)\\]\\(${MD_URL_RE}\\)|<((?:https?:\\/\\/|mailto:)[^>\\s]+|[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,})>|~~(.+?)~~|\`([^\\\`]+)\`|\\*\\*(.+?)\\*\\*|(?<!\\w)__(.+?)__(?!\\w)|\\*(.+?)\\*|(?<!\\w)_(.+?)_(?!\\w)|==(.+?)==|\\[\\^([^\\]]+)\\]|\\^([^^\\s][^^]*?)\\^|~([^~\\s][^~]*?)~|(https?:\\/\\/[^\\s<]+))`,
+  `(!\\[(.*?)\\]\\(${MD_URL_RE}\\)|\\[(.+?)\\]\\(${MD_URL_RE}\\)|<((?:https?:\\/\\/|mailto:)[^>\\s]+|[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,})>|~~(.+?)~~|\`([^\\\`]+)\`|\\*\\*(.+?)\\*\\*|(?<!\\w)__(.+?)__(?!\\w)|\\*(.+?)\\*|(?<!\\w)_(.+?)_(?!\\w)|==(.+?)==|\\[\\^([^\\]]+)\\]|\\^([^^\\s][^^]*?)\\^|~([A-Za-z0-9]{1,8})~|(https?:\\/\\/[^\\s<]+))`,
  'g'
 )

@ -108,7 +115,7 @@ export const stripInlineMarkup = (value: string) =>
    .replace(/==(.+?)==/g, '$1')
    .replace(/\[\^([^\]]+)\]/g, '[$1]')
    .replace(/\^([^^\s][^^]*?)\^/g, '^$1')
-    .replace(/~([^~\s][^~]*?)~/g, '_$1')
+    .replace(/~([A-Za-z0-9]{1,8})~/g, '_$1')

 const renderTable = (key: number, rows: string[][], t: Theme) => {
  const widths = rows[0]!.map((_, ci) => Math.max(...rows.map(r => stripInlineMarkup(r[ci] ?? '').length)))