Merge pull request #13715 from NousResearch/bb/tui-markdown-tilde-subscript

fix(tui): don't swallow Kimi/Qwen ~! ~? kaomoji as subscript spans
This commit is contained in:
brooklyn!
2026-04-21 18:12:59 -05:00
committed by GitHub
2 changed files with 39 additions and 2 deletions

View File

@ -23,6 +23,31 @@ describe('INLINE_RE emphasis', () => {
expect(matches('a*b*c')).toEqual(['*b*'])
expect(matches('a**bold**c')).toEqual(['**bold**'])
})
it('matches short alphanumeric subscript (H~2~O, CO~2~, X~n~)', () => {
expect(matches('H~2~O')).toEqual(['~2~'])
expect(matches('CO~2~ levels')).toEqual(['~2~'])
expect(matches('the X~n~ term')).toEqual(['~n~'])
})
it('ignores kaomoji-style ~! and ~? punctuation', () => {
// Kimi / Qwen / GLM emit these as decorators and the whole span between
// two tildes used to get collapsed into one dim blob.
expect(matches('Aww ~! Building step by step, I love it ~!')).toEqual([])
expect(matches('cool ~? yeah ~?')).toEqual([])
expect(matches('mixed ~! and ~? flow')).toEqual([])
})
it('ignores tilde spans that contain spaces or punctuation', () => {
// Real subscript doesn't contain spaces; a tilde followed by words-then-
// tilde is almost always conversational. Matching it swallows text.
expect(matches('hello ~good idea~ there')).toEqual([])
expect(matches('x ~oh no!~ y')).toEqual([])
})
it('does not let strikethrough eat subscript', () => {
expect(matches('~~strike~~ and H~2~O')).toEqual(['~~strike~~', '~2~'])
})
})
describe('stripInlineMarkup', () => {
@ -31,6 +56,11 @@ describe('stripInlineMarkup', () => {
expect(stripInlineMarkup('browser_screenshot_ecc.png')).toBe('browser_screenshot_ecc.png')
expect(stripInlineMarkup('__bold__ and foo__bar__')).toBe('bold and foo__bar__')
})
it('leaves ~!/~? kaomoji alone and still handles real subscript', () => {
expect(stripInlineMarkup('Yay ~! nice work ~!')).toBe('Yay ~! nice work ~!')
expect(stripInlineMarkup('H~2~O and CO~2~')).toBe('H_2O and CO_2')
})
})
describe('protocol sentinels', () => {

View File

@ -16,8 +16,15 @@ const MD_URL_RE = '((?:[^\\s()]|\\([^\\s()]*\\))+?)'
export const MEDIA_LINE_RE = /^\s*[`"']?MEDIA:\s*(\S+?)[`"']?\s*$/
export const AUDIO_DIRECTIVE_RE = /^\s*\[\[audio_as_voice\]\]\s*$/
// Subscript (`~x~`) is restricted to short alphanumeric runs so prose like
// `thing ~! more ~?` from Kimi / Qwen / GLM (kaomoji-style decorators) doesn't
// get parsed as a span that swallows everything between two stray tildes. Real
// Pandoc subscript is H~2~O / CO~2~ / X~n~ — always word-char content. Without
// this constraint the old pattern `~([^~\s][^~]*?)~` paired up `~!` openers
// with the next `~` anywhere on the line and rendered the interior as dim
// text with a `_` prefix.
export const INLINE_RE = new RegExp(
`(!\\[(.*?)\\]\\(${MD_URL_RE}\\)|\\[(.+?)\\]\\(${MD_URL_RE}\\)|<((?:https?:\\/\\/|mailto:)[^>\\s]+|[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,})>|~~(.+?)~~|\`([^\\\`]+)\`|\\*\\*(.+?)\\*\\*|(?<!\\w)__(.+?)__(?!\\w)|\\*(.+?)\\*|(?<!\\w)_(.+?)_(?!\\w)|==(.+?)==|\\[\\^([^\\]]+)\\]|\\^([^^\\s][^^]*?)\\^|~([^~\\s][^~]*?)~|(https?:\\/\\/[^\\s<]+))`,
`(!\\[(.*?)\\]\\(${MD_URL_RE}\\)|\\[(.+?)\\]\\(${MD_URL_RE}\\)|<((?:https?:\\/\\/|mailto:)[^>\\s]+|[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,})>|~~(.+?)~~|\`([^\\\`]+)\`|\\*\\*(.+?)\\*\\*|(?<!\\w)__(.+?)__(?!\\w)|\\*(.+?)\\*|(?<!\\w)_(.+?)_(?!\\w)|==(.+?)==|\\[\\^([^\\]]+)\\]|\\^([^^\\s][^^]*?)\\^|~([A-Za-z0-9]{1,8})~|(https?:\\/\\/[^\\s<]+))`,
'g'
)
@ -108,7 +115,7 @@ export const stripInlineMarkup = (value: string) =>
.replace(/==(.+?)==/g, '$1')
.replace(/\[\^([^\]]+)\]/g, '[$1]')
.replace(/\^([^^\s][^^]*?)\^/g, '^$1')
.replace(/~([^~\s][^~]*?)~/g, '_$1')
.replace(/~([A-Za-z0-9]{1,8})~/g, '_$1')
const renderTable = (key: number, rows: string[][], t: Theme) => {
const widths = rows[0]!.map((_, ci) => Math.max(...rows.map(r => stripInlineMarkup(r[ci] ?? '').length)))