fix(tui): break TTS→STT feedback loop + colorize REC badge

TTS feedback loop (hermes_cli/voice.py) The VAD loop kept the microphone live while speak_text played the agent's reply over the speakers, so the reply itself was picked up, transcribed, and submitted — the agent then replied to its own echo ("Ha, looks like we're in a loop"). Ported cli.py:_voice_tts_done synchronisation: - _tts_playing: threading.Event (initially set = "not playing"). - speak_text cancels the active recorder before opening the speakers, clears _tts_playing, and on exit waits 300 ms before re-starting the recorder — long enough for the OS audio device to settle so afplay and sounddevice don't race for it. - _continuous_on_silence now waits on _tts_playing (up to 60 s) before re-arming the mic with another 300 ms gap, mirroring cli.py:10619-10621. If the user flips voice off during the wait the loop exits cleanly instead of fighting for the device. Without both halves the loop races: if the silence callback fires before TTS starts it re-arms immediately; if TTS is already playing the pause-and-resume path catches it. Red REC badge (ui-tui appChrome + useMainApp) Classic CLI (cli.py:_get_voice_status_fragments) renders "● REC" in red and "◉ STT" in amber. TUI was showing a dim "REC" with no dot, making it hard to spot at a glance. voiceLabel now emits the same glyphs and appChrome colours them via t.color.error / t.color.warn, falling back to dim for the idle label.
2026-04-24 01:33:10 +03:00
parent 42ff785771
commit 98418afd5d
3 changed files with 91 additions and 2 deletions
--- a/hermes_cli/voice.py
+++ b/hermes_cli/voice.py
@ -87,6 +87,18 @@ _recorder_lock = threading.Lock()
 _continuous_lock = threading.Lock()
 _continuous_active = False
 _continuous_recorder: Any = None
+
+# ── TTS-vs-STT feedback guard ────────────────────────────────────────
+# When TTS plays the agent reply over the speakers, the live microphone
+# picks it up and transcribes the agent's own voice as user input — an
+# infinite loop the agent happily joins ("Ha, looks like we're in a loop").
+# This Event mirrors cli.py:_voice_tts_done: cleared while speak_text is
+# playing, set while silent. _continuous_on_silence waits on it before
+# re-arming the recorder, and speak_text itself cancels any live capture
+# before starting playback so the tail of the previous utterance doesn't
+# leak into the mic.
+_tts_playing = threading.Event()
+_tts_playing.set()  # initially "not playing"
 _continuous_on_transcript: Optional[Callable[[str], None]] = None
 _continuous_on_status: Optional[Callable[[str], None]] = None
 _continuous_on_silent_limit: Optional[Callable[[], None]] = None
@ -379,6 +391,23 @@ def _continuous_on_silence() -> None:
                pass
        return

+    # CLI parity (cli.py:10619-10621): wait for any in-flight TTS to
+    # finish before re-arming the mic, then leave a small gap to avoid
+    # catching the tail of the speaker output.  Without this the voice
+    # loop becomes a feedback loop — the agent's spoken reply lands
+    # back in the mic and gets re-submitted.
+    if not _tts_playing.is_set():
+        _debug("_continuous_on_silence: waiting for TTS to finish")
+        _tts_playing.wait(timeout=60)
+        import time as _time
+        _time.sleep(0.3)
+
+        # User may have stopped the loop during the wait.
+        with _continuous_lock:
+            if not _continuous_active:
+                _debug("_continuous_on_silence: stopped while waiting for TTS")
+                return
+
    # Restart for the next turn.
    _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
    _play_beep(frequency=880, count=1)
@ -409,6 +438,11 @@ def speak_text(text: str) -> None:
    MP3-over-OGG playback choice (afplay misbehaves on OGG), same cleanup
    of both extensions. Keeping these in sync means a voice-mode TTS
    session in the TUI sounds identical to one in the classic CLI.
+
+    While playback is in flight the module-level _tts_playing Event is
+    cleared so the continuous-recording loop knows to wait before
+    re-arming the mic (otherwise the agent's spoken reply feedback-loops
+    through the microphone and the agent ends up replying to itself).
    """
    if not text or not text.strip():
        return
@ -417,6 +451,26 @@ def speak_text(text: str) -> None:
    import tempfile
    import time

+    # Cancel any live capture before we open the speakers — otherwise the
+    # last ~200ms of the user's turn tail + the first syllables of our TTS
+    # both end up in the next recording window.  The continuous loop will
+    # re-arm itself after _tts_playing flips back (see _continuous_on_silence).
+    paused_recording = False
+    with _continuous_lock:
+        if (
+            _continuous_active
+            and _continuous_recorder is not None
+            and getattr(_continuous_recorder, "is_recording", False)
+        ):
+            try:
+                _continuous_recorder.cancel()
+                paused_recording = True
+            except Exception as e:
+                logger.warning("failed to pause recorder for TTS: %s", e)
+
+    _tts_playing.clear()
+    _debug(f"speak_text: TTS begin (paused_recording={paused_recording})")
+
    try:
        from tools.tts_tool import text_to_speech_tool

@ -463,3 +517,23 @@ def speak_text(text: str) -> None:
    except Exception as e:
        logger.warning("Voice TTS playback failed: %s", e)
        _debug(f"speak_text raised {type(e).__name__}: {e}")
+    finally:
+        _tts_playing.set()
+        _debug("speak_text: TTS done")
+
+        # Re-arm the mic so the user can answer without pressing Ctrl+B.
+        # Small delay lets the OS flush speaker output and afplay fully
+        # release the audio device before sounddevice re-opens the input.
+        if paused_recording:
+            time.sleep(0.3)
+            with _continuous_lock:
+                if _continuous_active and _continuous_recorder is not None:
+                    try:
+                        _continuous_recorder.start(
+                            on_silence_stop=_continuous_on_silence
+                        )
+                        _debug("speak_text: recording resumed after TTS")
+                    except Exception as e:
+                        logger.warning(
+                            "failed to resume recorder after TTS: %s", e
+                        )
--- a/ui-tui/src/app/useMainApp.ts
+++ b/ui-tui/src/app/useMainApp.ts
@ -716,7 +716,9 @@ export function useMainApp(gw: GatewayClient) {
      statusColor: statusColorOf(ui.status, ui.theme.color),
      stickyPrompt,
      turnStartedAt: ui.sid ? turnStartedAt : null,
-      voiceLabel: voiceRecording ? 'REC' : voiceProcessing ? 'STT' : `voice ${voiceEnabled ? 'on' : 'off'}`
+      // CLI parity: the classic prompt_toolkit status bar shows a red dot
+      // on REC (cli.py:_get_voice_status_fragments line 2344).
+      voiceLabel: voiceRecording ? '● REC' : voiceProcessing ? '◉ STT' : `voice ${voiceEnabled ? 'on' : 'off'}`
    }),
    [
      cwd,
--- a/ui-tui/src/components/appChrome.tsx
+++ b/ui-tui/src/components/appChrome.tsx
@ -215,7 +215,20 @@ export function StatusRule({
            </Text>
          ) : null}
          <SpawnHud t={t} />
-          {voiceLabel ? <Text color={t.color.dim}> │ {voiceLabel}</Text> : null}
+          {voiceLabel ? (
+            <Text
+              color={
+                voiceLabel.startsWith('●')
+                  ? t.color.error
+                  : voiceLabel.startsWith('◉')
+                    ? t.color.warn
+                    : t.color.dim
+              }
+            >
+              {' │ '}
+              {voiceLabel}
+            </Text>
+          ) : null}
          {bgCount > 0 ? <Text color={t.color.dim}> │ {bgCount} bg</Text> : null}
          {showCost && typeof usage.cost_usd === 'number' ? (
            <Text color={t.color.dim}> │ ${usage.cost_usd.toFixed(4)}</Text>