Files
hermes-agent/agent/i18n.py
Siddharth Balyan c349eca823 fix(packaging): ship locales/ i18n catalogs in wheel, sdist, and Nix (#38383)
* fix(packaging): ship locales/ i18n catalogs in wheel, sdist, and Nix

locales/ is a bare data dir (no __init__.py), invisible to packages.find
and package-data. Sealed installs (pip wheel, Nix store venv) dropped it,
so gateway/CLI commands rendered raw i18n keys like
gateway.reset.header_default.

- pyproject: [tool.setuptools.data-files] locales = ["locales/*.yaml"] (wheel)
- MANIFEST.in: graft locales (sdist)
- agent/i18n._locales_dir: env override -> source -> sysconfig data scheme
- nix/hermes-agent.nix: copy locales into the store + set HERMES_BUNDLED_LOCALES
  as defense-in-depth. The wheel's data-files already materialize into the
  uv2nix venv, so resolution works with no env var; the override pins the
  store path against a future uv2nix change that could drop data-files.
- tests: metadata regression, wheel + sdist build-install smoke tests, and a
  bundled-locales flake check that verifies BOTH the wrapper override and the
  env-var-less data-files path. Smoke test wired into CI.

Closes #23943, #27632, #35374.
Supersedes #23966, #27716, #30261, #33841, #35429, #35494, #35735, #36697.

* test: cap locale e2e timeout, tighten catalog count guard

The two wheel/sdist e2e tests inherit the global --timeout=30 from
addopts; a cold-CI run (isolated build env + venv create + network pip
install) can plausibly exceed it. Add @pytest.mark.timeout(300) so they
don't ride the unit-test budget and flake intermittently.

Also assert the shipped catalog count equals len(SUPPORTED_LANGUAGES)
instead of a hardcoded >=16 floor, so the guard self-updates and trips
on a single dropped catalog (not just a fully-empty graft).
2026-06-03 12:00:27 -07:00

303 lines
11 KiB
Python

"""Lightweight internationalization (i18n) for Hermes static user-facing messages.
Scope (thin slice, by design): only the highest-impact static strings shown
to the user by Hermes itself -- approval prompts, a handful of gateway slash
command replies, restart-drain notices. Agent-generated output, log lines,
error tracebacks, tool outputs, and slash-command descriptions all stay in
English.
Catalog files live under ``locales/<lang>.yaml`` at the repo root. Each
catalog is a flat dict keyed by dotted paths (e.g. ``approval.choose`` or
``gateway.approval_expired``). Missing keys fall back to English; if English
is missing too, the key path itself is returned so a broken catalog never
crashes the agent.
Usage::
from agent.i18n import t
print(t("approval.choose_long")) # current lang
print(t("gateway.draining", count=3)) # {count} formatted
print(t("approval.choose_long", lang="zh")) # explicit override
Language resolution order:
1. Explicit ``lang=`` argument passed to :func:`t`
2. ``HERMES_LANGUAGE`` environment variable (for tests / quick override)
3. ``display.language`` from config.yaml
4. ``"en"`` (baseline)
Supported languages: en, zh, ja, de, es, fr, tr, uk. Unknown values fall back to en.
"""
from __future__ import annotations
import logging
import os
import sysconfig
import threading
from functools import lru_cache
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
SUPPORTED_LANGUAGES: tuple[str, ...] = (
"en", "zh", "zh-hant", "ja", "de", "es", "fr", "tr", "uk",
"af", "ko", "it", "ga", "pt", "ru", "hu",
)
DEFAULT_LANGUAGE = "en"
# Accept a few natural aliases so users who type "chinese" / "zh-CN" / "jp"
# get the right catalog instead of silently falling back to English.
_LANGUAGE_ALIASES: dict[str, str] = {
"english": "en", "en-us": "en", "en-gb": "en",
# Simplified Chinese — explicit codes route here; bare "chinese" / "mandarin"
# also default to Simplified since that's the larger user base.
"chinese": "zh", "mandarin": "zh", "zh-cn": "zh", "zh-hans": "zh", "zh-sg": "zh",
# Traditional Chinese — distinct catalog. Cover Taiwan / Hong Kong / Macau
# locale tags plus the common "traditional" alias.
"traditional-chinese": "zh-hant", "traditional_chinese": "zh-hant",
"zh-tw": "zh-hant", "zh-hk": "zh-hant", "zh-mo": "zh-hant",
"japanese": "ja", "jp": "ja", "ja-jp": "ja",
"german": "de", "deutsch": "de", "de-de": "de", "de-at": "de", "de-ch": "de",
"spanish": "es", "español": "es", "espanol": "es", "es-es": "es", "es-mx": "es", "es-ar": "es",
"french": "fr", "français": "fr", "france": "fr", "fr-fr": "fr", "fr-be": "fr", "fr-ca": "fr", "fr-ch": "fr",
"ukrainian": "uk", "ukrainisch": "uk", "українська": "uk", "uk-ua": "uk", "ua": "uk",
"turkish": "tr", "türkçe": "tr", "tr-tr": "tr",
# Afrikaans — South African Dutch-derived language; "af-ZA" is the common BCP-47 tag.
"afrikaans": "af", "af-za": "af",
# Korean
"korean": "ko", "한국어": "ko", "ko-kr": "ko",
# Italian
"italian": "it", "italiano": "it", "it-it": "it", "it-ch": "it",
# Irish (Gaeilge) — ga is the BCP-47 code
"irish": "ga", "gaeilge": "ga", "ga-ie": "ga",
# Portuguese — bare "portuguese" routes to European Portuguese; pt-br
# is in the same family but rendered identically here (no separate br catalog).
"portuguese": "pt", "português": "pt", "portugues": "pt",
"pt-pt": "pt", "pt-br": "pt", "brazilian": "pt", "brasileiro": "pt",
# Russian
"russian": "ru", "русский": "ru", "ru-ru": "ru",
# Hungarian
"hungarian": "hu", "magyar": "hu", "hu-hu": "hu",
}
_catalog_cache: dict[str, dict[str, str]] = {}
_catalog_lock = threading.Lock()
def _locales_dir() -> Path:
"""Return the directory containing locale YAML files.
Resolution order, first existing wins:
1. ``HERMES_BUNDLED_LOCALES`` env var -- set by the Nix wrapper (or any
sealed-packaging system) to point at the installed catalog directory.
2. ``<repo-root>/locales`` -- source checkouts and ``pip install -e .``,
where the working tree sits next to ``agent/``.
3. ``<sysconfig data|purelib|platlib>/locales`` -- pip wheel installs.
setuptools ``data-files`` extracts ``locales/*.yaml`` under the
interpreter's ``data`` scheme; the other schemes are checked as a
safety net for nonstandard layouts.
Falling through to the source-style path (even when missing) keeps
``_load_catalog`` error messages informative -- it logs the path it
looked at -- rather than raising.
"""
override = os.getenv("HERMES_BUNDLED_LOCALES", "").strip()
if override:
candidate = Path(override)
if candidate.is_dir():
return candidate
logger.warning(
"HERMES_BUNDLED_LOCALES points to a non-directory path (%s); "
"falling back to bundled/source locale resolution",
override,
)
# agent/i18n.py -> agent/ -> repo root (source checkout, editable install)
source_dir = Path(__file__).resolve().parent.parent / "locales"
if source_dir.is_dir():
return source_dir
# pip wheel install: data-files lands under the interpreter data scheme.
# ``data`` (== sys.prefix in a venv) is where setuptools data-files extract
# and is checked first. ``purelib``/``platlib`` (site-packages) are a safety
# net for nonstandard layouts. NOTE: this does NOT cover ``pip install
# --user`` (user scheme, ~/.local/locales) or ``pip install --target`` --
# both are out of scope; see the plan header.
for scheme in ("data", "purelib", "platlib"):
raw = sysconfig.get_path(scheme)
if not raw:
continue
candidate = Path(raw) / "locales"
if candidate.is_dir():
return candidate
# Last resort: return the source-style path so _load_catalog's catalog-missing
# log (logger.debug "i18n catalog missing for %s at %s") stays informative.
return source_dir
def _normalize_lang(value: Any) -> str:
"""Normalize a user-supplied language value to a supported code.
Accepts supported codes directly, common aliases (``chinese`` -> ``zh``),
and case-insensitive regional tags (``zh-CN`` -> ``zh``). Returns the
default language for unknown values.
"""
if not isinstance(value, str):
return DEFAULT_LANGUAGE
key = value.strip().lower()
if not key:
return DEFAULT_LANGUAGE
if key in SUPPORTED_LANGUAGES:
return key
if key in _LANGUAGE_ALIASES:
return _LANGUAGE_ALIASES[key]
# Try stripping a region suffix (e.g. "pt-br" -> "pt" won't be supported,
# but "zh-CN" -> "zh" will).
base = key.split("-", 1)[0]
if base in SUPPORTED_LANGUAGES:
return base
return DEFAULT_LANGUAGE
def _load_catalog(lang: str) -> dict[str, str]:
"""Load and flatten one locale YAML file into a dotted-key dict.
YAML files can be nested for human readability; this produces the flat
key space :func:`t` expects. Cached per-language for the process.
"""
with _catalog_lock:
cached = _catalog_cache.get(lang)
if cached is not None:
return cached
path = _locales_dir() / f"{lang}.yaml"
if not path.is_file():
logger.debug("i18n catalog missing for %s at %s", lang, path)
with _catalog_lock:
_catalog_cache[lang] = {}
return {}
try:
import yaml # PyYAML is already a hermes dependency
with path.open("r", encoding="utf-8") as f:
raw = yaml.safe_load(f) or {}
except Exception as exc:
logger.warning("Failed to load i18n catalog %s: %s", path, exc)
with _catalog_lock:
_catalog_cache[lang] = {}
return {}
flat: dict[str, str] = {}
_flatten_into(raw, "", flat)
with _catalog_lock:
_catalog_cache[lang] = flat
return flat
def _flatten_into(node: Any, prefix: str, out: dict[str, str]) -> None:
if isinstance(node, dict):
for key, value in node.items():
child_key = f"{prefix}.{key}" if prefix else str(key)
_flatten_into(value, child_key, out)
elif isinstance(node, str):
out[prefix] = node
# Non-string, non-dict leaves are ignored -- catalogs are text-only.
@lru_cache(maxsize=1)
def _config_language_cached() -> str | None:
"""Read ``display.language`` from config.yaml once per process.
Cached because ``t()`` is called in hot paths (every approval prompt,
every gateway reply) and re-reading YAML each call would be wasteful.
``reset_language_cache()`` clears this when config changes at runtime
(e.g. after the setup wizard).
"""
try:
from hermes_cli.config import load_config
cfg = load_config()
lang = (cfg.get("display") or {}).get("language")
if lang:
return _normalize_lang(lang)
except Exception as exc:
logger.debug("Could not read display.language from config: %s", exc)
return None
def reset_language_cache() -> None:
"""Invalidate cached language resolution and catalogs.
Call after :func:`hermes_cli.config.save_config` if a running process
needs to pick up a changed ``display.language`` without restart.
"""
_config_language_cached.cache_clear()
with _catalog_lock:
_catalog_cache.clear()
def get_language() -> str:
"""Resolve the active language using env > config > default order."""
env_lang = os.environ.get("HERMES_LANGUAGE")
if env_lang:
return _normalize_lang(env_lang)
cfg_lang = _config_language_cached()
if cfg_lang:
return cfg_lang
return DEFAULT_LANGUAGE
def t(key: str, lang: str | None = None, **format_kwargs: Any) -> str:
"""Translate a dotted key to the active language.
Parameters
----------
key
Dotted path into the catalog, e.g. ``"approval.choose_long"``.
lang
Explicit language override. Takes precedence over env + config.
**format_kwargs
``str.format`` substitution arguments (``t("gateway.drain", count=3)``
expects a catalog entry with a ``{count}`` placeholder).
Returns
-------
The translated string, or the English fallback if the key is missing in
the target language, or the bare key if English is also missing.
"""
target = _normalize_lang(lang) if lang else get_language()
catalog = _load_catalog(target)
value = catalog.get(key)
if value is None and target != DEFAULT_LANGUAGE:
# Fall through to English rather than showing a key path to the user.
value = _load_catalog(DEFAULT_LANGUAGE).get(key)
if value is None:
# Last-ditch: return the key itself. A broken catalog should not
# crash anything; it just looks ugly until someone fixes it.
logger.debug("i18n miss: key=%r lang=%r", key, target)
value = key
if format_kwargs:
try:
return value.format(**format_kwargs)
except (KeyError, IndexError, ValueError) as exc:
logger.warning(
"i18n format failed for key=%r lang=%r kwargs=%r: %s",
key, target, format_kwargs, exc,
)
return value
return value
__all__ = [
"SUPPORTED_LANGUAGES",
"DEFAULT_LANGUAGE",
"t",
"get_language",
"reset_language_cache",
]