fix(file-tools): handle UTF-8 BOM in read_file / write_file / patch (#35278)

Some Windows editors prepend an invisible UTF-8 BOM (U+FEFF) to text
files. We had no awareness of it, so: read_file surfaced a phantom
U+FEFF as the first character; patch matches against the true first
line could miss; and a write/patch round-trip silently stripped the
marker, changing the file's byte signature.

Now:
- read_file / read_file_raw strip a single leading BOM so the model
  never sees it (only on the first chunk — the marker lives at byte 0).
- patch_replace strips the BOM before fuzzy-matching (so an exact
  first-line match works) and its post-write verification compares
  BOM-stripped content.
- write_file restores the BOM when the original file had one and the
  new content doesn't, mirroring the existing line-ending preservation
  (detect on disk via a cheap `head -c 3` probe or reuse pre_content,
  re-prepend across the edit). Guards against double-BOM.

Mid-content U+FEFF is left alone (it's data there, not a file marker).

Tests: TestBomHandling (real LocalEnvironment) — read-strips, raw-read
strips, write preserves, no-BOM-when-original-had-none, no-double-BOM,
patch round-trip preserves, patch matches first line through a BOM,
plus helper unit tests. 208 file-tool tests green.
This commit is contained in:
Teknium
2026-05-30 06:25:50 -07:00
committed by GitHub
parent 5a1aa9e68c
commit 5f84c9144a
2 changed files with 180 additions and 4 deletions

View File

@ -183,5 +183,101 @@ class TestAtomicWrite:
assert (os.stat(target).st_mode & 0o777) == 0o600
class TestBomHandling:
"""UTF-8 BOM is stripped on read and preserved across write/patch.
A BOM (U+FEFF, bytes EF BB BF) is an invisible leading marker some
Windows editors prepend. The agent should never see it in read output,
but a file that had one on disk must keep it after an edit so the byte
signature is preserved.
"""
BOM = "\ufeff"
@pytest.fixture
def ops(self, tmp_path: Path):
from tools.environments.local import LocalEnvironment
from tools.file_operations import ShellFileOperations
env = LocalEnvironment(cwd=str(tmp_path))
return ShellFileOperations(env, cwd=str(tmp_path))
def test_helpers(self):
from tools.file_operations import _strip_bom, _has_bom
assert _strip_bom("\ufeffhello") == ("hello", True)
assert _strip_bom("hello") == ("hello", False)
assert _strip_bom("") == ("", False)
# mid-string BOM is data, not a marker — left alone
assert _strip_bom("a\ufeffb") == ("a\ufeffb", False)
assert _has_bom("\ufeffx") is True
assert _has_bom("x") is False
assert _has_bom(None) is False
def test_read_strips_bom(self, ops, tmp_path: Path):
target = tmp_path / "bom.py"
# Write raw bytes with a real UTF-8 BOM prefix.
target.write_bytes(self.BOM.encode("utf-8") + b"import os\nx = 1\n")
res = ops.read_file(str(target))
assert res.error is None, res.error
# Line 1 content must NOT carry the phantom U+FEFF.
first_line = res.content.split("\n", 1)[0]
assert self.BOM not in first_line
assert first_line.endswith("import os")
def test_read_raw_strips_bom(self, ops, tmp_path: Path):
target = tmp_path / "bom.txt"
target.write_bytes(self.BOM.encode("utf-8") + b"hello\nworld\n")
res = ops.read_file_raw(str(target))
assert res.error is None, res.error
assert not res.content.startswith(self.BOM)
assert res.content == "hello\nworld\n"
def test_write_preserves_bom(self, ops, tmp_path: Path):
# Existing file has a BOM; agent rewrites with BOM-less content.
target = tmp_path / "config.txt"
target.write_bytes(self.BOM.encode("utf-8") + b"old\n")
res = ops.write_file(str(target), "new content\n")
assert res.error is None, res.error
raw = target.read_bytes()
assert raw.startswith(self.BOM.encode("utf-8")) # BOM restored
assert raw == self.BOM.encode("utf-8") + b"new content\n"
def test_write_no_bom_when_original_had_none(self, ops, tmp_path: Path):
target = tmp_path / "plain.txt"
target.write_text("old\n")
res = ops.write_file(str(target), "new\n")
assert res.error is None, res.error
assert not target.read_bytes().startswith(self.BOM.encode("utf-8"))
def test_write_does_not_double_bom(self, ops, tmp_path: Path):
# If content already carries a BOM and the file had one, don't add a
# second.
target = tmp_path / "config.txt"
target.write_bytes(self.BOM.encode("utf-8") + b"old\n")
res = ops.write_file(str(target), self.BOM + "new\n")
assert res.error is None, res.error
raw = target.read_bytes()
# exactly one BOM
assert raw == self.BOM.encode("utf-8") + b"new\n"
def test_patch_roundtrip_preserves_bom(self, ops, tmp_path: Path):
target = tmp_path / "edit.py"
target.write_bytes(self.BOM.encode("utf-8") + b"a = 1\nb = 2\nc = 3\n")
res = ops.patch_replace(str(target), "b = 2", "b = 22")
assert res.success, res.error
raw = target.read_bytes()
assert raw.startswith(self.BOM.encode("utf-8")) # marker survived
assert raw == self.BOM.encode("utf-8") + b"a = 1\nb = 22\nc = 3\n"
def test_patch_matches_first_line_through_bom(self, ops, tmp_path: Path):
# The whole point: an edit targeting the BOM-prefixed first line
# must match cleanly (the matcher sees BOM-stripped content).
target = tmp_path / "mod.py"
target.write_bytes(self.BOM.encode("utf-8") + b"import os\nimport sys\n")
res = ops.patch_replace(str(target), "import os", "import os, json")
assert res.success, res.error
raw = target.read_bytes()
assert raw == self.BOM.encode("utf-8") + b"import os, json\nimport sys\n"
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@ -113,6 +113,36 @@ def _normalize_line_endings(text: str, target: str) -> str:
return text
# UTF-8 byte order mark. Some Windows editors (Notepad, older Visual Studio,
# some PowerShell redirects) prepend this invisible 3-byte marker
# (EF BB BF == U+FEFF) to UTF-8 text files. It renders as nothing but is a
# real character at the start of the decoded string, so without handling it:
# - read_file would surface a stray U+FEFF as the first character (the
# model sees a phantom char before `import ...`), and
# - patch matches against the true first line would miss, and write_file
# would silently drop or double the marker on rewrite.
# We strip it on read so the model sees clean content, and restore it on
# write when the original file had one — exactly mirroring the line-ending
# preservation above (detect on disk, preserve across the edit).
_UTF8_BOM = "\ufeff"
def _strip_bom(text: str) -> tuple[str, bool]:
"""Return (text-without-leading-BOM, had_bom).
Only a single leading BOM is stripped; a BOM appearing mid-content is
left alone (it's legitimate data there, not a file marker).
"""
if text and text.startswith(_UTF8_BOM):
return text[len(_UTF8_BOM):], True
return text, False
def _has_bom(text: Optional[str]) -> bool:
"""True if ``text`` begins with a UTF-8 BOM."""
return bool(text) and text.startswith(_UTF8_BOM)
def _is_write_denied(path: str) -> bool:
"""Return True if path is on the write deny list."""
return _shared_is_write_denied(path)
@ -801,6 +831,22 @@ class ShellFileOperations(FileOperations):
return None
return _detect_line_ending(head_result.stdout)
def _file_has_bom(self, path: str, pre_content: Optional[str] = None) -> bool:
"""Whether the file on disk starts with a UTF-8 BOM.
Uses ``pre_content`` if we already read the file (zero extra exec
calls); otherwise issues a tiny ``head -c 3`` to sample just the
marker. A missing/empty file returns False (new writes get no BOM
unless the caller explicitly includes one).
"""
if pre_content is not None:
return _has_bom(pre_content)
head_cmd = f"head -c 3 {self._escape_shell_arg(path)} 2>/dev/null"
head_result = self._exec(head_cmd)
if head_result.exit_code != 0 or not head_result.stdout:
return False
return _has_bom(head_result.stdout)
def _unified_diff(self, old_content: str, new_content: str, filename: str) -> str:
"""Generate unified diff between old and new content."""
@ -885,6 +931,11 @@ class ShellFileOperations(FileOperations):
if read_result.exit_code != 0:
return ReadResult(error=f"Failed to read file: {read_result.stdout}")
read_output = _strip_terminal_fence_leaks(read_result.stdout)
# Strip a leading UTF-8 BOM so the model never sees a phantom U+FEFF
# before the first real character. Only meaningful on the first
# chunk (the marker lives at byte 0); later pages can't carry it.
if offset == 1:
read_output, _ = _strip_bom(read_output)
# Get total line count
wc_cmd = f"wc -l < {self._escape_shell_arg(path)}"
@ -989,8 +1040,14 @@ class ShellFileOperations(FileOperations):
cat_result = self._exec(f"cat {self._escape_shell_arg(path)}")
if cat_result.exit_code != 0:
return ReadResult(error=f"Failed to read file: {cat_result.stdout}")
# Strip a leading UTF-8 BOM so patch's fuzzy matcher operates on
# clean content (a phantom U+FEFF before line 1 would defeat an
# exact first-line match). write_file restores the BOM on the way
# back out — it re-probes the on-disk file, which still has the
# marker — so the round-trip preserves it.
raw_content, _ = _strip_bom(_strip_terminal_fence_leaks(cat_result.stdout))
return ReadResult(
content=_strip_terminal_fence_leaks(cat_result.stdout),
content=raw_content,
file_size=file_size,
)
@ -1090,6 +1147,18 @@ class ShellFileOperations(FileOperations):
if original_ending == "\r\n":
content = _normalize_line_endings(content, "\r\n")
# ── BOM preservation ──────────────────────────────────────────
# If the file on disk started with a UTF-8 BOM, keep it. read_file
# strips the BOM so the agent never sees it, which means the
# content it hands back to write_file / patch has no BOM either —
# without restoring it here a round-trip would silently strip the
# marker and change the file's byte signature (some Windows
# toolchains key on it). Only prepend when the original had a BOM
# and the new content doesn't already carry one (guards against
# double-BOM if a caller passed raw bytes).
if self._file_has_bom(path, pre_content) and not _has_bom(content):
content = _UTF8_BOM + content
# Snapshot LSP diagnostics for this file (best-effort) so the
# post-write LSP layer can return only diagnostics introduced
# by this specific edit. Mirrors claude-code's
@ -1193,7 +1262,13 @@ class ShellFileOperations(FileOperations):
return PatchResult(error=f"Failed to read file: {path}")
content = read_result.stdout
# Strip a leading UTF-8 BOM before matching so the fuzzy matcher and
# the diff operate on clean content (a phantom U+FEFF before line 1
# defeats an exact first-line match). write_file restores the BOM on
# the way back out by re-probing the on-disk file, so the round-trip
# preserves the marker.
content, _ = _strip_bom(content)
# Import and use fuzzy matching
from tools.fuzzy_match import fuzzy_find_and_replace
@ -1242,8 +1317,13 @@ class ShellFileOperations(FileOperations):
# ``new_content`` string has bare LFs. Without this normalization
# every patch on Windows returns a bogus "wrote 39, read 42"
# false-negative even though the edit landed correctly. POSIX
# backends don't translate, so this is a no-op there.
_verify_stdout_normalized = verify_result.stdout.replace("\r\n", "\n").replace("\r", "\n")
# backends don't translate, so this is a no-op there. We also
# strip a leading BOM from the re-read: write_file restored the
# marker on disk but ``new_content`` is the BOM-less string we
# matched against, so the comparison must drop it to stay
# apples-to-apples.
_verify_bomless, _ = _strip_bom(verify_result.stdout)
_verify_stdout_normalized = _verify_bomless.replace("\r\n", "\n").replace("\r", "\n")
_new_content_normalized = new_content.replace("\r\n", "\n").replace("\r", "\n")
if _verify_stdout_normalized != _new_content_normalized:
return PatchResult(error=(