fix(file-tools): handle UTF-8 BOM in read_file / write_file / patch (#35278)
Some Windows editors prepend an invisible UTF-8 BOM (U+FEFF) to text files. We had no awareness of it, so: read_file surfaced a phantom U+FEFF as the first character; patch matches against the true first line could miss; and a write/patch round-trip silently stripped the marker, changing the file's byte signature. Now: - read_file / read_file_raw strip a single leading BOM so the model never sees it (only on the first chunk — the marker lives at byte 0). - patch_replace strips the BOM before fuzzy-matching (so an exact first-line match works) and its post-write verification compares BOM-stripped content. - write_file restores the BOM when the original file had one and the new content doesn't, mirroring the existing line-ending preservation (detect on disk via a cheap `head -c 3` probe or reuse pre_content, re-prepend across the edit). Guards against double-BOM. Mid-content U+FEFF is left alone (it's data there, not a file marker). Tests: TestBomHandling (real LocalEnvironment) — read-strips, raw-read strips, write preserves, no-BOM-when-original-had-none, no-double-BOM, patch round-trip preserves, patch matches first line through a BOM, plus helper unit tests. 208 file-tool tests green.
This commit is contained in:
@ -183,5 +183,101 @@ class TestAtomicWrite:
|
||||
assert (os.stat(target).st_mode & 0o777) == 0o600
|
||||
|
||||
|
||||
class TestBomHandling:
|
||||
"""UTF-8 BOM is stripped on read and preserved across write/patch.
|
||||
|
||||
A BOM (U+FEFF, bytes EF BB BF) is an invisible leading marker some
|
||||
Windows editors prepend. The agent should never see it in read output,
|
||||
but a file that had one on disk must keep it after an edit so the byte
|
||||
signature is preserved.
|
||||
"""
|
||||
|
||||
BOM = "\ufeff"
|
||||
|
||||
@pytest.fixture
|
||||
def ops(self, tmp_path: Path):
|
||||
from tools.environments.local import LocalEnvironment
|
||||
from tools.file_operations import ShellFileOperations
|
||||
env = LocalEnvironment(cwd=str(tmp_path))
|
||||
return ShellFileOperations(env, cwd=str(tmp_path))
|
||||
|
||||
def test_helpers(self):
|
||||
from tools.file_operations import _strip_bom, _has_bom
|
||||
assert _strip_bom("\ufeffhello") == ("hello", True)
|
||||
assert _strip_bom("hello") == ("hello", False)
|
||||
assert _strip_bom("") == ("", False)
|
||||
# mid-string BOM is data, not a marker — left alone
|
||||
assert _strip_bom("a\ufeffb") == ("a\ufeffb", False)
|
||||
assert _has_bom("\ufeffx") is True
|
||||
assert _has_bom("x") is False
|
||||
assert _has_bom(None) is False
|
||||
|
||||
def test_read_strips_bom(self, ops, tmp_path: Path):
|
||||
target = tmp_path / "bom.py"
|
||||
# Write raw bytes with a real UTF-8 BOM prefix.
|
||||
target.write_bytes(self.BOM.encode("utf-8") + b"import os\nx = 1\n")
|
||||
res = ops.read_file(str(target))
|
||||
assert res.error is None, res.error
|
||||
# Line 1 content must NOT carry the phantom U+FEFF.
|
||||
first_line = res.content.split("\n", 1)[0]
|
||||
assert self.BOM not in first_line
|
||||
assert first_line.endswith("import os")
|
||||
|
||||
def test_read_raw_strips_bom(self, ops, tmp_path: Path):
|
||||
target = tmp_path / "bom.txt"
|
||||
target.write_bytes(self.BOM.encode("utf-8") + b"hello\nworld\n")
|
||||
res = ops.read_file_raw(str(target))
|
||||
assert res.error is None, res.error
|
||||
assert not res.content.startswith(self.BOM)
|
||||
assert res.content == "hello\nworld\n"
|
||||
|
||||
def test_write_preserves_bom(self, ops, tmp_path: Path):
|
||||
# Existing file has a BOM; agent rewrites with BOM-less content.
|
||||
target = tmp_path / "config.txt"
|
||||
target.write_bytes(self.BOM.encode("utf-8") + b"old\n")
|
||||
res = ops.write_file(str(target), "new content\n")
|
||||
assert res.error is None, res.error
|
||||
raw = target.read_bytes()
|
||||
assert raw.startswith(self.BOM.encode("utf-8")) # BOM restored
|
||||
assert raw == self.BOM.encode("utf-8") + b"new content\n"
|
||||
|
||||
def test_write_no_bom_when_original_had_none(self, ops, tmp_path: Path):
|
||||
target = tmp_path / "plain.txt"
|
||||
target.write_text("old\n")
|
||||
res = ops.write_file(str(target), "new\n")
|
||||
assert res.error is None, res.error
|
||||
assert not target.read_bytes().startswith(self.BOM.encode("utf-8"))
|
||||
|
||||
def test_write_does_not_double_bom(self, ops, tmp_path: Path):
|
||||
# If content already carries a BOM and the file had one, don't add a
|
||||
# second.
|
||||
target = tmp_path / "config.txt"
|
||||
target.write_bytes(self.BOM.encode("utf-8") + b"old\n")
|
||||
res = ops.write_file(str(target), self.BOM + "new\n")
|
||||
assert res.error is None, res.error
|
||||
raw = target.read_bytes()
|
||||
# exactly one BOM
|
||||
assert raw == self.BOM.encode("utf-8") + b"new\n"
|
||||
|
||||
def test_patch_roundtrip_preserves_bom(self, ops, tmp_path: Path):
|
||||
target = tmp_path / "edit.py"
|
||||
target.write_bytes(self.BOM.encode("utf-8") + b"a = 1\nb = 2\nc = 3\n")
|
||||
res = ops.patch_replace(str(target), "b = 2", "b = 22")
|
||||
assert res.success, res.error
|
||||
raw = target.read_bytes()
|
||||
assert raw.startswith(self.BOM.encode("utf-8")) # marker survived
|
||||
assert raw == self.BOM.encode("utf-8") + b"a = 1\nb = 22\nc = 3\n"
|
||||
|
||||
def test_patch_matches_first_line_through_bom(self, ops, tmp_path: Path):
|
||||
# The whole point: an edit targeting the BOM-prefixed first line
|
||||
# must match cleanly (the matcher sees BOM-stripped content).
|
||||
target = tmp_path / "mod.py"
|
||||
target.write_bytes(self.BOM.encode("utf-8") + b"import os\nimport sys\n")
|
||||
res = ops.patch_replace(str(target), "import os", "import os, json")
|
||||
assert res.success, res.error
|
||||
raw = target.read_bytes()
|
||||
assert raw == self.BOM.encode("utf-8") + b"import os, json\nimport sys\n"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@ -113,6 +113,36 @@ def _normalize_line_endings(text: str, target: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
# UTF-8 byte order mark. Some Windows editors (Notepad, older Visual Studio,
|
||||
# some PowerShell redirects) prepend this invisible 3-byte marker
|
||||
# (EF BB BF == U+FEFF) to UTF-8 text files. It renders as nothing but is a
|
||||
# real character at the start of the decoded string, so without handling it:
|
||||
# - read_file would surface a stray U+FEFF as the first character (the
|
||||
# model sees a phantom char before `import ...`), and
|
||||
# - patch matches against the true first line would miss, and write_file
|
||||
# would silently drop or double the marker on rewrite.
|
||||
# We strip it on read so the model sees clean content, and restore it on
|
||||
# write when the original file had one — exactly mirroring the line-ending
|
||||
# preservation above (detect on disk, preserve across the edit).
|
||||
_UTF8_BOM = "\ufeff"
|
||||
|
||||
|
||||
def _strip_bom(text: str) -> tuple[str, bool]:
|
||||
"""Return (text-without-leading-BOM, had_bom).
|
||||
|
||||
Only a single leading BOM is stripped; a BOM appearing mid-content is
|
||||
left alone (it's legitimate data there, not a file marker).
|
||||
"""
|
||||
if text and text.startswith(_UTF8_BOM):
|
||||
return text[len(_UTF8_BOM):], True
|
||||
return text, False
|
||||
|
||||
|
||||
def _has_bom(text: Optional[str]) -> bool:
|
||||
"""True if ``text`` begins with a UTF-8 BOM."""
|
||||
return bool(text) and text.startswith(_UTF8_BOM)
|
||||
|
||||
|
||||
def _is_write_denied(path: str) -> bool:
|
||||
"""Return True if path is on the write deny list."""
|
||||
return _shared_is_write_denied(path)
|
||||
@ -801,6 +831,22 @@ class ShellFileOperations(FileOperations):
|
||||
return None
|
||||
return _detect_line_ending(head_result.stdout)
|
||||
|
||||
def _file_has_bom(self, path: str, pre_content: Optional[str] = None) -> bool:
|
||||
"""Whether the file on disk starts with a UTF-8 BOM.
|
||||
|
||||
Uses ``pre_content`` if we already read the file (zero extra exec
|
||||
calls); otherwise issues a tiny ``head -c 3`` to sample just the
|
||||
marker. A missing/empty file returns False (new writes get no BOM
|
||||
unless the caller explicitly includes one).
|
||||
"""
|
||||
if pre_content is not None:
|
||||
return _has_bom(pre_content)
|
||||
head_cmd = f"head -c 3 {self._escape_shell_arg(path)} 2>/dev/null"
|
||||
head_result = self._exec(head_cmd)
|
||||
if head_result.exit_code != 0 or not head_result.stdout:
|
||||
return False
|
||||
return _has_bom(head_result.stdout)
|
||||
|
||||
|
||||
def _unified_diff(self, old_content: str, new_content: str, filename: str) -> str:
|
||||
"""Generate unified diff between old and new content."""
|
||||
@ -885,6 +931,11 @@ class ShellFileOperations(FileOperations):
|
||||
if read_result.exit_code != 0:
|
||||
return ReadResult(error=f"Failed to read file: {read_result.stdout}")
|
||||
read_output = _strip_terminal_fence_leaks(read_result.stdout)
|
||||
# Strip a leading UTF-8 BOM so the model never sees a phantom U+FEFF
|
||||
# before the first real character. Only meaningful on the first
|
||||
# chunk (the marker lives at byte 0); later pages can't carry it.
|
||||
if offset == 1:
|
||||
read_output, _ = _strip_bom(read_output)
|
||||
|
||||
# Get total line count
|
||||
wc_cmd = f"wc -l < {self._escape_shell_arg(path)}"
|
||||
@ -989,8 +1040,14 @@ class ShellFileOperations(FileOperations):
|
||||
cat_result = self._exec(f"cat {self._escape_shell_arg(path)}")
|
||||
if cat_result.exit_code != 0:
|
||||
return ReadResult(error=f"Failed to read file: {cat_result.stdout}")
|
||||
# Strip a leading UTF-8 BOM so patch's fuzzy matcher operates on
|
||||
# clean content (a phantom U+FEFF before line 1 would defeat an
|
||||
# exact first-line match). write_file restores the BOM on the way
|
||||
# back out — it re-probes the on-disk file, which still has the
|
||||
# marker — so the round-trip preserves it.
|
||||
raw_content, _ = _strip_bom(_strip_terminal_fence_leaks(cat_result.stdout))
|
||||
return ReadResult(
|
||||
content=_strip_terminal_fence_leaks(cat_result.stdout),
|
||||
content=raw_content,
|
||||
file_size=file_size,
|
||||
)
|
||||
|
||||
@ -1090,6 +1147,18 @@ class ShellFileOperations(FileOperations):
|
||||
if original_ending == "\r\n":
|
||||
content = _normalize_line_endings(content, "\r\n")
|
||||
|
||||
# ── BOM preservation ──────────────────────────────────────────
|
||||
# If the file on disk started with a UTF-8 BOM, keep it. read_file
|
||||
# strips the BOM so the agent never sees it, which means the
|
||||
# content it hands back to write_file / patch has no BOM either —
|
||||
# without restoring it here a round-trip would silently strip the
|
||||
# marker and change the file's byte signature (some Windows
|
||||
# toolchains key on it). Only prepend when the original had a BOM
|
||||
# and the new content doesn't already carry one (guards against
|
||||
# double-BOM if a caller passed raw bytes).
|
||||
if self._file_has_bom(path, pre_content) and not _has_bom(content):
|
||||
content = _UTF8_BOM + content
|
||||
|
||||
# Snapshot LSP diagnostics for this file (best-effort) so the
|
||||
# post-write LSP layer can return only diagnostics introduced
|
||||
# by this specific edit. Mirrors claude-code's
|
||||
@ -1193,7 +1262,13 @@ class ShellFileOperations(FileOperations):
|
||||
return PatchResult(error=f"Failed to read file: {path}")
|
||||
|
||||
content = read_result.stdout
|
||||
|
||||
# Strip a leading UTF-8 BOM before matching so the fuzzy matcher and
|
||||
# the diff operate on clean content (a phantom U+FEFF before line 1
|
||||
# defeats an exact first-line match). write_file restores the BOM on
|
||||
# the way back out by re-probing the on-disk file, so the round-trip
|
||||
# preserves the marker.
|
||||
content, _ = _strip_bom(content)
|
||||
|
||||
# Import and use fuzzy matching
|
||||
from tools.fuzzy_match import fuzzy_find_and_replace
|
||||
|
||||
@ -1242,8 +1317,13 @@ class ShellFileOperations(FileOperations):
|
||||
# ``new_content`` string has bare LFs. Without this normalization
|
||||
# every patch on Windows returns a bogus "wrote 39, read 42"
|
||||
# false-negative even though the edit landed correctly. POSIX
|
||||
# backends don't translate, so this is a no-op there.
|
||||
_verify_stdout_normalized = verify_result.stdout.replace("\r\n", "\n").replace("\r", "\n")
|
||||
# backends don't translate, so this is a no-op there. We also
|
||||
# strip a leading BOM from the re-read: write_file restored the
|
||||
# marker on disk but ``new_content`` is the BOM-less string we
|
||||
# matched against, so the comparison must drop it to stay
|
||||
# apples-to-apples.
|
||||
_verify_bomless, _ = _strip_bom(verify_result.stdout)
|
||||
_verify_stdout_normalized = _verify_bomless.replace("\r\n", "\n").replace("\r", "\n")
|
||||
_new_content_normalized = new_content.replace("\r\n", "\n").replace("\r", "\n")
|
||||
if _verify_stdout_normalized != _new_content_normalized:
|
||||
return PatchResult(error=(
|
||||
|
||||
Reference in New Issue
Block a user