From 5f84c9144a2c1f1248e92f53eeb2ea8146ad0883 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 30 May 2026 06:25:50 -0700 Subject: [PATCH] fix(file-tools): handle UTF-8 BOM in read_file / write_file / patch (#35278) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Windows editors prepend an invisible UTF-8 BOM (U+FEFF) to text files. We had no awareness of it, so: read_file surfaced a phantom U+FEFF as the first character; patch matches against the true first line could miss; and a write/patch round-trip silently stripped the marker, changing the file's byte signature. Now: - read_file / read_file_raw strip a single leading BOM so the model never sees it (only on the first chunk — the marker lives at byte 0). - patch_replace strips the BOM before fuzzy-matching (so an exact first-line match works) and its post-write verification compares BOM-stripped content. - write_file restores the BOM when the original file had one and the new content doesn't, mirroring the existing line-ending preservation (detect on disk via a cheap `head -c 3` probe or reuse pre_content, re-prepend across the edit). Guards against double-BOM. Mid-content U+FEFF is left alone (it's data there, not a file marker). Tests: TestBomHandling (real LocalEnvironment) — read-strips, raw-read strips, write preserves, no-BOM-when-original-had-none, no-double-BOM, patch round-trip preserves, patch matches first line through a BOM, plus helper unit tests. 208 file-tool tests green. --- tests/tools/test_file_write_safety.py | 96 +++++++++++++++++++++++++++ tools/file_operations.py | 88 ++++++++++++++++++++++-- 2 files changed, 180 insertions(+), 4 deletions(-) diff --git a/tests/tools/test_file_write_safety.py b/tests/tools/test_file_write_safety.py index a2bb05dd1..ac44dd1bc 100644 --- a/tests/tools/test_file_write_safety.py +++ b/tests/tools/test_file_write_safety.py @@ -183,5 +183,101 @@ class TestAtomicWrite: assert (os.stat(target).st_mode & 0o777) == 0o600 +class TestBomHandling: + """UTF-8 BOM is stripped on read and preserved across write/patch. + + A BOM (U+FEFF, bytes EF BB BF) is an invisible leading marker some + Windows editors prepend. The agent should never see it in read output, + but a file that had one on disk must keep it after an edit so the byte + signature is preserved. + """ + + BOM = "\ufeff" + + @pytest.fixture + def ops(self, tmp_path: Path): + from tools.environments.local import LocalEnvironment + from tools.file_operations import ShellFileOperations + env = LocalEnvironment(cwd=str(tmp_path)) + return ShellFileOperations(env, cwd=str(tmp_path)) + + def test_helpers(self): + from tools.file_operations import _strip_bom, _has_bom + assert _strip_bom("\ufeffhello") == ("hello", True) + assert _strip_bom("hello") == ("hello", False) + assert _strip_bom("") == ("", False) + # mid-string BOM is data, not a marker — left alone + assert _strip_bom("a\ufeffb") == ("a\ufeffb", False) + assert _has_bom("\ufeffx") is True + assert _has_bom("x") is False + assert _has_bom(None) is False + + def test_read_strips_bom(self, ops, tmp_path: Path): + target = tmp_path / "bom.py" + # Write raw bytes with a real UTF-8 BOM prefix. + target.write_bytes(self.BOM.encode("utf-8") + b"import os\nx = 1\n") + res = ops.read_file(str(target)) + assert res.error is None, res.error + # Line 1 content must NOT carry the phantom U+FEFF. + first_line = res.content.split("\n", 1)[0] + assert self.BOM not in first_line + assert first_line.endswith("import os") + + def test_read_raw_strips_bom(self, ops, tmp_path: Path): + target = tmp_path / "bom.txt" + target.write_bytes(self.BOM.encode("utf-8") + b"hello\nworld\n") + res = ops.read_file_raw(str(target)) + assert res.error is None, res.error + assert not res.content.startswith(self.BOM) + assert res.content == "hello\nworld\n" + + def test_write_preserves_bom(self, ops, tmp_path: Path): + # Existing file has a BOM; agent rewrites with BOM-less content. + target = tmp_path / "config.txt" + target.write_bytes(self.BOM.encode("utf-8") + b"old\n") + res = ops.write_file(str(target), "new content\n") + assert res.error is None, res.error + raw = target.read_bytes() + assert raw.startswith(self.BOM.encode("utf-8")) # BOM restored + assert raw == self.BOM.encode("utf-8") + b"new content\n" + + def test_write_no_bom_when_original_had_none(self, ops, tmp_path: Path): + target = tmp_path / "plain.txt" + target.write_text("old\n") + res = ops.write_file(str(target), "new\n") + assert res.error is None, res.error + assert not target.read_bytes().startswith(self.BOM.encode("utf-8")) + + def test_write_does_not_double_bom(self, ops, tmp_path: Path): + # If content already carries a BOM and the file had one, don't add a + # second. + target = tmp_path / "config.txt" + target.write_bytes(self.BOM.encode("utf-8") + b"old\n") + res = ops.write_file(str(target), self.BOM + "new\n") + assert res.error is None, res.error + raw = target.read_bytes() + # exactly one BOM + assert raw == self.BOM.encode("utf-8") + b"new\n" + + def test_patch_roundtrip_preserves_bom(self, ops, tmp_path: Path): + target = tmp_path / "edit.py" + target.write_bytes(self.BOM.encode("utf-8") + b"a = 1\nb = 2\nc = 3\n") + res = ops.patch_replace(str(target), "b = 2", "b = 22") + assert res.success, res.error + raw = target.read_bytes() + assert raw.startswith(self.BOM.encode("utf-8")) # marker survived + assert raw == self.BOM.encode("utf-8") + b"a = 1\nb = 22\nc = 3\n" + + def test_patch_matches_first_line_through_bom(self, ops, tmp_path: Path): + # The whole point: an edit targeting the BOM-prefixed first line + # must match cleanly (the matcher sees BOM-stripped content). + target = tmp_path / "mod.py" + target.write_bytes(self.BOM.encode("utf-8") + b"import os\nimport sys\n") + res = ops.patch_replace(str(target), "import os", "import os, json") + assert res.success, res.error + raw = target.read_bytes() + assert raw == self.BOM.encode("utf-8") + b"import os, json\nimport sys\n" + + if __name__ == "__main__": pytest.main([__file__, "-v"]) diff --git a/tools/file_operations.py b/tools/file_operations.py index 386ca2171..57c36e01e 100644 --- a/tools/file_operations.py +++ b/tools/file_operations.py @@ -113,6 +113,36 @@ def _normalize_line_endings(text: str, target: str) -> str: return text +# UTF-8 byte order mark. Some Windows editors (Notepad, older Visual Studio, +# some PowerShell redirects) prepend this invisible 3-byte marker +# (EF BB BF == U+FEFF) to UTF-8 text files. It renders as nothing but is a +# real character at the start of the decoded string, so without handling it: +# - read_file would surface a stray U+FEFF as the first character (the +# model sees a phantom char before `import ...`), and +# - patch matches against the true first line would miss, and write_file +# would silently drop or double the marker on rewrite. +# We strip it on read so the model sees clean content, and restore it on +# write when the original file had one — exactly mirroring the line-ending +# preservation above (detect on disk, preserve across the edit). +_UTF8_BOM = "\ufeff" + + +def _strip_bom(text: str) -> tuple[str, bool]: + """Return (text-without-leading-BOM, had_bom). + + Only a single leading BOM is stripped; a BOM appearing mid-content is + left alone (it's legitimate data there, not a file marker). + """ + if text and text.startswith(_UTF8_BOM): + return text[len(_UTF8_BOM):], True + return text, False + + +def _has_bom(text: Optional[str]) -> bool: + """True if ``text`` begins with a UTF-8 BOM.""" + return bool(text) and text.startswith(_UTF8_BOM) + + def _is_write_denied(path: str) -> bool: """Return True if path is on the write deny list.""" return _shared_is_write_denied(path) @@ -801,6 +831,22 @@ class ShellFileOperations(FileOperations): return None return _detect_line_ending(head_result.stdout) + def _file_has_bom(self, path: str, pre_content: Optional[str] = None) -> bool: + """Whether the file on disk starts with a UTF-8 BOM. + + Uses ``pre_content`` if we already read the file (zero extra exec + calls); otherwise issues a tiny ``head -c 3`` to sample just the + marker. A missing/empty file returns False (new writes get no BOM + unless the caller explicitly includes one). + """ + if pre_content is not None: + return _has_bom(pre_content) + head_cmd = f"head -c 3 {self._escape_shell_arg(path)} 2>/dev/null" + head_result = self._exec(head_cmd) + if head_result.exit_code != 0 or not head_result.stdout: + return False + return _has_bom(head_result.stdout) + def _unified_diff(self, old_content: str, new_content: str, filename: str) -> str: """Generate unified diff between old and new content.""" @@ -885,6 +931,11 @@ class ShellFileOperations(FileOperations): if read_result.exit_code != 0: return ReadResult(error=f"Failed to read file: {read_result.stdout}") read_output = _strip_terminal_fence_leaks(read_result.stdout) + # Strip a leading UTF-8 BOM so the model never sees a phantom U+FEFF + # before the first real character. Only meaningful on the first + # chunk (the marker lives at byte 0); later pages can't carry it. + if offset == 1: + read_output, _ = _strip_bom(read_output) # Get total line count wc_cmd = f"wc -l < {self._escape_shell_arg(path)}" @@ -989,8 +1040,14 @@ class ShellFileOperations(FileOperations): cat_result = self._exec(f"cat {self._escape_shell_arg(path)}") if cat_result.exit_code != 0: return ReadResult(error=f"Failed to read file: {cat_result.stdout}") + # Strip a leading UTF-8 BOM so patch's fuzzy matcher operates on + # clean content (a phantom U+FEFF before line 1 would defeat an + # exact first-line match). write_file restores the BOM on the way + # back out — it re-probes the on-disk file, which still has the + # marker — so the round-trip preserves it. + raw_content, _ = _strip_bom(_strip_terminal_fence_leaks(cat_result.stdout)) return ReadResult( - content=_strip_terminal_fence_leaks(cat_result.stdout), + content=raw_content, file_size=file_size, ) @@ -1090,6 +1147,18 @@ class ShellFileOperations(FileOperations): if original_ending == "\r\n": content = _normalize_line_endings(content, "\r\n") + # ── BOM preservation ────────────────────────────────────────── + # If the file on disk started with a UTF-8 BOM, keep it. read_file + # strips the BOM so the agent never sees it, which means the + # content it hands back to write_file / patch has no BOM either — + # without restoring it here a round-trip would silently strip the + # marker and change the file's byte signature (some Windows + # toolchains key on it). Only prepend when the original had a BOM + # and the new content doesn't already carry one (guards against + # double-BOM if a caller passed raw bytes). + if self._file_has_bom(path, pre_content) and not _has_bom(content): + content = _UTF8_BOM + content + # Snapshot LSP diagnostics for this file (best-effort) so the # post-write LSP layer can return only diagnostics introduced # by this specific edit. Mirrors claude-code's @@ -1193,7 +1262,13 @@ class ShellFileOperations(FileOperations): return PatchResult(error=f"Failed to read file: {path}") content = read_result.stdout - + # Strip a leading UTF-8 BOM before matching so the fuzzy matcher and + # the diff operate on clean content (a phantom U+FEFF before line 1 + # defeats an exact first-line match). write_file restores the BOM on + # the way back out by re-probing the on-disk file, so the round-trip + # preserves the marker. + content, _ = _strip_bom(content) + # Import and use fuzzy matching from tools.fuzzy_match import fuzzy_find_and_replace @@ -1242,8 +1317,13 @@ class ShellFileOperations(FileOperations): # ``new_content`` string has bare LFs. Without this normalization # every patch on Windows returns a bogus "wrote 39, read 42" # false-negative even though the edit landed correctly. POSIX - # backends don't translate, so this is a no-op there. - _verify_stdout_normalized = verify_result.stdout.replace("\r\n", "\n").replace("\r", "\n") + # backends don't translate, so this is a no-op there. We also + # strip a leading BOM from the re-read: write_file restored the + # marker on disk but ``new_content`` is the BOM-less string we + # matched against, so the comparison must drop it to stay + # apples-to-apples. + _verify_bomless, _ = _strip_bom(verify_result.stdout) + _verify_stdout_normalized = _verify_bomless.replace("\r\n", "\n").replace("\r", "\n") _new_content_normalized = new_content.replace("\r\n", "\n").replace("\r", "\n") if _verify_stdout_normalized != _new_content_normalized: return PatchResult(error=(