fix(file-tools): handle UTF-8 BOM in read_file / write_file / patch (#35278)

Some Windows editors prepend an invisible UTF-8 BOM (U+FEFF) to text files. We had no awareness of it, so: read_file surfaced a phantom U+FEFF as the first character; patch matches against the true first line could miss; and a write/patch round-trip silently stripped the marker, changing the file's byte signature. Now: - read_file / read_file_raw strip a single leading BOM so the model never sees it (only on the first chunk — the marker lives at byte 0). - patch_replace strips the BOM before fuzzy-matching (so an exact first-line match works) and its post-write verification compares BOM-stripped content. - write_file restores the BOM when the original file had one and the new content doesn't, mirroring the existing line-ending preservation (detect on disk via a cheap `head -c 3` probe or reuse pre_content, re-prepend across the edit). Guards against double-BOM. Mid-content U+FEFF is left alone (it's data there, not a file marker). Tests: TestBomHandling (real LocalEnvironment) — read-strips, raw-read strips, write preserves, no-BOM-when-original-had-none, no-double-BOM, patch round-trip preserves, patch matches first line through a BOM, plus helper unit tests. 208 file-tool tests green.
2026-05-30 06:25:50 -07:00
parent 5a1aa9e68c
commit 5f84c9144a
2 changed files with 180 additions and 4 deletions
--- a/tests/tools/test_file_write_safety.py
+++ b/tests/tools/test_file_write_safety.py
@ -183,5 +183,101 @@ class TestAtomicWrite:
        assert (os.stat(target).st_mode & 0o777) == 0o600


+class TestBomHandling:
+    """UTF-8 BOM is stripped on read and preserved across write/patch.
+
+    A BOM (U+FEFF, bytes EF BB BF) is an invisible leading marker some
+    Windows editors prepend. The agent should never see it in read output,
+    but a file that had one on disk must keep it after an edit so the byte
+    signature is preserved.
+    """
+
+    BOM = "\ufeff"
+
+    @pytest.fixture
+    def ops(self, tmp_path: Path):
+        from tools.environments.local import LocalEnvironment
+        from tools.file_operations import ShellFileOperations
+        env = LocalEnvironment(cwd=str(tmp_path))
+        return ShellFileOperations(env, cwd=str(tmp_path))
+
+    def test_helpers(self):
+        from tools.file_operations import _strip_bom, _has_bom
+        assert _strip_bom("\ufeffhello") == ("hello", True)
+        assert _strip_bom("hello") == ("hello", False)
+        assert _strip_bom("") == ("", False)
+        # mid-string BOM is data, not a marker — left alone
+        assert _strip_bom("a\ufeffb") == ("a\ufeffb", False)
+        assert _has_bom("\ufeffx") is True
+        assert _has_bom("x") is False
+        assert _has_bom(None) is False
+
+    def test_read_strips_bom(self, ops, tmp_path: Path):
+        target = tmp_path / "bom.py"
+        # Write raw bytes with a real UTF-8 BOM prefix.
+        target.write_bytes(self.BOM.encode("utf-8") + b"import os\nx = 1\n")
+        res = ops.read_file(str(target))
+        assert res.error is None, res.error
+        # Line 1 content must NOT carry the phantom U+FEFF.
+        first_line = res.content.split("\n", 1)[0]
+        assert self.BOM not in first_line
+        assert first_line.endswith("import os")
+
+    def test_read_raw_strips_bom(self, ops, tmp_path: Path):
+        target = tmp_path / "bom.txt"
+        target.write_bytes(self.BOM.encode("utf-8") + b"hello\nworld\n")
+        res = ops.read_file_raw(str(target))
+        assert res.error is None, res.error
+        assert not res.content.startswith(self.BOM)
+        assert res.content == "hello\nworld\n"
+
+    def test_write_preserves_bom(self, ops, tmp_path: Path):
+        # Existing file has a BOM; agent rewrites with BOM-less content.
+        target = tmp_path / "config.txt"
+        target.write_bytes(self.BOM.encode("utf-8") + b"old\n")
+        res = ops.write_file(str(target), "new content\n")
+        assert res.error is None, res.error
+        raw = target.read_bytes()
+        assert raw.startswith(self.BOM.encode("utf-8"))  # BOM restored
+        assert raw == self.BOM.encode("utf-8") + b"new content\n"
+
+    def test_write_no_bom_when_original_had_none(self, ops, tmp_path: Path):
+        target = tmp_path / "plain.txt"
+        target.write_text("old\n")
+        res = ops.write_file(str(target), "new\n")
+        assert res.error is None, res.error
+        assert not target.read_bytes().startswith(self.BOM.encode("utf-8"))
+
+    def test_write_does_not_double_bom(self, ops, tmp_path: Path):
+        # If content already carries a BOM and the file had one, don't add a
+        # second.
+        target = tmp_path / "config.txt"
+        target.write_bytes(self.BOM.encode("utf-8") + b"old\n")
+        res = ops.write_file(str(target), self.BOM + "new\n")
+        assert res.error is None, res.error
+        raw = target.read_bytes()
+        # exactly one BOM
+        assert raw == self.BOM.encode("utf-8") + b"new\n"
+
+    def test_patch_roundtrip_preserves_bom(self, ops, tmp_path: Path):
+        target = tmp_path / "edit.py"
+        target.write_bytes(self.BOM.encode("utf-8") + b"a = 1\nb = 2\nc = 3\n")
+        res = ops.patch_replace(str(target), "b = 2", "b = 22")
+        assert res.success, res.error
+        raw = target.read_bytes()
+        assert raw.startswith(self.BOM.encode("utf-8"))  # marker survived
+        assert raw == self.BOM.encode("utf-8") + b"a = 1\nb = 22\nc = 3\n"
+
+    def test_patch_matches_first_line_through_bom(self, ops, tmp_path: Path):
+        # The whole point: an edit targeting the BOM-prefixed first line
+        # must match cleanly (the matcher sees BOM-stripped content).
+        target = tmp_path / "mod.py"
+        target.write_bytes(self.BOM.encode("utf-8") + b"import os\nimport sys\n")
+        res = ops.patch_replace(str(target), "import os", "import os, json")
+        assert res.success, res.error
+        raw = target.read_bytes()
+        assert raw == self.BOM.encode("utf-8") + b"import os, json\nimport sys\n"
+
+
 if __name__ == "__main__":
    pytest.main([__file__, "-v"])
--- a/tools/file_operations.py
+++ b/tools/file_operations.py
@ -113,6 +113,36 @@ def _normalize_line_endings(text: str, target: str) -> str:
    return text


+# UTF-8 byte order mark. Some Windows editors (Notepad, older Visual Studio,
+# some PowerShell redirects) prepend this invisible 3-byte marker
+# (EF BB BF == U+FEFF) to UTF-8 text files. It renders as nothing but is a
+# real character at the start of the decoded string, so without handling it:
+#   - read_file would surface a stray U+FEFF as the first character (the
+#     model sees a phantom char before `import ...`), and
+#   - patch matches against the true first line would miss, and write_file
+#     would silently drop or double the marker on rewrite.
+# We strip it on read so the model sees clean content, and restore it on
+# write when the original file had one — exactly mirroring the line-ending
+# preservation above (detect on disk, preserve across the edit).
+_UTF8_BOM = "\ufeff"
+
+
+def _strip_bom(text: str) -> tuple[str, bool]:
+    """Return (text-without-leading-BOM, had_bom).
+
+    Only a single leading BOM is stripped; a BOM appearing mid-content is
+    left alone (it's legitimate data there, not a file marker).
+    """
+    if text and text.startswith(_UTF8_BOM):
+        return text[len(_UTF8_BOM):], True
+    return text, False
+
+
+def _has_bom(text: Optional[str]) -> bool:
+    """True if ``text`` begins with a UTF-8 BOM."""
+    return bool(text) and text.startswith(_UTF8_BOM)
+
+
 def _is_write_denied(path: str) -> bool:
    """Return True if path is on the write deny list."""
    return _shared_is_write_denied(path)
@ -801,6 +831,22 @@ class ShellFileOperations(FileOperations):
            return None
        return _detect_line_ending(head_result.stdout)

+    def _file_has_bom(self, path: str, pre_content: Optional[str] = None) -> bool:
+        """Whether the file on disk starts with a UTF-8 BOM.
+
+        Uses ``pre_content`` if we already read the file (zero extra exec
+        calls); otherwise issues a tiny ``head -c 3`` to sample just the
+        marker. A missing/empty file returns False (new writes get no BOM
+        unless the caller explicitly includes one).
+        """
+        if pre_content is not None:
+            return _has_bom(pre_content)
+        head_cmd = f"head -c 3 {self._escape_shell_arg(path)} 2>/dev/null"
+        head_result = self._exec(head_cmd)
+        if head_result.exit_code != 0 or not head_result.stdout:
+            return False
+        return _has_bom(head_result.stdout)
+

    def _unified_diff(self, old_content: str, new_content: str, filename: str) -> str:
        """Generate unified diff between old and new content."""
@ -885,6 +931,11 @@ class ShellFileOperations(FileOperations):
        if read_result.exit_code != 0:
            return ReadResult(error=f"Failed to read file: {read_result.stdout}")
        read_output = _strip_terminal_fence_leaks(read_result.stdout)
+        # Strip a leading UTF-8 BOM so the model never sees a phantom U+FEFF
+        # before the first real character. Only meaningful on the first
+        # chunk (the marker lives at byte 0); later pages can't carry it.
+        if offset == 1:
+            read_output, _ = _strip_bom(read_output)
        
        # Get total line count
        wc_cmd = f"wc -l < {self._escape_shell_arg(path)}"
@ -989,8 +1040,14 @@ class ShellFileOperations(FileOperations):
        cat_result = self._exec(f"cat {self._escape_shell_arg(path)}")
        if cat_result.exit_code != 0:
            return ReadResult(error=f"Failed to read file: {cat_result.stdout}")
+        # Strip a leading UTF-8 BOM so patch's fuzzy matcher operates on
+        # clean content (a phantom U+FEFF before line 1 would defeat an
+        # exact first-line match). write_file restores the BOM on the way
+        # back out — it re-probes the on-disk file, which still has the
+        # marker — so the round-trip preserves it.
+        raw_content, _ = _strip_bom(_strip_terminal_fence_leaks(cat_result.stdout))
        return ReadResult(
-            content=_strip_terminal_fence_leaks(cat_result.stdout),
+            content=raw_content,
            file_size=file_size,
        )

@ -1090,6 +1147,18 @@ class ShellFileOperations(FileOperations):
        if original_ending == "\r\n":
            content = _normalize_line_endings(content, "\r\n")

+        # ── BOM preservation ──────────────────────────────────────────
+        # If the file on disk started with a UTF-8 BOM, keep it. read_file
+        # strips the BOM so the agent never sees it, which means the
+        # content it hands back to write_file / patch has no BOM either —
+        # without restoring it here a round-trip would silently strip the
+        # marker and change the file's byte signature (some Windows
+        # toolchains key on it). Only prepend when the original had a BOM
+        # and the new content doesn't already carry one (guards against
+        # double-BOM if a caller passed raw bytes).
+        if self._file_has_bom(path, pre_content) and not _has_bom(content):
+            content = _UTF8_BOM + content
+
        # Snapshot LSP diagnostics for this file (best-effort) so the
        # post-write LSP layer can return only diagnostics introduced
        # by this specific edit.  Mirrors claude-code's
@ -1193,7 +1262,13 @@ class ShellFileOperations(FileOperations):
            return PatchResult(error=f"Failed to read file: {path}")
        
        content = read_result.stdout
-        
+        # Strip a leading UTF-8 BOM before matching so the fuzzy matcher and
+        # the diff operate on clean content (a phantom U+FEFF before line 1
+        # defeats an exact first-line match). write_file restores the BOM on
+        # the way back out by re-probing the on-disk file, so the round-trip
+        # preserves the marker.
+        content, _ = _strip_bom(content)
+
        # Import and use fuzzy matching
        from tools.fuzzy_match import fuzzy_find_and_replace
        
@ -1242,8 +1317,13 @@ class ShellFileOperations(FileOperations):
        # ``new_content`` string has bare LFs.  Without this normalization
        # every patch on Windows returns a bogus "wrote 39, read 42"
        # false-negative even though the edit landed correctly.  POSIX
-        # backends don't translate, so this is a no-op there.
-        _verify_stdout_normalized = verify_result.stdout.replace("\r\n", "\n").replace("\r", "\n")
+        # backends don't translate, so this is a no-op there.  We also
+        # strip a leading BOM from the re-read: write_file restored the
+        # marker on disk but ``new_content`` is the BOM-less string we
+        # matched against, so the comparison must drop it to stay
+        # apples-to-apples.
+        _verify_bomless, _ = _strip_bom(verify_result.stdout)
+        _verify_stdout_normalized = _verify_bomless.replace("\r\n", "\n").replace("\r", "\n")
        _new_content_normalized = new_content.replace("\r\n", "\n").replace("\r", "\n")
        if _verify_stdout_normalized != _new_content_normalized:
            return PatchResult(error=(