fix(gdpval): normalize python-docx ns0 namespacing before LibreOffice convert (#1270)

agronskiy · web-flow · commit ed190cdcf67c · 2026-05-08T09:05:03.000-07:00
## Summary Some `.docx` files in the GDPVal corpus are emitted by `python-docx` (or similar `lxml`-based tools) and serialize the OPC package XML with an explicit `ns0:` namespace prefix: ```xml <ns0:Relationships xmlns:ns0="http://schemas.openxmlformats.org/..."> <ns0:Types xmlns:ns0="http://schemas.openxmlformats.org/..."> ``` These are valid OOXML — Microsoft Word and pandoc accept them — but LibreOffice 24.2 rejects them silently with `Error: source file could not be loaded`. In the gdpval resources server this surfaces as `pdf miss` events when preconverting task input documents. The prefixing appears in BOTH `_rels/.rels` AND `[Content_Types].xml`. Rewriting only one of them is not enough — both must be normalized for LibreOffice to load the file. `convert_to_pdf` now detects the `ns0:` prefix and, when present, writes a namespace-normalized copy of the package to a tempdir and runs LibreOffice on that copy. The original file on disk is left untouched and the output PDF still lands next to the original. ## Empirical impact Reproduced and verified on the actual vadams Kimi corpus on lustre. Across the 46 source files in 27 affected GDPVal tasks: | Failure mode | Files | Resolved by this PR | |---|---|---| | `ns0:` prefix in package XML | 43 | Yes | | `ns0:` prefix + malformed `<Relationship>` chain | 2 | No (separate cause) | | Missing rels target parts (one `.pptx`) | 1 | No (out of scope) | So this PR resolves 43/46 (~93%) of the failing source files. The remaining 3 are outside the scope of this fix. ## Test plan - [x] Unit tests for `_rewrite_ns0_namespace`, `_ooxml_has_ns0_prefix`, `_normalize_ooxml_zip`, and `convert_to_pdf` selecting the normalized copy when `ns0:` is present (21 tests pass) - [x] End-to-end: pulled `Compensation Model Ideas.docx` from lustre (a known-failing file), ran `convert_to_pdf` — produced a valid PDF with the message "converted Compensation Model Ideas.docx (after ns0 normalization)" - [x] Lint + format clean (`ruff check`, `ruff format --check`) ## Notes - Out of scope: the 2 files with malformed `<Relationship>` chains in `word/_rels/document.xml.rels` (caused by an upstream URL-redaction step that left invalid XML) and the 1 `.pptx` with missing rels targets. These need separate handling. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Signed-off-by: Alex Gronskiy <agronskiy@nvidia.com>
diff --git a/resources_servers/gdpval/preconvert.py b/resources_servers/gdpval/preconvert.py
@@ -12,16 +12,41 @@
 concurrent libreoffice subprocesses don't race on the shared default
 profile lock (``$HOME/.config/libreoffice``) — that race is the reason
 the previous default ``max_concurrent=1`` existed.
+
+OOXML namespace normalization
+-----------------------------
+
+Some files in the GDPVal corpus were emitted by ``python-docx`` (or
+similar lxml-based tools), which serialize the OPC package XML with an
+explicit ``ns0:`` namespace prefix:
+
+    <ns0:Relationships xmlns:ns0="http://schemas.openxmlformats.org/...">
+
+instead of the standard default-namespace form:
+
+    <Relationships xmlns="http://schemas.openxmlformats.org/...">
+
+The two forms are semantically identical XML, and Microsoft Word /
+pandoc accept both. LibreOffice 24.2, however, rejects the prefixed
+form with ``Error: source file could not be loaded``. The prefixing
+shows up in BOTH ``_rels/.rels`` and ``[Content_Types].xml``; rewriting
+only one of them is not enough.
+
+Before invoking libreoffice we detect this shape and write a
+namespace-normalized copy to a tempdir, leaving the original on disk
+untouched.
 """
 
 from __future__ import annotations
 
 import asyncio
 import logging
 import os
+import re
 import shutil
 import subprocess
 import tempfile
+import zipfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 
@@ -32,6 +57,42 @@
 
 DEFAULT_MAX_CONCURRENT = 4
 
+_NS0_ROOT_RE = re.compile(r'<ns0:([A-Za-z_][\w.-]*)\b([^>]*?)\bxmlns:ns0="([^"]+)"')
+_NS0_TAG_RE = re.compile(r"</?ns0:")
+_NS0_SENTINEL = b'xmlns:ns0="http://schemas.openxmlformats.org/'
+
+
+def _rewrite_ns0_namespace(text: str) -> str:
+    text = _NS0_ROOT_RE.sub(r'<\1 xmlns="\3"\2', text)
+    text = _NS0_TAG_RE.sub(lambda m: m.group(0).replace("ns0:", ""), text)
+    return text
+
+
+def _ooxml_has_ns0_prefix(path: Path) -> bool:
+    """True if the package uses python-docx-style ``ns0:`` prefixing in
+    ``_rels/.rels`` or ``[Content_Types].xml``. LibreOffice can't load
+    files in this form even though they are valid OOXML."""
+    try:
+        with zipfile.ZipFile(path) as zin:
+            names = set(zin.namelist())
+            for part in ("_rels/.rels", "[Content_Types].xml"):
+                if part in names and _NS0_SENTINEL in zin.read(part):
+                    return True
+    except (zipfile.BadZipFile, OSError):
+        return False
+    return False
+
+
+def _normalize_ooxml_zip(src: Path, dst: Path) -> None:
+    """Copy ``src`` to ``dst`` rewriting any ``ns0:``-prefixed package XML
+    (``*.rels`` and ``[Content_Types].xml``) to default-namespace form."""
+    with zipfile.ZipFile(src) as zin, zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout:
+        for item in zin.namelist():
+            data = zin.read(item)
+            if item.endswith(".rels") or item == "[Content_Types].xml":
+                data = _rewrite_ns0_namespace(data.decode("utf-8")).encode("utf-8")
+            zout.writestr(item, data)
+
 
 def needs_conversion(path: Path) -> bool:
     return path.suffix.lower() in OFFICE_EXTENSIONS and not path.with_suffix(".pdf").exists()
@@ -41,7 +102,16 @@ def convert_to_pdf(path: Path) -> tuple[Path, bool, str]:
     """Convert one file to PDF via host LibreOffice. Returns ``(path, ok, msg)``."""
     output_dir = str(path.parent)
     profile_dir = Path(tempfile.mkdtemp(prefix="lo-profile-"))
+    norm_dir: Path | None = None
+    input_path = path
+    normalized = False
     try:
+        if _ooxml_has_ns0_prefix(path):
+            norm_dir = Path(tempfile.mkdtemp(prefix="gdpval-norm-"))
+            input_path = norm_dir / path.name
+            _normalize_ooxml_zip(path, input_path)
+            normalized = True
+
         result = subprocess.run(
             [
                 "libreoffice",
@@ -55,15 +125,16 @@ def convert_to_pdf(path: Path) -> tuple[Path, bool, str]:
                 "pdf",
                 "--outdir",
                 output_dir,
-                str(path),
+                str(input_path),
             ],
             capture_output=True,
             text=True,
             timeout=120,
         )
         pdf_path = path.with_suffix(".pdf")
         if pdf_path.exists():
-            return path, True, f"converted {path.name}"
+            suffix = " (after ns0 normalization)" if normalized else ""
+            return path, True, f"converted {path.name}{suffix}"
         return (
             path,
             False,
@@ -77,6 +148,8 @@ def convert_to_pdf(path: Path) -> tuple[Path, bool, str]:
         return path, False, f"error converting {path.name}: {exc!r}"
     finally:
         shutil.rmtree(profile_dir, ignore_errors=True)
+        if norm_dir is not None:
+            shutil.rmtree(norm_dir, ignore_errors=True)
 
 
 def find_convertible_files(root_dir: str | os.PathLike) -> list[Path]:
diff --git a/resources_servers/gdpval/tests/test_preconvert.py b/resources_servers/gdpval/tests/test_preconvert.py
@@ -134,3 +134,204 @@ async def test_preconvert_dir_async_propagates_results(tmp_path: Path, monkeypat
     ok, fail, errors = await pcv.preconvert_dir_async(str(tmp_path))
     assert (ok, fail) == (0, 1)
     assert errors == ["boom"]
+
+
+# Fixtures + tests for the ns0-namespace normalization (Mode A in
+# the GDPVal corpus). See module docstring on preconvert.py for the
+# background on why python-docx-style ns0 prefixing breaks LibreOffice.
+
+NS0_RELS = (
+    b"<?xml version='1.0' encoding='utf-8'?>\n"
+    b'<ns0:Relationships xmlns:ns0="http://schemas.openxmlformats.org/package/2006/relationships">'
+    b'<ns0:Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/'
+    b'relationships/officeDocument" Target="word/document.xml" />'
+    b"</ns0:Relationships>"
+)
+
+NS0_CONTENT_TYPES = (
+    b"<?xml version='1.0' encoding='utf-8'?>\n"
+    b'<ns0:Types xmlns:ns0="http://schemas.openxmlformats.org/package/2006/content-types">'
+    b'<ns0:Default Extension="rels" '
+    b'ContentType="application/vnd.openxmlformats-package.relationships+xml" />'
+    b'<ns0:Override PartName="/word/document.xml" '
+    b'ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml" />'
+    b"</ns0:Types>"
+)
+
+DEFAULT_NS_RELS = (
+    b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
+    b'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
+    b'<Relationship Id="rId1" Type="x" Target="word/document.xml"/></Relationships>'
+)
+
+DEFAULT_NS_CONTENT_TYPES = (
+    b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
+    b'<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"></Types>'
+)
+
+DOCUMENT_XML = (
+    b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><w:document '
+    b'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"></w:document>'
+)
+
+
+def _make_zip(path: Path, parts: dict[str, bytes]) -> Path:
+    import zipfile as _zip
+
+    with _zip.ZipFile(path, "w", _zip.ZIP_DEFLATED) as z:
+        for name, data in parts.items():
+            z.writestr(name, data)
+    return path
+
+
+class TestRewriteNs0Namespace:
+    def test_rewrites_root_to_default_namespace(self) -> None:
+        out = pcv._rewrite_ns0_namespace(NS0_RELS.decode("utf-8"))
+        assert "<ns0:" not in out
+        assert "</ns0:" not in out
+        assert "xmlns:ns0=" not in out
+        assert '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"' in out
+
+    def test_rewrites_content_types(self) -> None:
+        out = pcv._rewrite_ns0_namespace(NS0_CONTENT_TYPES.decode("utf-8"))
+        assert "<ns0:" not in out
+        assert "</ns0:" not in out
+        assert '<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"' in out
+        # Override children must remain (just unprefixed).
+        assert "<Override PartName=" in out
+
+    def test_idempotent_on_default_namespace(self) -> None:
+        out = pcv._rewrite_ns0_namespace(DEFAULT_NS_RELS.decode("utf-8"))
+        assert out == DEFAULT_NS_RELS.decode("utf-8")
+
+
+class TestOoxmlHasNs0Prefix:
+    def test_true_when_rels_has_ns0(self, tmp_path: Path) -> None:
+        zp = _make_zip(
+            tmp_path / "a.docx",
+            {
+                "[Content_Types].xml": DEFAULT_NS_CONTENT_TYPES,
+                "_rels/.rels": NS0_RELS,
+                "word/document.xml": DOCUMENT_XML,
+            },
+        )
+        assert pcv._ooxml_has_ns0_prefix(zp) is True
+
+    def test_true_when_only_content_types_has_ns0(self, tmp_path: Path) -> None:
+        zp = _make_zip(
+            tmp_path / "a.docx",
+            {
+                "[Content_Types].xml": NS0_CONTENT_TYPES,
+                "_rels/.rels": DEFAULT_NS_RELS,
+                "word/document.xml": DOCUMENT_XML,
+            },
+        )
+        assert pcv._ooxml_has_ns0_prefix(zp) is True
+
+    def test_false_when_default_namespace(self, tmp_path: Path) -> None:
+        zp = _make_zip(
+            tmp_path / "a.docx",
+            {
+                "[Content_Types].xml": DEFAULT_NS_CONTENT_TYPES,
+                "_rels/.rels": DEFAULT_NS_RELS,
+                "word/document.xml": DOCUMENT_XML,
+            },
+        )
+        assert pcv._ooxml_has_ns0_prefix(zp) is False
+
+    def test_false_on_non_zip(self, tmp_path: Path) -> None:
+        bogus = tmp_path / "a.docx"
+        bogus.write_bytes(b"not a zip")
+        assert pcv._ooxml_has_ns0_prefix(bogus) is False
+
+
+class TestNormalizeOoxmlZip:
+    def test_rewrites_rels_and_content_types_only(self, tmp_path: Path) -> None:
+        import zipfile as _zip
+
+        src = _make_zip(
+            tmp_path / "in.docx",
+            {
+                "[Content_Types].xml": NS0_CONTENT_TYPES,
+                "_rels/.rels": NS0_RELS,
+                "word/_rels/document.xml.rels": NS0_RELS,
+                "word/document.xml": DOCUMENT_XML,
+            },
+        )
+        dst = tmp_path / "out.docx"
+        pcv._normalize_ooxml_zip(src, dst)
+
+        with _zip.ZipFile(dst) as z:
+            for part in ("[Content_Types].xml", "_rels/.rels", "word/_rels/document.xml.rels"):
+                text = z.read(part).decode("utf-8")
+                assert "<ns0:" not in text, f"ns0 still present in {part}"
+                assert "xmlns:ns0=" not in text, f"xmlns:ns0 still in {part}"
+            # non-package XML must be byte-identical
+            assert z.read("word/document.xml") == DOCUMENT_XML
+
+
+class TestConvertToPdfNormalization:
+    def test_calls_libreoffice_with_normalized_copy_when_ns0(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        src = _make_zip(
+            tmp_path / "src.docx",
+            {
+                "[Content_Types].xml": NS0_CONTENT_TYPES,
+                "_rels/.rels": NS0_RELS,
+                "word/document.xml": DOCUMENT_XML,
+            },
+        )
+        captured: list[list[str]] = []
+
+        class _Completed:
+            returncode = 0
+            stdout = ""
+            stderr = ""
+
+        def _run(cmd, *_a, **_kw):
+            captured.append(cmd)
+            # libreoffice would write the PDF to the original outdir using the input stem.
+            (src.with_suffix(".pdf")).write_bytes(b"%PDF-1.4 fake\n")
+            return _Completed()
+
+        monkeypatch.setattr(subprocess, "run", _run)
+        path, ok, msg = pcv.convert_to_pdf(src)
+        assert ok is True
+        assert "(after ns0 normalization)" in msg
+        # The input arg passed to libreoffice should NOT be the original file: it must come
+        # from the gdpval-norm- tempdir, but with the same basename so output stem is preserved.
+        assert len(captured) == 1
+        input_arg = captured[0][-1]
+        assert input_arg.endswith("/src.docx")
+        assert "/gdpval-norm-" in input_arg
+        assert input_arg != str(src)
+
+    def test_calls_libreoffice_with_original_when_not_ns0(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        src = _make_zip(
+            tmp_path / "src.docx",
+            {
+                "[Content_Types].xml": DEFAULT_NS_CONTENT_TYPES,
+                "_rels/.rels": DEFAULT_NS_RELS,
+                "word/document.xml": DOCUMENT_XML,
+            },
+        )
+        captured: list[list[str]] = []
+
+        class _Completed:
+            returncode = 0
+            stdout = ""
+            stderr = ""
+
+        def _run(cmd, *_a, **_kw):
+            captured.append(cmd)
+            (src.with_suffix(".pdf")).write_bytes(b"%PDF-1.4 fake\n")
+            return _Completed()
+
+        monkeypatch.setattr(subprocess, "run", _run)
+        path, ok, msg = pcv.convert_to_pdf(src)
+        assert ok is True
+        assert "(after ns0 normalization)" not in msg
+        assert captured[0][-1] == str(src)