Skip to content

Commit ed190cd

Browse files
authored
fix(gdpval): normalize python-docx ns0 namespacing before LibreOffice convert (#1270)
## Summary Some `.docx` files in the GDPVal corpus are emitted by `python-docx` (or similar `lxml`-based tools) and serialize the OPC package XML with an explicit `ns0:` namespace prefix: ```xml <ns0:Relationships xmlns:ns0="http://schemas.openxmlformats.org/..."> <ns0:Types xmlns:ns0="http://schemas.openxmlformats.org/..."> ``` These are valid OOXML — Microsoft Word and pandoc accept them — but LibreOffice 24.2 rejects them silently with `Error: source file could not be loaded`. In the gdpval resources server this surfaces as `pdf miss` events when preconverting task input documents. The prefixing appears in BOTH `_rels/.rels` AND `[Content_Types].xml`. Rewriting only one of them is not enough — both must be normalized for LibreOffice to load the file. `convert_to_pdf` now detects the `ns0:` prefix and, when present, writes a namespace-normalized copy of the package to a tempdir and runs LibreOffice on that copy. The original file on disk is left untouched and the output PDF still lands next to the original. ## Empirical impact Reproduced and verified on the actual vadams Kimi corpus on lustre. Across the 46 source files in 27 affected GDPVal tasks: | Failure mode | Files | Resolved by this PR | |---|---|---| | `ns0:` prefix in package XML | 43 | Yes | | `ns0:` prefix + malformed `<Relationship>` chain | 2 | No (separate cause) | | Missing rels target parts (one `.pptx`) | 1 | No (out of scope) | So this PR resolves 43/46 (~93%) of the failing source files. The remaining 3 are outside the scope of this fix. ## Test plan - [x] Unit tests for `_rewrite_ns0_namespace`, `_ooxml_has_ns0_prefix`, `_normalize_ooxml_zip`, and `convert_to_pdf` selecting the normalized copy when `ns0:` is present (21 tests pass) - [x] End-to-end: pulled `Compensation Model Ideas.docx` from lustre (a known-failing file), ran `convert_to_pdf` — produced a valid PDF with the message "converted Compensation Model Ideas.docx (after ns0 normalization)" - [x] Lint + format clean (`ruff check`, `ruff format --check`) ## Notes - Out of scope: the 2 files with malformed `<Relationship>` chains in `word/_rels/document.xml.rels` (caused by an upstream URL-redaction step that left invalid XML) and the 1 `.pptx` with missing rels targets. These need separate handling. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Signed-off-by: Alex Gronskiy <agronskiy@nvidia.com>
1 parent ace12f9 commit ed190cd

2 files changed

Lines changed: 276 additions & 2 deletions

File tree

resources_servers/gdpval/preconvert.py

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,41 @@
1212
concurrent libreoffice subprocesses don't race on the shared default
1313
profile lock (``$HOME/.config/libreoffice``) — that race is the reason
1414
the previous default ``max_concurrent=1`` existed.
15+
16+
OOXML namespace normalization
17+
-----------------------------
18+
19+
Some files in the GDPVal corpus were emitted by ``python-docx`` (or
20+
similar lxml-based tools), which serialize the OPC package XML with an
21+
explicit ``ns0:`` namespace prefix:
22+
23+
<ns0:Relationships xmlns:ns0="http://schemas.openxmlformats.org/...">
24+
25+
instead of the standard default-namespace form:
26+
27+
<Relationships xmlns="http://schemas.openxmlformats.org/...">
28+
29+
The two forms are semantically identical XML, and Microsoft Word /
30+
pandoc accept both. LibreOffice 24.2, however, rejects the prefixed
31+
form with ``Error: source file could not be loaded``. The prefixing
32+
shows up in BOTH ``_rels/.rels`` and ``[Content_Types].xml``; rewriting
33+
only one of them is not enough.
34+
35+
Before invoking libreoffice we detect this shape and write a
36+
namespace-normalized copy to a tempdir, leaving the original on disk
37+
untouched.
1538
"""
1639

1740
from __future__ import annotations
1841

1942
import asyncio
2043
import logging
2144
import os
45+
import re
2246
import shutil
2347
import subprocess
2448
import tempfile
49+
import zipfile
2550
from concurrent.futures import ThreadPoolExecutor, as_completed
2651
from pathlib import Path
2752

@@ -32,6 +57,42 @@
3257

3358
DEFAULT_MAX_CONCURRENT = 4
3459

60+
_NS0_ROOT_RE = re.compile(r'<ns0:([A-Za-z_][\w.-]*)\b([^>]*?)\bxmlns:ns0="([^"]+)"')
61+
_NS0_TAG_RE = re.compile(r"</?ns0:")
62+
_NS0_SENTINEL = b'xmlns:ns0="http://schemas.openxmlformats.org/'
63+
64+
65+
def _rewrite_ns0_namespace(text: str) -> str:
66+
text = _NS0_ROOT_RE.sub(r'<\1 xmlns="\3"\2', text)
67+
text = _NS0_TAG_RE.sub(lambda m: m.group(0).replace("ns0:", ""), text)
68+
return text
69+
70+
71+
def _ooxml_has_ns0_prefix(path: Path) -> bool:
72+
"""True if the package uses python-docx-style ``ns0:`` prefixing in
73+
``_rels/.rels`` or ``[Content_Types].xml``. LibreOffice can't load
74+
files in this form even though they are valid OOXML."""
75+
try:
76+
with zipfile.ZipFile(path) as zin:
77+
names = set(zin.namelist())
78+
for part in ("_rels/.rels", "[Content_Types].xml"):
79+
if part in names and _NS0_SENTINEL in zin.read(part):
80+
return True
81+
except (zipfile.BadZipFile, OSError):
82+
return False
83+
return False
84+
85+
86+
def _normalize_ooxml_zip(src: Path, dst: Path) -> None:
87+
"""Copy ``src`` to ``dst`` rewriting any ``ns0:``-prefixed package XML
88+
(``*.rels`` and ``[Content_Types].xml``) to default-namespace form."""
89+
with zipfile.ZipFile(src) as zin, zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout:
90+
for item in zin.namelist():
91+
data = zin.read(item)
92+
if item.endswith(".rels") or item == "[Content_Types].xml":
93+
data = _rewrite_ns0_namespace(data.decode("utf-8")).encode("utf-8")
94+
zout.writestr(item, data)
95+
3596

3697
def needs_conversion(path: Path) -> bool:
3798
return path.suffix.lower() in OFFICE_EXTENSIONS and not path.with_suffix(".pdf").exists()
@@ -41,7 +102,16 @@ def convert_to_pdf(path: Path) -> tuple[Path, bool, str]:
41102
"""Convert one file to PDF via host LibreOffice. Returns ``(path, ok, msg)``."""
42103
output_dir = str(path.parent)
43104
profile_dir = Path(tempfile.mkdtemp(prefix="lo-profile-"))
105+
norm_dir: Path | None = None
106+
input_path = path
107+
normalized = False
44108
try:
109+
if _ooxml_has_ns0_prefix(path):
110+
norm_dir = Path(tempfile.mkdtemp(prefix="gdpval-norm-"))
111+
input_path = norm_dir / path.name
112+
_normalize_ooxml_zip(path, input_path)
113+
normalized = True
114+
45115
result = subprocess.run(
46116
[
47117
"libreoffice",
@@ -55,15 +125,16 @@ def convert_to_pdf(path: Path) -> tuple[Path, bool, str]:
55125
"pdf",
56126
"--outdir",
57127
output_dir,
58-
str(path),
128+
str(input_path),
59129
],
60130
capture_output=True,
61131
text=True,
62132
timeout=120,
63133
)
64134
pdf_path = path.with_suffix(".pdf")
65135
if pdf_path.exists():
66-
return path, True, f"converted {path.name}"
136+
suffix = " (after ns0 normalization)" if normalized else ""
137+
return path, True, f"converted {path.name}{suffix}"
67138
return (
68139
path,
69140
False,
@@ -77,6 +148,8 @@ def convert_to_pdf(path: Path) -> tuple[Path, bool, str]:
77148
return path, False, f"error converting {path.name}: {exc!r}"
78149
finally:
79150
shutil.rmtree(profile_dir, ignore_errors=True)
151+
if norm_dir is not None:
152+
shutil.rmtree(norm_dir, ignore_errors=True)
80153

81154

82155
def find_convertible_files(root_dir: str | os.PathLike) -> list[Path]:

resources_servers/gdpval/tests/test_preconvert.py

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,3 +134,204 @@ async def test_preconvert_dir_async_propagates_results(tmp_path: Path, monkeypat
134134
ok, fail, errors = await pcv.preconvert_dir_async(str(tmp_path))
135135
assert (ok, fail) == (0, 1)
136136
assert errors == ["boom"]
137+
138+
139+
# Fixtures + tests for the ns0-namespace normalization (Mode A in
140+
# the GDPVal corpus). See module docstring on preconvert.py for the
141+
# background on why python-docx-style ns0 prefixing breaks LibreOffice.
142+
143+
NS0_RELS = (
144+
b"<?xml version='1.0' encoding='utf-8'?>\n"
145+
b'<ns0:Relationships xmlns:ns0="http://schemas.openxmlformats.org/package/2006/relationships">'
146+
b'<ns0:Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/'
147+
b'relationships/officeDocument" Target="word/document.xml" />'
148+
b"</ns0:Relationships>"
149+
)
150+
151+
NS0_CONTENT_TYPES = (
152+
b"<?xml version='1.0' encoding='utf-8'?>\n"
153+
b'<ns0:Types xmlns:ns0="http://schemas.openxmlformats.org/package/2006/content-types">'
154+
b'<ns0:Default Extension="rels" '
155+
b'ContentType="application/vnd.openxmlformats-package.relationships+xml" />'
156+
b'<ns0:Override PartName="/word/document.xml" '
157+
b'ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml" />'
158+
b"</ns0:Types>"
159+
)
160+
161+
DEFAULT_NS_RELS = (
162+
b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
163+
b'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
164+
b'<Relationship Id="rId1" Type="x" Target="word/document.xml"/></Relationships>'
165+
)
166+
167+
DEFAULT_NS_CONTENT_TYPES = (
168+
b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
169+
b'<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"></Types>'
170+
)
171+
172+
DOCUMENT_XML = (
173+
b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><w:document '
174+
b'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"></w:document>'
175+
)
176+
177+
178+
def _make_zip(path: Path, parts: dict[str, bytes]) -> Path:
179+
import zipfile as _zip
180+
181+
with _zip.ZipFile(path, "w", _zip.ZIP_DEFLATED) as z:
182+
for name, data in parts.items():
183+
z.writestr(name, data)
184+
return path
185+
186+
187+
class TestRewriteNs0Namespace:
188+
def test_rewrites_root_to_default_namespace(self) -> None:
189+
out = pcv._rewrite_ns0_namespace(NS0_RELS.decode("utf-8"))
190+
assert "<ns0:" not in out
191+
assert "</ns0:" not in out
192+
assert "xmlns:ns0=" not in out
193+
assert '<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"' in out
194+
195+
def test_rewrites_content_types(self) -> None:
196+
out = pcv._rewrite_ns0_namespace(NS0_CONTENT_TYPES.decode("utf-8"))
197+
assert "<ns0:" not in out
198+
assert "</ns0:" not in out
199+
assert '<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"' in out
200+
# Override children must remain (just unprefixed).
201+
assert "<Override PartName=" in out
202+
203+
def test_idempotent_on_default_namespace(self) -> None:
204+
out = pcv._rewrite_ns0_namespace(DEFAULT_NS_RELS.decode("utf-8"))
205+
assert out == DEFAULT_NS_RELS.decode("utf-8")
206+
207+
208+
class TestOoxmlHasNs0Prefix:
209+
def test_true_when_rels_has_ns0(self, tmp_path: Path) -> None:
210+
zp = _make_zip(
211+
tmp_path / "a.docx",
212+
{
213+
"[Content_Types].xml": DEFAULT_NS_CONTENT_TYPES,
214+
"_rels/.rels": NS0_RELS,
215+
"word/document.xml": DOCUMENT_XML,
216+
},
217+
)
218+
assert pcv._ooxml_has_ns0_prefix(zp) is True
219+
220+
def test_true_when_only_content_types_has_ns0(self, tmp_path: Path) -> None:
221+
zp = _make_zip(
222+
tmp_path / "a.docx",
223+
{
224+
"[Content_Types].xml": NS0_CONTENT_TYPES,
225+
"_rels/.rels": DEFAULT_NS_RELS,
226+
"word/document.xml": DOCUMENT_XML,
227+
},
228+
)
229+
assert pcv._ooxml_has_ns0_prefix(zp) is True
230+
231+
def test_false_when_default_namespace(self, tmp_path: Path) -> None:
232+
zp = _make_zip(
233+
tmp_path / "a.docx",
234+
{
235+
"[Content_Types].xml": DEFAULT_NS_CONTENT_TYPES,
236+
"_rels/.rels": DEFAULT_NS_RELS,
237+
"word/document.xml": DOCUMENT_XML,
238+
},
239+
)
240+
assert pcv._ooxml_has_ns0_prefix(zp) is False
241+
242+
def test_false_on_non_zip(self, tmp_path: Path) -> None:
243+
bogus = tmp_path / "a.docx"
244+
bogus.write_bytes(b"not a zip")
245+
assert pcv._ooxml_has_ns0_prefix(bogus) is False
246+
247+
248+
class TestNormalizeOoxmlZip:
249+
def test_rewrites_rels_and_content_types_only(self, tmp_path: Path) -> None:
250+
import zipfile as _zip
251+
252+
src = _make_zip(
253+
tmp_path / "in.docx",
254+
{
255+
"[Content_Types].xml": NS0_CONTENT_TYPES,
256+
"_rels/.rels": NS0_RELS,
257+
"word/_rels/document.xml.rels": NS0_RELS,
258+
"word/document.xml": DOCUMENT_XML,
259+
},
260+
)
261+
dst = tmp_path / "out.docx"
262+
pcv._normalize_ooxml_zip(src, dst)
263+
264+
with _zip.ZipFile(dst) as z:
265+
for part in ("[Content_Types].xml", "_rels/.rels", "word/_rels/document.xml.rels"):
266+
text = z.read(part).decode("utf-8")
267+
assert "<ns0:" not in text, f"ns0 still present in {part}"
268+
assert "xmlns:ns0=" not in text, f"xmlns:ns0 still in {part}"
269+
# non-package XML must be byte-identical
270+
assert z.read("word/document.xml") == DOCUMENT_XML
271+
272+
273+
class TestConvertToPdfNormalization:
274+
def test_calls_libreoffice_with_normalized_copy_when_ns0(
275+
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
276+
) -> None:
277+
src = _make_zip(
278+
tmp_path / "src.docx",
279+
{
280+
"[Content_Types].xml": NS0_CONTENT_TYPES,
281+
"_rels/.rels": NS0_RELS,
282+
"word/document.xml": DOCUMENT_XML,
283+
},
284+
)
285+
captured: list[list[str]] = []
286+
287+
class _Completed:
288+
returncode = 0
289+
stdout = ""
290+
stderr = ""
291+
292+
def _run(cmd, *_a, **_kw):
293+
captured.append(cmd)
294+
# libreoffice would write the PDF to the original outdir using the input stem.
295+
(src.with_suffix(".pdf")).write_bytes(b"%PDF-1.4 fake\n")
296+
return _Completed()
297+
298+
monkeypatch.setattr(subprocess, "run", _run)
299+
path, ok, msg = pcv.convert_to_pdf(src)
300+
assert ok is True
301+
assert "(after ns0 normalization)" in msg
302+
# The input arg passed to libreoffice should NOT be the original file: it must come
303+
# from the gdpval-norm- tempdir, but with the same basename so output stem is preserved.
304+
assert len(captured) == 1
305+
input_arg = captured[0][-1]
306+
assert input_arg.endswith("/src.docx")
307+
assert "/gdpval-norm-" in input_arg
308+
assert input_arg != str(src)
309+
310+
def test_calls_libreoffice_with_original_when_not_ns0(
311+
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
312+
) -> None:
313+
src = _make_zip(
314+
tmp_path / "src.docx",
315+
{
316+
"[Content_Types].xml": DEFAULT_NS_CONTENT_TYPES,
317+
"_rels/.rels": DEFAULT_NS_RELS,
318+
"word/document.xml": DOCUMENT_XML,
319+
},
320+
)
321+
captured: list[list[str]] = []
322+
323+
class _Completed:
324+
returncode = 0
325+
stdout = ""
326+
stderr = ""
327+
328+
def _run(cmd, *_a, **_kw):
329+
captured.append(cmd)
330+
(src.with_suffix(".pdf")).write_bytes(b"%PDF-1.4 fake\n")
331+
return _Completed()
332+
333+
monkeypatch.setattr(subprocess, "run", _run)
334+
path, ok, msg = pcv.convert_to_pdf(src)
335+
assert ok is True
336+
assert "(after ns0 normalization)" not in msg
337+
assert captured[0][-1] == str(src)

0 commit comments

Comments
 (0)