fix(gdpval): plumb judge_responses_create_params_overrides into create() call (#1174)

Kh4L · web-flow · commit 9603653e9a1d · 2026-04-29T09:37:42.000-07:00
## Summary

`GDPValResourcesServerConfig.judge_responses_create_params_overrides` is
documented as a way to override the kwargs passed to the rubric judge's
`client.chat.completions.create(...)` call, but today only `model` and
`api_key` are extracted from it
(`resources_servers/gdpval/app.py:148-151`). Anything else — most
importantly `max_tokens` — is silently dropped, which is unfortunate
because the default `max_tokens=8192` in `scoring.py:132,265` truncates
the rubric judge's JSON for ~50% of GDPVal-style rollouts.

This PR plumbs the rest of the dict through:
- `app.py` switches `model` / `api_key` extraction from `dict.get(...)`
to `dict.pop(...)`, then forwards what's left as a new
`create_overrides` kwarg.
- `score_with_rubric` and `score_with_rubric_visual` accept a
`create_overrides: dict | None = None`, and merge it into the kwargs of
`client.chat.completions.create` (user-supplied keys win over the
in-function defaults).
- `_verify_comparison` is left alone — it goes through
`comparison.run_trials`, which already exposes `max_output_tokens`
separately and doesn't share the rubric path.

After this PR, the override config field actually does what its name
promises:

```yaml
gdpval_resources_server:
  resources_servers:
    gdpval:
      judge_responses_create_params_overrides:
        max_tokens: 16384
        temperature: 0.0
```

or via Hydra at the CLI:

```
++gdpval_resources_server.resources_servers.gdpval.judge_responses_create_params_overrides.max_tokens=16384
```

## Test plan

- [x] `pytest resources_servers/gdpval/tests/test_app.py -v` — 8 passed
(1 new + 7 existing). The new
`test_verify_rubric_passes_create_overrides_through` builds a server
with `judge_responses_create_params_overrides={"model": "custom-judge",
"api_key": "sk-custom", "max_tokens": 16384, "temperature": 0.0}`,
patches `score_with_rubric` to capture kwargs, and asserts `model` /
`api_key` were popped and the rest reached the scoring function as
`create_overrides`.
- [x] `ruff check` / `ruff format --check` clean.
- [x] Manual end-to-end: a 140-task rubric run on Ultra V3 SFT showed an
`invalid_judge_response` rate of ~44% under load (vs. 10% on a
10-rollout smoke), all attributable to truncated rubric JSON at the 8192
cap. Bumping the override to 16384 takes the truncated-JSON fallback
path out of play; rerun in progress.

---------

Signed-off-by: Serge Panev &lt;spanev@nvidia.com&gt;
diff --git a/resources_servers/gdpval/app.py b/resources_servers/gdpval/app.py
@@ -147,8 +147,11 @@ async def _verify_rubric(self, body: GDPValVerifyRequest) -> GDPValVerifyRespons
 
         overrides = dict(self.config.judge_responses_create_params_overrides or {})
         judge_base_url = get_server_url(self.config.judge_model_server.name) + "/v1"
-        judge_model_name = overrides.get("model", "judge")
-        judge_api_key = overrides.get("api_key", "dummy")
+        judge_model_name = overrides.pop("model", "judge")
+        judge_api_key = overrides.pop("api_key", "dummy")
+        # Anything left in `overrides` (max_tokens, temperature, top_p, …) is
+        # merged into the judge's chat.completions.create kwargs.
+        judge_create_overrides = overrides or None
 
         deliverable_text = _safe_output_text(body.response)
         deliverable_content_blocks: Optional[List[Dict[str, Any]]] = None
@@ -185,6 +188,7 @@ async def _verify_rubric(self, body: GDPValVerifyRequest) -> GDPValVerifyRespons
                 model_base_url=judge_base_url,
                 model_name=judge_model_name,
                 api_key=judge_api_key,
+                create_overrides=judge_create_overrides,
             )
         else:
             from resources_servers.gdpval.scoring import score_with_rubric
@@ -198,6 +202,7 @@ async def _verify_rubric(self, body: GDPValVerifyRequest) -> GDPValVerifyRespons
                 model_base_url=judge_base_url,
                 model_name=judge_model_name,
                 api_key=judge_api_key,
+                create_overrides=judge_create_overrides,
             )
 
         return GDPValVerifyResponse(
diff --git a/resources_servers/gdpval/scoring.py b/resources_servers/gdpval/scoring.py
@@ -92,12 +92,17 @@ async def score_with_rubric(
     model_base_url: str,
     model_name: str,
     api_key: str = "dummy",
+    create_overrides: dict | None = None,
 ) -> tuple[float, dict | None]:
     """Score a deliverable against a rubric using an LLM judge.
 
     Returns ``(score, judge_response)`` where *score* is a float in [0, 1]
     and *judge_response* is the parsed JSON dict from the judge (or ``None``
     on failure).
+
+    *create_overrides* is merged into the kwargs passed to
+    ``client.chat.completions.create``; user-supplied keys win over defaults.
+    Use it to bump ``max_tokens`` (default 8192), tweak ``temperature``, etc.
     """
     from openai import AsyncOpenAI
 
@@ -119,18 +124,21 @@ async def score_with_rubric(
         response = None
         for attempt in range(max_retries + 1):
             try:
-                response = await client.chat.completions.create(
-                    model=model_name,
-                    messages=[
+                create_kwargs: dict = {
+                    "model": model_name,
+                    "messages": [
                         {
                             "role": "system",
                             "content": "You are an expert evaluator. You must respond with valid JSON only.",
                         },
                         {"role": "user", "content": judge_prompt},
                     ],
-                    temperature=0.1,
-                    max_tokens=8192,
-                )
+                    "temperature": 0.1,
+                    "max_tokens": 8192,
+                }
+                if create_overrides:
+                    create_kwargs.update(create_overrides)
+                response = await client.chat.completions.create(**create_kwargs)
                 break
             except Exception as retry_err:
                 err_str = str(retry_err)
@@ -217,6 +225,7 @@ async def score_with_rubric_visual(
     model_base_url: str,
     model_name: str,
     api_key: str = "dummy",
+    create_overrides: dict | None = None,
 ) -> tuple[float, dict | None]:
     """Score deliverables visually using a multimodal judge (e.g., Gemini 3 Pro).
 
@@ -226,6 +235,10 @@ async def score_with_rubric_visual(
     *deliverable_content_blocks* is a list of OpenAI-compatible content blocks
     (text and image_url) produced by ``file_reader.convert_deliverables_to_content_blocks()``.
 
+    *create_overrides* is merged into the kwargs passed to
+    ``client.chat.completions.create``; user-supplied keys win over defaults.
+    Use it to bump ``max_tokens`` (default 8192), tweak ``temperature``, etc.
+
     Returns ``(score, judge_response)`` — same contract as ``score_with_rubric``.
     """
     from openai import AsyncOpenAI
@@ -252,18 +265,21 @@ async def score_with_rubric_visual(
         response = None
         for attempt in range(max_retries + 1):
             try:
-                response = await client.chat.completions.create(
-                    model=model_name,
-                    messages=[
+                create_kwargs: dict = {
+                    "model": model_name,
+                    "messages": [
                         {
                             "role": "system",
                             "content": "You are an expert evaluator. You must respond with valid JSON only.",
                         },
                         {"role": "user", "content": content},
                     ],
-                    temperature=0.1,
-                    max_tokens=8192,
-                )
+                    "temperature": 0.1,
+                    "max_tokens": 8192,
+                }
+                if create_overrides:
+                    create_kwargs.update(create_overrides)
+                response = await client.chat.completions.create(**create_kwargs)
                 break
             except Exception as retry_err:
                 err_str = str(retry_err)
diff --git a/resources_servers/gdpval/tests/test_app.py b/resources_servers/gdpval/tests/test_app.py
@@ -122,6 +122,42 @@ async def fake_score_with_rubric(**_kwargs):
         assert resp.invalid_judge_response is False
         assert resp.judge_response == canned_result
 
+    @pytest.mark.asyncio
+    async def test_verify_rubric_passes_create_overrides_through(self) -> None:
+        """``judge_responses_create_params_overrides`` must reach the scoring fn.
+
+        ``model`` and ``api_key`` are pulled out as their own kwargs; everything
+        else (e.g. ``max_tokens``, ``temperature``) flows through as
+        ``create_overrides`` and gets merged into ``client.chat.completions.create``.
+        """
+        server = _server(
+            reward_mode="rubric",
+            judge_responses_create_params_overrides={
+                "model": "custom-judge",
+                "api_key": "sk-custom",  # pragma: allowlist secret
+                "max_tokens": 16384,
+                "temperature": 0.0,
+            },
+        )
+
+        captured: dict = {}
+
+        async def fake_score_with_rubric(**kwargs):
+            captured.update(kwargs)
+            return 0.5, {"overall_score": 0.5}
+
+        body = _verify_request(rubric_json=[{"criterion": "clarity", "score": 1}])
+
+        with (
+            patch("resources_servers.gdpval.scoring.score_with_rubric", side_effect=fake_score_with_rubric),
+            patch("resources_servers.gdpval.app.get_server_url", return_value="http://localhost:9999"),
+        ):
+            await server.verify(body)
+
+        assert captured["model_name"] == "custom-judge"
+        assert captured["api_key"] == "sk-custom"  # pragma: allowlist secret
+        assert captured["create_overrides"] == {"max_tokens": 16384, "temperature": 0.0}
+
     @pytest.mark.asyncio
     async def test_verify_comparison_missing_reference(self, tmp_path) -> None:
         server = _server(