NVIDIA-NeMo
diff --git a/‎benchmarks/gdpval/README.md‎
Lines changed: 68 additions & 0 deletions b/‎benchmarks/gdpval/README.md‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎benchmarks/gdpval/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎benchmarks/gdpval/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎benchmarks/gdpval/config.yaml‎
Lines changed: 66 additions & 0 deletions b/‎benchmarks/gdpval/config.yaml‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎benchmarks/gdpval/data/.gitignore‎
Lines changed: 6 additions & 0 deletions b/‎benchmarks/gdpval/data/.gitignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmarks/gdpval/prepare.py‎
Lines changed: 70 additions & 0 deletions b/‎benchmarks/gdpval/prepare.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎resources_servers/gdpval/README.md‎
Lines changed: 22 additions & 0 deletions b/‎resources_servers/gdpval/README.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎resources_servers/gdpval/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎resources_servers/gdpval/__init__.py‎
Lines changed: 14 additions & 0 deletions
@@ -0,0 +1,68 @@
+# GDPVal benchmark
+
+[GDPVal](https://huggingface.co/datasets/openai/gdpval) — 220 professional
+knowledge-work tasks scored by an LLM judge against per-task rubrics. This
+benchmark wires the Stirrup-based agent (`responses_api_agents/stirrup_agent`)
+to the GDPVal resources server (`resources_servers/gdpval`).
+
+## Prepare data
+
+Downloads `openai/gdpval` from HuggingFace and writes
+`data/gdpval_benchmark.jsonl`:
+
+```bash
+ng_prepare_benchmark "+config_paths=[benchmarks/gdpval/config.yaml]"
+```
+
+## Run rubric mode (default)
+
+Each deliverable is scored 0–1 against the task rubric.
+
+```bash
+config_paths="responses_api_models/vllm_model/configs/vllm_model.yaml,\
+benchmarks/gdpval/config.yaml"
+ng_e2e_collect_rollouts \
+    "+config_paths=[${config_paths}]" \
+    ++output_jsonl_fpath=results/gdpval_rubric.jsonl \
+    ++split=benchmark \
+    ++policy_base_url=<vllm_base_url> \
+    ++policy_api_key=<vllm_api_key> \
+    ++policy_model_name=<served_model_name>
+```
+
+Required environment variables for the judge:
+
+- `JUDGE_API_KEY` — sk- key for the judge inference API (nvapi- keys 401 on
+  multimodal payloads)
+- `JUDGE_BASE_URL` — defaults to NVIDIA's internal inference API
+- `JUDGE_MODEL_NAME` — defaults to `gcp/google/gemini-3.1-pro-preview`
+- `HF_TOKEN` — for downloading reference files (avoids HF anonymous rate limits)
+
+## Run comparison mode (pairwise ELO vs. a reference model)
+
+Each deliverable is judged against a reference model's deliverable for the
+same `task_id`; aggregate metrics include ELO relative to a configurable
+anchor (default 1000).
+
+```bash
+ng_e2e_collect_rollouts \
+    "+config_paths=[${config_paths}]" \
+    ++output_jsonl_fpath=results/gdpval_compare.jsonl \
+    ++split=benchmark \
+    ++gdpval_resources_server.resources_servers.gdpval.reward_mode=comparison \
+    ++gdpval_resources_server.resources_servers.gdpval.reference_deliverables_dir=/path/to/reference/output
+```
+
+The reference directory must be laid out as
+`<reference_deliverables_dir>/task_<task_id>/` with `finish_params.json` and
+the deliverable files (the same layout the Stirrup agent persists).
+
+## Aggregate metrics
+
+After `ng_e2e_collect_rollouts` returns, the resources server's
+`/aggregate_metrics` endpoint emits headline scores in
+`results/<output>_metrics.json`:
+
+- Rubric mode: `mean/reward` (pass@1 equivalent)
+- Comparison mode: `comparison/wins`, `comparison/losses`, `comparison/ties`,
+  `comparison/win_rate`, `comparison/eval_elo`, `comparison/normalized_elo`
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,66 @@
+# GDPVal benchmark — Stirrup agent + GDPVal resources server.
+#
+# Run:
+#   ng_prepare_benchmark "+config_paths=[benchmarks/gdpval/config.yaml]"
+#   ng_e2e_collect_rollouts \
+#     "+config_paths=[responses_api_models/vllm_model/configs/vllm_model.yaml,benchmarks/gdpval/config.yaml]" \
+#     ++split=benchmark \
+#     ++output_jsonl_fpath=results/gdpval.jsonl
+#
+# Comparison mode (pairwise ELO vs a reference model's deliverables):
+#   ++gdpval_resources_server.resources_servers.gdpval.reward_mode=comparison \
+#   ++gdpval_resources_server.resources_servers.gdpval.reference_deliverables_dir=/path/to/fork
+
+# Judge model — proxy to NVIDIA inference API for Gemini 3.1 Pro.
+gdpval_judge_model:
+  responses_api_models:
+    openai_model:
+      entrypoint: app.py
+      openai_base_url: ${oc.env:JUDGE_BASE_URL,https://inference-api.nvidia.com/v1}
+      openai_api_key: ${oc.env:JUDGE_API_KEY,dummy}
+      openai_model: ${oc.env:JUDGE_MODEL_NAME,gcp/google/gemini-3.1-pro-preview}
+
+# GDPVal resources server (rubric scoring by default; switch to comparison via override).
+gdpval_resources_server:
+  resources_servers:
+    gdpval:
+      entrypoint: app.py
+      domain: other
+      verified: false
+      reward_mode: rubric
+      reference_deliverables_dir: null
+      num_comparison_trials: 4
+      reference_elo: 1000.0
+      preconvert_office_to_pdf: true
+      preconvert_max_concurrent: 1
+      judge_model_server:
+        type: responses_api_models
+        name: gdpval_judge_model
+      judge_responses_create_params_overrides: {}
+
+# Stirrup agent paired with the resources server above.
+gdpval_stirrup_agent:
+  responses_api_agents:
+    stirrup_agent:
+      entrypoint: app.py
+      task: gdpval
+      agent_max_turns: 100
+      concurrency: 32
+      temperature: 1.0
+      system_prompt_template: ${oc.env:SYSTEM_PROMPT_TEMPLATE,null}
+      user_prompt_template: ${oc.env:USER_PROMPT_TEMPLATE,null}
+      gdpval_container_path: ${oc.env:GDPVAL_CONTAINER_PATH,null}
+      persist_deliverables_dir: ${oc.env:PERSIST_DELIVERABLES_DIR,output/gdpval/deliverables}
+      resources_server:
+        type: resources_servers
+        name: gdpval_resources_server
+      model_server:
+        type: responses_api_models
+        name: policy_model
+      datasets:
+        - name: gdpval
+          type: benchmark
+          jsonl_fpath: benchmarks/gdpval/data/gdpval_benchmark.jsonl
+          prompt_config: null
+          prepare_script: benchmarks/gdpval/prepare.py
+          num_repeats: 2
@@ -0,0 +1,6 @@
+*train.jsonl
+*validation.jsonl
+*benchmark.jsonl
+*train_prepare.jsonl
+*validation_prepare.jsonl
+*example_prepare.jsonl
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare the GDPVal benchmark JSONL.
+
+Downloads the ``openai/gdpval`` HuggingFace dataset and converts it into the
+NeMo-Gym benchmark JSONL format: each row has ``responses_create_params`` (an
+empty input — the Stirrup agent builds the actual prompt from the top-level
+``prompt`` / ``sector`` / ``occupation`` fields) plus task metadata at the
+top level so the GDPVal resources server can pick them up via /verify.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+
+BENCHMARK_DIR = Path(__file__).parent
+DATA_DIR = BENCHMARK_DIR / "data"
+OUTPUT_FPATH = DATA_DIR / "gdpval_benchmark.jsonl"
+
+HF_DATASET = "openai/gdpval"
+HF_SPLIT = "train"
+
+
+def prepare() -> Path:
+    from datasets import load_dataset
+
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    # Pass HF_TOKEN explicitly — ``load_dataset`` doesn't always pick it up
+    # from the env, and GDPVal's bucket aggressively rate-limits anonymous IPs.
+    hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+    ds = load_dataset(HF_DATASET, split=HF_SPLIT, token=hf_token)
+
+    with OUTPUT_FPATH.open("w") as f:
+        for row in ds:
+            record = {
+                # Empty input: the Stirrup agent constructs the user prompt
+                # from the top-level ``prompt`` field at runtime.
+                "responses_create_params": {"input": []},
+                "task_id": row["task_id"],
+                "sector": row.get("sector", ""),
+                "occupation": row.get("occupation", ""),
+                "prompt": row["prompt"],
+                "reference_files": row.get("reference_files", []),
+                "reference_file_urls": row.get("reference_file_urls", []),
+                "rubric_json": row.get("rubric_json", {}),
+                "rubric_pretty": row.get("rubric_pretty", ""),
+            }
+            f.write(json.dumps(record) + "\n")
+
+    print(f"Wrote {len(ds)} tasks to {OUTPUT_FPATH}")
+    return OUTPUT_FPATH
+
+
+if __name__ == "__main__":
+    prepare()
@@ -0,0 +1,22 @@
+# GDPVal resources server
+
+Scores deliverables produced by the Stirrup agent on the GDPVal benchmark.
+
+Two modes via `reward_mode` config:
+
+- `rubric` (default) — LLM judge scores each deliverable against a per-task
+  rubric, reward in `[0.0, 1.0]`.
+- `comparison` — pairwise judge compares eval deliverable vs. a reference
+  rollout (`reference_deliverables_dir` must be set), reward in
+  `{0.0, 0.5, 1.0}`. `aggregate_metrics` reduces to an ELO rating.
+
+Canonical entry point is the benchmark at `benchmarks/gdpval/`:
+
+```bash
+ng_prepare_benchmark "+config_paths=[benchmarks/gdpval/config.yaml]"
+ng_e2e_collect_rollouts \
+  "+config_paths=[responses_api_models/vllm_model/configs/vllm_model.yaml,benchmarks/gdpval/config.yaml]" \
+  ++split=benchmark
+```
+
+See `benchmarks/gdpval/README.md` for the full run recipe.
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.