addressing review comments

melllinia · melllinia · commit 6f4f93b10b56 · 2026-04-08T17:41:03.000+04:00
Signed-off-by: mmkrtchyan &lt;mmkrtchyan@nvidia.com&gt;
diff --git a/recipes/asr/run_hf_leaderboard.py b/recipes/asr/run_hf_leaderboard.py
@@ -38,7 +38,8 @@
 
 import argparse
 
-from nemo_skills.pipeline.cli import eval, wrap_arguments
+from nemo_skills.pipeline.cli import eval as run_eval
+from nemo_skills.pipeline.cli import wrap_arguments
 
 DEFAULT_SERVER_CONTAINER = "nvcr.io/nvidia/nemo:25.11"
 DEFAULT_INSTALLATION_COMMAND = "pip install -r requirements/audio.txt"
@@ -69,7 +70,7 @@ def main():
 
     args = parser.parse_args()
 
-    eval(
+    run_eval(
         ctx=wrap_arguments(
             "++prompt_format=openai "
             "++prompt_config=null "
diff --git a/recipes/multimodal/server/backends/nemo_asr_backend.py b/recipes/multimodal/server/backends/nemo_asr_backend.py
@@ -221,7 +221,9 @@ def _parse_single_hypothesis(self, hyp: Any) -> tuple[str, List[Dict[str, Any]]]
             if text is None:
                 text = hyp.get("pred_text")
             if text is None:
-                text = hyp.get("transcript", "")
+                text = hyp.get("transcript")
+            if text is None:
+                text = ""
             words = hyp.get("words")
             if words is None:
                 ts = hyp.get("timestamp")
diff --git a/recipes/multimodal/server/backends/salm_backend.py b/recipes/multimodal/server/backends/salm_backend.py
@@ -33,7 +33,6 @@ class SALMConfig(BackendConfig):
     """Configuration for SALM backend."""
 
     model_name: Optional[str] = None
-    batch_size: int = 16
     warmup: bool = True
     user_prompt: str = DEFAULT_ASR_PROMPT
 
@@ -50,7 +49,6 @@ def from_dict(cls, d: Dict[str, Any]) -> "SALMConfig":
             "temperature",
             "top_p",
             "top_k",
-            "batch_size",
             "warmup",
             "user_prompt",
         }
@@ -144,6 +142,8 @@ def validate_request(self, request: GenerationRequest) -> Optional[str]:
         )
         if not has_audio:
             return "SALM backend requires audio input"
+        if request.audio_bytes_list is not None and len(request.audio_bytes_list) > 1:
+            return "SALM backend currently supports one audio input per request"
         return None
 
     def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]:
@@ -170,9 +170,10 @@ def generate(self, requests: List[GenerationRequest]) -> List[GenerationResult]:
                     results[idx] = GenerationResult(error=str(e), request_id=req.request_id)
 
             if temp_paths:
-                first_extra = requests[valid_indices[0]].extra_params or {}
+                first_req = requests[valid_indices[0]]
+                first_extra = first_req.extra_params or {}
                 user_prompt = first_extra.get("user_prompt", self.salm_config.user_prompt)
-                max_new_tokens = int(first_extra.get("max_new_tokens", self.config.max_new_tokens))
+                max_new_tokens = first_req.max_new_tokens or self.config.max_new_tokens
                 audio_tag = self._model.audio_locator_tag
 
                 prompts = []