huggingface
diff --git a/‎agent/__init__.py‎
Lines changed: 15 additions & 1 deletion b/‎agent/__init__.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎agent/context_manager/manager.py‎
Lines changed: 110 additions & 29 deletions b/‎agent/context_manager/manager.py‎
Lines changed: 110 additions & 29 deletions
diff --git a/‎agent/core/agent_loop.py‎
Lines changed: 25 additions & 26 deletions b/‎agent/core/agent_loop.py‎
Lines changed: 25 additions & 26 deletions
diff --git a/‎agent/core/llm_params.py‎
Lines changed: 2 additions & 1 deletion b/‎agent/core/llm_params.py‎
Lines changed: 2 additions & 1 deletion
@@ -2,6 +2,20 @@
 HF Agent - Main agent module
 """
 
-from agent.core.agent_loop import submission_loop
+import litellm
+
+# Global LiteLLM behavior — set once at package import so both CLI and
+# backend entries share the same config.
+#   drop_params: quietly drop unsupported params rather than raising
+#   suppress_debug_info: hide the noisy "Give Feedback" banner on errors
+#   modify_params: let LiteLLM patch Anthropic's tool-call requirements
+#     (synthesize a dummy tool spec when we call completion on a history
+#     that contains tool_calls but aren't passing `tools=` — happens
+#     during summarization / session seeding).
+litellm.drop_params = True
+litellm.suppress_debug_info = True
+litellm.modify_params = True
+
+from agent.core.agent_loop import submission_loop  # noqa: E402
 
 __all__ = ["submission_loop"]
@@ -69,12 +69,69 @@ def _get_hf_username(hf_token: str | None = None) -> str:
         return "unknown"
 
 
+_COMPACT_PROMPT = (
+    "Please provide a concise summary of the conversation above, focusing on "
+    "key decisions, the 'why' behind the decisions, problems solved, and "
+    "important context needed for developing further. Your summary will be "
+    "given to someone who has never worked on this project before and they "
+    "will be have to be filled in."
+)
+
+# Used when seeding a brand-new session from prior browser-cached messages.
+# Here we're writing a note to *ourselves* — so preserve the tool-call trail,
+# files produced, and planned next steps in first person. Optimized for
+# continuity, not brevity.
+_RESTORE_PROMPT = (
+    "You're about to be restored into a fresh session with no memory of the "
+    "conversation above. Write a first-person note to your future self so "
+    "you can continue right where you left off. Include:\n"
+    "  • What the user originally asked for and what progress you've made.\n"
+    "  • Every tool you called, with arguments and a one-line result summary.\n"
+    "  • Any code, files, scripts, or artifacts you produced (with paths).\n"
+    "  • Key decisions and the reasoning behind them.\n"
+    "  • What you were planning to do next.\n\n"
+    "Don't be cute. Be specific. This is the only context you'll have."
+)
+
+
+async def summarize_messages(
+    messages: list[Message],
+    model_name: str,
+    hf_token: str | None = None,
+    max_tokens: int = 2000,
+    tool_specs: list[dict] | None = None,
+    prompt: str = _COMPACT_PROMPT,
+) -> tuple[str, int]:
+    """Run a summarization prompt against a list of messages.
+
+    ``prompt`` defaults to the compaction prompt (terse, decision-focused).
+    Callers seeding a new session after a restart should pass ``_RESTORE_PROMPT``
+    instead — it preserves the tool-call trail so the agent can answer
+    follow-up questions about what it did.
+
+    Returns ``(summary_text, completion_tokens)``.
+    """
+    from agent.core.llm_params import _resolve_llm_params
+
+    prompt_messages = list(messages) + [Message(role="user", content=prompt)]
+    llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
+    response = await acompletion(
+        messages=prompt_messages,
+        max_completion_tokens=max_tokens,
+        tools=tool_specs,
+        **llm_params,
+    )
+    summary = response.choices[0].message.content or ""
+    completion_tokens = response.usage.completion_tokens if response.usage else 0
+    return summary, completion_tokens
+
+
 class ContextManager:
     """Manages conversation context and message history for the agent"""
 
     def __init__(
         self,
-        max_context: int = 180_000,
+        model_max_tokens: int = 180_000,
         compact_size: float = 0.1,
         untouched_messages: int = 5,
         tool_specs: list[dict[str, Any]] | None = None,
@@ -88,9 +145,15 @@ def __init__(
             hf_token=hf_token,
             local_mode=local_mode,
         )
-        self.max_context = max_context - 10000
-        self.compact_size = int(max_context * compact_size)
-        self.context_length = 0  # Updated after each LLM call with actual usage
+        # The model's real input-token ceiling (from litellm.get_model_info).
+        # Compaction triggers at _COMPACT_THRESHOLD_RATIO below it — see
+        # the compaction_threshold property.
+        self.model_max_tokens = model_max_tokens
+        self.compact_size = int(model_max_tokens * compact_size)
+        # Running count of tokens the last LLM call reported. Drives the
+        # compaction gate; updated in add_message() with each response's
+        # usage.total_tokens.
+        self.running_context_usage = 0
         self.untouched_messages = untouched_messages
         self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
 
@@ -151,7 +214,7 @@ def _load_system_prompt(
     def add_message(self, message: Message, token_count: int = None) -> None:
         """Add a message to the history"""
         if token_count:
-            self.context_length = token_count
+            self.running_context_usage = token_count
         self.items.append(message)
 
     def get_messages(self) -> list[Message]:
@@ -264,14 +327,29 @@ def truncate_to_user_message(self, user_message_index: int) -> bool:
                 count += 1
         return False
 
+    # Compaction fires at 90% of model_max_tokens so there's headroom for
+    # the next turn's prompt + response before we actually hit the ceiling.
+    _COMPACT_THRESHOLD_RATIO = 0.9
+
+    @property
+    def compaction_threshold(self) -> int:
+        """Token count at which `compact()` kicks in."""
+        return int(self.model_max_tokens * self._COMPACT_THRESHOLD_RATIO)
+
+    @property
+    def needs_compaction(self) -> bool:
+        return self.running_context_usage > self.compaction_threshold and bool(
+            self.items
+        )
+
     async def compact(
         self,
         model_name: str,
         tool_specs: list[dict] | None = None,
         hf_token: str | None = None,
     ) -> None:
         """Remove old messages to keep history under target size"""
-        if (self.context_length <= self.max_context) or not self.items:
+        if not self.needs_compaction:
             return
 
         system_msg = (
@@ -301,34 +379,37 @@ async def compact(
         if not messages_to_summarize:
             return
 
-        messages_to_summarize.append(
-            Message(
-                role="user",
-                content="Please provide a concise summary of the conversation above, focusing on key decisions, the 'why' behind the decisions, problems solved, and important context needed for developing further. Your summary will be given to someone who has never worked on this project before and they will be have to be filled in.",
-            )
-        )
-
-        llm_params = _resolve_llm_params(
-            model_name,
-            session_hf_token=hf_token,
-            reasoning_effort="high",
-        )
-        response = await acompletion(
-            messages=messages_to_summarize,
-            max_completion_tokens=self.compact_size,
-            tools=tool_specs,
-            **llm_params,
-        )
-        summarized_message = Message(
-            role="assistant", content=response.choices[0].message.content
+        summary, completion_tokens = await summarize_messages(
+            messages_to_summarize,
+            model_name=model_name,
+            hf_token=hf_token,
+            max_tokens=self.compact_size,
+            tool_specs=tool_specs,
+            prompt=_COMPACT_PROMPT,
         )
+        summarized_message = Message(role="assistant", content=summary)
 
         # Reconstruct: system + first user msg + summary + recent messages
         head = [system_msg] if system_msg else []
         if first_user_msg:
             head.append(first_user_msg)
         self.items = head + [summarized_message] + recent_messages
 
-        self.context_length = (
-            len(self.system_prompt) // 4 + response.usage.completion_tokens
-        )
+        # Count the actual post-compact context — system prompt + first user
+        # turn + summary + the preserved tail all contribute, not just the
+        # summary. litellm.token_counter uses the model's real tokenizer.
+        from litellm import token_counter
+
+        try:
+            self.running_context_usage = token_counter(
+                model=model_name,
+                messages=[m.model_dump() for m in self.items],
+            )
+        except Exception as e:
+            logger.warning(
+                "token_counter failed post-compact (%s); falling back to rough estimate",
+                e,
+            )
+            self.running_context_usage = (
+                len(self.system_prompt) // 4 + completion_tokens
+            )
@@ -196,33 +196,33 @@ def _friendly_error_message(error: Exception) -> str | None:
 
 async def _compact_and_notify(session: Session) -> None:
     """Run compaction and send event if context was reduced."""
-    old_length = session.context_manager.context_length
-    max_ctx = session.context_manager.max_context
+    cm = session.context_manager
+    old_usage = cm.running_context_usage
     logger.debug(
-        "Compaction check: context_length=%d, max_context=%d, needs_compact=%s",
-        old_length,
-        max_ctx,
-        old_length > max_ctx,
+        "Compaction check: usage=%d, max=%d, threshold=%d, needs_compact=%s",
+        old_usage,
+        cm.model_max_tokens,
+        cm.compaction_threshold,
+        cm.needs_compaction,
     )
-    tool_specs = session.tool_router.get_tool_specs_for_llm()
-    await session.context_manager.compact(
+    await cm.compact(
         model_name=session.config.model_name,
-        tool_specs=tool_specs,
+        tool_specs=session.tool_router.get_tool_specs_for_llm(),
         hf_token=session.hf_token,
     )
-    new_length = session.context_manager.context_length
-    if new_length != old_length:
+    new_usage = cm.running_context_usage
+    if new_usage != old_usage:
         logger.warning(
             "Context compacted: %d -> %d tokens (max=%d, %d messages)",
-            old_length,
-            new_length,
-            max_ctx,
-            len(session.context_manager.items),
+            old_usage,
+            new_usage,
+            cm.model_max_tokens,
+            len(cm.items),
         )
         await session.send_event(
             Event(
                 event_type="compacted",
-                data={"old_tokens": old_length, "new_tokens": new_length},
+                data={"old_tokens": old_usage, "new_tokens": new_usage},
             )
         )
 
@@ -630,13 +630,13 @@ async def run_agent(
                     logger.debug(
                         "Agent loop ending: no tool calls. "
                         "finish_reason=%s, token_count=%d, "
-                        "context_length=%d, max_context=%d, "
+                        "usage=%d, model_max_tokens=%d, "
                         "iteration=%d/%d, "
                         "response_text=%s",
                         finish_reason,
                         token_count,
-                        session.context_manager.context_length,
-                        session.context_manager.max_context,
+                        session.context_manager.running_context_usage,
+                        session.context_manager.model_max_tokens,
                         iteration,
                         max_iterations,
                         (content or "")[:500],
@@ -870,17 +870,16 @@ async def _exec_tool(
 
             except ContextWindowExceededError:
                 # Force compact and retry this iteration
+                cm = session.context_manager
                 logger.warning(
                     "ContextWindowExceededError at iteration %d — forcing compaction "
-                    "(context_length=%d, max_context=%d, messages=%d)",
+                    "(usage=%d, model_max_tokens=%d, messages=%d)",
                     iteration,
-                    session.context_manager.context_length,
-                    session.context_manager.max_context,
-                    len(session.context_manager.items),
-                )
-                session.context_manager.context_length = (
-                    session.context_manager.max_context + 1
+                    cm.running_context_usage,
+                    cm.model_max_tokens,
+                    len(cm.items),
                 )
+                cm.running_context_usage = cm.model_max_tokens + 1
                 await _compact_and_notify(session)
                 continue
 
 
@@ -109,7 +109,8 @@ def _resolve_llm_params(
         "api_key": api_key,
     }
     if os.environ.get("INFERENCE_TOKEN"):
-        params["extra_headers"] = {"X-HF-Bill-To": "huggingface"}
+        bill_to = os.environ.get("HF_BILL_TO", "smolagents")
+        params["extra_headers"] = {"X-HF-Bill-To": bill_to}
     if reasoning_effort:
         hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
         if hf_level in _HF_ALLOWED_EFFORTS: