Skip to content

Commit fb54391

Browse files
committed
Merge upstream main into phase two provider adapters
2 parents 064c99b + f30ed48 commit fb54391

19 files changed

Lines changed: 711 additions & 128 deletions

agent/__init__.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,20 @@
22
HF Agent - Main agent module
33
"""
44

5-
from agent.core.agent_loop import submission_loop
5+
import litellm
6+
7+
# Global LiteLLM behavior — set once at package import so both CLI and
8+
# backend entries share the same config.
9+
# drop_params: quietly drop unsupported params rather than raising
10+
# suppress_debug_info: hide the noisy "Give Feedback" banner on errors
11+
# modify_params: let LiteLLM patch Anthropic's tool-call requirements
12+
# (synthesize a dummy tool spec when we call completion on a history
13+
# that contains tool_calls but aren't passing `tools=` — happens
14+
# during summarization / session seeding).
15+
litellm.drop_params = True
16+
litellm.suppress_debug_info = True
17+
litellm.modify_params = True
18+
19+
from agent.core.agent_loop import submission_loop # noqa: E402
620

721
__all__ = ["submission_loop"]

agent/context_manager/manager.py

Lines changed: 110 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,69 @@ def _get_hf_username(hf_token: str | None = None) -> str:
6969
return "unknown"
7070

7171

72+
_COMPACT_PROMPT = (
73+
"Please provide a concise summary of the conversation above, focusing on "
74+
"key decisions, the 'why' behind the decisions, problems solved, and "
75+
"important context needed for developing further. Your summary will be "
76+
"given to someone who has never worked on this project before and they "
77+
"will be have to be filled in."
78+
)
79+
80+
# Used when seeding a brand-new session from prior browser-cached messages.
81+
# Here we're writing a note to *ourselves* — so preserve the tool-call trail,
82+
# files produced, and planned next steps in first person. Optimized for
83+
# continuity, not brevity.
84+
_RESTORE_PROMPT = (
85+
"You're about to be restored into a fresh session with no memory of the "
86+
"conversation above. Write a first-person note to your future self so "
87+
"you can continue right where you left off. Include:\n"
88+
" • What the user originally asked for and what progress you've made.\n"
89+
" • Every tool you called, with arguments and a one-line result summary.\n"
90+
" • Any code, files, scripts, or artifacts you produced (with paths).\n"
91+
" • Key decisions and the reasoning behind them.\n"
92+
" • What you were planning to do next.\n\n"
93+
"Don't be cute. Be specific. This is the only context you'll have."
94+
)
95+
96+
97+
async def summarize_messages(
98+
messages: list[Message],
99+
model_name: str,
100+
hf_token: str | None = None,
101+
max_tokens: int = 2000,
102+
tool_specs: list[dict] | None = None,
103+
prompt: str = _COMPACT_PROMPT,
104+
) -> tuple[str, int]:
105+
"""Run a summarization prompt against a list of messages.
106+
107+
``prompt`` defaults to the compaction prompt (terse, decision-focused).
108+
Callers seeding a new session after a restart should pass ``_RESTORE_PROMPT``
109+
instead — it preserves the tool-call trail so the agent can answer
110+
follow-up questions about what it did.
111+
112+
Returns ``(summary_text, completion_tokens)``.
113+
"""
114+
from agent.core.llm_params import _resolve_llm_params
115+
116+
prompt_messages = list(messages) + [Message(role="user", content=prompt)]
117+
llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
118+
response = await acompletion(
119+
messages=prompt_messages,
120+
max_completion_tokens=max_tokens,
121+
tools=tool_specs,
122+
**llm_params,
123+
)
124+
summary = response.choices[0].message.content or ""
125+
completion_tokens = response.usage.completion_tokens if response.usage else 0
126+
return summary, completion_tokens
127+
128+
72129
class ContextManager:
73130
"""Manages conversation context and message history for the agent"""
74131

75132
def __init__(
76133
self,
77-
max_context: int = 180_000,
134+
model_max_tokens: int = 180_000,
78135
compact_size: float = 0.1,
79136
untouched_messages: int = 5,
80137
tool_specs: list[dict[str, Any]] | None = None,
@@ -88,9 +145,15 @@ def __init__(
88145
hf_token=hf_token,
89146
local_mode=local_mode,
90147
)
91-
self.max_context = max_context - 10000
92-
self.compact_size = int(max_context * compact_size)
93-
self.context_length = 0 # Updated after each LLM call with actual usage
148+
# The model's real input-token ceiling (from litellm.get_model_info).
149+
# Compaction triggers at _COMPACT_THRESHOLD_RATIO below it — see
150+
# the compaction_threshold property.
151+
self.model_max_tokens = model_max_tokens
152+
self.compact_size = int(model_max_tokens * compact_size)
153+
# Running count of tokens the last LLM call reported. Drives the
154+
# compaction gate; updated in add_message() with each response's
155+
# usage.total_tokens.
156+
self.running_context_usage = 0
94157
self.untouched_messages = untouched_messages
95158
self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
96159

@@ -151,7 +214,7 @@ def _load_system_prompt(
151214
def add_message(self, message: Message, token_count: int = None) -> None:
152215
"""Add a message to the history"""
153216
if token_count:
154-
self.context_length = token_count
217+
self.running_context_usage = token_count
155218
self.items.append(message)
156219

157220
def get_messages(self) -> list[Message]:
@@ -264,14 +327,29 @@ def truncate_to_user_message(self, user_message_index: int) -> bool:
264327
count += 1
265328
return False
266329

330+
# Compaction fires at 90% of model_max_tokens so there's headroom for
331+
# the next turn's prompt + response before we actually hit the ceiling.
332+
_COMPACT_THRESHOLD_RATIO = 0.9
333+
334+
@property
335+
def compaction_threshold(self) -> int:
336+
"""Token count at which `compact()` kicks in."""
337+
return int(self.model_max_tokens * self._COMPACT_THRESHOLD_RATIO)
338+
339+
@property
340+
def needs_compaction(self) -> bool:
341+
return self.running_context_usage > self.compaction_threshold and bool(
342+
self.items
343+
)
344+
267345
async def compact(
268346
self,
269347
model_name: str,
270348
tool_specs: list[dict] | None = None,
271349
hf_token: str | None = None,
272350
) -> None:
273351
"""Remove old messages to keep history under target size"""
274-
if (self.context_length <= self.max_context) or not self.items:
352+
if not self.needs_compaction:
275353
return
276354

277355
system_msg = (
@@ -301,34 +379,37 @@ async def compact(
301379
if not messages_to_summarize:
302380
return
303381

304-
messages_to_summarize.append(
305-
Message(
306-
role="user",
307-
content="Please provide a concise summary of the conversation above, focusing on key decisions, the 'why' behind the decisions, problems solved, and important context needed for developing further. Your summary will be given to someone who has never worked on this project before and they will be have to be filled in.",
308-
)
309-
)
310-
311-
llm_params = _resolve_llm_params(
312-
model_name,
313-
session_hf_token=hf_token,
314-
reasoning_effort="high",
315-
)
316-
response = await acompletion(
317-
messages=messages_to_summarize,
318-
max_completion_tokens=self.compact_size,
319-
tools=tool_specs,
320-
**llm_params,
321-
)
322-
summarized_message = Message(
323-
role="assistant", content=response.choices[0].message.content
382+
summary, completion_tokens = await summarize_messages(
383+
messages_to_summarize,
384+
model_name=model_name,
385+
hf_token=hf_token,
386+
max_tokens=self.compact_size,
387+
tool_specs=tool_specs,
388+
prompt=_COMPACT_PROMPT,
324389
)
390+
summarized_message = Message(role="assistant", content=summary)
325391

326392
# Reconstruct: system + first user msg + summary + recent messages
327393
head = [system_msg] if system_msg else []
328394
if first_user_msg:
329395
head.append(first_user_msg)
330396
self.items = head + [summarized_message] + recent_messages
331397

332-
self.context_length = (
333-
len(self.system_prompt) // 4 + response.usage.completion_tokens
334-
)
398+
# Count the actual post-compact context — system prompt + first user
399+
# turn + summary + the preserved tail all contribute, not just the
400+
# summary. litellm.token_counter uses the model's real tokenizer.
401+
from litellm import token_counter
402+
403+
try:
404+
self.running_context_usage = token_counter(
405+
model=model_name,
406+
messages=[m.model_dump() for m in self.items],
407+
)
408+
except Exception as e:
409+
logger.warning(
410+
"token_counter failed post-compact (%s); falling back to rough estimate",
411+
e,
412+
)
413+
self.running_context_usage = (
414+
len(self.system_prompt) // 4 + completion_tokens
415+
)

agent/core/agent_loop.py

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -196,33 +196,33 @@ def _friendly_error_message(error: Exception) -> str | None:
196196

197197
async def _compact_and_notify(session: Session) -> None:
198198
"""Run compaction and send event if context was reduced."""
199-
old_length = session.context_manager.context_length
200-
max_ctx = session.context_manager.max_context
199+
cm = session.context_manager
200+
old_usage = cm.running_context_usage
201201
logger.debug(
202-
"Compaction check: context_length=%d, max_context=%d, needs_compact=%s",
203-
old_length,
204-
max_ctx,
205-
old_length > max_ctx,
202+
"Compaction check: usage=%d, max=%d, threshold=%d, needs_compact=%s",
203+
old_usage,
204+
cm.model_max_tokens,
205+
cm.compaction_threshold,
206+
cm.needs_compaction,
206207
)
207-
tool_specs = session.tool_router.get_tool_specs_for_llm()
208-
await session.context_manager.compact(
208+
await cm.compact(
209209
model_name=session.config.model_name,
210-
tool_specs=tool_specs,
210+
tool_specs=session.tool_router.get_tool_specs_for_llm(),
211211
hf_token=session.hf_token,
212212
)
213-
new_length = session.context_manager.context_length
214-
if new_length != old_length:
213+
new_usage = cm.running_context_usage
214+
if new_usage != old_usage:
215215
logger.warning(
216216
"Context compacted: %d -> %d tokens (max=%d, %d messages)",
217-
old_length,
218-
new_length,
219-
max_ctx,
220-
len(session.context_manager.items),
217+
old_usage,
218+
new_usage,
219+
cm.model_max_tokens,
220+
len(cm.items),
221221
)
222222
await session.send_event(
223223
Event(
224224
event_type="compacted",
225-
data={"old_tokens": old_length, "new_tokens": new_length},
225+
data={"old_tokens": old_usage, "new_tokens": new_usage},
226226
)
227227
)
228228

@@ -630,13 +630,13 @@ async def run_agent(
630630
logger.debug(
631631
"Agent loop ending: no tool calls. "
632632
"finish_reason=%s, token_count=%d, "
633-
"context_length=%d, max_context=%d, "
633+
"usage=%d, model_max_tokens=%d, "
634634
"iteration=%d/%d, "
635635
"response_text=%s",
636636
finish_reason,
637637
token_count,
638-
session.context_manager.context_length,
639-
session.context_manager.max_context,
638+
session.context_manager.running_context_usage,
639+
session.context_manager.model_max_tokens,
640640
iteration,
641641
max_iterations,
642642
(content or "")[:500],
@@ -870,17 +870,16 @@ async def _exec_tool(
870870

871871
except ContextWindowExceededError:
872872
# Force compact and retry this iteration
873+
cm = session.context_manager
873874
logger.warning(
874875
"ContextWindowExceededError at iteration %d — forcing compaction "
875-
"(context_length=%d, max_context=%d, messages=%d)",
876+
"(usage=%d, model_max_tokens=%d, messages=%d)",
876877
iteration,
877-
session.context_manager.context_length,
878-
session.context_manager.max_context,
879-
len(session.context_manager.items),
880-
)
881-
session.context_manager.context_length = (
882-
session.context_manager.max_context + 1
878+
cm.running_context_usage,
879+
cm.model_max_tokens,
880+
len(cm.items),
883881
)
882+
cm.running_context_usage = cm.model_max_tokens + 1
884883
await _compact_and_notify(session)
885884
continue
886885

agent/core/llm_params.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ def _resolve_llm_params(
109109
"api_key": api_key,
110110
}
111111
if os.environ.get("INFERENCE_TOKEN"):
112-
params["extra_headers"] = {"X-HF-Bill-To": "huggingface"}
112+
bill_to = os.environ.get("HF_BILL_TO", "smolagents")
113+
params["extra_headers"] = {"X-HF-Bill-To": bill_to}
113114
if reasoning_effort:
114115
hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
115116
if hf_level in _HF_ALLOWED_EFFORTS:

0 commit comments

Comments
 (0)