@@ -69,12 +69,69 @@ def _get_hf_username(hf_token: str | None = None) -> str:
6969 return "unknown"
7070
7171
72+ _COMPACT_PROMPT = (
73+ "Please provide a concise summary of the conversation above, focusing on "
74+ "key decisions, the 'why' behind the decisions, problems solved, and "
75+ "important context needed for developing further. Your summary will be "
76+ "given to someone who has never worked on this project before and they "
77+ "will be have to be filled in."
78+ )
79+
80+ # Used when seeding a brand-new session from prior browser-cached messages.
81+ # Here we're writing a note to *ourselves* — so preserve the tool-call trail,
82+ # files produced, and planned next steps in first person. Optimized for
83+ # continuity, not brevity.
84+ _RESTORE_PROMPT = (
85+ "You're about to be restored into a fresh session with no memory of the "
86+ "conversation above. Write a first-person note to your future self so "
87+ "you can continue right where you left off. Include:\n "
88+ " • What the user originally asked for and what progress you've made.\n "
89+ " • Every tool you called, with arguments and a one-line result summary.\n "
90+ " • Any code, files, scripts, or artifacts you produced (with paths).\n "
91+ " • Key decisions and the reasoning behind them.\n "
92+ " • What you were planning to do next.\n \n "
93+ "Don't be cute. Be specific. This is the only context you'll have."
94+ )
95+
96+
97+ async def summarize_messages (
98+ messages : list [Message ],
99+ model_name : str ,
100+ hf_token : str | None = None ,
101+ max_tokens : int = 2000 ,
102+ tool_specs : list [dict ] | None = None ,
103+ prompt : str = _COMPACT_PROMPT ,
104+ ) -> tuple [str , int ]:
105+ """Run a summarization prompt against a list of messages.
106+
107+ ``prompt`` defaults to the compaction prompt (terse, decision-focused).
108+ Callers seeding a new session after a restart should pass ``_RESTORE_PROMPT``
109+ instead — it preserves the tool-call trail so the agent can answer
110+ follow-up questions about what it did.
111+
112+ Returns ``(summary_text, completion_tokens)``.
113+ """
114+ from agent .core .llm_params import _resolve_llm_params
115+
116+ prompt_messages = list (messages ) + [Message (role = "user" , content = prompt )]
117+ llm_params = _resolve_llm_params (model_name , hf_token , reasoning_effort = "high" )
118+ response = await acompletion (
119+ messages = prompt_messages ,
120+ max_completion_tokens = max_tokens ,
121+ tools = tool_specs ,
122+ ** llm_params ,
123+ )
124+ summary = response .choices [0 ].message .content or ""
125+ completion_tokens = response .usage .completion_tokens if response .usage else 0
126+ return summary , completion_tokens
127+
128+
72129class ContextManager :
73130 """Manages conversation context and message history for the agent"""
74131
75132 def __init__ (
76133 self ,
77- max_context : int = 180_000 ,
134+ model_max_tokens : int = 180_000 ,
78135 compact_size : float = 0.1 ,
79136 untouched_messages : int = 5 ,
80137 tool_specs : list [dict [str , Any ]] | None = None ,
@@ -88,9 +145,15 @@ def __init__(
88145 hf_token = hf_token ,
89146 local_mode = local_mode ,
90147 )
91- self .max_context = max_context - 10000
92- self .compact_size = int (max_context * compact_size )
93- self .context_length = 0 # Updated after each LLM call with actual usage
148+ # The model's real input-token ceiling (from litellm.get_model_info).
149+ # Compaction triggers at _COMPACT_THRESHOLD_RATIO below it — see
150+ # the compaction_threshold property.
151+ self .model_max_tokens = model_max_tokens
152+ self .compact_size = int (model_max_tokens * compact_size )
153+ # Running count of tokens the last LLM call reported. Drives the
154+ # compaction gate; updated in add_message() with each response's
155+ # usage.total_tokens.
156+ self .running_context_usage = 0
94157 self .untouched_messages = untouched_messages
95158 self .items : list [Message ] = [Message (role = "system" , content = self .system_prompt )]
96159
@@ -151,7 +214,7 @@ def _load_system_prompt(
151214 def add_message (self , message : Message , token_count : int = None ) -> None :
152215 """Add a message to the history"""
153216 if token_count :
154- self .context_length = token_count
217+ self .running_context_usage = token_count
155218 self .items .append (message )
156219
157220 def get_messages (self ) -> list [Message ]:
@@ -264,14 +327,29 @@ def truncate_to_user_message(self, user_message_index: int) -> bool:
264327 count += 1
265328 return False
266329
330+ # Compaction fires at 90% of model_max_tokens so there's headroom for
331+ # the next turn's prompt + response before we actually hit the ceiling.
332+ _COMPACT_THRESHOLD_RATIO = 0.9
333+
334+ @property
335+ def compaction_threshold (self ) -> int :
336+ """Token count at which `compact()` kicks in."""
337+ return int (self .model_max_tokens * self ._COMPACT_THRESHOLD_RATIO )
338+
339+ @property
340+ def needs_compaction (self ) -> bool :
341+ return self .running_context_usage > self .compaction_threshold and bool (
342+ self .items
343+ )
344+
267345 async def compact (
268346 self ,
269347 model_name : str ,
270348 tool_specs : list [dict ] | None = None ,
271349 hf_token : str | None = None ,
272350 ) -> None :
273351 """Remove old messages to keep history under target size"""
274- if ( self . context_length <= self . max_context ) or not self .items :
352+ if not self .needs_compaction :
275353 return
276354
277355 system_msg = (
@@ -301,34 +379,37 @@ async def compact(
301379 if not messages_to_summarize :
302380 return
303381
304- messages_to_summarize .append (
305- Message (
306- role = "user" ,
307- content = "Please provide a concise summary of the conversation above, focusing on key decisions, the 'why' behind the decisions, problems solved, and important context needed for developing further. Your summary will be given to someone who has never worked on this project before and they will be have to be filled in." ,
308- )
309- )
310-
311- llm_params = _resolve_llm_params (
312- model_name ,
313- session_hf_token = hf_token ,
314- reasoning_effort = "high" ,
315- )
316- response = await acompletion (
317- messages = messages_to_summarize ,
318- max_completion_tokens = self .compact_size ,
319- tools = tool_specs ,
320- ** llm_params ,
321- )
322- summarized_message = Message (
323- role = "assistant" , content = response .choices [0 ].message .content
382+ summary , completion_tokens = await summarize_messages (
383+ messages_to_summarize ,
384+ model_name = model_name ,
385+ hf_token = hf_token ,
386+ max_tokens = self .compact_size ,
387+ tool_specs = tool_specs ,
388+ prompt = _COMPACT_PROMPT ,
324389 )
390+ summarized_message = Message (role = "assistant" , content = summary )
325391
326392 # Reconstruct: system + first user msg + summary + recent messages
327393 head = [system_msg ] if system_msg else []
328394 if first_user_msg :
329395 head .append (first_user_msg )
330396 self .items = head + [summarized_message ] + recent_messages
331397
332- self .context_length = (
333- len (self .system_prompt ) // 4 + response .usage .completion_tokens
334- )
398+ # Count the actual post-compact context — system prompt + first user
399+ # turn + summary + the preserved tail all contribute, not just the
400+ # summary. litellm.token_counter uses the model's real tokenizer.
401+ from litellm import token_counter
402+
403+ try :
404+ self .running_context_usage = token_counter (
405+ model = model_name ,
406+ messages = [m .model_dump () for m in self .items ],
407+ )
408+ except Exception as e :
409+ logger .warning (
410+ "token_counter failed post-compact (%s); falling back to rough estimate" ,
411+ e ,
412+ )
413+ self .running_context_usage = (
414+ len (self .system_prompt ) // 4 + completion_tokens
415+ )
0 commit comments