mudler · mudler · Feb 25, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/README.md b/README.md
@@ -1109,6 +1109,45 @@ result, err := cogito.ExecuteTools(llm, fragment,
     cogito.EnableStrictGuidelines)
 ```
 
+
+### Automatic Conversation Compaction
+
+Cogito can automatically compact conversations to prevent context overflow when token usage exceeds a threshold. This is useful for long-running conversations with LLMs that have context limits.
+
+**How it works:**
+
+1. After each LLM call, Cogito checks if the token count exceeds the threshold
+2. If exceeded, it generates a summary of the conversation history using an LLM
+3. The original messages are replaced with a condensed summary, preserving context
+
+**Basic Usage:**
+
+```go
+// Enable automatic compaction with a token threshold of 4000
+// This will trigger compaction when the conversation exceeds 4000 tokens
+result, err := cogito.ExecuteTools(llm, fragment,
+    cogito.WithTools(searchTool),
+    cogito.WithCompactionThreshold(4000))
+```
+
+**Customizing Compaction:**
+
+```go
+// Set custom compaction options
+result, err := cogito.ExecuteTools(llm, fragment,
+    cogito.WithTools(searchTool),
+    cogito.WithCompactionThreshold(4000),      // Trigger at 4000 tokens
+    cogito.WithCompactionKeepMessages(5),      // Keep last 5 messages (default: 10)
+)
+```
+
+**Notes:**
+
+- Compaction requires token usage data from the LLM (supported by OpenAI, LocalAI with token usage enabled)
+- If `LastUsage` is not available, Cogito falls back to estimating tokens from message count
+- The summary prompt uses the conversation compaction prompt type
+- Compaction preserves `Status` fields like `LastUsage`, `ToolsCalled`, etc.
+
 ### Custom Prompts
 
 ```go

diff --git a/clients/localai_client.go b/clients/localai_client.go
@@ -81,17 +81,17 @@ func (m *localAICompletionMessage) UnmarshalJSON(data []byte) error {
 
 // CreateChatCompletion sends the chat completion request and parses the response,
 // including LocalAI's optional "reasoning" field, into LLMReply.ReasoningContent.
-func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, error) {
+func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, cogito.LLMUsage, error) {
 	request.Model = llm.model
 	body, err := json.Marshal(request)
 	if err != nil {
-		return cogito.LLMReply{}, fmt.Errorf("localai: marshal request: %w", err)
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: marshal request: %w", err)
 	}
 
 	url := llm.baseURL + "/chat/completions"
 	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
 	if err != nil {
-		return cogito.LLMReply{}, fmt.Errorf("localai: new request: %w", err)
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: new request: %w", err)
 	}
 	req.Header.Set("Content-Type", "application/json")
 	req.Header.Set("Accept", "application/json")
@@ -101,21 +101,21 @@ func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request open
 
 	resp, err := llm.client.Do(req)
 	if err != nil {
-		return cogito.LLMReply{}, fmt.Errorf("localai: request: %w", err)
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: request: %w", err)
 	}
 	defer resp.Body.Close()
 
 	respBody, err := io.ReadAll(resp.Body)
 	if err != nil {
-		return cogito.LLMReply{}, fmt.Errorf("localai: read response: %w", err)
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: read response: %w", err)
 	}
 
 	if resp.StatusCode != http.StatusOK {
 		var errRes openai.ErrorResponse
 		if json.Unmarshal(respBody, &errRes) == nil && errRes.Error != nil {
-			return cogito.LLMReply{}, errRes.Error
+			return cogito.LLMReply{}, cogito.LLMUsage{}, errRes.Error
 		}
-		return cogito.LLMReply{}, &openai.RequestError{
+		return cogito.LLMReply{}, cogito.LLMUsage{}, &openai.RequestError{
 			HTTPStatus:     resp.Status,
 			HTTPStatusCode: resp.StatusCode,
 			Err:            fmt.Errorf("localai: %s", string(respBody)),
@@ -125,11 +125,11 @@ func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request open
 
 	var localResp localAIChatCompletionResponse
 	if err := json.Unmarshal(respBody, &localResp); err != nil {
-		return cogito.LLMReply{}, fmt.Errorf("localai: unmarshal response: %w", err)
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: unmarshal response: %w", err)
 	}
 
 	if len(localResp.Choices) == 0 {
-		return cogito.LLMReply{}, fmt.Errorf("localai: no choices in response")
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: no choices in response")
 	}
 
 	choice := localResp.Choices[0]
@@ -157,30 +157,42 @@ func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request open
 	// Ensure ReasoningContent is set for downstream (e.g. tools.go).
 	response.Choices[0].Message.ReasoningContent = reasoning
 
+	usage := cogito.LLMUsage{
+		PromptTokens:     localResp.Usage.PromptTokens,
+		CompletionTokens: localResp.Usage.CompletionTokens,
+		TotalTokens:      localResp.Usage.TotalTokens,
+	}
+
 	return cogito.LLMReply{
 		ChatCompletionResponse: response,
 		ReasoningContent:       reasoning,
-	}, nil
+	}, usage, nil
 }
 
 // Ask prompts the LLM with the provided messages and returns a Fragment
 // containing the response. Uses CreateChatCompletion so reasoning is preserved.
+// The Fragment's Status.LastUsage is updated with the token usage.
 func (llm *LocalAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fragment, error) {
 	messages := f.GetMessages()
 	request := openai.ChatCompletionRequest{
 		Model:    llm.model,
 		Messages: messages,
 	}
-	reply, err := llm.CreateChatCompletion(ctx, request)
+	reply, usage, err := llm.CreateChatCompletion(ctx, request)
 	if err != nil {
 		return cogito.Fragment{}, err
 	}
 	if len(reply.ChatCompletionResponse.Choices) == 0 {
 		return cogito.Fragment{}, fmt.Errorf("localai: no choices in response")
 	}
-	return cogito.Fragment{
+	result := cogito.Fragment{
 		Messages:       append(f.Messages, reply.ChatCompletionResponse.Choices[0].Message),
 		ParentFragment: &f,
-		Status:         &cogito.Status{},
-	}, nil
+		Status:         f.Status,
+	}
+	if result.Status == nil {
+		result.Status = &cogito.Status{}
+	}
+	result.Status.LastUsage = usage
+	return result, nil
 }
diff --git a/clients/openai_client.go b/clients/openai_client.go
@@ -27,6 +27,7 @@ func NewOpenAILLM(model, apiKey, baseURL string) *OpenAIClient {
 // and returns a Fragment containing the response.
 // The Fragment.GetMessages() method automatically handles force-text-reply
 // when tool calls are present in the conversation history.
+// The Fragment's Status.LastUsage is updated with the token usage.
 func (llm *OpenAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fragment, error) {
 	// Use Fragment.GetMessages() which automatically adds force-text-reply
 	// system message when tool calls are detected in the conversation
@@ -40,27 +41,47 @@ func (llm *OpenAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fra
 		},
 	)
 
-	if err == nil && len(resp.Choices) > 0 {
-		return cogito.Fragment{
+	if err != nil {
+		return cogito.Fragment{}, err
+	}
+
+	if len(resp.Choices) > 0 {
+		usage := cogito.LLMUsage{
+			PromptTokens:     resp.Usage.PromptTokens,
+			CompletionTokens: resp.Usage.CompletionTokens,
+			TotalTokens:      resp.Usage.TotalTokens,
+		}
+		result := cogito.Fragment{
 			Messages:       append(f.Messages, resp.Choices[0].Message),
 			ParentFragment: &f,
-			Status:         &cogito.Status{},
-		}, nil
+			Status:         f.Status,
+		}
+		if result.Status == nil {
+			result.Status = &cogito.Status{}
+		}
+		result.Status.LastUsage = usage
+		return result, nil
 	}
 
-	return cogito.Fragment{}, err
+	return cogito.Fragment{}, nil
 }
-
-func (llm *OpenAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, error) {
+func (llm *OpenAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, cogito.LLMUsage, error) {
 	request.Model = llm.model
 	response, err := llm.client.CreateChatCompletion(ctx, request)
 	if err != nil {
-		return cogito.LLMReply{}, err
+		return cogito.LLMReply{}, cogito.LLMUsage{}, err
 	}
+
+	usage := cogito.LLMUsage{
+		PromptTokens:     response.Usage.PromptTokens,
+		CompletionTokens: response.Usage.CompletionTokens,
+		TotalTokens:      response.Usage.TotalTokens,
+	}
+
 	return cogito.LLMReply{
 		ChatCompletionResponse: response,
 		ReasoningContent:       response.Choices[0].Message.ReasoningContent,
-	}, nil
+	}, usage, nil
 }
 
 // NewOpenAIService creates a new OpenAI service instance

diff --git a/fragment.go b/fragment.go
@@ -32,6 +32,7 @@ type InjectedMessage struct {
 }
 
 type Status struct {
+	LastUsage        LLMUsage // Track token usage from the last LLM call
 	Iterations       int
 	ToolsCalled      Tools
 	ToolResults      []ToolStatus
@@ -97,6 +98,7 @@ func NewEmptyFragment() Fragment {
 			ReasoningLog: []string{},
 			ToolsCalled:  Tools{},
 			ToolResults:  []ToolStatus{},
+			LastUsage:    LLMUsage{},
 		},
 	}
 }
@@ -109,6 +111,7 @@ func NewFragment(messages ...openai.ChatCompletionMessage) Fragment {
 			ReasoningLog: []string{},
 			ToolsCalled:  Tools{},
 			ToolResults:  []ToolStatus{},
+			LastUsage:    LLMUsage{},
 		},
 	}
 }
@@ -210,11 +213,13 @@ func (r Fragment) ExtractStructure(ctx context.Context, llm LLM, s structures.St
 		},
 	}
 
-	resp, err := llm.CreateChatCompletion(ctx, decision)
+	resp, usage, err := llm.CreateChatCompletion(ctx, decision)
 	if err != nil {
 		return err
 	}
 
+	r.Status.LastUsage = usage
+
 	if len(resp.ChatCompletionResponse.Choices) != 1 {
 		return fmt.Errorf("no choices: %d", len(resp.ChatCompletionResponse.Choices))
 	}
@@ -271,11 +276,13 @@ func (f Fragment) SelectTool(ctx context.Context, llm LLM, availableTools Tools,
 		}
 	}
 
-	resp, err := llm.CreateChatCompletion(ctx, decision)
+	resp, usage, err := llm.CreateChatCompletion(ctx, decision)
 	if err != nil {
 		return Fragment{}, nil, err
 	}
 
+	f.Status.LastUsage = usage
+
 	if len(resp.ChatCompletionResponse.Choices) != 1 {
 		return Fragment{}, nil, fmt.Errorf("no choices: %d", len(resp.ChatCompletionResponse.Choices))
 	}

diff --git a/fragment_e2e_test.go b/fragment_e2e_test.go
@@ -156,7 +156,7 @@ var _ = Describe("Result test", Label("e2e"), func() {
 				Content: "What's the weather today in San Francisco?",
 			})
 
-			newFragment, result, err := fragment.SelectTool(context.TODO(), *defaultLLM, Tools{
+			newFragment, result, err := fragment.SelectTool(context.TODO(), defaultLLM, Tools{
 				NewToolDefinition(
 					(&GetWeatherTool{}),
 					WeatherArgs{},

diff --git a/llm.go b/llm.go
@@ -6,9 +6,16 @@ import (
 	"github.com/sashabaranov/go-openai"
 )
 
+// LLMUsage represents token usage information from an LLM response
+type LLMUsage struct {
+	PromptTokens     int
+	CompletionTokens int
+	TotalTokens      int
+}
+
 type LLM interface {
 	Ask(ctx context.Context, f Fragment) (Fragment, error)
-	CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (LLMReply, error)
+	CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (LLMReply, LLMUsage, error)
 }
 
 type LLMReply struct {

diff --git a/options.go b/options.go
@@ -63,23 +63,29 @@ type Options struct {
 	todos               *structures.TODOList
 
 	messagesManipulator func([]openai.ChatCompletionMessage) []openai.ChatCompletionMessage
+
+	// Compaction options - automatic conversation compaction based on token count
+	compactionThreshold    int // Token count threshold that triggers compaction (0 = disabled)
+	compactionKeepMessages int // Number of recent messages to keep after compaction
 }
 
 type Option func(*Options)
 
 func defaultOptions() *Options {
 	return &Options{
-		maxIterations:         1,
-		maxAttempts:           1,
-		maxRetries:            5,
-		loopDetectionSteps:    0,
-		forceReasoning:        false,
-		maxAdjustmentAttempts: 5,
-		sinkStateTool:         &defaultSinkStateTool{},
-		sinkState:             true,
-		context:               context.Background(),
-		statusCallback:        func(s string) {},
-		reasoningCallback:     func(s string) {},
+		maxIterations:          1,
+		maxAttempts:            1,
+		maxRetries:             5,
+		loopDetectionSteps:     0,
+		forceReasoning:         false,
+		maxAdjustmentAttempts:  5,
+		sinkStateTool:          &defaultSinkStateTool{},
+		sinkState:              true,
+		context:                context.Background(),
+		statusCallback:         func(s string) {},
+		reasoningCallback:      func(s string) {},
+		compactionThreshold:    0,  // Disabled by default
+		compactionKeepMessages: 10, // Keep 10 recent messages by default
 	}
 }
 
@@ -367,6 +373,24 @@ func WithMessageInjectionResultChan(ch chan MessageInjectionResult) func(o *Opti
 	}
 }
 
+// WithCompactionThreshold sets the token count threshold that triggers automatic
+// conversation compaction. When total tokens in the response >= threshold,
+// the conversation will be compacted to stay within the limit.
+// Set to 0 (default) to disable automatic compaction.
+func WithCompactionThreshold(threshold int) func(o *Options) {
+	return func(o *Options) {
+		o.compactionThreshold = threshold
+	}
+}
+
+// WithCompactionKeepMessages sets the number of recent messages to keep after
+// compaction. Default is 10. This only applies when WithCompactionThreshold is set.
+func WithCompactionKeepMessages(count int) func(o *Options) {
+	return func(o *Options) {
+		o.compactionKeepMessages = count
+	}
+}
+
 type defaultSinkStateTool struct{}
 
 func (d *defaultSinkStateTool) Execute(args map[string]any) (string, any, error) {