From 96e16f2e0fc29d54169c09719655597764e39830 Mon Sep 17 00:00:00 2001
From: localai-bot <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 24 Feb 2026 12:46:37 +0000
Subject: [PATCH 1/3] feat: add automatic conversation compaction based on
 token threshold

This commit adds automatic conversation compaction to prevent context overflow
during long-running tool execution sessions.

Key changes:
- Added LLMUsage struct to track token usage from LLM responses
- Modified LLM interface to return token usage alongside Fragment
- Added WithCompactionThreshold option to set token count threshold
- Added WithCompactionKeepMessages option to configure recent messages to keep
- Added compaction logic in ExecuteTools after LLM calls
- Added helper functions: compactFragment, checkAndCompact, estimateTokens
- Added PromptConversationCompaction for generating conversation summaries
- Updated OpenAI and LocalAI clients to return token usage
- Updated mock client for testing

When compactionThreshold is set (> 0), the conversation will be automatically
compacted when estimated token count exceeds the threshold. The compaction
generates a summary of the conversation history using an LLM call while
preserving recent messages.

Signed-off-by: Autonomous Coding Agent <agent@autonomous>
---
 clients/localai_client.go |  36 ++++----
 clients/openai_client.go  |  30 +++++--
 extractors.go             |   2 +-
 fragment.go               |   4 +-
 fragment_e2e_test.go      |   4 +-
 goal.go                   |   4 +-
 guidelines.go             |   2 +-
 llm.go                    |  11 ++-
 options.go                |  24 +++++
 plan.go                   |   6 +-
 prompt/prompt.go          |  18 ++++
 reviewer.go               |   6 +-
 reviewer_e2e_test.go      |   4 +-
 tests/mock/client.go      |  50 +++++++++--
 tools.go                  | 183 +++++++++++++++++++++++++++++++++++++-
 15 files changed, 334 insertions(+), 50 deletions(-)

diff --git a/clients/localai_client.go b/clients/localai_client.go
index cab9ef7..622eb61 100644
--- a/clients/localai_client.go
+++ b/clients/localai_client.go
@@ -81,17 +81,17 @@ func (m *localAICompletionMessage) UnmarshalJSON(data []byte) error {
 
 // CreateChatCompletion sends the chat completion request and parses the response,
 // including LocalAI's optional "reasoning" field, into LLMReply.ReasoningContent.
-func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, error) {
+func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, cogito.LLMUsage, error) {
 	request.Model = llm.model
 	body, err := json.Marshal(request)
 	if err != nil {
-		return cogito.LLMReply{}, fmt.Errorf("localai: marshal request: %w", err)
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: marshal request: %w", err)
 	}
 
 	url := llm.baseURL + "/chat/completions"
 	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
 	if err != nil {
-		return cogito.LLMReply{}, fmt.Errorf("localai: new request: %w", err)
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: new request: %w", err)
 	}
 	req.Header.Set("Content-Type", "application/json")
 	req.Header.Set("Accept", "application/json")
@@ -101,21 +101,21 @@ func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request open
 
 	resp, err := llm.client.Do(req)
 	if err != nil {
-		return cogito.LLMReply{}, fmt.Errorf("localai: request: %w", err)
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: request: %w", err)
 	}
 	defer resp.Body.Close()
 
 	respBody, err := io.ReadAll(resp.Body)
 	if err != nil {
-		return cogito.LLMReply{}, fmt.Errorf("localai: read response: %w", err)
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: read response: %w", err)
 	}
 
 	if resp.StatusCode != http.StatusOK {
 		var errRes openai.ErrorResponse
 		if json.Unmarshal(respBody, &errRes) == nil && errRes.Error != nil {
-			return cogito.LLMReply{}, errRes.Error
+			return cogito.LLMReply{}, cogito.LLMUsage{}, errRes.Error
 		}
-		return cogito.LLMReply{}, &openai.RequestError{
+		return cogito.LLMReply{}, cogito.LLMUsage{}, &openai.RequestError{
 			HTTPStatus:     resp.Status,
 			HTTPStatusCode: resp.StatusCode,
 			Err:            fmt.Errorf("localai: %s", string(respBody)),
@@ -125,11 +125,11 @@ func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request open
 
 	var localResp localAIChatCompletionResponse
 	if err := json.Unmarshal(respBody, &localResp); err != nil {
-		return cogito.LLMReply{}, fmt.Errorf("localai: unmarshal response: %w", err)
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: unmarshal response: %w", err)
 	}
 
 	if len(localResp.Choices) == 0 {
-		return cogito.LLMReply{}, fmt.Errorf("localai: no choices in response")
+		return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: no choices in response")
 	}
 
 	choice := localResp.Choices[0]
@@ -157,30 +157,36 @@ func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request open
 	// Ensure ReasoningContent is set for downstream (e.g. tools.go).
 	response.Choices[0].Message.ReasoningContent = reasoning
 
+	usage := cogito.LLMUsage{
+		PromptTokens:     localResp.Usage.PromptTokens,
+		CompletionTokens: localResp.Usage.CompletionTokens,
+		TotalTokens:      localResp.Usage.TotalTokens,
+	}
+
 	return cogito.LLMReply{
 		ChatCompletionResponse: response,
 		ReasoningContent:       reasoning,
-	}, nil
+	}, usage, nil
 }
 
 // Ask prompts the LLM with the provided messages and returns a Fragment
 // containing the response. Uses CreateChatCompletion so reasoning is preserved.
-func (llm *LocalAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fragment, error) {
+func (llm *LocalAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fragment, cogito.LLMUsage, error) {
 	messages := f.GetMessages()
 	request := openai.ChatCompletionRequest{
 		Model:    llm.model,
 		Messages: messages,
 	}
-	reply, err := llm.CreateChatCompletion(ctx, request)
+	reply, _, err := llm.CreateChatCompletion(ctx, request)
 	if err != nil {
-		return cogito.Fragment{}, err
+		return cogito.Fragment{}, cogito.LLMUsage{}, err
 	}
 	if len(reply.ChatCompletionResponse.Choices) == 0 {
-		return cogito.Fragment{}, fmt.Errorf("localai: no choices in response")
+		return cogito.Fragment{}, cogito.LLMUsage{}, fmt.Errorf("localai: no choices in response")
 	}
 	return cogito.Fragment{
 		Messages:       append(f.Messages, reply.ChatCompletionResponse.Choices[0].Message),
 		ParentFragment: &f,
 		Status:         &cogito.Status{},
-	}, nil
+	}, cogito.LLMUsage{}, nil
 }
diff --git a/clients/openai_client.go b/clients/openai_client.go
index 4dbc69e..17e7d7d 100644
--- a/clients/openai_client.go
+++ b/clients/openai_client.go
@@ -27,7 +27,7 @@ func NewOpenAILLM(model, apiKey, baseURL string) *OpenAIClient {
 // and returns a Fragment containing the response.
 // The Fragment.GetMessages() method automatically handles force-text-reply
 // when tool calls are present in the conversation history.
-func (llm *OpenAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fragment, error) {
+func (llm *OpenAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fragment, cogito.LLMUsage, error) {
 	// Use Fragment.GetMessages() which automatically adds force-text-reply
 	// system message when tool calls are detected in the conversation
 	messages := f.GetMessages()
@@ -40,27 +40,43 @@ func (llm *OpenAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fra
 		},
 	)
 
-	if err == nil && len(resp.Choices) > 0 {
+	if err != nil {
+		return cogito.Fragment{}, cogito.LLMUsage{}, err
+	}
+
+	if len(resp.Choices) > 0 {
+		usage := cogito.LLMUsage{
+			PromptTokens:      resp.Usage.PromptTokens,
+			CompletionTokens:  resp.Usage.CompletionTokens,
+			TotalTokens:       resp.Usage.TotalTokens,
+		}
 		return cogito.Fragment{
 			Messages:       append(f.Messages, resp.Choices[0].Message),
 			ParentFragment: &f,
 			Status:         &cogito.Status{},
-		}, nil
+		}, usage, nil
 	}
 
-	return cogito.Fragment{}, err
+	return cogito.Fragment{}, cogito.LLMUsage{}, nil
 }
 
-func (llm *OpenAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, error) {
+func (llm *OpenAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, cogito.LLMUsage, error) {
 	request.Model = llm.model
 	response, err := llm.client.CreateChatCompletion(ctx, request)
 	if err != nil {
-		return cogito.LLMReply{}, err
+		return cogito.LLMReply{}, cogito.LLMUsage{}, err
 	}
+
+	usage := cogito.LLMUsage{
+		PromptTokens:     response.Usage.PromptTokens,
+		CompletionTokens: response.Usage.CompletionTokens,
+		TotalTokens:      response.Usage.TotalTokens,
+	}
+
 	return cogito.LLMReply{
 		ChatCompletionResponse: response,
 		ReasoningContent:       response.Choices[0].Message.ReasoningContent,
-	}, nil
+	}, usage, nil
 }
 
 // NewOpenAIService creates a new OpenAI service instance
diff --git a/extractors.go b/extractors.go
index 5568a67..dde6c15 100644
--- a/extractors.go
+++ b/extractors.go
@@ -68,7 +68,7 @@ func ExtractKnowledgeGaps(llm LLM, f Fragment, opts ...Option) ([]string, error)
 	xlog.Debug("Analyzing knowledge gaps", "prompt", prompt)
 	newFragment := NewEmptyFragment().AddMessage("system", prompt)
 
-	f, err = llm.Ask(o.context, newFragment)
+	f, _, err = llm.Ask(o.context, newFragment)
 	if err != nil {
 		return nil, err
 	}
diff --git a/fragment.go b/fragment.go
index d136d6a..73bd1be 100644
--- a/fragment.go
+++ b/fragment.go
@@ -210,7 +210,7 @@ func (r Fragment) ExtractStructure(ctx context.Context, llm LLM, s structures.St
 		},
 	}
 
-	resp, err := llm.CreateChatCompletion(ctx, decision)
+	resp, _, err := llm.CreateChatCompletion(ctx, decision)
 	if err != nil {
 		return err
 	}
@@ -271,7 +271,7 @@ func (f Fragment) SelectTool(ctx context.Context, llm LLM, availableTools Tools,
 		}
 	}
 
-	resp, err := llm.CreateChatCompletion(ctx, decision)
+	resp, _, err := llm.CreateChatCompletion(ctx, decision)
 	if err != nil {
 		return Fragment{}, nil, err
 	}
diff --git a/fragment_e2e_test.go b/fragment_e2e_test.go
index c810241..474acaa 100644
--- a/fragment_e2e_test.go
+++ b/fragment_e2e_test.go
@@ -120,7 +120,7 @@ var _ = Describe("Result test", Label("e2e"), func() {
 
 			fragment := NewEmptyFragment().AddMessage("user", "Write a short poem about the sea in less than 20 words.")
 
-			result, err := defaultLLM.Ask(context.TODO(), fragment)
+			result, _, err := defaultLLM.Ask(context.TODO(), fragment)
 
 			Expect(err).ToNot(HaveOccurred())
 
@@ -156,7 +156,7 @@ var _ = Describe("Result test", Label("e2e"), func() {
 				Content: "What's the weather today in San Francisco?",
 			})
 
-			newFragment, result, err := fragment.SelectTool(context.TODO(), *defaultLLM, Tools{
+			newFragment, result, err := fragment.SelectTool(context.TODO(), defaultLLM, Tools{
 				NewToolDefinition(
 					(&GetWeatherTool{}),
 					WeatherArgs{},
diff --git a/goal.go b/goal.go
index 833ca0c..3336a23 100644
--- a/goal.go
+++ b/goal.go
@@ -33,7 +33,7 @@ func ExtractGoal(llm LLM, f Fragment, opts ...Option) (*structures.Goal, error)
 
 	goalConv := NewEmptyFragment().AddMessage("user", prompt)
 
-	reasoningGoal, err := llm.Ask(o.context, goalConv)
+	reasoningGoal, _, err := llm.Ask(o.context, goalConv)
 	if err != nil {
 		return nil, fmt.Errorf("failed to ask LLM for goal identification: %w", err)
 	}
@@ -91,7 +91,7 @@ func IsGoalAchieved(llm LLM, f Fragment, goal *structures.Goal, opts ...Option)
 	}
 	goalAchievedConv := NewEmptyFragment().AddMessage("user", prompt, multimedias...)
 
-	reasoningGoal, err := llm.Ask(o.context, goalAchievedConv)
+	reasoningGoal, _, err := llm.Ask(o.context, goalAchievedConv)
 	if err != nil {
 		return nil, fmt.Errorf("failed to ask LLM for goal identification: %w", err)
 	}
diff --git a/guidelines.go b/guidelines.go
index 350485c..5f02433 100644
--- a/guidelines.go
+++ b/guidelines.go
@@ -70,7 +70,7 @@ func GetRelevantGuidelines(llm LLM, guidelines Guidelines, fragment Fragment, op
 
 	guidelineConv := NewEmptyFragment().AddMessage("user", guidelinePrompt)
 
-	guidelineResult, err := llm.Ask(o.context, guidelineConv)
+	guidelineResult, _, err := llm.Ask(o.context, guidelineConv)
 	if err != nil {
 		return Guidelines{}, fmt.Errorf("failed to ask LLM for guidelines: %w", err)
 	}
diff --git a/llm.go b/llm.go
index d2b4193..039c358 100644
--- a/llm.go
+++ b/llm.go
@@ -6,9 +6,16 @@ import (
 	"github.com/sashabaranov/go-openai"
 )
 
+// LLMUsage represents token usage information from an LLM response
+type LLMUsage struct {
+	PromptTokens      int
+	CompletionTokens  int
+	TotalTokens       int
+}
+
 type LLM interface {
-	Ask(ctx context.Context, f Fragment) (Fragment, error)
-	CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (LLMReply, error)
+	Ask(ctx context.Context, f Fragment) (Fragment, LLMUsage, error)
+	CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (LLMReply, LLMUsage, error)
 }
 
 type LLMReply struct {
diff --git a/options.go b/options.go
index d9c5157..a788a20 100644
--- a/options.go
+++ b/options.go
@@ -63,6 +63,10 @@ type Options struct {
 	todos               *structures.TODOList
 
 	messagesManipulator func([]openai.ChatCompletionMessage) []openai.ChatCompletionMessage
+
+	// Compaction options - automatic conversation compaction based on token count
+	compactionThreshold    int // Token count threshold that triggers compaction (0 = disabled)
+	compactionKeepMessages int // Number of recent messages to keep after compaction
 }
 
 type Option func(*Options)
@@ -80,6 +84,8 @@ func defaultOptions() *Options {
 		context:               context.Background(),
 		statusCallback:        func(s string) {},
 		reasoningCallback:     func(s string) {},
+		compactionThreshold:   0,   // Disabled by default
+		compactionKeepMessages: 10, // Keep 10 recent messages by default
 	}
 }
 
@@ -367,6 +373,24 @@ func WithMessageInjectionResultChan(ch chan MessageInjectionResult) func(o *Opti
 	}
 }
 
+// WithCompactionThreshold sets the token count threshold that triggers automatic
+// conversation compaction. When total tokens in the response >= threshold,
+// the conversation will be compacted to stay within the limit.
+// Set to 0 (default) to disable automatic compaction.
+func WithCompactionThreshold(threshold int) func(o *Options) {
+	return func(o *Options) {
+		o.compactionThreshold = threshold
+	}
+}
+
+// WithCompactionKeepMessages sets the number of recent messages to keep after
+// compaction. Default is 10. This only applies when WithCompactionThreshold is set.
+func WithCompactionKeepMessages(count int) func(o *Options) {
+	return func(o *Options) {
+		o.compactionKeepMessages = count
+	}
+}
+
 type defaultSinkStateTool struct{}
 
 func (d *defaultSinkStateTool) Execute(args map[string]any) (string, any, error) {
diff --git a/plan.go b/plan.go
index fd11d00..dc194da 100644
--- a/plan.go
+++ b/plan.go
@@ -111,7 +111,7 @@ func applyPlanFromPrompt(llm LLM, o *Options, planPrompt string, feedbackConv *F
 		multimedias = feedbackConv.Multimedia
 	}
 	planConv := NewEmptyFragment().AddMessage("user", planPrompt, multimedias...)
-	reasoningPlan, err := llm.Ask(o.context, planConv)
+	reasoningPlan, _, err := llm.Ask(o.context, planConv)
 	if err != nil {
 		return nil, fmt.Errorf("failed to ask LLM for plan identification: %w", err)
 	}
@@ -165,7 +165,7 @@ func ExtractTODOs(llm LLM, plan *structures.Plan, goal *structures.Goal, opts ..
 	}
 
 	todoConv := NewEmptyFragment().AddMessage("user", promptStr)
-	reasoningTodo, err := llm.Ask(o.context, todoConv)
+	reasoningTodo, _, err := llm.Ask(o.context, todoConv)
 	if err != nil {
 		return nil, fmt.Errorf("failed to ask LLM for TODO generation: %w", err)
 	}
@@ -518,7 +518,7 @@ func executeReviewPhase(reviewerLLMs []LLM, workFragment Fragment, goal *structu
 		}
 
 		// Get the reasoning from the review
-		reviewResult, err := reviewerLLM.Ask(o.context, reviewFragment)
+		reviewResult, _, err := reviewerLLM.Ask(o.context, reviewFragment)
 		if err != nil {
 			return NewEmptyFragment(), false, fmt.Errorf("failed to get review result: %w", err)
 		}
diff --git a/prompt/prompt.go b/prompt/prompt.go
index 6006065..7ef3dc2 100644
--- a/prompt/prompt.go
+++ b/prompt/prompt.go
@@ -20,6 +20,7 @@ const (
 	PromptTODOWorkType             PromptType = iota
 	PromptTODOReviewType           PromptType = iota
 	PromptTODOTrackingType         PromptType = iota
+	PromptConversationCompactionType PromptType = iota
 )
 
 var (
@@ -41,6 +42,7 @@ var (
 		PromptTODOWorkType:             PromptTODOWork,
 		PromptTODOReviewType:           PromptTODOReview,
 		PromptTODOTrackingType:         PromptTODOTracking,
+		PromptConversationCompactionType: PromptConversationCompaction,
 	}
 
 	PromptGuidelinesExtraction = NewPrompt("What guidelines should be applied? return only the numbers of the guidelines by using the json tool with a list of integers corresponding to the guidelines.")
@@ -328,4 +330,20 @@ Use the "json" tool to return an updated TODO list with:
 - Completed TODOs marked as completed
 - Any new TODOs that were identified
 - Updated feedback for TODOs if provided`)
+
+	PromptConversationCompaction = NewPrompt(`You are an AI assistant that summarizes a conversation history to preserve important context while reducing token count.
+
+Analyze the conversation history and create a concise summary that preserves:
+1. The original user request/goal
+2. Key decisions and reasoning
+3. Important tool results
+4. Current state of the task
+
+Conversation History:
+{{.Context}}
+
+Tool Results:
+{{.ToolResults}}
+
+Provide a summary that allows continuing the task without losing critical context. Be concise but comprehensive.`)
 )
diff --git a/reviewer.go b/reviewer.go
index 3392271..62257f3 100644
--- a/reviewer.go
+++ b/reviewer.go
@@ -97,5 +97,9 @@ func improveContent(llm LLM, f Fragment, refinedMessage string, gaps []string, o
 
 	newFragment.ParentFragment = f.ParentFragment
 
-	return llm.Ask(o.context, newFragment)
+	_, _, err = llm.Ask(o.context, newFragment)
+	if err != nil {
+		return Fragment{}, err
+	}
+	return newFragment, nil
 }
diff --git a/reviewer_e2e_test.go b/reviewer_e2e_test.go
index 0c86d9f..3df1461 100644
--- a/reviewer_e2e_test.go
+++ b/reviewer_e2e_test.go
@@ -16,7 +16,7 @@ var _ = Describe("cogito test", Label("e2e"), func() {
 
 			conv := NewEmptyFragment().AddMessage("user", "Explain how a combustion engine works in less than 100 words.")
 
-			result, err := defaultLLM.Ask(context.TODO(), conv)
+			result, _, err := defaultLLM.Ask(context.TODO(), conv)
 
 			Expect(err).ToNot(HaveOccurred())
 
@@ -30,7 +30,7 @@ var _ = Describe("cogito test", Label("e2e"), func() {
 
 			conv := NewEmptyFragment().AddMessage("user", "What are the latest news today?")
 
-			result, err := defaultLLM.Ask(context.TODO(), conv)
+			result, _, err := defaultLLM.Ask(context.TODO(), conv)
 
 			Expect(err).ToNot(HaveOccurred())
 			Expect(result.String()).ToNot(BeEmpty())
diff --git a/tests/mock/client.go b/tests/mock/client.go
index 13183d6..1e2eb19 100644
--- a/tests/mock/client.go
+++ b/tests/mock/client.go
@@ -19,23 +19,31 @@ type MockOpenAIClient struct {
 	AskError                      error
 	CreateChatCompletionError     error
 	FragmentHistory               []Fragment
+
+	// Token usage for responses
+	AskUsage                  []LLMUsage
+	AskUsageIndex             int
+	CreateChatCompletionUsage []LLMUsage
+	CreateChatCompletionUsageIndex int
 }
 
 func NewMockOpenAIClient() *MockOpenAIClient {
 	return &MockOpenAIClient{
 		AskResponses:                  []Fragment{},
 		CreateChatCompletionResponses: []openai.ChatCompletionResponse{},
+		AskUsage:                      []LLMUsage{},
+		CreateChatCompletionUsage:     []LLMUsage{},
 	}
 }
 
-func (m *MockOpenAIClient) Ask(ctx context.Context, f Fragment) (Fragment, error) {
+func (m *MockOpenAIClient) Ask(ctx context.Context, f Fragment) (Fragment, LLMUsage, error) {
 	m.FragmentHistory = append(m.FragmentHistory, f)
 	if m.AskError != nil {
-		return Fragment{}, m.AskError
+		return Fragment{}, LLMUsage{}, m.AskError
 	}
 
 	if m.AskResponseIndex >= len(m.AskResponses) {
-		return Fragment{}, fmt.Errorf("no more Ask responses configured")
+		return Fragment{}, LLMUsage{}, fmt.Errorf("no more Ask responses configured")
 	}
 
 	response := m.AskResponses[m.AskResponseIndex]
@@ -48,26 +56,41 @@ func (m *MockOpenAIClient) Ask(ctx context.Context, f Fragment) (Fragment, error
 	response.Messages = append(f.Messages, response.Messages...)
 	response.ParentFragment = &f
 
-	return response, nil
+	// Get usage if available
+	var usage LLMUsage
+	if m.AskUsageIndex < len(m.AskUsage) {
+		usage = m.AskUsage[m.AskUsageIndex]
+		m.AskUsageIndex++
+	}
+
+	return response, usage, nil
 }
 
-func (m *MockOpenAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (LLMReply, error) {
+func (m *MockOpenAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (LLMReply, LLMUsage, error) {
 	if m.CreateChatCompletionError != nil {
-		return LLMReply{}, m.CreateChatCompletionError
+		return LLMReply{}, LLMUsage{}, m.CreateChatCompletionError
 	}
 
 	if m.CreateChatCompletionIndex >= len(m.CreateChatCompletionResponses) {
-		return LLMReply{}, fmt.Errorf("no more CreateChatCompletion responses configured")
+		return LLMReply{}, LLMUsage{}, fmt.Errorf("no more CreateChatCompletion responses configured")
 	}
 
 	response := m.CreateChatCompletionResponses[m.CreateChatCompletionIndex]
 	m.CreateChatCompletionIndex++
 
 	xlog.Info("CreateChatCompletion response", "response", response)
+
+	// Get usage if available
+	var usage LLMUsage
+	if m.CreateChatCompletionUsageIndex < len(m.CreateChatCompletionUsage) {
+		usage = m.CreateChatCompletionUsage[m.CreateChatCompletionUsageIndex]
+		m.CreateChatCompletionUsageIndex++
+	}
+
 	return LLMReply{
 		ChatCompletionResponse: response,
 		ReasoningContent:       response.Choices[0].Message.ReasoningContent,
-	}, nil
+	}, usage, nil
 }
 
 // Helper methods for setting up mock responses
@@ -109,3 +132,14 @@ func (m *MockOpenAIClient) AddCreateChatCompletionFunction(name, args string) {
 func (m *MockOpenAIClient) SetCreateChatCompletionError(err error) {
 	m.CreateChatCompletionError = err
 }
+
+// SetUsage sets token usage for the next responses
+func (m *MockOpenAIClient) SetUsage(promptTokens, completionTokens, totalTokens int) {
+	usage := LLMUsage{
+		PromptTokens:      promptTokens,
+		CompletionTokens:  completionTokens,
+		TotalTokens:       totalTokens,
+	}
+	m.AskUsage = append(m.AskUsage, usage)
+	m.CreateChatCompletionUsage = append(m.CreateChatCompletionUsage, usage)
+}
diff --git a/tools.go b/tools.go
index 1fa6f89..2850b75 100644
--- a/tools.go
+++ b/tools.go
@@ -203,7 +203,7 @@ func decision(ctx context.Context, llm LLM, conversation []openai.ChatCompletion
 
 	var lastErr error
 	for attempts := 0; attempts < maxRetries; attempts++ {
-		resp, err := llm.CreateChatCompletion(ctx, decision)
+		resp, _, err := llm.CreateChatCompletion(ctx, decision)
 		if err != nil {
 			lastErr = err
 			xlog.Warn("Attempt to make a decision failed", "attempt", attempts+1, "error", err)
@@ -602,7 +602,7 @@ func decideToPlan(llm LLM, f Fragment, tools Tools, opts ...Option) (bool, error
 		return false, fmt.Errorf("failed to render content improver prompt: %w", err)
 	}
 
-	planDecision, err := llm.Ask(o.context, NewEmptyFragment().AddMessage("user", prompt))
+	planDecision, _, err := llm.Ask(o.context, NewEmptyFragment().AddMessage("user", prompt))
 	if err != nil {
 		return false, fmt.Errorf("failed to ask LLM for plan decision: %w", err)
 	}
@@ -886,12 +886,23 @@ TOOL_LOOP:
 
 			// Preserve the status before calling Ask
 			status := f.Status
-			f, err := llm.Ask(o.context, f)
+			f, _, err := llm.Ask(o.context, f)
 			if err != nil {
 				return f, fmt.Errorf("failed to ask LLM: %w", err)
 			}
 			// Restore the status
 			f.Status = status
+
+			// Check and compact if threshold exceeded
+			if o.compactionThreshold > 0 {
+				f, compacted, err := checkAndCompact(o.context, llm, f, o.compactionThreshold, o.compactionKeepMessages, o.prompts)
+				if err != nil {
+					return f, fmt.Errorf("failed to compact: %w", err)
+				}
+				if compacted {
+					xlog.Debug("Fragment compacted successfully after max iterations")
+				}
+			}
 			return f, nil
 		}
 
@@ -1288,10 +1299,21 @@ Please provide revised tool call based on this feedback.`,
 	// If sink state was found, stop execution after processing all tools
 	if hasSinkState {
 		xlog.Debug("Sink state was found, stopping execution after processing tools")
-		f, err = llm.Ask(o.context, f)
+		f, _, err = llm.Ask(o.context, f)
 		if err != nil {
 			return f, fmt.Errorf("failed to ask LLM: %w", err)
 		}
+
+		// Check and compact if threshold exceeded
+		if o.compactionThreshold > 0 {
+			f, compacted, err := checkAndCompact(o.context, llm, f, o.compactionThreshold, o.compactionKeepMessages, o.prompts)
+			if err != nil {
+				return f, fmt.Errorf("failed to compact: %w", err)
+			}
+			if compacted {
+				xlog.Debug("Fragment compacted successfully after sink state")
+			}
+		}
 	}
 
 	if len(f.Status.ToolsCalled) == 0 {
@@ -1313,3 +1335,156 @@ Please provide revised tool call based on this feedback.`,
 
 	return f, nil
 }
+
+// compactFragment compacts the conversation by generating a summary of the history
+// and keeping only the most recent messages.
+// Returns a new fragment with the summary prepended and recent messages appended.
+func compactFragment(ctx context.Context, llm LLM, f Fragment, keepMessages int, prompts prompt.PromptMap) (Fragment, error) {
+	xlog.Debug("[compactFragment] Starting conversation compaction", "currentMessages", len(f.Messages), "keepMessages", keepMessages)
+
+	// Get the conversation context (everything except the most recent messages)
+	var contextMessages []openai.ChatCompletionMessage
+	var toolResults []string
+
+	if len(f.Messages) > keepMessages {
+		contextMessages = f.Messages[:len(f.Messages)-keepMessages]
+	} else {
+		contextMessages = f.Messages
+	}
+
+	// Extract tool results from context
+	for _, msg := range contextMessages {
+		if msg.Role == "tool" {
+			toolResults = append(toolResults, msg.Content)
+		}
+	}
+
+	// Build context string
+	contextStr := ""
+	for _, msg := range contextMessages {
+		if msg.Role == "system" {
+			continue // Skip system messages in summary
+		}
+		contextStr += fmt.Sprintf("%s: %s\n", msg.Role, msg.Content)
+	}
+
+	// Build tool results string
+	toolResultsStr := ""
+	for i, result := range toolResults {
+		toolResultsStr += fmt.Sprintf("Tool result %d: %s\n", i+1, result)
+	}
+
+	// Render the compaction prompt
+	prompter := prompts.GetPrompt(prompt.PromptConversationCompactionType)
+	compactionData := struct {
+		Context    string
+		ToolResults string
+	}{
+		Context:     contextStr,
+		ToolResults: toolResultsStr,
+	}
+
+	compactionPrompt, err := prompter.Render(compactionData)
+	if err != nil {
+		return f, fmt.Errorf("failed to render compaction prompt: %w", err)
+	}
+
+	// Ask the LLM to generate a summary
+	summaryFragment := NewEmptyFragment().AddMessage("user", compactionPrompt)
+	summaryFragment, _, err = llm.Ask(ctx, summaryFragment)
+	if err != nil {
+		return f, fmt.Errorf("failed to generate compaction summary: %w", err)
+	}
+
+	// Get the summary from the LLM response
+	var summary string
+	if len(summaryFragment.Messages) > 0 {
+		summary = summaryFragment.Messages[len(summaryFragment.Messages)-1].Content
+	}
+
+	xlog.Debug("[compactFragment] Generated summary", "summaryLength", len(summary))
+
+	// Build new fragment with summary + recent messages
+	newFragment := NewEmptyFragment()
+
+	// Add system message indicating compaction
+	newFragment = newFragment.AddMessage("system", "[This conversation has been compacted to reduce token count. The following is a summary of previous context:]")
+
+	// Add the summary
+	newFragment = newFragment.AddMessage("assistant", summary)
+
+	// Add the recent messages we want to keep
+	if len(f.Messages) > keepMessages {
+		recentMessages := f.Messages[len(f.Messages)-keepMessages:]
+		for _, msg := range recentMessages {
+			newFragment = newFragment.AddMessage(MessageRole(msg.Role), msg.Content)
+			// Preserve tool calls if any
+			if len(msg.ToolCalls) > 0 {
+				lastMsg := newFragment.Messages[len(newFragment.Messages)-1]
+				lastMsg.ToolCalls = msg.ToolCalls
+				newFragment.Messages[len(newFragment.Messages)-1] = lastMsg
+			}
+		}
+	} else {
+		// If we don't have more than keepMessages, just use what we have
+		for _, msg := range f.Messages {
+			newFragment = newFragment.AddMessage(MessageRole(msg.Role), msg.Content)
+		}
+	}
+
+	// Preserve parent fragment and status
+	newFragment.ParentFragment = f.ParentFragment
+	if f.Status != nil {
+		newFragment.Status = &Status{
+			ReasoningLog:    f.Status.ReasoningLog,
+			ToolsCalled:     f.Status.ToolsCalled,
+			ToolResults:     f.Status.ToolResults,
+			PastActions:     f.Status.PastActions,
+			InjectedMessages: f.Status.InjectedMessages,
+			Iterations:      f.Status.Iterations,
+		}
+	}
+
+	xlog.Debug("[compactFragment] Compaction complete", "newMessages", len(newFragment.Messages))
+
+	return newFragment, nil
+}
+
+// checkAndCompact checks if estimated token count exceeds threshold and performs compaction if needed
+// Returns the (potentially compacted) fragment and whether compaction was performed
+func checkAndCompact(ctx context.Context, llm LLM, f Fragment, threshold int, keepMessages int, prompts prompt.PromptMap) (Fragment, bool, error) {
+	if threshold <= 0 {
+		return f, false, nil // Compaction disabled
+	}
+
+	// Estimate token count based on message content
+	estimatedTokens := estimateTokens(f.Messages)
+
+	if estimatedTokens >= threshold {
+		xlog.Debug("[checkAndCompact] Token threshold exceeded", "estimatedTokens", estimatedTokens, "threshold", threshold)
+		compacted, err := compactFragment(ctx, llm, f, keepMessages, prompts)
+		if err != nil {
+			return f, false, err
+		}
+		return compacted, true, nil
+	}
+
+	return f, false, nil
+}
+
+// estimateTokens provides a rough estimate of token count based on message content
+func estimateTokens(messages []openai.ChatCompletionMessage) int {
+	// Rough estimate: ~4 characters per token on average
+	total := 0
+	for _, msg := range messages {
+		// Add content length
+		total += len(msg.Content) / 4
+		// Add role overhead
+		total += 10
+		// Add tool call overhead if present
+		for _, tc := range msg.ToolCalls {
+			total += len(tc.Function.Name) + len(tc.Function.Arguments)
+		}
+	}
+	return total
+}

From 80d629b227529b693a0b63223f6d3eb3857ab14a Mon Sep 17 00:00:00 2001
From: localai-bot <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 24 Feb 2026 22:38:34 +0000
Subject: [PATCH 2/3] fix: use actual usage tokens from LLM response for
 compaction

- Store LastUsage in Status struct from LLM responses
- checkAndCompact now uses actual TotalTokens from LLM response
- Removed estimateTokens function (no longer needed)
- Fallback estimate only used on first iteration when no usage data available
---
 fragment.go |  1 +
 tools.go    | 53 ++++++++++++++++++++++++++++++-----------------------
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/fragment.go b/fragment.go
index 73bd1be..01b579c 100644
--- a/fragment.go
+++ b/fragment.go
@@ -32,6 +32,7 @@ type InjectedMessage struct {
 }
 
 type Status struct {
+	LastUsage       LLMUsage             // Track token usage from the last LLM call
 	Iterations       int
 	ToolsCalled      Tools
 	ToolResults      []ToolStatus
diff --git a/tools.go b/tools.go
index 2850b75..44925ff 100644
--- a/tools.go
+++ b/tools.go
@@ -886,11 +886,16 @@ TOOL_LOOP:
 
 			// Preserve the status before calling Ask
 			status := f.Status
-			f, _, err := llm.Ask(o.context, f)
+			f, usage, err := llm.Ask(o.context, f)
 			if err != nil {
 				return f, fmt.Errorf("failed to ask LLM: %w", err)
 			}
-			// Restore the status
+			// Store usage tokens
+			if f.Status != nil {
+				f.Status.LastUsage = usage
+			}
+			// Restore the status (preserving LastUsage)
+			status.LastUsage = usage
 			f.Status = status
 
 			// Check and compact if threshold exceeded
@@ -1450,18 +1455,36 @@ func compactFragment(ctx context.Context, llm LLM, f Fragment, keepMessages int,
 	return newFragment, nil
 }
 
-// checkAndCompact checks if estimated token count exceeds threshold and performs compaction if needed
+// checkAndCompact checks if actual token count from LLM response exceeds threshold and performs compaction if needed
 // Returns the (potentially compacted) fragment and whether compaction was performed
 func checkAndCompact(ctx context.Context, llm LLM, f Fragment, threshold int, keepMessages int, prompts prompt.PromptMap) (Fragment, bool, error) {
 	if threshold <= 0 {
 		return f, false, nil // Compaction disabled
 	}
 
-	// Estimate token count based on message content
-	estimatedTokens := estimateTokens(f.Messages)
+	// Use the actual usage tokens from the last LLM call stored in Status
+	totalUsedTokens := 0
+	if f.Status != nil && f.Status.LastUsage.TotalTokens > 0 {
+		totalUsedTokens = f.Status.LastUsage.TotalTokens
+		xlog.Debug("[checkAndCompact] Using actual usage tokens from LLM response", "totalUsedTokens", totalUsedTokens, "threshold", threshold)
+	} else {
+		// Fallback to rough estimate if no usage data available (first iteration)
+		for _, msg := range f.Messages {
+			if msg.Role == "assistant" || msg.Role == "tool" {
+				totalUsedTokens += len(msg.Content) / 4 // Rough estimate
+			}
+		}
+		// Also count tool call arguments
+		for _, msg := range f.Messages {
+			for _, tc := range msg.ToolCalls {
+				totalUsedTokens += len(tc.Function.Name) + len(tc.Function.Arguments)
+			}
+		}
+		xlog.Debug("[checkAndCompact] Using rough estimate (no usage data)", "totalUsedTokens", totalUsedTokens, "threshold", threshold)
+	}
 
-	if estimatedTokens >= threshold {
-		xlog.Debug("[checkAndCompact] Token threshold exceeded", "estimatedTokens", estimatedTokens, "threshold", threshold)
+	if totalUsedTokens >= threshold {
+		xlog.Debug("[checkAndCompact] Token threshold exceeded", "totalUsedTokens", totalUsedTokens, "threshold", threshold)
 		compacted, err := compactFragment(ctx, llm, f, keepMessages, prompts)
 		if err != nil {
 			return f, false, err
@@ -1472,19 +1495,3 @@ func checkAndCompact(ctx context.Context, llm LLM, f Fragment, threshold int, ke
 	return f, false, nil
 }
 
-// estimateTokens provides a rough estimate of token count based on message content
-func estimateTokens(messages []openai.ChatCompletionMessage) int {
-	// Rough estimate: ~4 characters per token on average
-	total := 0
-	for _, msg := range messages {
-		// Add content length
-		total += len(msg.Content) / 4
-		// Add role overhead
-		total += 10
-		// Add tool call overhead if present
-		for _, tc := range msg.ToolCalls {
-			total += len(tc.Function.Name) + len(tc.Function.Arguments)
-		}
-	}
-	return total
-}

From 3e6d64d611852ee64f6716962b30d09342c94268 Mon Sep 17 00:00:00 2001
From: localai-bot <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 24 Feb 2026 23:02:57 +0000
Subject: [PATCH 3/3] fix: capture usage tokens after sink state LLM call for
 compaction

The sink state handling was not capturing usage tokens from the LLM response,
which meant the compaction check would use the rough estimate instead of
actual usage tokens. This change ensures LastUsage is stored after the
llm.Ask call in the hasSinkState block, allowing proper token-based compaction.
---
 clients/openai_client.go |  6 ++--
 fragment.go              |  2 +-
 llm.go                   |  6 ++--
 options.go               | 24 +++++++-------
 prompt/prompt.go         | 68 ++++++++++++++++++++--------------------
 tests/mock/client.go     | 12 +++----
 tools.go                 | 21 +++++++------
 7 files changed, 71 insertions(+), 68 deletions(-)

diff --git a/clients/openai_client.go b/clients/openai_client.go
index 17e7d7d..e1585f3 100644
--- a/clients/openai_client.go
+++ b/clients/openai_client.go
@@ -46,9 +46,9 @@ func (llm *OpenAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fra
 
 	if len(resp.Choices) > 0 {
 		usage := cogito.LLMUsage{
-			PromptTokens:      resp.Usage.PromptTokens,
-			CompletionTokens:  resp.Usage.CompletionTokens,
-			TotalTokens:       resp.Usage.TotalTokens,
+			PromptTokens:     resp.Usage.PromptTokens,
+			CompletionTokens: resp.Usage.CompletionTokens,
+			TotalTokens:      resp.Usage.TotalTokens,
 		}
 		return cogito.Fragment{
 			Messages:       append(f.Messages, resp.Choices[0].Message),
diff --git a/fragment.go b/fragment.go
index 01b579c..fd1619b 100644
--- a/fragment.go
+++ b/fragment.go
@@ -32,7 +32,7 @@ type InjectedMessage struct {
 }
 
 type Status struct {
-	LastUsage       LLMUsage             // Track token usage from the last LLM call
+	LastUsage        LLMUsage // Track token usage from the last LLM call
 	Iterations       int
 	ToolsCalled      Tools
 	ToolResults      []ToolStatus
diff --git a/llm.go b/llm.go
index 039c358..5443c3e 100644
--- a/llm.go
+++ b/llm.go
@@ -8,9 +8,9 @@ import (
 
 // LLMUsage represents token usage information from an LLM response
 type LLMUsage struct {
-	PromptTokens      int
-	CompletionTokens  int
-	TotalTokens       int
+	PromptTokens     int
+	CompletionTokens int
+	TotalTokens      int
 }
 
 type LLM interface {
diff --git a/options.go b/options.go
index a788a20..76907b5 100644
--- a/options.go
+++ b/options.go
@@ -73,18 +73,18 @@ type Option func(*Options)
 
 func defaultOptions() *Options {
 	return &Options{
-		maxIterations:         1,
-		maxAttempts:           1,
-		maxRetries:            5,
-		loopDetectionSteps:    0,
-		forceReasoning:        false,
-		maxAdjustmentAttempts: 5,
-		sinkStateTool:         &defaultSinkStateTool{},
-		sinkState:             true,
-		context:               context.Background(),
-		statusCallback:        func(s string) {},
-		reasoningCallback:     func(s string) {},
-		compactionThreshold:   0,   // Disabled by default
+		maxIterations:          1,
+		maxAttempts:            1,
+		maxRetries:             5,
+		loopDetectionSteps:     0,
+		forceReasoning:         false,
+		maxAdjustmentAttempts:  5,
+		sinkStateTool:          &defaultSinkStateTool{},
+		sinkState:              true,
+		context:                context.Background(),
+		statusCallback:         func(s string) {},
+		reasoningCallback:      func(s string) {},
+		compactionThreshold:    0,  // Disabled by default
 		compactionKeepMessages: 10, // Keep 10 recent messages by default
 	}
 }
diff --git a/prompt/prompt.go b/prompt/prompt.go
index 7ef3dc2..aa7e712 100644
--- a/prompt/prompt.go
+++ b/prompt/prompt.go
@@ -3,45 +3,45 @@ package prompt
 type PromptType uint
 
 const (
-	GapAnalysisType                PromptType = iota
-	ContentImproverType            PromptType = iota
-	PromptBooleanType              PromptType = iota
-	PromptIdentifyGoalType         PromptType = iota
-	PromptGoalAchievedType         PromptType = iota
-	PromptPlanType                 PromptType = iota
-	PromptReEvaluatePlanType       PromptType = iota
-	PromptSubtaskExtractionType    PromptType = iota
-	PromptPlanExecutionType        PromptType = iota
-	PromptGuidelinesType           PromptType = iota
-	PromptGuidelinesExtractionType PromptType = iota
-	PromptPlanDecisionType         PromptType = iota
-	PromptParameterReasoningType   PromptType = iota
-	PromptTODOGenerationType       PromptType = iota
-	PromptTODOWorkType             PromptType = iota
-	PromptTODOReviewType           PromptType = iota
-	PromptTODOTrackingType         PromptType = iota
+	GapAnalysisType                  PromptType = iota
+	ContentImproverType              PromptType = iota
+	PromptBooleanType                PromptType = iota
+	PromptIdentifyGoalType           PromptType = iota
+	PromptGoalAchievedType           PromptType = iota
+	PromptPlanType                   PromptType = iota
+	PromptReEvaluatePlanType         PromptType = iota
+	PromptSubtaskExtractionType      PromptType = iota
+	PromptPlanExecutionType          PromptType = iota
+	PromptGuidelinesType             PromptType = iota
+	PromptGuidelinesExtractionType   PromptType = iota
+	PromptPlanDecisionType           PromptType = iota
+	PromptParameterReasoningType     PromptType = iota
+	PromptTODOGenerationType         PromptType = iota
+	PromptTODOWorkType               PromptType = iota
+	PromptTODOReviewType             PromptType = iota
+	PromptTODOTrackingType           PromptType = iota
 	PromptConversationCompactionType PromptType = iota
 )
 
 var (
 	defaultPromptMap PromptMap = map[PromptType]Prompt{
-		GapAnalysisType:                PromptGapsAnalysis,
-		ContentImproverType:            PromptContentImprover,
-		PromptBooleanType:              PromptExtractBoolean,
-		PromptIdentifyGoalType:         PromptIdentifyGoal,
-		PromptGoalAchievedType:         PromptGoalAchieved,
-		PromptPlanType:                 PromptPlan,
-		PromptReEvaluatePlanType:       PromptReEvaluatePlan,
-		PromptSubtaskExtractionType:    PromptSubtaskExtraction,
-		PromptPlanExecutionType:        PromptPlanExecution,
-		PromptGuidelinesType:           PromptGuidelines,
-		PromptGuidelinesExtractionType: PromptGuidelinesExtraction,
-		PromptPlanDecisionType:         DecideIfPlanningIsNeeded,
-		PromptParameterReasoningType:   PromptParameterReasoning,
-		PromptTODOGenerationType:       PromptTODOGeneration,
-		PromptTODOWorkType:             PromptTODOWork,
-		PromptTODOReviewType:           PromptTODOReview,
-		PromptTODOTrackingType:         PromptTODOTracking,
+		GapAnalysisType:                  PromptGapsAnalysis,
+		ContentImproverType:              PromptContentImprover,
+		PromptBooleanType:                PromptExtractBoolean,
+		PromptIdentifyGoalType:           PromptIdentifyGoal,
+		PromptGoalAchievedType:           PromptGoalAchieved,
+		PromptPlanType:                   PromptPlan,
+		PromptReEvaluatePlanType:         PromptReEvaluatePlan,
+		PromptSubtaskExtractionType:      PromptSubtaskExtraction,
+		PromptPlanExecutionType:          PromptPlanExecution,
+		PromptGuidelinesType:             PromptGuidelines,
+		PromptGuidelinesExtractionType:   PromptGuidelinesExtraction,
+		PromptPlanDecisionType:           DecideIfPlanningIsNeeded,
+		PromptParameterReasoningType:     PromptParameterReasoning,
+		PromptTODOGenerationType:         PromptTODOGeneration,
+		PromptTODOWorkType:               PromptTODOWork,
+		PromptTODOReviewType:             PromptTODOReview,
+		PromptTODOTrackingType:           PromptTODOTracking,
 		PromptConversationCompactionType: PromptConversationCompaction,
 	}
 
diff --git a/tests/mock/client.go b/tests/mock/client.go
index 1e2eb19..a6b1df2 100644
--- a/tests/mock/client.go
+++ b/tests/mock/client.go
@@ -21,9 +21,9 @@ type MockOpenAIClient struct {
 	FragmentHistory               []Fragment
 
 	// Token usage for responses
-	AskUsage                  []LLMUsage
-	AskUsageIndex             int
-	CreateChatCompletionUsage []LLMUsage
+	AskUsage                       []LLMUsage
+	AskUsageIndex                  int
+	CreateChatCompletionUsage      []LLMUsage
 	CreateChatCompletionUsageIndex int
 }
 
@@ -136,9 +136,9 @@ func (m *MockOpenAIClient) SetCreateChatCompletionError(err error) {
 // SetUsage sets token usage for the next responses
 func (m *MockOpenAIClient) SetUsage(promptTokens, completionTokens, totalTokens int) {
 	usage := LLMUsage{
-		PromptTokens:      promptTokens,
-		CompletionTokens:  completionTokens,
-		TotalTokens:       totalTokens,
+		PromptTokens:     promptTokens,
+		CompletionTokens: completionTokens,
+		TotalTokens:      totalTokens,
 	}
 	m.AskUsage = append(m.AskUsage, usage)
 	m.CreateChatCompletionUsage = append(m.CreateChatCompletionUsage, usage)
diff --git a/tools.go b/tools.go
index 44925ff..03a0e69 100644
--- a/tools.go
+++ b/tools.go
@@ -1300,15 +1300,19 @@ Please provide revised tool call based on this feedback.`,
 
 	}
 
-	var err error
 	// If sink state was found, stop execution after processing all tools
 	if hasSinkState {
 		xlog.Debug("Sink state was found, stopping execution after processing tools")
-		f, _, err = llm.Ask(o.context, f)
+		f, usage, err := llm.Ask(o.context, f)
 		if err != nil {
 			return f, fmt.Errorf("failed to ask LLM: %w", err)
 		}
 
+		// Store usage tokens for compaction check
+		if f.Status != nil {
+			f.Status.LastUsage = usage
+		}
+
 		// Check and compact if threshold exceeded
 		if o.compactionThreshold > 0 {
 			f, compacted, err := checkAndCompact(o.context, llm, f, o.compactionThreshold, o.compactionKeepMessages, o.prompts)
@@ -1382,7 +1386,7 @@ func compactFragment(ctx context.Context, llm LLM, f Fragment, keepMessages int,
 	// Render the compaction prompt
 	prompter := prompts.GetPrompt(prompt.PromptConversationCompactionType)
 	compactionData := struct {
-		Context    string
+		Context     string
 		ToolResults string
 	}{
 		Context:     contextStr,
@@ -1441,12 +1445,12 @@ func compactFragment(ctx context.Context, llm LLM, f Fragment, keepMessages int,
 	newFragment.ParentFragment = f.ParentFragment
 	if f.Status != nil {
 		newFragment.Status = &Status{
-			ReasoningLog:    f.Status.ReasoningLog,
-			ToolsCalled:     f.Status.ToolsCalled,
-			ToolResults:     f.Status.ToolResults,
-			PastActions:     f.Status.PastActions,
+			ReasoningLog:     f.Status.ReasoningLog,
+			ToolsCalled:      f.Status.ToolsCalled,
+			ToolResults:      f.Status.ToolResults,
+			PastActions:      f.Status.PastActions,
 			InjectedMessages: f.Status.InjectedMessages,
-			Iterations:      f.Status.Iterations,
+			Iterations:       f.Status.Iterations,
 		}
 	}
 
@@ -1494,4 +1498,3 @@ func checkAndCompact(ctx context.Context, llm LLM, f Fragment, threshold int, ke
 
 	return f, false, nil
 }
-