Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
96e16f2
feat: add automatic conversation compaction based on token threshold
localai-bot Feb 24, 2026
80d629b
fix: use actual usage tokens from LLM response for compaction
localai-bot Feb 24, 2026
3e6d64d
fix: capture usage tokens after sink state LLM call for compaction
localai-bot Feb 24, 2026
232f718
fix: move compaction check to beginning of tool loop
localai-bot Feb 25, 2026
8d79990
fix: update Ask to return usage tokens from LocalAIClient
localai-bot Feb 25, 2026
5be96d5
fix: set LastUsage in Ask function return fragment
localai-bot Feb 25, 2026
d74447f
refactor: Ask() updates Fragment.Status.LastUsage directly
localai-bot Feb 25, 2026
9214ddb
Apply suggestion from @mudler
mudler Feb 25, 2026
7895d8a
Apply suggestion from @mudler
mudler Feb 25, 2026
a6227bf
Apply suggestion from @mudler
mudler Feb 25, 2026
dea5c91
Apply suggestion from @mudler
mudler Feb 25, 2026
783446d
Apply suggestions from code review
mudler Feb 25, 2026
a5a0276
Apply suggestion from @mudler
mudler Feb 25, 2026
3595e1f
test: add mocked tests for compaction functionality
localai-bot Feb 25, 2026
e52126c
test: add compaction tests to tools_test.go suite
localai-bot Feb 25, 2026
1f7405a
chore: run go fmt and add compaction docs to README
localai-bot Feb 25, 2026
2fea6f4
Make CompactFragment and CheckAndCompact private
localai-bot Feb 25, 2026
1061460
chore: remove exported functions from README, keep them private
localai-bot Feb 25, 2026
9079e8a
chore: verify all changes applied - build passes
localai-bot Feb 25, 2026
4913886
chore: verify build and vet pass
localai-bot Feb 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1109,6 +1109,45 @@ result, err := cogito.ExecuteTools(llm, fragment,
cogito.EnableStrictGuidelines)
```


### Automatic Conversation Compaction

Cogito can automatically compact conversations to prevent context overflow when token usage exceeds a threshold. This is useful for long-running conversations with LLMs that have context limits.

**How it works:**

1. After each LLM call, Cogito checks if the token count exceeds the threshold
2. If exceeded, it generates a summary of the conversation history using an LLM
3. The original messages are replaced with a condensed summary, preserving context

**Basic Usage:**

```go
// Enable automatic compaction with a token threshold of 4000
// This will trigger compaction when the conversation exceeds 4000 tokens
result, err := cogito.ExecuteTools(llm, fragment,
cogito.WithTools(searchTool),
cogito.WithCompactionThreshold(4000))
```

**Customizing Compaction:**

```go
// Set custom compaction options
result, err := cogito.ExecuteTools(llm, fragment,
cogito.WithTools(searchTool),
cogito.WithCompactionThreshold(4000), // Trigger at 4000 tokens
cogito.WithCompactionKeepMessages(5), // Keep last 5 messages (default: 10)
)
```

**Notes:**

- Compaction requires token usage data from the LLM (supported by OpenAI, LocalAI with token usage enabled)
- If `LastUsage` is not available, Cogito falls back to estimating tokens from message count
- The summary prompt uses the conversation compaction prompt type
- Compaction preserves `Status` fields like `LastUsage`, `ToolsCalled`, etc.

### Custom Prompts

```go
Expand Down
40 changes: 26 additions & 14 deletions clients/localai_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,17 @@ func (m *localAICompletionMessage) UnmarshalJSON(data []byte) error {

// CreateChatCompletion sends the chat completion request and parses the response,
// including LocalAI's optional "reasoning" field, into LLMReply.ReasoningContent.
func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, error) {
func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, cogito.LLMUsage, error) {
request.Model = llm.model
body, err := json.Marshal(request)
if err != nil {
return cogito.LLMReply{}, fmt.Errorf("localai: marshal request: %w", err)
return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: marshal request: %w", err)
}

url := llm.baseURL + "/chat/completions"
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return cogito.LLMReply{}, fmt.Errorf("localai: new request: %w", err)
return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: new request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Accept", "application/json")
Expand All @@ -101,21 +101,21 @@ func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request open

resp, err := llm.client.Do(req)
if err != nil {
return cogito.LLMReply{}, fmt.Errorf("localai: request: %w", err)
return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: request: %w", err)
}
defer resp.Body.Close()

respBody, err := io.ReadAll(resp.Body)
if err != nil {
return cogito.LLMReply{}, fmt.Errorf("localai: read response: %w", err)
return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: read response: %w", err)
}

if resp.StatusCode != http.StatusOK {
var errRes openai.ErrorResponse
if json.Unmarshal(respBody, &errRes) == nil && errRes.Error != nil {
return cogito.LLMReply{}, errRes.Error
return cogito.LLMReply{}, cogito.LLMUsage{}, errRes.Error
}
return cogito.LLMReply{}, &openai.RequestError{
return cogito.LLMReply{}, cogito.LLMUsage{}, &openai.RequestError{
HTTPStatus: resp.Status,
HTTPStatusCode: resp.StatusCode,
Err: fmt.Errorf("localai: %s", string(respBody)),
Expand All @@ -125,11 +125,11 @@ func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request open

var localResp localAIChatCompletionResponse
if err := json.Unmarshal(respBody, &localResp); err != nil {
return cogito.LLMReply{}, fmt.Errorf("localai: unmarshal response: %w", err)
return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: unmarshal response: %w", err)
}

if len(localResp.Choices) == 0 {
return cogito.LLMReply{}, fmt.Errorf("localai: no choices in response")
return cogito.LLMReply{}, cogito.LLMUsage{}, fmt.Errorf("localai: no choices in response")
}

choice := localResp.Choices[0]
Expand Down Expand Up @@ -157,30 +157,42 @@ func (llm *LocalAIClient) CreateChatCompletion(ctx context.Context, request open
// Ensure ReasoningContent is set for downstream (e.g. tools.go).
response.Choices[0].Message.ReasoningContent = reasoning

usage := cogito.LLMUsage{
PromptTokens: localResp.Usage.PromptTokens,
CompletionTokens: localResp.Usage.CompletionTokens,
TotalTokens: localResp.Usage.TotalTokens,
}

return cogito.LLMReply{
ChatCompletionResponse: response,
ReasoningContent: reasoning,
}, nil
}, usage, nil
}

// Ask prompts the LLM with the provided messages and returns a Fragment
// containing the response. Uses CreateChatCompletion so reasoning is preserved.
// The Fragment's Status.LastUsage is updated with the token usage.
func (llm *LocalAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fragment, error) {
messages := f.GetMessages()
request := openai.ChatCompletionRequest{
Model: llm.model,
Messages: messages,
}
reply, err := llm.CreateChatCompletion(ctx, request)
reply, usage, err := llm.CreateChatCompletion(ctx, request)
if err != nil {
return cogito.Fragment{}, err
}
if len(reply.ChatCompletionResponse.Choices) == 0 {
return cogito.Fragment{}, fmt.Errorf("localai: no choices in response")
}
return cogito.Fragment{
result := cogito.Fragment{
Messages: append(f.Messages, reply.ChatCompletionResponse.Choices[0].Message),
ParentFragment: &f,
Status: &cogito.Status{},
}, nil
Status: f.Status,
}
if result.Status == nil {
result.Status = &cogito.Status{}
}
result.Status.LastUsage = usage
return result, nil
}
39 changes: 30 additions & 9 deletions clients/openai_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ func NewOpenAILLM(model, apiKey, baseURL string) *OpenAIClient {
// and returns a Fragment containing the response.
// The Fragment.GetMessages() method automatically handles force-text-reply
// when tool calls are present in the conversation history.
// The Fragment's Status.LastUsage is updated with the token usage.
func (llm *OpenAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fragment, error) {
// Use Fragment.GetMessages() which automatically adds force-text-reply
// system message when tool calls are detected in the conversation
Expand All @@ -40,27 +41,47 @@ func (llm *OpenAIClient) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fra
},
)

if err == nil && len(resp.Choices) > 0 {
return cogito.Fragment{
if err != nil {
return cogito.Fragment{}, err
}

if len(resp.Choices) > 0 {
usage := cogito.LLMUsage{
PromptTokens: resp.Usage.PromptTokens,
CompletionTokens: resp.Usage.CompletionTokens,
TotalTokens: resp.Usage.TotalTokens,
}
result := cogito.Fragment{
Messages: append(f.Messages, resp.Choices[0].Message),
ParentFragment: &f,
Status: &cogito.Status{},
}, nil
Status: f.Status,
}
if result.Status == nil {
result.Status = &cogito.Status{}
}
result.Status.LastUsage = usage
return result, nil
}

return cogito.Fragment{}, err
return cogito.Fragment{}, nil
}

func (llm *OpenAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, error) {
func (llm *OpenAIClient) CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (cogito.LLMReply, cogito.LLMUsage, error) {
request.Model = llm.model
response, err := llm.client.CreateChatCompletion(ctx, request)
if err != nil {
return cogito.LLMReply{}, err
return cogito.LLMReply{}, cogito.LLMUsage{}, err
}

usage := cogito.LLMUsage{
PromptTokens: response.Usage.PromptTokens,
CompletionTokens: response.Usage.CompletionTokens,
TotalTokens: response.Usage.TotalTokens,
}

return cogito.LLMReply{
ChatCompletionResponse: response,
ReasoningContent: response.Choices[0].Message.ReasoningContent,
}, nil
}, usage, nil
}

// NewOpenAIService creates a new OpenAI service instance
Expand Down
11 changes: 9 additions & 2 deletions fragment.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ type InjectedMessage struct {
}

type Status struct {
LastUsage LLMUsage // Track token usage from the last LLM call
Iterations int
ToolsCalled Tools
ToolResults []ToolStatus
Expand Down Expand Up @@ -97,6 +98,7 @@ func NewEmptyFragment() Fragment {
ReasoningLog: []string{},
ToolsCalled: Tools{},
ToolResults: []ToolStatus{},
LastUsage: LLMUsage{},
},
}
}
Expand All @@ -109,6 +111,7 @@ func NewFragment(messages ...openai.ChatCompletionMessage) Fragment {
ReasoningLog: []string{},
ToolsCalled: Tools{},
ToolResults: []ToolStatus{},
LastUsage: LLMUsage{},
},
}
}
Expand Down Expand Up @@ -210,11 +213,13 @@ func (r Fragment) ExtractStructure(ctx context.Context, llm LLM, s structures.St
},
}

resp, err := llm.CreateChatCompletion(ctx, decision)
resp, usage, err := llm.CreateChatCompletion(ctx, decision)
if err != nil {
return err
}

r.Status.LastUsage = usage

if len(resp.ChatCompletionResponse.Choices) != 1 {
return fmt.Errorf("no choices: %d", len(resp.ChatCompletionResponse.Choices))
}
Expand Down Expand Up @@ -271,11 +276,13 @@ func (f Fragment) SelectTool(ctx context.Context, llm LLM, availableTools Tools,
}
}

resp, err := llm.CreateChatCompletion(ctx, decision)
resp, usage, err := llm.CreateChatCompletion(ctx, decision)
if err != nil {
return Fragment{}, nil, err
}

f.Status.LastUsage = usage

if len(resp.ChatCompletionResponse.Choices) != 1 {
return Fragment{}, nil, fmt.Errorf("no choices: %d", len(resp.ChatCompletionResponse.Choices))
}
Expand Down
2 changes: 1 addition & 1 deletion fragment_e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ var _ = Describe("Result test", Label("e2e"), func() {
Content: "What's the weather today in San Francisco?",
})

newFragment, result, err := fragment.SelectTool(context.TODO(), *defaultLLM, Tools{
newFragment, result, err := fragment.SelectTool(context.TODO(), defaultLLM, Tools{
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when we run SelectTool, we have to make sure we update as well the number of tokens that have been used

NewToolDefinition(
(&GetWeatherTool{}),
WeatherArgs{},
Expand Down
9 changes: 8 additions & 1 deletion llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,16 @@ import (
"github.com/sashabaranov/go-openai"
)

// LLMUsage represents token usage information from an LLM response
type LLMUsage struct {
PromptTokens int
CompletionTokens int
TotalTokens int
}

type LLM interface {
Ask(ctx context.Context, f Fragment) (Fragment, error)
CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (LLMReply, error)
CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (LLMReply, LLMUsage, error)
}

type LLMReply struct {
Expand Down
46 changes: 35 additions & 11 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,23 +63,29 @@ type Options struct {
todos *structures.TODOList

messagesManipulator func([]openai.ChatCompletionMessage) []openai.ChatCompletionMessage

// Compaction options - automatic conversation compaction based on token count
compactionThreshold int // Token count threshold that triggers compaction (0 = disabled)
compactionKeepMessages int // Number of recent messages to keep after compaction
}

type Option func(*Options)

func defaultOptions() *Options {
return &Options{
maxIterations: 1,
maxAttempts: 1,
maxRetries: 5,
loopDetectionSteps: 0,
forceReasoning: false,
maxAdjustmentAttempts: 5,
sinkStateTool: &defaultSinkStateTool{},
sinkState: true,
context: context.Background(),
statusCallback: func(s string) {},
reasoningCallback: func(s string) {},
maxIterations: 1,
maxAttempts: 1,
maxRetries: 5,
loopDetectionSteps: 0,
forceReasoning: false,
maxAdjustmentAttempts: 5,
sinkStateTool: &defaultSinkStateTool{},
sinkState: true,
context: context.Background(),
statusCallback: func(s string) {},
reasoningCallback: func(s string) {},
compactionThreshold: 0, // Disabled by default
compactionKeepMessages: 10, // Keep 10 recent messages by default
}
}

Expand Down Expand Up @@ -367,6 +373,24 @@ func WithMessageInjectionResultChan(ch chan MessageInjectionResult) func(o *Opti
}
}

// WithCompactionThreshold sets the token count threshold that triggers automatic
// conversation compaction. When total tokens in the response >= threshold,
// the conversation will be compacted to stay within the limit.
// Set to 0 (default) to disable automatic compaction.
func WithCompactionThreshold(threshold int) func(o *Options) {
return func(o *Options) {
o.compactionThreshold = threshold
}
}

// WithCompactionKeepMessages sets the number of recent messages to keep after
// compaction. Default is 10. This only applies when WithCompactionThreshold is set.
func WithCompactionKeepMessages(count int) func(o *Options) {
return func(o *Options) {
o.compactionKeepMessages = count
}
}

type defaultSinkStateTool struct{}

func (d *defaultSinkStateTool) Execute(args map[string]any) (string, any, error) {
Expand Down
Loading
Loading