From 71412a13ff6befb4c8858e71320a50f79b189a9e Mon Sep 17 00:00:00 2001
From: wittekin <6079900+wittekin@users.noreply.github.com>
Date: Sat, 7 Mar 2026 17:50:33 -0800
Subject: [PATCH 1/4] fix: show MCP provider info instead of local path in
 startup header (#56)

When an MCP tasklist provider is configured, the startup banner now
displays the provider name and labels instead of the local file path.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/millstone/runtime/orchestrator.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/millstone/runtime/orchestrator.py b/src/millstone/runtime/orchestrator.py
index 2d9a0f8..a3033bc 100755
--- a/src/millstone/runtime/orchestrator.py
+++ b/src/millstone/runtime/orchestrator.py
@@ -680,7 +680,14 @@ def __init__(
             else:
                 if self.roadmap:
                     print(f"Roadmap: {self.roadmap}")
-                print(f"Tasklist: {self.tasklist}")
+                from millstone.artifact_providers.mcp import MCPTasklistProvider
+
+                tl_provider = self._outer_loop_manager.tasklist_provider
+                if isinstance(tl_provider, MCPTasklistProvider):
+                    label_str = ", ".join(tl_provider._labels) if tl_provider._labels else "none"
+                    print(f"Tasklist: {tl_provider._mcp_server} (labels: {label_str})")
+                else:
+                    print(f"Tasklist: {self.tasklist}")
                 print(f"Max tasks: {self.max_tasks}")
                 if self.compact_threshold > 0:
                     print(f"Compact threshold: {self.compact_threshold}")

From c1f1978ccb56288d63fbc0a6784ea14d2054b260 Mon Sep 17 00:00:00 2001
From: wittekin <6079900+wittekin@users.noreply.github.com>
Date: Sat, 7 Mar 2026 18:02:18 -0800
Subject: [PATCH 2/4] feat: show live task counts in --status for MCP providers

Fetch open/completed task counts from remote MCP providers in
analyze_tasklist() instead of returning zeros. Degrades gracefully
when the remote fetch fails. Also add explicit DO NOT COMMIT
instruction to the builder prompt.

Generated with millstone orchestrator
---
 src/millstone/prompts/tasklist_prompt.md |  1 +
 src/millstone/runtime/orchestrator.py    | 32 +++++++++++--
 tests/test_orchestrator.py               | 61 +++++++++++++++++++++++-
 3 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/src/millstone/prompts/tasklist_prompt.md b/src/millstone/prompts/tasklist_prompt.md
index 754c515..c46a2b0 100644
--- a/src/millstone/prompts/tasklist_prompt.md
+++ b/src/millstone/prompts/tasklist_prompt.md
@@ -20,6 +20,7 @@ You are an autonomous developer working in: {{WORKING_DIRECTORY}}
 - Allowed edits:
   - Code/tests needed for the chosen task.
   - The tasklist only to (a) check off the chosen task and (b) text-only coherence updates to future tasks if strictly necessary.
+- **DO NOT COMMIT**: Do not run `git commit` or `git push`. Leave changes staged or unstaged — the orchestrator commits after review.
 - **FORBIDDEN**:
   - Completing, partially completing, or reordering any other tasks.
   - Implementing future tasks.
diff --git a/src/millstone/runtime/orchestrator.py b/src/millstone/runtime/orchestrator.py
index a3033bc..cb06cdb 100755
--- a/src/millstone/runtime/orchestrator.py
+++ b/src/millstone/runtime/orchestrator.py
@@ -1990,13 +1990,37 @@ def analyze_tasklist(self) -> dict:
             label_str = ", ".join(provider._labels) if provider._labels else "none"
             print(f"Remote tasklist provider: {provider._mcp_server}")
             print(f"  Labels: {label_str}")
+
+            # Fetch live task counts from the remote provider
+            pending_count = 0
+            completed_count = 0
+            total_count = 0
+            try:
+                if provider._agent_callback is None:
+                    provider.set_agent_callback(
+                        lambda p, **k: self.run_agent(p, role="author", **k)
+                    )
+                provider.invalidate_cache()
+                tasks = provider.list_tasks()
+                from millstone.artifacts.models import TaskStatus
+
+                for t in tasks:
+                    if t.status == TaskStatus.done:
+                        completed_count += 1
+                    else:
+                        pending_count += 1
+                total_count = len(tasks)
+                print(f"  Open tasks: {pending_count}")
+            except Exception:
+                print("  Open tasks: unable to fetch task count")
+
             print()
-            print("Task analysis is not available for remote providers.")
+            print("Detailed task analysis is not available for remote providers.")
             print("Use --dry-run to inspect prompts that would be sent.")
             return {
-                "pending_count": 0,
-                "completed_count": 0,
-                "total_count": 0,
+                "pending_count": pending_count,
+                "completed_count": completed_count,
+                "total_count": total_count,
                 "tasks": [],
                 "dependencies": [],
                 "suggested_order": [],
diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py
index 314a466..44cb2c4 100644
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@@ -15631,18 +15631,77 @@ def test_analyze_tasklist_shows_no_file_message(self, temp_repo, capsys):
 
     def test_analyze_tasklist_with_mcp_provider(self, temp_repo, capsys):
         """--analyze-tasklist with MCP provider shows remote info, not file error."""
+        from unittest.mock import patch
+
         from millstone.artifact_providers.mcp import MCPTasklistProvider
 
         orch = Orchestrator(dry_run=False, quiet=True)
         try:
             provider = MCPTasklistProvider("github", labels=["millstone"])
             orch._outer_loop_manager.tasklist_provider = provider
-            result = orch.analyze_tasklist()
+            with patch.object(provider, "list_tasks", return_value=[]):
+                with patch.object(provider, "invalidate_cache"):
+                    result = orch.analyze_tasklist()
 
             captured = capsys.readouterr()
             assert "Remote tasklist provider: github" in captured.out
             assert "Labels: millstone" in captured.out
             assert "Tasklist not found" not in captured.out
+            assert "Open tasks: 0" in captured.out
+            assert result["pending_count"] == 0
+            assert result["total_count"] == 0
+        finally:
+            orch.cleanup()
+
+    def test_analyze_tasklist_with_mcp_provider_shows_open_count(self, temp_repo, capsys):
+        """--analyze-tasklist with MCP provider fetches and displays live task counts."""
+        from unittest.mock import patch
+
+        from millstone.artifact_providers.mcp import MCPTasklistProvider
+        from millstone.artifacts.models import TasklistItem, TaskStatus
+
+        orch = Orchestrator(dry_run=False, quiet=True)
+        try:
+            provider = MCPTasklistProvider("github", labels=["millstone"])
+            mock_tasks = [
+                TasklistItem(task_id="1", title="Open task A", status=TaskStatus.todo),
+                TasklistItem(task_id="2", title="Open task B", status=TaskStatus.in_progress),
+                TasklistItem(task_id="3", title="Done task", status=TaskStatus.done),
+                TasklistItem(task_id="4", title="Blocked task", status=TaskStatus.blocked),
+            ]
+            orch._outer_loop_manager.tasklist_provider = provider
+            with patch.object(provider, "list_tasks", return_value=mock_tasks):
+                with patch.object(provider, "invalidate_cache"):
+                    result = orch.analyze_tasklist()
+
+            captured = capsys.readouterr()
+            assert "Remote tasklist provider: github" in captured.out
+            assert "Open tasks: 3" in captured.out
+            assert result["pending_count"] == 3
+            assert result["completed_count"] == 1
+            assert result["total_count"] == 4
+        finally:
+            orch.cleanup()
+
+    def test_analyze_tasklist_with_mcp_provider_fetch_error(self, temp_repo, capsys):
+        """--analyze-tasklist degrades gracefully when MCP fetch fails."""
+        from unittest.mock import patch
+
+        from millstone.artifact_providers.mcp import MCPTasklistProvider
+
+        orch = Orchestrator(dry_run=False, quiet=True)
+        try:
+            provider = MCPTasklistProvider("github", labels=["millstone"])
+            orch._outer_loop_manager.tasklist_provider = provider
+            with patch.object(
+                provider, "list_tasks", side_effect=RuntimeError("connection failed")
+            ):
+                with patch.object(provider, "invalidate_cache"):
+                    result = orch.analyze_tasklist()
+
+            captured = capsys.readouterr()
+            assert "Remote tasklist provider: github" in captured.out
+            assert "unable to fetch task count" in captured.out
             assert result["pending_count"] == 0
             assert result["total_count"] == 0
         finally:

From 59718dd41c1d4c76a4b29b954820d0da7fa81bda Mon Sep 17 00:00:00 2001
From: wittekin <6079900+wittekin@users.noreply.github.com>
Date: Sat, 7 Mar 2026 18:15:23 -0800
Subject: [PATCH 3/4] fix: use MCP task ID and close remote tasks in research
 mode

When the tasklist is MCP-backed, run_single_task() now derives the
current task title and ID from the remote provider's cached task list
instead of reading a local file that may not exist or be stale. After
research mode completes, the remote task is closed via
update_task_status() so MCP-backed tasks don't remain open.

Generated with millstone orchestrator

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/millstone/runtime/orchestrator.py | 61 ++++++++++++++++++++-----
 tests/test_orchestrator.py            | 66 +++++++++++++++++++++++++++
 2 files changed, 116 insertions(+), 11 deletions(-)

diff --git a/src/millstone/runtime/orchestrator.py b/src/millstone/runtime/orchestrator.py
index cb06cdb..bf573d7 100755
--- a/src/millstone/runtime/orchestrator.py
+++ b/src/millstone/runtime/orchestrator.py
@@ -2892,6 +2892,7 @@ def run_single_task(self) -> bool:
         self._task_previous_diff = None
 
         # Determine task text and metadata
+        _mcp_task_item: Any = None  # Set when MCP provider supplies the current task
         if self.task:
             task_display = self.task[:50] + "..." if len(self.task) > 50 else self.task
             self.current_task_title = task_display
@@ -2901,23 +2902,52 @@ def run_single_task(self) -> bool:
             self.apply_risk_settings(task_metadata.get("risk"))
             self.current_task_group = None
         else:
-            self.current_task_title = self.extract_current_task_title() or "task"
-            task_text = self.current_task_title
-            self.apply_risk_settings(self.extract_current_task_risk())
-            self.current_task_group = self.extract_current_task_group()
+            # When the tasklist is MCP-backed, derive task title and ID from the
+            # remote provider's cached task list instead of reading a local file
+            # that may not exist or may be stale.
+            from millstone.artifact_providers.mcp import MCPTasklistProvider
+            from millstone.artifacts.models import TaskStatus
+
+            provider = self._outer_loop_manager.tasklist_provider
+            if isinstance(provider, MCPTasklistProvider):
+                cached = provider.list_tasks()
+                pending = [
+                    t for t in cached if t.status in (TaskStatus.todo, TaskStatus.in_progress)
+                ]
+                if pending:
+                    _mcp_task_item = pending[0]
+                    self.current_task_title = _mcp_task_item.title or "task"
+                    task_text = self.current_task_title
+                    self.apply_risk_settings(_mcp_task_item.risk)
+                    self.current_task_group = None
+                else:
+                    self.current_task_title = "task"
+                    task_text = self.current_task_title
+                    self.apply_risk_settings(None)
+                    self.current_task_group = None
+            else:
+                self.current_task_title = self.extract_current_task_title() or "task"
+                task_text = self.current_task_title
+                self.apply_risk_settings(self.extract_current_task_risk())
+                self.current_task_group = self.extract_current_task_group()
 
         self._current_task_text = task_text
         # Prefer canonical task_id from metadata (stable, ontology-compliant identity).
-        # In tasklist mode, use the raw block (title + indented metadata) so that
-        # "- ID: my-task" lines are visible to the parser.
-        # In --task mode, task_text already contains the full metadata block.
-        if self.task:
+        # When an MCP provider supplied the task, use its remote ID directly.
+        if _mcp_task_item is not None:
+            self._current_task_id = _mcp_task_item.task_id
+        elif self.task:
             _task_meta = self._tasklist_manager._parse_task_metadata(task_text)
+            self._current_task_id = _task_meta.get("task_id") or (
+                re.sub(r"[^a-z0-9]+", "-", task_text.lower()).strip("-")[:30] or None
+            )
         else:
+            # In tasklist mode, use the raw block (title + indented metadata) so that
+            # "- ID: my-task" lines are visible to the parser.
             _task_meta = self._tasklist_manager.extract_current_task_metadata()
-        self._current_task_id = _task_meta.get("task_id") or (
-            re.sub(r"[^a-z0-9]+", "-", task_text.lower()).strip("-")[:30] or None
-        )
+            self._current_task_id = _task_meta.get("task_id") or (
+                re.sub(r"[^a-z0-9]+", "-", task_text.lower()).strip("-")[:30] or None
+            )
 
         # Worker mode: when --shared-state-dir is set, emit result.json + heartbeats
         # for the worktree control plane to consume.
@@ -3032,6 +3062,15 @@ def _worker_finish(
             self.log("research_completed", task=task_text[:200], output_file=str(output_file))
             if not self.task:
                 self.mark_task_complete()
+                # Close remote task for MCP providers — mark_task_complete()
+                # only updates the local tasklist file, which is a no-op when
+                # the task lives on a remote backend (GitHub Issues, Linear, …).
+                from millstone.artifact_providers.mcp import MCPTasklistProvider
+                from millstone.artifacts.models import TaskStatus
+
+                provider = self._outer_loop_manager.tasklist_provider
+                if isinstance(provider, MCPTasklistProvider) and self._current_task_id:
+                    provider.update_task_status(self._current_task_id, TaskStatus.done)
             progress(f"{self._task_prefix()} Research completed -> {output_file}")
             _worker_finish("success", review_summary=builder_output[:4000])
             return True
diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py
index 44cb2c4..4e26724 100644
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@@ -1869,6 +1869,72 @@ def test_run_single_task_research_mode_with_direct_task(self, temp_repo):
             finally:
                 orch.cleanup()
 
+    def test_run_single_task_research_mode_closes_mcp_task(self, temp_repo):
+        """Research mode closes remote task via update_task_status for MCP providers.
+
+        Verifies that the real remote task ID (not a local-file-derived fallback)
+        is passed to update_task_status.
+        """
+        from unittest.mock import MagicMock, patch
+
+        from millstone.artifact_providers.mcp import MCPTasklistProvider
+        from millstone.artifacts.models import TasklistItem, TaskStatus
+
+        tasklist = temp_repo / "docs" / "tasklist.md"
+        tasklist.write_text("# Tasks\n\n- [ ] Investigate auth flow\n")
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(
+                returncode=0,
+                stdout="## FINDINGS\n\nAuth uses OAuth2\n",
+                stderr="",
+            )
+
+            orch = Orchestrator(tasklist="docs/tasklist.md", research=True, cli="claude")
+            try:
+                # Inject a mock MCP tasklist provider with a known remote task ID
+                mock_mcp = MagicMock(spec=MCPTasklistProvider)
+                mock_mcp.get_prompt_placeholders.return_value = {}
+                mock_mcp.list_tasks.return_value = [
+                    TasklistItem(
+                        task_id="GH-42",
+                        title="Investigate auth flow",
+                        status=TaskStatus.todo,
+                    ),
+                ]
+                orch._outer_loop_manager.tasklist_provider = mock_mcp
+
+                result = orch.run_single_task()
+                assert result is True
+
+                # MCP provider should have been asked to close the exact remote task
+                mock_mcp.update_task_status.assert_called_once_with("GH-42", TaskStatus.done)
+            finally:
+                orch.cleanup()
+
+    def test_run_single_task_research_mode_direct_task_skips_mcp_close(self, temp_repo):
+        """Research mode with --task (direct task) does NOT attempt MCP close."""
+        from unittest.mock import MagicMock, patch
+
+        from millstone.artifact_providers.mcp import MCPTasklistProvider
+
+        with patch("subprocess.run") as mock_run:
+            mock_run.return_value = MagicMock(returncode=0, stdout="Research output", stderr="")
+
+            orch = Orchestrator(task="Check perf", research=True, cli="claude")
+            try:
+                mock_mcp = MagicMock(spec=MCPTasklistProvider)
+                mock_mcp.get_prompt_placeholders.return_value = {}
+                orch._outer_loop_manager.tasklist_provider = mock_mcp
+
+                result = orch.run_single_task()
+                assert result is True
+
+                # With --task, mark_task_complete + MCP close should be skipped
+                mock_mcp.update_task_status.assert_not_called()
+            finally:
+                orch.cleanup()
+
 
 class TestIsApproved:
     """Tests for approval detection."""

From aeeb37c8e3586979efcb9fdc65cc8d812de46165 Mon Sep 17 00:00:00 2001
From: wittekin <6079900+wittekin@users.noreply.github.com>
Date: Sat, 7 Mar 2026 18:38:40 -0800
Subject: [PATCH 4/4] fix: detect and handle builder early commits

When the builder agent commits its own changes (advancing HEAD), the
orchestrator now detects this and uses the committed diff for review
instead of expecting unstaged changes. Skips redundant commit delegation
and auto-commits the tasklist checkbox if the builder left it unstaged.

Handles edge cases: review-fix cycles with uncommitted changes after an
early commit, LoC baseline updates, and git failures during tasklist
auto-commit.

Generated with millstone orchestrator

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/millstone/runtime/orchestrator.py | 151 +++++++++++++-
 tests/test_integration.py             | 289 ++++++++++++++++++++++++++
 2 files changed, 438 insertions(+), 2 deletions(-)

diff --git a/src/millstone/runtime/orchestrator.py b/src/millstone/runtime/orchestrator.py
index bf573d7..66ee3c2 100755
--- a/src/millstone/runtime/orchestrator.py
+++ b/src/millstone/runtime/orchestrator.py
@@ -79,6 +79,7 @@ class BuilderArtifact:
     output: str
     git_status: str
     git_diff: str
+    builder_committed: bool = False
 
 
 @dataclass
@@ -1689,6 +1690,71 @@ def delegate_commit(self) -> bool:
             self.last_commit_failure = failure_info
         return success
 
+    def _auto_commit_tasklist_if_needed(self) -> bool:
+        """Auto-commit the tasklist checkbox if the builder left it unstaged.
+
+        When the builder commits code early but forgets to include the tasklist
+        tick, this ensures the task is still marked complete so it won't be
+        re-selected on the next run. Only applies to file-backed tasklists.
+
+        Returns True if no action was needed or the commit succeeded.
+        Returns False if the commit failed (task not fully completed).
+        """
+        import subprocess
+
+        from millstone.artifact_providers.mcp import MCPTasklistProvider
+
+        provider = self._outer_loop_manager.tasklist_provider
+        if isinstance(provider, MCPTasklistProvider):
+            return True  # MCP tasks are marked done via API, not file commits
+
+        status = self.git("status", "--porcelain").strip()
+        if not status:
+            return True
+
+        remaining_files = [line.split()[-1] for line in status.split("\n") if line.strip()]
+        tasklist_path = str(self.tasklist)
+
+        if len(remaining_files) == 1 and remaining_files[0] == tasklist_path:
+            self.log(
+                "auto_commit_tasklist",
+                reason="builder_early_commit_forgot_tasklist",
+                file=tasklist_path,
+            )
+            add_result = subprocess.run(
+                ["git", "add", tasklist_path],
+                cwd=self.repo_dir,
+                capture_output=True,
+            )
+            if add_result.returncode != 0:
+                progress(
+                    f"{self._task_prefix()} ERROR: git add for tasklist failed "
+                    f"(rc={add_result.returncode})"
+                )
+                return False
+            commit_result = subprocess.run(
+                [
+                    "git",
+                    "commit",
+                    "-m",
+                    "Mark task complete in tasklist\n\nGenerated with millstone orchestrator",
+                ],
+                cwd=self.repo_dir,
+                capture_output=True,
+            )
+            if commit_result.returncode != 0:
+                progress(
+                    f"{self._task_prefix()} ERROR: git commit for tasklist failed "
+                    f"(rc={commit_result.returncode})"
+                )
+                return False
+            progress(
+                f"{self._task_prefix()} Auto-committed tasklist tick "
+                "(builder committed code but forgot to stage tasklist)"
+            )
+
+        return True
+
     def load_prompt(self, name: str) -> str:
         """Load a prompt file from custom dir or package resources."""
         if self._custom_prompts_dir:
@@ -3082,6 +3148,9 @@ def _worker_finish(
         # Track empty-diff retry state
         empty_diff_retried = False
 
+        # Record HEAD before builder runs so we can detect early commits
+        pre_build_head = self.git("rev-parse", "HEAD").strip()
+
         # Define Loop components
         def builder_producer(feedback: str | None = None) -> BuilderArtifact:
             if feedback:
@@ -3104,7 +3173,38 @@ def builder_producer(feedback: str | None = None) -> BuilderArtifact:
             # Ensure untracked files are included in diff (for reviewer visibility)
             self._stage_untracked_files()
 
-            return BuilderArtifact(output, self.git("status", "--short"), self.git("diff", "HEAD"))
+            # Detect if builder committed its own changes (HEAD advanced)
+            nonlocal pre_build_head
+            current_head = self.git("rev-parse", "HEAD").strip()
+            builder_committed = current_head != pre_build_head
+
+            if builder_committed:
+                progress(
+                    f"{self._task_prefix()} Builder committed changes directly "
+                    f"(HEAD advanced from {pre_build_head[:8]} to {current_head[:8]}). "
+                    "Using committed diff for review."
+                )
+                self.log(
+                    "builder_early_commit",
+                    old_head=pre_build_head,
+                    new_head=current_head,
+                )
+                # Use the diff from the builder's commit(s) for review.
+                # Also include any uncommitted changes on top of the builder's commits.
+                git_diff = self.git("diff", pre_build_head)
+                git_status = self.git("diff", "--stat", pre_build_head)
+            else:
+                git_diff = self.git("diff", "HEAD")
+                git_status = self.git("status", "--short")
+
+            # Update pre_build_head so the next cycle (if reviewer rejects)
+            # correctly detects whether the builder commits again.
+            if builder_committed:
+                pre_build_head = current_head
+
+            return BuilderArtifact(
+                output, git_status, git_diff, builder_committed=builder_committed
+            )
 
         def builder_validator(artifact: BuilderArtifact) -> tuple[bool, str | None]:
             nonlocal empty_diff_retried
@@ -3121,7 +3221,12 @@ def builder_validator(artifact: BuilderArtifact) -> tuple[bool, str | None]:
 
             # Retry once on empty diff before sanity check fails
             # This catches cases where the builder didn't make changes but should have
-            if not artifact.git_status.strip() and not empty_diff_retried:
+            # Skip nudge if builder already committed (diff extracted from commits)
+            if (
+                not artifact.git_status.strip()
+                and not empty_diff_retried
+                and not artifact.builder_committed
+            ):
                 empty_diff_retried = True
                 progress(f"{self._task_prefix()} No changes detected, nudging builder to retry...")
                 nudge_msg = (
@@ -3226,6 +3331,48 @@ def builder_on_success(artifact: BuilderArtifact, verdict: BuilderVerdict) -> bo
                     loop_state["failure_reason"] = "eval_gate_failed"
                     return False
 
+            if artifact.builder_committed:
+                # Builder committed during this cycle. Check if there are
+                # remaining uncommitted changes (e.g. from a review-fix cycle
+                # where the builder edited files without committing again).
+                worktree_status = self.git("status", "--porcelain").strip()
+                tasklist_path = str(self.tasklist)
+                non_tasklist_dirty = any(
+                    line.split()[-1] != tasklist_path
+                    for line in worktree_status.split("\n")
+                    if line.strip()
+                )
+
+                if non_tasklist_dirty:
+                    # There are uncommitted changes beyond the tasklist —
+                    # delegate a commit for the remaining edits.
+                    progress(
+                        f"{self._task_prefix()} Builder committed earlier but "
+                        "uncommitted changes remain — delegating commit."
+                    )
+                    if not self.delegate_commit():
+                        loop_state["failure_reason"] = "commit_failed"
+                        return False
+                else:
+                    progress(
+                        f"{self._task_prefix()} Skipping commit delegation (builder already committed)."
+                    )
+                    # Auto-commit the tasklist checkbox if it's the only
+                    # remaining dirty file (file-backed tasklist only).
+                    if not self._auto_commit_tasklist_if_needed():
+                        loop_state["failure_reason"] = "tasklist_commit_failed"
+                        return False
+
+                # Update baseline so next task measures LoC from this commit
+                self._update_loc_baseline()
+
+                if self.eval_on_commit and not self._run_eval_on_commit(task_text=task_text):
+                    loop_state["failure_reason"] = "eval_regression"
+                    return False
+
+                self.accumulate_group_context(task_text, git_diff=artifact.git_diff)
+                return True
+
             if self.delegate_commit():
                 if self.eval_on_commit and not self._run_eval_on_commit(task_text=task_text):
                     loop_state["failure_reason"] = "eval_regression"
diff --git a/tests/test_integration.py b/tests/test_integration.py
index dfef638..644dbf3 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -426,6 +426,295 @@ def test_no_changes_detected(self, temp_repo, capsys):
         finally:
             orch.cleanup()
 
+    def test_builder_early_commit_detected(self, temp_repo, capsys):
+        """
+        Scenario: Builder commits its own changes (HEAD advances).
+        Expected: Orchestrator detects the early commit, uses the committed
+        diff for review, skips delegate_commit, and succeeds.
+        """
+
+        def make_change_and_commit(repo):
+            """Simulate builder creating a file AND committing it."""
+            (repo / "feature.py").write_text("def new_feature():\n    pass\n")
+            _original_subprocess_run(["git", "add", "."], cwd=repo, capture_output=True)
+            _original_subprocess_run(
+                ["git", "commit", "-m", "feat: builder committed early"],
+                cwd=repo,
+                capture_output=True,
+            )
+
+        # Builder commits directly; no commit delegation needed
+        responses = (
+            ResponseBuilder()
+            .on_builder(change_fn=make_change_and_commit, output="Implemented feature.")
+            .on_reviewer(approve=True)
+            .build()
+        )
+
+        orch = Orchestrator(max_tasks=1)
+        try:
+            with patch("subprocess.run", side_effect=make_mock_runner(temp_repo, responses)):
+                exit_code = orch.run()
+
+            assert exit_code == 0
+            captured = capsys.readouterr()
+            # Should detect the early commit
+            assert "Builder committed changes directly" in captured.out
+            # Should skip commit delegation
+            assert "Skipping commit delegation" in captured.out
+            assert orch.cycle == 1
+        finally:
+            orch.cleanup()
+
+    def test_builder_early_commit_auto_commits_tasklist(self, temp_repo, capsys):
+        """
+        Scenario: Builder commits code but leaves the tasklist checkbox unstaged.
+        Expected: Orchestrator detects the early commit, auto-commits the
+        tasklist tick so the task won't be re-selected on the next run.
+
+        Uses docs/tasklist.md (git-tracked) to exercise the fallback path.
+        """
+
+        def commit_code_but_leave_tasklist(repo):
+            """Builder commits code but modifies tasklist without staging it."""
+            (repo / "feature.py").write_text("def new_feature():\n    pass\n")
+            _original_subprocess_run(["git", "add", "feature.py"], cwd=repo, capture_output=True)
+            _original_subprocess_run(
+                ["git", "commit", "-m", "feat: builder committed early"],
+                cwd=repo,
+                capture_output=True,
+            )
+            # Modify tasklist (check off the task) but don't stage/commit it
+            tasklist = repo / "docs" / "tasklist.md"
+            tasklist.write_text(
+                "# Tasklist\n\n- [x] Task 1: Do something\n- [ ] Task 2: Do another thing\n"
+            )
+
+        responses = (
+            ResponseBuilder()
+            .on_builder(change_fn=commit_code_but_leave_tasklist, output="Implemented feature.")
+            .on_reviewer(approve=True)
+            .build()
+        )
+
+        orch = Orchestrator(max_tasks=1, tasklist="docs/tasklist.md")
+        try:
+            with patch("subprocess.run", side_effect=make_mock_runner(temp_repo, responses)):
+                exit_code = orch.run()
+
+            assert exit_code == 0
+            captured = capsys.readouterr()
+            assert "Builder committed changes directly" in captured.out
+            assert "Skipping commit delegation" in captured.out
+            # Should auto-commit the tasklist tick
+            assert "Auto-committed tasklist tick" in captured.out
+
+            # Verify no uncommitted changes remain
+            status = _original_subprocess_run(
+                ["git", "status", "--porcelain"],
+                cwd=temp_repo,
+                capture_output=True,
+                text=True,
+            )
+            assert status.stdout.strip() == ""
+
+            # Verify the tasklist was actually committed
+            log = _original_subprocess_run(
+                ["git", "log", "--oneline", "-3"],
+                cwd=temp_repo,
+                capture_output=True,
+                text=True,
+            )
+            assert "Mark task complete" in log.stdout
+        finally:
+            orch.cleanup()
+
+    def test_builder_early_commit_then_review_fix_uncommitted(self, temp_repo, capsys):
+        """
+        Scenario: Builder commits in cycle 1, reviewer rejects, builder fixes
+        in cycle 2 WITHOUT committing. The orchestrator must detect the dirty
+        worktree and delegate a commit for the remaining edits.
+
+        Regression test for: builder_on_success skipping delegate_commit when
+        builder_committed was True, leaving uncommitted changes.
+        """
+        cycle_count = [0]
+
+        def make_change_and_commit(repo):
+            """Cycle 1: builder creates file AND commits it."""
+            (repo / "feature.py").write_text("def new_feature():\n    return 1\n")
+            _original_subprocess_run(["git", "add", "."], cwd=repo, capture_output=True)
+            _original_subprocess_run(
+                ["git", "commit", "-m", "feat: builder committed early"],
+                cwd=repo,
+                capture_output=True,
+            )
+            cycle_count[0] += 1
+
+        def make_fix_without_commit(repo):
+            """Cycle 2: builder edits file but does NOT commit."""
+            (repo / "feature.py").write_text("def new_feature():\n    return 2  # fixed\n")
+            _original_subprocess_run(["git", "add", "."], cwd=repo, capture_output=True)
+            cycle_count[0] += 1
+
+        build_count = [0]
+
+        def responses(prompt, counts):
+            if is_builder_prompt(prompt) or "address this review feedback" in prompt.lower():
+                build_count[0] += 1
+                if build_count[0] == 1:
+                    return ("Implemented feature.", make_change_and_commit)
+                else:
+                    return ("Fixed review feedback.", make_fix_without_commit)
+            elif is_reviewer_prompt(prompt):
+                if build_count[0] <= 1:
+                    return (
+                        '{"status": "REQUEST_CHANGES", "review": "Return value should be 2", '
+                        '"summary": "Needs fix", "findings": ["Wrong return value"], '
+                        '"findings_by_severity": {"critical": [], "high": ["Wrong return value"], '
+                        '"medium": [], "low": [], "nit": []}}',
+                        None,
+                    )
+                return (
+                    '{"status": "APPROVED", "review": "Looks good", "summary": "LGTM", '
+                    '"findings": [], "findings_by_severity": {"critical": [], "high": [], '
+                    '"medium": [], "low": [], "nit": []}}',
+                    None,
+                )
+            elif is_commit_prompt(prompt):
+                return ("Committed changes.", lambda repo: do_commit(repo))
+            elif is_sanity_check(prompt):
+                return ('{"status": "OK"}', None)
+            return ('{"status": "OK"}', None)
+
+        orch = Orchestrator(max_tasks=1, max_cycles=3)
+        try:
+            with patch("subprocess.run", side_effect=make_mock_runner(temp_repo, responses)):
+                exit_code = orch.run()
+
+            assert exit_code == 0
+            captured = capsys.readouterr()
+            # Cycle 1 should detect early commit
+            assert "Builder committed changes directly" in captured.out
+            # Cycle 2: builder didn't commit, so normal delegate_commit runs
+            assert "Delegating commit to builder" in captured.out
+
+            # Verify no uncommitted changes remain
+            status = _original_subprocess_run(
+                ["git", "status", "--porcelain"],
+                cwd=temp_repo,
+                capture_output=True,
+                text=True,
+            )
+            assert status.stdout.strip() == "", (
+                "Working directory should be clean — all changes committed"
+            )
+        finally:
+            orch.cleanup()
+
+    def test_builder_early_commit_updates_loc_baseline(self, temp_repo, capsys):
+        """
+        Scenario: Builder commits directly (early commit) in a multi-task run.
+        Expected: loc_baseline_ref is updated after the early-commit path so the
+        next task's LoC measurement starts from the new HEAD, not the old baseline.
+
+        Regression test for: builder_on_success skipping _update_loc_baseline()
+        when builder_committed was True, causing stale baselines in multi-task runs.
+        """
+
+        def make_change_and_commit(repo):
+            (repo / "feature.py").write_text("def new_feature():\n    pass\n")
+            _original_subprocess_run(["git", "add", "."], cwd=repo, capture_output=True)
+            _original_subprocess_run(
+                ["git", "commit", "-m", "feat: builder committed early"],
+                cwd=repo,
+                capture_output=True,
+            )
+
+        responses = (
+            ResponseBuilder()
+            .on_builder(change_fn=make_change_and_commit, output="Implemented feature.")
+            .on_reviewer(approve=True)
+            .build()
+        )
+
+        orch = Orchestrator(max_tasks=1)
+        try:
+            with patch("subprocess.run", side_effect=make_mock_runner(temp_repo, responses)):
+                exit_code = orch.run()
+
+            assert exit_code == 0
+            # After the early-commit path, loc_baseline_ref should point to HEAD
+            current_head = _original_subprocess_run(
+                ["git", "rev-parse", "HEAD"],
+                cwd=temp_repo,
+                capture_output=True,
+                text=True,
+            ).stdout.strip()
+            assert orch.loc_baseline_ref == current_head, (
+                f"loc_baseline_ref should be updated to HEAD ({current_head}) "
+                f"but was {orch.loc_baseline_ref}"
+            )
+        finally:
+            orch.cleanup()
+
+    def test_builder_early_commit_tasklist_autocommit_git_failure(self, temp_repo, capsys):
+        """
+        Scenario: Builder commits code early but the tasklist auto-commit fails.
+        Expected: A warning is printed and the failure is not silently swallowed.
+
+        Regression test for: _auto_commit_tasklist_if_needed ignoring git failures.
+        """
+
+        def commit_code_but_leave_tasklist(repo):
+            (repo / "feature.py").write_text("def new_feature():\n    pass\n")
+            _original_subprocess_run(["git", "add", "feature.py"], cwd=repo, capture_output=True)
+            _original_subprocess_run(
+                ["git", "commit", "-m", "feat: builder committed early"],
+                cwd=repo,
+                capture_output=True,
+            )
+            # Modify tasklist but don't stage it
+            tasklist = repo / "docs" / "tasklist.md"
+            tasklist.write_text(
+                "# Tasklist\n\n- [x] Task 1: Do something\n- [ ] Task 2: Do another thing\n"
+            )
+
+        responses = (
+            ResponseBuilder()
+            .on_builder(change_fn=commit_code_but_leave_tasklist, output="Implemented feature.")
+            .on_reviewer(approve=True)
+            .build()
+        )
+
+        orch = Orchestrator(max_tasks=1, tasklist="docs/tasklist.md")
+        try:
+            # Patch subprocess.run to fail on git commit for tasklist
+            real_mock = make_mock_runner(temp_repo, responses)
+
+            def mock_with_commit_failure(cmd, **kwargs):
+                # Intercept the tasklist auto-commit
+                if (
+                    cmd[0] == "git"
+                    and cmd[1] == "commit"
+                    and len(cmd) > 3
+                    and "Mark task complete" in cmd[3]
+                ):
+                    return MagicMock(returncode=1, stdout=b"", stderr=b"commit failed")
+                return real_mock(cmd, **kwargs)
+
+            with patch("subprocess.run", side_effect=mock_with_commit_failure):
+                exit_code = orch.run()
+
+            # Task must fail — tasklist not marked complete means task can be
+            # re-selected on the next run, breaking completion semantics.
+            assert exit_code == 1
+            captured = capsys.readouterr()
+            # Should report the failed tasklist commit
+            assert "ERROR: git commit for tasklist failed" in captured.out
+        finally:
+            orch.cleanup()
+
     def test_sensitive_file_no_longer_halts_flow(self, temp_repo, capsys):
         """
         Scenario: Builder modifies a sensitive file.