diff --git a/.gitignore b/.gitignore index 5ff8d5c..7e83e6f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ __pycache__/ *.pyc +.DS_Store +.serena/ .venv/ .pytest_cache/ docs/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..f9396e8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,12 @@ +[submodule "ccb-multi"] + path = ccb-multi + url = https://github.com/daniellee2015/ccb-multi.git +[submodule "ccb-status"] + path = ccb-status + url = https://github.com/daniellee2015/ccb-status.git +[submodule "ccb-worktree"] + path = ccb-worktree + url = https://github.com/daniellee2015/ccb-worktree.git +[submodule "ccb-shared-context"] + path = ccb-shared-context + url = https://github.com/daniellee2015/ccb-shared-context.git diff --git a/CHANGELOG.md b/CHANGELOG.md index a6f03e2..3bb3616 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,126 +1,40 @@ # Changelog -## Unreleased - -## v5.2.5 (2026-02-15) - -### 🔧 Bug Fixes - -- **Async Guardrail**: Added global mandatory turn-stop rule to `claude-md-ccb.md` to prevent Claude from polling after async `ask` submission -- **Marker Consistency**: `bin/ask` now emits `[CCB_ASYNC_SUBMITTED provider=xxx]` matching all other provider scripts -- **SKILL.md DRY**: Ask skill rules reference global guardrail with local fallback, eliminating duplicate maintenance -- **Command References**: Fixed `/ping` → `/cping` and `ping` → `ccb-ping` in docs - -## v5.2.4 (2026-02-11) - -### 🔧 Bug Fixes - -- **Explicit CCB_CALLER**: `bin/ask` no longer defaults to `"claude"` when `CCB_CALLER` is unset; exits with an error instead -- **SKILL.md template**: Ask skill execution template now explicitly passes `CCB_CALLER=claude` - -## v5.2.3 (2026-02-09) - -### 🚀 Project-Local History + Legacy Compatibility - -- **Local History**: Context exports now save to `./.ccb/history/` per project -- **CWD Scope**: Auto transfer runs only for the current working directory -- **Legacy Migration**: Auto-detect `.ccb_config` and upgrade to `.ccb` when possible -- **Claude /continue**: Attach the latest history file with a single skill - -## v5.2.2 (2026-02-04) - -### 🚀 Session Switch Capture - -- **Old Session Fields**: `.claude-session` now records `old_claude_session_id` / `old_claude_session_path` with `old_updated_at` -- **Auto Context Export**: Previous Claude session is extracted to `./.ccb/history/claude--.md` -- **Transfer Cleanup**: Improved noise filtering while preserving tool-only actions - -## v5.1.2 (2026-01-29) - -### 🔧 Bug Fixes & Improvements +All notable changes to CCB Multi are documented here. +This project uses its own version line, independent from upstream CCB. -- **Claude Completion Hook**: Unified askd now triggers completion hook for Claude -- **askd Lifecycle**: askd is bound to CCB lifecycle to avoid stale daemons -- **Mounted Detection**: `ccb-mounted` now uses ping-based detection across all platforms -- **State File Lookup**: `askd_client` falls back to `CCB_RUN_DIR` for daemon state files - -## v5.1.1 (2025-01-28) - -### 🔧 Bug Fixes & Improvements - -- **Unified Daemon**: All providers now use unified askd daemon architecture -- **Install/Uninstall**: Fixed installation and uninstallation bugs -- **Process Management**: Fixed kill/termination issues - -### 🔧 ask Foreground Defaults - -- `bin/ask`: Foreground mode available via `--foreground`; `--background` forces legacy async -- Managed Codex sessions default to foreground to avoid background cleanup -- Environment overrides: `CCB_ASK_FOREGROUND=1` / `CCB_ASK_BACKGROUND=1` -- Foreground runs sync and suppresses completion hook unless `CCB_COMPLETION_HOOK_ENABLED` is set -- `CCB_CALLER` now defaults to `codex` in Codex sessions when unset - -## v5.1.0 (2025-01-26) - -### 🚀 Major Changes: Unified Command System - -**New unified commands replace provider-specific commands:** - -| Old Commands | New Unified Command | -|--------------|---------------------| -| `cask`, `gask`, `oask`, `dask`, `lask` | `ask ` | -| `cping`, `gping`, `oping`, `dping`, `lping` | `ccb-ping ` (skill: `/cping`) | -| `cpend`, `gpend`, `opend`, `dpend`, `lpend` | `pend [N]` | - -**Supported providers:** `gemini`, `codex`, `opencode`, `droid`, `claude` - -### 🪟 Windows WezTerm + PowerShell Support - -- Full support for Windows native environment with WezTerm terminal -- `install.ps1` now generates wrappers for `ask`, `ccb-ping`, `pend`, `ccb-completion-hook` -- Background execution uses PowerShell scripts with `DETACHED_PROCESS` flag -- WezTerm CLI integration with stdin for large payloads (avoids command line length limits) -- UTF-8 BOM handling for PowerShell-generated session files - -### 🔧 Technical Improvements +## Unreleased -- `completion_hook.py`: Uses `sys.executable` for cross-platform script execution -- `ccb-completion-hook`: - - Added `find_wezterm_cli()` with PATH lookup and common install locations - - Support `CCB_WEZTERM_BIN` environment variable - - Uses stdin for WezTerm send-text to handle large payloads -- `bin/ask`: - - Unix: Uses `nohup` for true background execution - - Windows: Uses PowerShell script + message file to avoid escaping issues -- Added `SKILL.md.powershell` for `cping` and `pend` skills +## v1.0.0 (2026-02-18) -### 📦 Skills System +Initial release as independent fork. Based on upstream CCB v5.2.6. -New unified skills: -- `/ask ` - Async request to AI provider -- `/cping ` - Test provider connectivity -- `/pend [N]` - View latest provider reply +### 🚀 Multi-Instance Support -### ⚠️ Breaking Changes +- **ccb-multi**: Launch multiple CCB instances in the same project with independent contexts +- **ccb-multi-status**: Real-time status monitoring for all instances +- **ccb-multi-history**: View instance execution history +- **ccb-multi-clean**: Clean up stale instance directories +- **Collision-Free Naming**: Instance dirs use `inst--N` format (8-char SHA-256 of project root) -- Old provider-specific commands (`cask`, `gask`, etc.) are deprecated -- Old skills (`/cask`, `/gask`, etc.) are removed -- Use new unified commands instead +### 🔧 LLM Communication Fixes (upstream-unmerged) -### 🔄 Migration Guide +- **Gemini CLI 0.29.0 Deadlock**: Dual-format session scanning (basename + SHA-256 hash) with auto-adoption +- **Hash Persistence**: `_all_known_hashes` set survives hash format transitions +- **Daemon work_dir Decoupling**: `--work-dir` parameter and `CCB_WORK_DIR` env for `bin/askd` +- **State Validation**: `bin/ask` validates daemon's `work_dir` with fallback to `cwd` +- **Cross-Hash Guard**: Instance mode blocks cross-hash session override to prevent contamination -```bash -# Old way -cask "What is 1+1?" -gping -cpend +### 🔧 Inherited from Upstream CCB v5.2.5 -# New way -ask codex "What is 1+1?" -ccb-ping gemini -pend codex -``` +- Async Guardrail hardening (global turn-stop rule) +- Marker consistency for `[CCB_ASYNC_SUBMITTED]` +- Project-local history (`.ccb/history/`) +- Session switch capture and context transfer +- Unified command system (`ask`, `ccb-ping`, `pend`) +- Windows WezTerm + PowerShell support +- Email-to-AI gateway (mail system) --- -For older versions, see [CHANGELOG_4.0.md](CHANGELOG_4.0.md) +For upstream CCB changelog prior to this fork, see [CHANGELOG_4.0.md](CHANGELOG_4.0.md) or the [upstream repo](https://github.com/bfly123/claude_code_bridge). diff --git a/ISSUE_ANALYSIS.md b/ISSUE_ANALYSIS.md new file mode 100644 index 0000000..e2f364d --- /dev/null +++ b/ISSUE_ANALYSIS.md @@ -0,0 +1,596 @@ +# CCB Multi 异步通信问题分析报告 + +## 问题概述 + +给其他 LLM (gemini/opencode) 发出需求后,对方完成了但没有返回完成信息,导致一直显示 'processing'。 + +- **OpenCode**: 第二次调用一定没有返回 +- **Gemini**: 有时候会出现这个问题 +- **Codex**: 当前测试中也没有返回(验证了问题存在) + +## 根本原因分析(综合三个模型的发现) + +### 关键 Bug 列表(按严重程度) + +#### Critical: Daemon 启动崩溃 +**位置**: `lib/askd_server.py:221`, `lib/askd_server.py:235` +**问题**: `_parent_monitor` 条件定义但无条件启动,当没有 parent PID 时会崩溃 +**影响**: 直接导致 "daemon 无法启动" 的历史问题 + +#### High: OpenCode 会话 ID 固定问题(第二次调用失败的主因) +**位置**: +- `lib/askd/adapters/opencode.py:133` - 从会话文件传递 `session_id_filter` +- `lib/opencode_comm.py:673` - DB 会话查找强制使用该过滤器 +- `lib/opencode_comm.py:651` - DB 路径优先于文件路径 +- `lib/opencode_comm.py:788` - 只有文件查找有 "新会话覆盖" 逻辑 + +**问题**: OpenCode 会话 ID 被固定,第二次请求轮询错误的会话并超时 +**影响**: 第二次调用一定失败 + +#### High: 完成回调严格依赖 done_seen +**位置**: +- `lib/completion_hook.py:132` - 硬性检查 `if not done_seen: return` +- `lib/askd/adapters/opencode.py:196` - 只在严格标记匹配时设置 done +- `lib/askd/adapters/gemini.py:230` - 同上 +- `bin/ask:255` - 默认超时 3600 秒 +- `lib/askd/daemon.py:186` - daemon 等待窗口 + +**问题**: 如果回复完成但标记缺失/错位,不会发送完成通知 +**影响**: UI 显示 "processing forever" + +#### Medium: Gemini 会话绑定风险 +**位置**: +- `lib/gemini_comm.py:235`, `lib/gemini_comm.py:293` - 扫描 basename/sha hash 文件夹 +- `lib/gemini_comm.py:337` - 跨 hash 保护仅在首选会话存在时应用 +- `lib/gemini_comm.py:355` - 首次绑定直接接受扫描结果 + +**问题**: 实例模式下可能附加到错误的会话 +**影响**: Gemini 有时会出现问题 + +#### Medium: notify_completion 阻塞 worker +**位置**: `lib/completion_hook.py:100`, `lib/completion_hook.py:102` +**问题**: 名为 async 但实际阻塞最多 65 秒(`join(timeout=65)`) +**影响**: 降低每会话吞吐量,负载下后续任务看起来卡住 + +#### Medium/Low: 取消/错误处理不完整 +**位置**: +- `lib/askd/adapters/opencode.py:34` - 取消检测辅助函数存在但未连接 +- `lib/opencode_comm.py:33` - 取消 req-id 正则仍假设旧的 32-hex ID +- `lib/ccb_protocol.py:56` - 当前 req ID 是 datetime/pid/counter 格式 + +**问题**: 中止的任务倾向于退化为长超时 +**影响**: 错误处理不友好 + +### 1. req_id 不匹配问题 + +**症状**: +- OpenCode 返回: `CCB_DONE: 20260219-210049-399-57397-2` +- 期望的 req_id: `20260219-224825-969-86134` + +**原因**: +- LLM 没有正确解析提示中的 `CCB_REQ_ID: {req_id}` +- LLM 可能使用了之前请求的 req_id(状态污染) +- LLM 可能自己生成了一个 req_id + +**影响**: +```python +# lib/ccb_protocol.py:76-82 +def is_done_text(text: str, req_id: str) -> bool: + # 使用严格的正则匹配 + return bool(done_line_re(req_id).match(lines[i])) + # 如果 req_id 不匹配,返回 False +``` + +当 `is_done_text()` 返回 False 时: +- `done_seen` 保持为 False +- `notify_completion()` 不会被调用(因为检查 `if not done_seen: return`) +- 用户永远不会收到完成通知 + +### 2. 状态管理问题(第二次调用失败) + +**OpenCode 的状态跟踪**: +```python +state = { + "session_id": "...", + "session_updated": timestamp, + "assistant_count": N, + "last_assistant_id": "...", + "last_assistant_completed": timestamp, + "last_assistant_has_done": bool +} +``` + +**问题**: +- 第二次调用时,状态可能没有正确重置 +- `_read_since()` 可能错误地认为新消息是重复的 +- 重复检测逻辑可能过滤掉合法的新回复 + +### 3. LLM 提示解析问题 + +**当前提示格式** (lib/oaskd_protocol.py): +``` +CCB_REQ_ID: {req_id} + +{user_message} + +IMPORTANT: +- Reply normally, in English. +- End your reply with this exact final line (verbatim, on its own line): +CCB_DONE: {req_id} +``` + +**可能的问题**: +- LLM 可能忽略了 `CCB_REQ_ID:` 行 +- LLM 可能没有理解需要原样输出 req_id +- LLM 可能在多轮对话中混淆了不同请求的 req_id + +## 解决方案 + +### 方案 1: 修复 OpenCode 状态更新不完整问题(最关键) + +**问题定位** (来自 OpenCode 的深度分析): + +在 `lib/opencode_comm.py` 的 `_read_since()` 方法中,Line 1132-1134: +```python +# Update state baseline even if reply isn't ready yet. +state = dict(state) +state["session_updated"] = updated_i +``` + +**缺陷**: 当 `session_updated` 变化但没有检测到新回复时,只更新了 `session_updated`,但**没有更新** `assistant_count`、`last_assistant_id`、`last_assistant_completed`、`last_assistant_has_done`。 + +**第二次调用失败的场景**: +1. 第一次调用成功,状态为 `assistant_count=2` +2. 第二次调用时,`capture_state()` 返回 `assistant_count=2` +3. 发送新消息,OpenCode 开始创建新的 assistant message +4. 如果在 polling 周期内 `session_updated` 变化但 `_find_new_assistant_reply_with_state` 返回 `None`(消息未完成) +5. 此时 `session_updated` 被更新,但 `assistant_count` 仍是旧值 2 +6. 下一轮循环时,由于 `session_updated` 已是最新值,`should_scan` 为 False +7. 即使 force read 触发,使用旧的 `assistant_count=2` 进行比较会导致检测失败 + +**修复方案**: + +```python +# lib/opencode_comm.py, around line 1132-1134 +# Replace: +# state = dict(state) +# state["session_updated"] = updated_i + +# With: +state = dict(state) +state["session_updated"] = updated_i +# Also update assistant state baseline to avoid stale comparisons +current_assistants = [m for m in self._read_messages(current_session_id) + if m.get("role") == "assistant" and isinstance(m.get("id"), str)] +state["assistant_count"] = len(current_assistants) +if current_assistants: + latest = current_assistants[-1] + state["last_assistant_id"] = latest.get("id") + completed = (latest.get("time") or {}).get("completed") + try: + state["last_assistant_completed"] = int(completed) if completed is not None else None + except Exception: + state["last_assistant_completed"] = None + # Update has_done flag + parts = self._read_parts(str(latest.get("id"))) + text = self._extract_text(parts, allow_reasoning_fallback=True) + state["last_assistant_has_done"] = bool(text) and ("CCB_DONE:" in text) +``` + +### 方案 2: 增强 req_id 检测容错性 + +**目标**: 即使 req_id 不完全匹配,也能检测到完成信号 + +**实现**: + +```python +# lib/ccb_protocol.py - 添加宽松匹配模式 +def is_done_text_relaxed(text: str, req_id: str) -> bool: + """ + 检测 CCB_DONE 标记,允许部分 req_id 匹配 + 用于处理 LLM 可能修改或截断 req_id 的情况 + """ + lines = [ln.rstrip() for ln in (text or "").splitlines()] + + # 首先尝试严格匹配 + for i in range(len(lines) - 1, -1, -1): + if _is_trailing_noise_line(lines[i]): + continue + if done_line_re(req_id).match(lines[i]): + return True + break + + # 如果严格匹配失败,尝试宽松匹配 + # 检查是否有任何 CCB_DONE: 行 + for i in range(len(lines) - 1, -1, -1): + if _is_trailing_noise_line(lines[i]): + continue + line = lines[i] + if line.strip().startswith("CCB_DONE:"): + # 提取 req_id 并检查日期部分是否匹配 + # req_id 格式: YYYYMMDD-HHMMSS-mmm-PID-counter + parts = line.split(":", 1) + if len(parts) == 2: + found_req_id = parts[1].strip() + # 至少检查日期部分 (YYYYMMDD) 是否匹配 + if req_id[:8] == found_req_id[:8]: + return True + break + + return False + +# 在 lib/askd/adapters/opencode.py 和 gemini.py 中使用 +# Line 189 (opencode.py): +if is_done_text_relaxed(combined, task.req_id): + done_seen = True + done_ms = _now_ms() - started_ms + break +``` + +### 方案 3: 改进 LLM 提示格式 + +**目标**: 让 LLM 更容易理解和遵循 req_id 要求 + +**实现**: + +```python +# lib/oaskd_protocol.py - 改进提示格式 +def wrap_opencode_prompt(message: str, req_id: str) -> str: + message = (message or "").rstrip() + return ( + f"[SYSTEM] Request ID: {req_id}\n\n" + f"{message}\n\n" + "CRITICAL INSTRUCTIONS:\n" + "1. Process the request normally and reply in English\n" + "2. At the very end of your response, add this EXACT line (copy it verbatim):\n" + f" CCB_DONE: {req_id}\n" + "3. Do NOT modify the Request ID in any way\n" + "4. The CCB_DONE line must be the last line of your response\n" + ) +``` + +### 方案 4: 添加超时和重试机制 + +**目标**: 当检测失败时,提供降级方案 + +**实现**: + +```python +# lib/askd/adapters/opencode.py - 添加降级检测 +def _handle_task_locked(self, task: QueuedTask, session: Any, session_key: str, started_ms: int) -> ProviderResult: + # ... existing code ... + + # 添加降级检测:如果超时但有回复内容,检查是否包含任何 CCB_DONE + if not done_seen and chunks: + combined = "\n".join(chunks) + # 检查是否有任何 CCB_DONE 标记(即使 req_id 不匹配) + if "CCB_DONE:" in combined: + _write_log(f"[WARN] Found CCB_DONE but req_id mismatch for req_id={task.req_id}") + # 可以选择: + # 1. 设置 done_seen=True(宽松模式) + # 2. 返回特殊错误码让用户知道 + done_seen = True # 宽松模式 + done_ms = _now_ms() - started_ms + + # ... rest of the code ... +``` + +### 方案 5: 增强日志和调试 + +**目标**: 便于诊断问题 + +**实现**: + +```python +# 添加环境变量控制的调试日志 +# lib/opencode_comm.py +def _read_since(self, state: Dict[str, Any], timeout: float, block: bool): + debug = os.environ.get("CCB_DEBUG_OPENCODE_STATE", "").lower() in ("1", "true", "yes") + + # ... existing code ... + + if debug: + print(f"[DEBUG] OpenCode state: session_id={session_id}, " + f"updated={updated_i}, count={state.get('assistant_count')}, " + f"last_id={state.get('last_assistant_id')}", file=sys.stderr) + + # ... rest of the code ... +``` + +## 推荐实施顺序(基于风险和影响) + +### 阶段 1: 关键修复(立即实施) + +**1.1 修复 Daemon 启动崩溃** +```python +# lib/askd_server.py +# 确保 _parent_monitor 只在有 parent PID 时启动 +if self._parent_pid: + self._parent_monitor = threading.Thread(target=self._monitor_parent, daemon=True) + self._parent_monitor.start() +``` + +**1.2 修复 OpenCode 会话 ID 固定问题** +```python +# lib/opencode_comm.py +# 在 _get_latest_session_from_db 中添加新会话检测 +def _get_latest_session_from_db(self) -> Optional[Dict[str, Any]]: + # ... existing code ... + + # 如果有 session_id_filter,检查是否有更新的会话 + if self._session_id_filter: + # 也查询没有过滤器的最新会话 + all_sessions = self._fetch_opencode_db_rows( + "SELECT * FROM session ORDER BY time_updated DESC LIMIT 1", + [] + ) + if all_sessions and all_sessions[0].get("id") != self._session_id_filter: + # 发现更新的会话,更新过滤器 + self._session_id_filter = all_sessions[0].get("id") + return all_sessions[0] + + # ... rest of existing code ... +``` + +**1.3 修复 OpenCode 状态更新不完整** +```python +# lib/opencode_comm.py, line ~1132 +state = dict(state) +state["session_updated"] = updated_i +# 同步更新所有状态字段 +current_assistants = [m for m in self._read_messages(current_session_id) + if m.get("role") == "assistant" and isinstance(m.get("id"), str)] +state["assistant_count"] = len(current_assistants) +if current_assistants: + latest = current_assistants[-1] + state["last_assistant_id"] = latest.get("id") + completed = (latest.get("time") or {}).get("completed") + try: + state["last_assistant_completed"] = int(completed) if completed is not None else None + except Exception: + state["last_assistant_completed"] = None +``` + +### 阶段 2: 高优先级修复(短期内实施) + +**2.1 添加降级完成检测** +```python +# lib/askd/adapters/opencode.py, after line ~196 +# 如果超时但有回复,检查是否有任何 CCB_DONE 标记 +if not done_seen and chunks: + combined = "\n".join(chunks) + if "CCB_DONE:" in combined: + _write_log(f"[WARN] Found CCB_DONE but req_id mismatch for req_id={task.req_id}") + # 降级模式:接受任何 CCB_DONE + done_seen = True + done_ms = _now_ms() - started_ms +``` + +**2.2 改进 Gemini 会话绑定** +```python +# lib/gemini_comm.py, line ~355 +# 首次绑定时也检查 hash 匹配 +def _scan_latest_session(self) -> Optional[Path]: + # ... existing code ... + + # 如果在实例模式下,验证 hash 匹配 + if self._instance_mode and latest_path: + expected_hash = self._get_project_hash() + if expected_hash and expected_hash not in str(latest_path): + _debug(f"[WARN] Session hash mismatch, skipping {latest_path}") + return None + + return latest_path +``` + +**2.3 修复 notify_completion 阻塞** +```python +# lib/completion_hook.py +# 移除 join,让线程真正异步运行 +def _run_hook_async(...): + # ... existing code ... + + thread = threading.Thread(target=_run, daemon=False) + thread.start() + # 移除: thread.join(timeout=65) + # 让线程真正在后台运行 +``` + +### 阶段 3: 中期改进 + +**3.1 添加宽松 req_id 匹配** +```python +# lib/ccb_protocol.py +def is_done_text_relaxed(text: str, req_id: str) -> bool: + # 首先尝试严格匹配 + if is_done_text(text, req_id): + return True + + # 宽松匹配:检查日期部分 + lines = [ln.rstrip() for ln in (text or "").splitlines()] + for i in range(len(lines) - 1, -1, -1): + if _is_trailing_noise_line(lines[i]): + continue + line = lines[i] + if line.strip().startswith("CCB_DONE:"): + parts = line.split(":", 1) + if len(parts) == 2: + found_req_id = parts[1].strip() + # 检查日期部分 (YYYYMMDD) + if len(req_id) >= 8 and len(found_req_id) >= 8: + if req_id[:8] == found_req_id[:8]: + return True + break + return False +``` + +**3.2 改进 LLM 提示格式** +```python +# lib/oaskd_protocol.py +def wrap_opencode_prompt(message: str, req_id: str) -> str: + message = (message or "").rstrip() + return ( + f"[SYSTEM] Request ID: {req_id}\n\n" + f"{message}\n\n" + "CRITICAL INSTRUCTIONS:\n" + "1. Process the request and reply in English\n" + "2. At the END of your response, add this EXACT line:\n" + f" CCB_DONE: {req_id}\n" + "3. Do NOT modify the Request ID\n" + "4. The CCB_DONE line must be the LAST line\n" + ) +``` + +**3.3 修复取消检测** +```python +# lib/opencode_comm.py, line ~33 +# 更新正则以匹配新的 req_id 格式 +_REQ_ID_RE = re.compile(r"\d{8}-\d{6}-\d{3}-\d+-\d+") +``` + +### 阶段 4: 长期优化 + +**4.1 增强错误处理** +- 将 `except Exception: pass` 替换为具体的异常处理和日志 +- 添加错误状态返回,而不是静默失败 + +**4.2 添加调试日志** +```python +# 添加环境变量控制的调试模式 +CCB_DEBUG_OPENCODE_STATE=1 # OpenCode 状态跟踪 +CCB_DEBUG_GEMINI_SESSION=1 # Gemini 会话绑定 +CCB_DEBUG_COMPLETION=1 # 完成检测 +``` + +**4.3 添加监控指标** +- 完成率 +- 超时率 +- req_id 不匹配率 +- 平均响应时间 + +## 推荐实施顺序(基于风险和影响) + +1. **立即修复**: 方案 1 (OpenCode 状态更新) - 解决第二次调用失败的根本原因 +2. **短期修复**: 方案 2 (宽松 req_id 匹配) - 提高容错性 +3. **中期改进**: 方案 3 (改进提示) - 减少 LLM 错误 +4. **长期优化**: 方案 4 (降级机制) + 方案 5 (调试日志) + +## 测试验证计划 + +### 回归测试列表 + +**测试 1: Daemon 启动稳定性** +```bash +# 测试在没有 parent PID 的情况下启动 daemon +unset PPID +ccb -r # 应该成功启动而不崩溃 +``` + +**测试 2: OpenCode 第二次调用** +```bash +# 第一次调用 +CCB_CALLER=claude ask opencode "Test 1" +pend opencode # 应该成功返回 + +# 第二次调用(关键测试) +CCB_CALLER=claude ask opencode "Test 2" +pend opencode # 应该成功返回,不应该超时 +``` + +**测试 3: Gemini 稳定性** +```bash +# 多次调用测试 +for i in {1..5}; do + CCB_CALLER=claude ask gemini "Test $i" + sleep 2 + pend gemini +done +# 所有调用都应该成功返回 +``` + +**测试 4: req_id 不匹配降级** +```bash +# 手动测试:让 LLM 返回错误的 req_id +# 应该在日志中看到 WARN 但仍然完成 +CCB_DEBUG_COMPLETION=1 CCB_CALLER=claude ask opencode "Reply with CCB_DONE: 12345678-000000-000-00000-0" +``` + +**测试 5: 并发请求** +```bash +# 测试多个并发请求 +CCB_CALLER=claude ask gemini "Task 1" & +CCB_CALLER=claude ask opencode "Task 2" & +CCB_CALLER=claude ask codex "Task 3" & +wait +# 所有任务都应该完成 +``` + +### 性能基准 + +修复前后对比: +- **完成率**: 目标 > 95%(当前 < 50% for OpenCode 第二次调用) +- **平均响应时间**: 目标 < 30 秒(当前可能超时 3600 秒) +- **第二次调用成功率**: 目标 100%(当前 0%) +- **Daemon 启动成功率**: 目标 100%(当前有崩溃) + +## 四个报告症状的映射 + +基于综合分析,四个症状的根本原因: + +1. **"完成了但没有返回完成信息"** + - 根本原因: 严格的 `done_seen` 检查 + 没有降级路径 + - 修复: 阶段 2.1 (降级完成检测) + +2. **"OpenCode 第二次调用一定没有返回"** + - 根本原因: 会话 ID 固定 + DB 优先查找 + - 修复: 阶段 1.2 (会话 ID 更新) + 阶段 1.3 (状态同步) + +3. **"Gemini 有时候会出现这个问题"** + - 根本原因: 会话绑定风险 + 严格标记要求 + - 修复: 阶段 2.2 (会话绑定) + 阶段 2.1 (降级检测) + +4. **"之前还有 daemon 无法启动的问题"** + - 根本原因: `_parent_monitor` 无条件启动 + - 修复: 阶段 1.1 (条件启动) + +## 协作分析总结 + +### Gemini 的贡献 +- 识别了 `done_seen` 检测机制 +- 分析了 `is_done_text` 的严格匹配要求 +- 提出了 req_id 不匹配的可能原因 + +### OpenCode 的贡献 +- 发现了 `_read_since` 状态更新不完整的关键缺陷 +- 详细分析了第二次调用失败的场景 +- 提供了状态同步的具体修复方案 + +### Codex 的贡献 +- 进行了端到端的代码审查 +- 识别了 6 个具体的 bug 及其位置 +- 评估了状态管理、并发安全、错误处理和超时机制 +- 提供了按严重程度排序的问题列表 + +### Claude 的贡献 +- 协调多模型协作分析 +- 整合所有发现到统一报告 +- 提供分阶段的修复计划 +- 设计测试验证方案 + +## 下一步行动建议 + +1. **立即**: 实施阶段 1 的三个关键修复 +2. **本周**: 实施阶段 2 的高优先级修复 +3. **本月**: 完成阶段 3 的中期改进 +4. **持续**: 添加阶段 4 的监控和日志 + +## 相关文件(按修改优先级) + +- `lib/opencode_comm.py` - OpenCode 日志读取器 +- `lib/gemini_comm.py` - Gemini 日志读取器 +- `lib/ccb_protocol.py` - 协议定义和检测函数 +- `lib/oaskd_protocol.py` - OpenCode 提示包装 +- `lib/askd/adapters/opencode.py` - OpenCode 适配器 +- `lib/askd/adapters/gemini.py` - Gemini 适配器 +- `lib/completion_hook.py` - 完成通知钩子 + diff --git a/PR_MINIMAL_FIX.md b/PR_MINIMAL_FIX.md new file mode 100644 index 0000000..317d6d5 --- /dev/null +++ b/PR_MINIMAL_FIX.md @@ -0,0 +1,85 @@ +# Pull Request: 修复异步通信卡住问题 + +## 问题描述 + +在使用 CCB Multi 时,发现以下问题: +1. OpenCode 第二次调用一定失败,一直显示 "processing" +2. Gemini 有时会出现类似问题 +3. 当 LLM 返回的 `CCB_DONE:` 标记中的 req_id 不匹配时,永远不会触发完成通知 + +## 根本原因 + +经过多模型协作分析(Claude + Gemini + OpenCode + Codex),发现三个关键问题: + +1. **OpenCode 会话 ID 固定**: `_get_latest_session_from_db()` 在设置了 `session_id_filter` 后,会跳过所有新会话,导致第二次调用轮询错误的会话 +2. **状态更新不完整**: `_read_since()` 只更新 `session_updated` 时间戳,但不更新 `assistant_count` 等状态字段,导致使用过时状态进行比较 +3. **完成检测过于严格**: 当 req_id 不完全匹配时,`done_seen` 永远为 False,不会触发完成通知 + +## 修复内容 + +### 1. 修复 OpenCode 会话 ID 固定问题 + +**文件**: `lib/opencode_comm.py` +**位置**: `_get_latest_session_from_db()` 方法 + +**修改**: +- 跟踪最新的未过滤会话 +- 如果发现比过滤会话更新的会话,使用新会话 +- 这允许检测到第二次调用时创建的新会话 + +### 2. 修复状态更新不完整问题 + +**文件**: `lib/opencode_comm.py` +**位置**: `_read_since()` 方法,line ~1132-1134 + +**修改**: +- 在更新 `session_updated` 时,同时更新所有状态字段 +- 包括 `assistant_count`, `last_assistant_id`, `last_assistant_completed`, `last_assistant_has_done` +- 防止第二次调用使用过时的状态进行比较 + +### 3. 添加降级完成检测 + +**文件**: +- `lib/askd/adapters/opencode.py` +- `lib/askd/adapters/gemini.py` + +**修改**: +- 在超时后,如果回复中包含任何 `CCB_DONE:` 标记,接受为完成 +- 记录 WARN 日志,显示期望的和实际的 req_id +- 这提供了一个降级路径,即使 req_id 不匹配也能完成 + +## 测试 + +运行测试脚本: +```bash +./test_minimal_fix.sh +``` + +预期结果: +1. OpenCode 第二次调用成功 +2. Gemini 稳定返回 +3. 即使 req_id 不匹配,也能完成(会有 WARN 日志) + +## 影响范围 + +- **最小化修改**: 只修改了关键的三个位置 +- **向后兼容**: 不影响现有功能 +- **降级安全**: 降级检测只在严格匹配失败后才触发 + +## 相关 Issue + +解决了以下问题: +- OpenCode 第二次调用失败 +- Gemini 间歇性失败 +- req_id 不匹配导致的永久 "processing" 状态 + +## 后续工作 + +这是最小修复集。完整的修复计划包括: +- 修复 daemon 启动崩溃问题 +- 改进 Gemini 会话绑定 +- 修复 notify_completion 阻塞 +- 改进错误处理和日志 +- 添加监控指标 + +详细分析报告见:`ISSUE_ANALYSIS.md` diff --git a/README.md b/README.md index 4eb5d7d..c5d1a12 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,16 @@
-# Claude Code Bridge (ccb) v5.2.6 +# CCB Multi -**Multi-Model Collaboration via Split-Pane Terminal** -**Claude · Codex · Gemini · OpenCode · Droid** -**Lightweight async messaging — full CLI power, every interaction visible** +**Claude Code Bridge — Multi-Instance Edition** -

- Every Interaction Visible - Every Model Controllable -

+Run multiple CCB instances in parallel, with LLM communication fixes included. -[![Version](https://img.shields.io/badge/version-5.2.6-orange.svg)]() +[![Version](https://img.shields.io/badge/version-1.0.0-orange.svg)]() [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) -[![CI](https://github.com/bfly123/claude_code_bridge/actions/workflows/test.yml/badge.svg)](https://github.com/bfly123/claude_code_bridge/actions/workflows/test.yml) [![Platform](https://img.shields.io/badge/platform-Linux%20%7C%20macOS%20%7C%20Windows-lightgrey.svg)]() -**English** | [Chinese](README_zh.md) - ![Showcase](assets/show.png)
@@ -34,924 +26,209 @@ --- -**Introduction:** Multi-model collaboration avoids model bias, cognitive blind spots, and context limits. Unlike MCP or API-based approaches, ccb gives you a WYSIWYG split-pane terminal where every interaction is visible and every model is controllable. - -## ⚡ Why ccb? +## What is CCB Multi? -| Feature | Benefit | -| :--- | :--- | -| **🖥️ Visual & Controllable** | Multiple AI models in split-pane CLI. See everything, control everything. | -| **🧠 Persistent Context** | Each AI maintains its own memory. Close and resume anytime (`-r` flag). | -| **📉 Token Savings** | Sends lightweight prompts instead of full file history. | -| **🪟 Native Workflow** | Integrates directly into **WezTerm** (recommended) or tmux. No complex servers required. | +An enhanced fork of [Claude Code Bridge (CCB)](https://github.com/bfly123/claude_code_bridge) that adds **multi-instance concurrent execution** and includes several upstream-unmerged LLM communication fixes. If you need to run multiple CCB sessions in the same project simultaneously, this is for you. --- -

🚀 What's New

- -
-v5.2.6 - Async Communication & Gemini 0.29 Compatibility - -**🔧 Gemini CLI 0.29.0 Support:** -- **Dual Hash Strategy**: Session path discovery now supports both basename and SHA-256 formats -- **Autostart**: `ccb-ping` and `ccb-mounted` gain `--autostart` flag to launch offline provider daemons -- **Cleanup Tool**: New `ccb-cleanup` utility for removing zombie daemons and stale state files - -**🔗 Async Communication Fixes:** -- **OpenCode Deadlock**: Fixed session ID pinning that caused second async call to always fail -- **Degraded Completion**: Adapters now accept `CCB_DONE` even when req_id doesn't match exactly -- **req_id Regex**: `opencode_comm.py` now matches both old hex and new timestamp-based formats -- **Gemini Idle Timeout**: Auto-detect reply completion when Gemini omits `CCB_DONE` marker (15s idle, configurable via `CCB_GEMINI_IDLE_TIMEOUT`) -- **Gemini Prompt Hardening**: Stronger instructions to reduce `CCB_DONE` omission rate - -**🛠 Other Fixes:** -- **lpend**: Prefers fresh Claude session path when registry is stale -- **mail setup**: Unblocked `ccb mail setup` import on config v3 - -
- -
-v5.2.5 - Async Guardrail Hardening - -**🔧 Async Turn-Stop Fix:** -- **Global Guardrail**: Added mandatory `Async Guardrail` rule to `claude-md-ccb.md` — covers both `/ask` skill and direct `Bash(ask ...)` calls -- **Marker Consistency**: `bin/ask` now emits `[CCB_ASYNC_SUBMITTED provider=xxx]` matching all other provider scripts -- **DRY Skills**: Ask skill rules reference global guardrail with local fallback, single source of truth - -This fix prevents Claude from polling/sleeping after submitting async tasks. - -
- -
-v5.2.3 - Project-Local History & Legacy Compatibility - -**📂 Project-Local History:** -- **Local Storage**: Auto context exports now save to `./.ccb/history/` per project -- **Safe Scope**: Auto transfer runs only for the current working directory -- **Claude /continue**: New skill to attach the latest history file via `@` - -**🧩 Legacy Compatibility:** -- **Auto Migration**: `.ccb_config` is detected and upgraded to `.ccb` when possible -- **Fallback Lookup**: Legacy sessions still resolve cleanly during transition - -These changes keep handoff artifacts scoped to the project and make upgrades smoother. - -
- -
-v5.2.2 - Session Switch Capture & Context Transfer +## Differences from Upstream CCB -**🔁 Session Switch Tracking:** -- **Old Session Fields**: `.claude-session` now records `old_claude_session_id` / `old_claude_session_path` with `old_updated_at` -- **Auto Context Export**: Previous Claude session is automatically extracted to `./.ccb/history/claude--.md` -- **Cleaner Transfers**: Noise filtering removes protocol markers and guardrails while keeping tool-only actions +| Feature | Upstream CCB | CCB Multi | +| :--- | :---: | :---: | +| Multi-instance concurrent execution | ❌ | ✅ | +| Gemini CLI 0.29.0 compatibility | ❌ | ✅ | +| Daemon `work_dir` decoupling | ❌ | ✅ | +| Dead-thread detection | ❌ | ✅ | +| Instance dir collision prevention | ❌ | ✅ | -These updates make session handoff more reliable and easier to audit. - -
- -
-v5.2.1 - Enhanced Ask Command Stability - -**🔧 Stability Improvements:** -- **Watchdog File Monitoring**: Real-time session updates with efficient file watching -- **Mandatory Caller Field**: Improved request tracking and routing reliability -- **Unified Execution Model**: Simplified ask skill execution across all platforms -- **Auto-Dependency Installation**: Watchdog library installed automatically during setup -- **Session Registry**: Enhanced Claude adapter with automatic session monitoring - -These improvements significantly enhance the reliability of cross-AI communication and reduce session binding failures. - -
- -
-v5.2.0 - Email Integration for Remote AI Access - -**📧 New Feature: Mail Service** -- **Email-to-AI Gateway**: Send emails to interact with AI providers remotely -- **Multi-Provider Support**: Gmail, Outlook, QQ, 163 mail presets -- **Provider Routing**: Use body prefix to target specific AI (e.g., `CLAUDE: your question`) -- **Real-time Polling**: IMAP IDLE support for instant email detection -- **Secure Credentials**: System keyring integration for password storage -- **Mail Daemon**: Background service (`maild`) for continuous email monitoring - -See [Mail System Configuration](#-mail-system-configuration) for setup instructions. - -
- -
-v5.1.3 - Tmux Claude Ask Stability - -**🔧 Fixes & Improvements:** -- **tmux Claude ask**: read replies from pane output with automatic pipe-pane logging for more reliable completion - -See [CHANGELOG.md](CHANGELOG.md) for full details. - -
- -
-v5.1.2 - Daemon & Hooks Reliability - -**🔧 Fixes & Improvements:** -- **Claude Completion Hook**: Unified askd now triggers completion hook for Claude -- **askd Lifecycle**: askd is bound to CCB lifecycle to avoid stale daemons -- **Mounted Detection**: `ccb-mounted` uses ping-based detection across all platforms -- **State File Lookup**: `askd_client` falls back to `CCB_RUN_DIR` for daemon state files - -See [CHANGELOG.md](CHANGELOG.md) for full details. - -
- -
-v5.1.1 - Unified Daemon + Bug Fixes - -**🔧 Bug Fixes & Improvements:** -- **Unified Daemon**: All providers now use unified askd daemon architecture -- **Install/Uninstall**: Fixed installation and uninstallation bugs -- **Process Management**: Fixed kill/termination issues - -See [CHANGELOG.md](CHANGELOG.md) for full details. - -
- -
-v5.1.0 - Unified Command System + Windows WezTerm Support - -**🚀 Unified Commands** - Replace provider-specific commands with unified interface: +--- -| Old Commands | New Unified Command | -|--------------|---------------------| -| `cask`, `gask`, `oask`, `dask`, `lask` | `ask ` | -| `cping`, `gping`, `oping`, `dping`, `lping` | `ccb-ping ` | -| `cpend`, `gpend`, `opend`, `dpend`, `lpend` | `pend [N]` | +## What's Fixed -**Supported providers:** `gemini`, `codex`, `opencode`, `droid`, `claude` +### Gemini CLI 0.29.0 Deadlock +Gemini CLI 0.29.0 changed session storage from SHA-256 hash to directory basename (`~/.gemini/tmp//`). CCB Multi scans both formats and auto-adopts the active one, preventing session hangs. -**🪟 Windows WezTerm + PowerShell Support:** -- Full native Windows support with WezTerm terminal -- Background execution using PowerShell + `DETACHED_PROCESS` -- WezTerm CLI integration with stdin for large payloads -- UTF-8 BOM handling for PowerShell compatibility +### Daemon work_dir Decoupling +`bin/askd` now accepts `--work-dir` (or `CCB_WORK_DIR` env) to decouple the daemon's project root from the launch directory. `bin/ask` validates the daemon's `work_dir` and falls back to `cwd` with a warning if missing. -**📦 New Skills:** -- `/ask ` - Request to AI provider (background by default) -- `/cping ` - Test provider connectivity -- `/pend [N]` - View latest provider reply +### Worker Pool Robustness +- `GeminiLogReader` maintains `_all_known_hashes` set that survives hash format transitions +- Instance mode blocks cross-hash session override to prevent contamination between projects -See [CHANGELOG.md](CHANGELOG.md) for full details. +### Instance Directory Basename Collision +Changed from `instance-N` to `inst--N` format (8-char SHA-256 of project root) to prevent cross-project collisions in Gemini CLI's basename-based storage. Old `instance-N` directories are still recognized for backward compatibility. -
+--- -
-v5.0.6 - Zombie session cleanup + mounted skill optimization +## Multi-Instance Usage -- **Zombie Cleanup**: `ccb kill -f` now cleans up orphaned tmux sessions globally (sessions whose parent process has exited) -- **Mounted Skill**: Optimized to use `pgrep` for daemon detection (~4x faster), extracted to standalone `ccb-mounted` script -- **Droid Skills**: Added full skill set (cask/gask/lask/oask + ping/pend variants) to `droid_skills/` -- **Install**: Added `install_droid_skills()` to install Droid skills to `~/.droid/skills/` +### Quick Start -
+```bash +# Start instance 1 with Gemini +ccb-multi 1 gemini -
-v5.0.5 - Droid delegation tools + setup +# Start instance 2 with Codex (in another terminal) +ccb-multi 2 codex -- **Droid**: Adds delegation tools (`ccb_ask_*` plus `cask/gask/lask/oask` aliases). -- **Setup**: New `ccb droid setup-delegation` command for MCP registration. -- **Installer**: Auto-registers Droid delegation when `droid` is detected (opt-out via env). +# Start instance 3 with Claude (in another terminal) +ccb-multi 3 claude -
-Details & usage +# Check all instance status +ccb-multi-status -Usage: -``` -/all-plan -``` +# View history +ccb-multi-history -Example: +# Clean up stale instances +ccb-multi-clean ``` -/all-plan Design a caching layer for the API with Redis -``` - -Highlights: -- Socratic Ladder + Superpowers Lenses + Anti-pattern analysis. -- Availability-gated dispatch (use only mounted CLIs). -- Two-round reviewer refinement with merged design. - -
-
- -
-v5.0.0 - Any AI as primary driver - -- **Claude Independence**: No need to start Claude first; Codex can act as the primary CLI. -- **Unified Control**: Single entry point controls Claude/OpenCode/Gemini. -- **Simplified Launch**: Dropped `ccb up`; use `ccb ...` or the default `ccb.config`. -- **Flexible Mounting**: More flexible pane mounting and session binding. -- **Default Config**: Auto-create `ccb.config` when missing. -- **Daemon Autostart**: `caskd`/`laskd` auto-start in WezTerm/tmux when needed. -- **Session Robustness**: PID liveness checks prevent stale sessions. - -
- -
-v4.0 - tmux-first refactor - -- **Full Refactor**: Cleaner structure, better stability, and easier extension. -- **Terminal Backend Abstraction**: Unified terminal layer (`TmuxBackend` / `WeztermBackend`) with auto-detection and WSL path handling. -- **Perfect tmux Experience**: Stable layouts + pane titles/borders + session-scoped theming. -- **Works in Any Terminal**: If your terminal can run tmux, CCB can provide the full multi-model split experience (except native Windows; WezTerm recommended; otherwise just use tmux). - -
- -
-v3.0 - Smart daemons - -- **True Parallelism**: Submit multiple tasks to Codex, Gemini, or OpenCode simultaneously. -- **Cross-AI Orchestration**: Claude and Codex can now drive OpenCode agents together. -- **Bulletproof Stability**: Daemons auto-start on first request and stop after idle. -- **Chained Execution**: Codex can delegate to OpenCode for multi-step workflows. -- **Smart Interruption**: Gemini tasks handle interruption safely. - -
-Details - -
- -![Parallel](https://img.shields.io/badge/Strategy-Parallel_Queue-blue?style=flat-square) -![Stability](https://img.shields.io/badge/Daemon-Auto_Managed-green?style=flat-square) -![Interruption](https://img.shields.io/badge/Gemini-Interruption_Aware-orange?style=flat-square) - -
- -

✨ Key Features

-- **🔄 True Parallelism**: Submit multiple tasks to Codex, Gemini, or OpenCode simultaneously. The new daemons (`caskd`, `gaskd`, `oaskd`) automatically queue and execute them serially, ensuring no context pollution. -- **🤝 Cross-AI Orchestration**: Claude and Codex can now simultaneously drive OpenCode agents. All requests are arbitrated by the unified daemon layer. -- **🛡️ Bulletproof Stability**: Daemons are self-managing—they start automatically on the first request and shut down after 60s of idleness to save resources. -- **⚡ Chained Execution**: Advanced workflows supported! Codex can autonomously call `oask` to delegate sub-tasks to OpenCode models. -- **🛑 Smart Interruption**: Gemini tasks now support intelligent interruption detection, automatically handling stops and ensuring workflow continuity. +### Instance Directory Format -

🧩 Feature Support Matrix

+Instances are created under `.ccb-instances/` in the project root: -| Feature | `caskd` (Codex) | `gaskd` (Gemini) | `oaskd` (OpenCode) | -| :--- | :---: | :---: | :---: | -| **Parallel Queue** | ✅ | ✅ | ✅ | -| **Interruption Awareness** | ✅ | ✅ | - | -| **Response Isolation** | ✅ | ✅ | ✅ | - -
-📊 View Real-world Stress Test Results - -
- -**Scenario 1: Claude & Codex Concurrent Access to OpenCode** -*Both agents firing requests simultaneously, perfectly coordinated by the daemon.* - -| Source | Task | Result | Status | -| :--- | :--- | :--- | :---: | -| 🤖 Claude | `CLAUDE-A` | **CLAUDE-A** | 🟢 | -| 🤖 Claude | `CLAUDE-B` | **CLAUDE-B** | 🟢 | -| 💻 Codex | `CODEX-A` | **CODEX-A** | 🟢 | -| 💻 Codex | `CODEX-B` | **CODEX-B** | 🟢 | - -**Scenario 2: Recursive/Chained Calls** -*Codex autonomously driving OpenCode for a 5-step workflow.* - -| Request | Exit Code | Response | -| :--- | :---: | :--- | -| **ONE** | `0` | `CODEX-ONE` | -| **TWO** | `0` | `CODEX-TWO` | -| **THREE** | `0` | `CODEX-THREE` | -| **FOUR** | `0` | `CODEX-FOUR` | -| **FIVE** | `0` | `CODEX-FIVE` | - -
-
-
- ---- - -## 🚀 Quick Start - -**Step 1:** Install [WezTerm](https://wezfurlong.org/wezterm/) (native `.exe` for Windows) - -**Step 2:** Choose installer based on your environment: - -
-Linux - -```bash -git clone https://github.com/bfly123/claude_code_bridge.git -cd claude_code_bridge -./install.sh install ``` - -
- -
-macOS - -```bash -git clone https://github.com/bfly123/claude_code_bridge.git -cd claude_code_bridge -./install.sh install +.ccb-instances/ + inst-a1b2c3d4-1/ # inst-- + inst-a1b2c3d4-2/ + instance-3/ # Old format: still recognized ``` -> **Note:** If commands not found after install, see [macOS Troubleshooting](#-macos-installation-guide). +The `` is an 8-char SHA-256 of the project root path, ensuring globally unique basenames across projects. -
+### Environment Variables -
-WSL (Windows Subsystem for Linux) - -> Use this if your Claude/Codex/Gemini runs in WSL. +| Variable | Description | +| :--- | :--- | +| `CCB_INSTANCE_ID` | Instance number (1, 2, 3, ...) | +| `CCB_PROJECT_ROOT` | Original project root path | +| `CCB_WORK_DIR` | Override daemon's working directory | -> **⚠️ WARNING:** Do NOT install or run ccb as root/administrator. Switch to a normal user first (`su - username` or create one with `adduser`). +### Concurrent LLM Requests Within an Instance ```bash -# Run inside WSL terminal (as normal user, NOT root) -git clone https://github.com/bfly123/claude_code_bridge.git -cd claude_code_bridge -./install.sh install -``` - -
+# Send async requests to multiple LLMs +CCB_CALLER=claude ask gemini "task 1" & +CCB_CALLER=claude ask codex "task 2" & +CCB_CALLER=claude ask opencode "task 3" & +wait -
-Windows Native - -> Use this if your Claude/Codex/Gemini runs natively on Windows. - -```powershell -git clone https://github.com/bfly123/claude_code_bridge.git -cd claude_code_bridge -powershell -ExecutionPolicy Bypass -File .\install.ps1 install +# Check results +pend gemini +pend codex +pend opencode ``` -- The installer prefers `pwsh.exe` (PowerShell 7+) when available, otherwise `powershell.exe`. -- If a WezTerm config exists, the installer will try to set `config.default_prog` to PowerShell (adds a `-- CCB_WEZTERM_*` block and will prompt before overriding an existing `default_prog`). - -
+--- -### Run -```bash -ccb # Start providers from ccb.config (default: all four) -ccb codex gemini # Start both -ccb codex gemini opencode claude # Start all four (spaces) -ccb codex,gemini,opencode,claude # Start all four (commas) -ccb -r codex gemini # Resume last session for Codex + Gemini -ccb -a codex gemini opencode # Auto-approval mode with multiple providers -ccb -a -r codex gemini opencode claude # Auto + resume for all providers - -tmux tip: CCB's tmux status/pane theming is enabled only while CCB is running. - -Layout rule: the last provider runs in the current pane. Extras are ordered as `[cmd?, reversed providers]`; the first extra goes to the top-right, then the left column fills top-to-bottom, then the right column fills top-to-bottom. Examples: 4 panes = left2/right2, 5 panes = left2/right3. -Note: `ccb up` is removed; use `ccb ...` or configure `ccb.config`. -``` +## Process Management -### Flags -| Flag | Description | Example | -| :--- | :--- | :--- | -| `-r` | Resume previous session context | `ccb -r` | -| `-a` | Auto-mode, skip permission prompts | `ccb -a` | -| `-h` | Show help information | `ccb -h` | -| `-v` | Show version and check for updates | `ccb -v` | - -### ccb.config -Default lookup order: -- `.ccb/ccb.config` (project) -- `~/.ccb/ccb.config` (global) - -Simple format (recommended): -```text -codex,gemini,opencode,claude -``` +### List Running Daemons -Enable cmd pane (default title/command): -```text -codex,gemini,opencode,claude,cmd -``` +```bash +# Simple list +ccb-cleanup --list -Advanced JSON (optional, for flags or custom cmd pane): -```json -{ - "providers": ["codex", "gemini", "opencode", "claude"], - "cmd": { "enabled": true, "title": "CCB-Cmd", "start_cmd": "bash" }, - "flags": { "auto": false, "resume": false } -} +# Detailed info (work_dir, port, host) +ccb-cleanup --list -v ``` -Cmd pane participates in the layout as the first extra pane and does not change which AI runs in the current pane. -### Update -```bash -ccb update # Update ccb to the latest version -ccb update 4 # Update to the highest v4.x.x version -ccb update 4.1 # Update to the highest v4.1.x version -ccb update 4.1.2 # Update to specific version v4.1.2 -ccb uninstall # Uninstall ccb and clean configs -ccb reinstall # Clean then reinstall ccb +Example output: ``` - ---- - -
-🪟 Windows Installation Guide (WSL vs Native) - -> **Key Point:** `ccb/cask/cping/cpend` must run in the **same environment** as `codex/gemini`. The most common issue is environment mismatch causing `cping` to fail. - -Note: The installers also install OS-specific `SKILL.md` variants for Claude/Codex skills: -- Linux/macOS/WSL: bash heredoc templates (`SKILL.md.bash`) -- Native Windows: PowerShell here-string templates (`SKILL.md.powershell`) - -### 1) Prerequisites: Install Native WezTerm - -- Install Windows native WezTerm (`.exe` from official site or via winget), not the Linux version inside WSL. -- Reason: `ccb` in WezTerm mode relies on `wezterm cli` to manage panes. - -### 2) How to Identify Your Environment - -Determine based on **how you installed/run Claude Code/Codex**: - -- **WSL Environment** - - You installed/run via WSL terminal (Ubuntu/Debian) using `bash` (e.g., `curl ... | bash`, `apt`, `pip`, `npm`) - - Paths look like: `/home//...` and you may see `/mnt/c/...` - - Verify: `cat /proc/version | grep -i microsoft` has output, or `echo $WSL_DISTRO_NAME` is non-empty - -- **Native Windows Environment** - - You installed/run via Windows Terminal / WezTerm / PowerShell / CMD (e.g., `winget`, PowerShell scripts) - - Paths look like: `C:\Users\\...` - -### 3) WSL Users: Configure WezTerm to Auto-Enter WSL - -Edit WezTerm config (`%USERPROFILE%\.wezterm.lua`): - -```lua -local wezterm = require 'wezterm' -return { - default_domain = 'WSL:Ubuntu', -- Replace with your distro name -} +=== Running askd daemons === + PID 26639 (parent 26168) - OK + Project: ad4f88fa5c5269a3 + Started: 2026-02-19 10:05:35 + Work Dir: /Users/user/project/.ccb-instances/instance-1 + Port: 65108 + Host: 127.0.0.1 ``` -Check distro name with `wsl -l -v` in PowerShell. - -### 4) Troubleshooting: `cping` Not Working - -- **Most common:** Environment mismatch (ccb in WSL but codex in native Windows, or vice versa) -- **Codex session not running:** Run `ccb codex` (or add codex to ccb.config) first -- **WezTerm CLI not found:** Ensure `wezterm` is in PATH -- **Terminal not refreshed:** Restart WezTerm after installation -- **Text sent but not submitted (no Enter) on Windows WezTerm:** Set `CCB_WEZTERM_ENTER_METHOD=key` and ensure your WezTerm supports `wezterm cli send-key` - -
- -
-🍎 macOS Installation Guide - -### Command Not Found After Installation - -If `ccb`, `cask`, `cping` commands are not found after running `./install.sh install`: - -**Cause:** The install directory (`~/.local/bin`) is not in your PATH. - -**Solution:** +### Kill Specific Daemon ```bash -# 1. Check if install directory exists -ls -la ~/.local/bin/ - -# 2. Check if PATH includes the directory -echo $PATH | tr ':' '\n' | grep local - -# 3. Check shell config (macOS defaults to zsh) -cat ~/.zshrc | grep local - -# 4. If not configured, add manually -echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.zshrc +# Kill by PID +ccb-cleanup --kill-pid 26639 -# 5. Reload config -source ~/.zshrc +# Interactive selection +ccb-cleanup -i ``` -### WezTerm Not Detecting Commands +Interactive mode shows a numbered list of daemons and prompts for selection with confirmation. -If WezTerm cannot find ccb commands but regular Terminal can: - -- WezTerm may use a different shell config -- Add PATH to `~/.zprofile` as well: +### Cleanup Operations ```bash -echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.zprofile -``` - -Then restart WezTerm completely (Cmd+Q, reopen). - -
- ---- - -## 🗣️ Usage - -Once started, collaborate naturally. Claude will detect when to delegate tasks. - -**Common Scenarios:** - -- **Code Review:** *"Have Codex review the changes in `main.py`."* -- **Second Opinion:** *"Ask Gemini for alternative implementation approaches."* -- **Pair Programming:** *"Codex writes the backend logic, I'll handle the frontend."* -- **Architecture:** *"Let Codex design the module structure first."* -- **Info Exchange:** *"Fetch 3 rounds of Codex conversation and summarize."* - -### 🎴 Fun & Creative: AI Poker Night! - -> *"Let Claude, Codex and Gemini play Dou Di Zhu! You deal the cards, everyone plays open hand!"* -> -> 🃏 Claude (Landlord) vs 🎯 Codex + 💎 Gemini (Farmers) - -> **Note:** Manual commands (like `cask`, `cping`) are usually invoked by Claude automatically. See Command Reference for details. - ---- - -## 🛠️ Unified Command System - -### Legacy Commands (Deprecated) -- `cask/gask/oask/dask/lask` - Independent ask commands per provider -- `cping/gping/oping/dping/lping` - Independent ping commands -- `cpend/gpend/opend/dpend/lpend` - Independent pend commands - -### Unified Commands -- **`ask `** - Unified request (background by default) - - Supports: `gemini`, `codex`, `opencode`, `droid`, `claude` - - Defaults to background; managed Codex sessions prefer foreground to avoid cleanup - - Override with `--foreground` / `--background` or `CCB_ASK_FOREGROUND=1` / `CCB_ASK_BACKGROUND=1` - - Foreground uses sync send and disables completion hook unless `CCB_COMPLETION_HOOK_ENABLED` is set - - Supports `--notify` for short synchronous notifications - - Supports `CCB_CALLER` (default: `codex` in Codex sessions, otherwise `claude`) - -- **`ccb-ping `** - Unified connectivity test - - Checks if the specified provider's daemon is online - -- **`pend [N]`** - Unified reply fetch - - Fetches latest N replies from the provider - - Optional N specifies number of recent messages - -### Skills System -- `/ask ` - Request skill (background by default; foreground in managed Codex sessions) -- `/cping ` - Connectivity test skill -- `/pend ` - Reply fetch skill - -### Cross-Platform Support -- **Linux/macOS/WSL**: Uses `tmux` as terminal backend -- **Windows WezTerm**: Uses **PowerShell** as terminal backend -- **Windows PowerShell**: Native support via `DETACHED_PROCESS` background execution - -### Completion Hook -- Notifies caller upon task completion -- Supports `CCB_CALLER` targeting (`claude`/`codex`/`droid`) -- Compatible with both tmux and WezTerm backends - - Foreground ask suppresses the hook unless `CCB_COMPLETION_HOOK_ENABLED` is set - ---- +# Kill zombie daemons (parent process dead) +ccb-cleanup --kill-zombies -## 🧩 Skills - -- **/all-plan**: Collaborative multi-AI design with Superpowers brainstorming. - -
-/all-plan details & usage - -Usage: -``` -/all-plan -``` - -Example: -``` -/all-plan Design a caching layer for the API with Redis +# Clean stale state files and locks +ccb-cleanup --clean ``` -How it works: -1. **Requirement Refinement** - Socratic questioning to uncover hidden needs -2. **Parallel Independent Design** - Each AI designs independently (no groupthink) -3. **Comparative Analysis** - Merge insights, detect anti-patterns -4. **Iterative Refinement** - Cross-AI review and critique -5. **Final Output** - Actionable implementation plan - -Key features: -- **Socratic Ladder**: 7 structured questions for deep requirement mining -- **Superpowers Lenses**: Systematic alternative exploration (10x scale, remove dependency, invert flow) -- **Anti-pattern Detection**: Proactive risk identification across all designs - -When to use: -- Complex features requiring diverse perspectives -- Architectural decisions with multiple valid approaches -- High-stakes implementations needing thorough validation - -
+**Note**: The shell alias `ccb-kill` kills ALL CCB processes indiscriminately. Use `ccb-cleanup --kill-pid` for precise control. --- -## 📧 Mail System Configuration - -The mail system allows you to interact with AI providers via email, enabling remote access when you're away from your terminal. - -### How It Works - -1. **Send an email** to your CCB service mailbox -2. **Specify the AI provider** using a prefix in the email body (e.g., `CLAUDE: your question`) -3. **CCB routes the request** to the specified AI provider via the ASK system -4. **Receive the response** via email reply - -### Quick Setup +## Installation -**Step 1: Run the configuration wizard** -```bash -maild setup -``` - -**Step 2: Choose your email provider** -- Gmail -- Outlook -- QQ Mail -- 163 Mail -- Custom IMAP/SMTP - -**Step 3: Enter credentials** -- Service email address (CCB's mailbox) -- App password (not your regular password - see provider-specific instructions below) -- Target email (where to send replies) +### Option 1: Full Install (clone this repo) -**Step 4: Start the mail daemon** ```bash -maild start -``` - -### Configuration File - -Configuration is stored in `~/.ccb/mail/config.json`: - -```json -{ - "version": 3, - "enabled": true, - "service_account": { - "provider": "gmail", - "email": "your-ccb-service@gmail.com", - "imap": {"host": "imap.gmail.com", "port": 993, "ssl": true}, - "smtp": {"host": "smtp.gmail.com", "port": 587, "starttls": true} - }, - "target_email": "your-phone@example.com", - "default_provider": "claude", - "polling": { - "use_idle": true, - "idle_timeout": 300 - } -} +git clone https://github.com/daniellee2015/claude_code_bridge_multi.git +cd claude_code_bridge_multi +./install.sh install ``` -### Provider-Specific Setup - -
-Gmail - -1. Enable 2-Step Verification in your Google Account -2. Go to [App Passwords](https://myaccount.google.com/apppasswords) -3. Generate a new app password for "Mail" -4. Use this 16-character password (not your Google password) - -
+This installs the full CCB + multi-instance tooling. -
-Outlook / Office 365 - -1. Enable 2-Step Verification in your Microsoft Account -2. Go to [Security > App Passwords](https://account.live.com/proofs/AppPassword) -3. Generate a new app password -4. Use this password for CCB mail configuration +### Option 2: npm Package Only (with existing upstream CCB) -
+If you already have upstream CCB installed and only want the multi-instance CLI: -
-QQ Mail - -1. Log in to QQ Mail web interface -2. Go to Settings > Account -3. Enable IMAP/SMTP service -4. Generate an authorization code (授权码) -5. Use this authorization code as the password - -
- -
-163 Mail - -1. Log in to 163 Mail web interface -2. Go to Settings > POP3/SMTP/IMAP -3. Enable IMAP service -4. Set an authorization password (客户端授权密码) -5. Use this authorization password for CCB - -
- -### Email Format - -**Basic format:** -``` -Subject: Any subject (ignored) -Body: -CLAUDE: What is the weather like today? +```bash +npm install -g ccb-multi ``` -**Supported provider prefixes:** -- `CLAUDE:` or `claude:` - Route to Claude -- `CODEX:` or `codex:` - Route to Codex -- `GEMINI:` or `gemini:` - Route to Gemini -- `OPENCODE:` or `opencode:` - Route to OpenCode -- `DROID:` or `droid:` - Route to Droid +This installs `ccb-multi`, `ccb-multi-status`, `ccb-multi-history`, and `ccb-multi-clean` globally. +Source: [github.com/daniellee2015/ccb-multi](https://github.com/daniellee2015/ccb-multi) -If no prefix is specified, the request goes to the `default_provider` (default: `claude`). - -### Mail Daemon Commands +### Update & Uninstall ```bash -maild start # Start the mail daemon -maild stop # Stop the mail daemon -maild status # Check daemon status -maild config # Show current configuration -maild setup # Run configuration wizard -maild test # Test email connectivity +ccb update # Update to latest version +ccb uninstall # Uninstall +ccb reinstall # Clean reinstall ``` --- -Neovim integration with multi-AI code review - -> Combine with editors like **Neovim** for seamless code editing and multi-model review workflow. Edit in your favorite editor while AI assistants review and suggest improvements in real-time. - ---- +## Base CCB Documentation -## 📋 Requirements +For core CCB usage, command reference, skills system, mail service, and platform-specific guides, see the [upstream CCB README](https://github.com/bfly123/claude_code_bridge#readme). -- **Python 3.10+** -- **Terminal:** [WezTerm](https://wezfurlong.org/wezterm/) (Highly Recommended) or tmux +Key topics covered there: +- `ccb` launch flags (`-r`, `-a`, `-h`, `-v`) +- `ccb.config` format +- Unified command system (`ask`, `ccb-ping`, `pend`) +- Skills (`/all-plan`, `/ask`, `/cping`, `/pend`) +- Mail system configuration +- Windows / WSL / macOS installation guides --- -## 🗑️ Uninstall +## Version -```bash -ccb uninstall -ccb reinstall +**1.0.0** — Independent version line, forked from upstream CCB v5.2.6. -# Fallback: -./install.sh uninstall -``` +See [CHANGELOG.md](CHANGELOG.md) for details. ---
-**Windows fully supported** (WSL + Native via WezTerm) - ---- - -**Join our community** - -📧 Email: bfly123@126.com -💬 WeChat: seemseam-com - -WeChat Group +**[Upstream CCB](https://github.com/bfly123/claude_code_bridge)** · **[Issues](https://github.com/daniellee2015/claude_code_bridge_multi/issues)**
- ---- - -
-Version History - -### v5.0.6 -- **Zombie Cleanup**: `ccb kill -f` cleans up orphaned tmux sessions globally -- **Mounted Skill**: Optimized with `pgrep`, extracted to `ccb-mounted` script -- **Droid Skills**: Full skill set added to `droid_skills/` - -### v5.0.5 -- **Droid**: Add delegation tools (`ccb_ask_*` and `cask/gask/lask/oask`) plus `ccb droid setup-delegation` for MCP install - -### v5.0.4 -- **OpenCode**: 修复 `-r` 恢复在多项目切换后失效的问题 - -### v5.0.3 -- **Daemons**: 全新的稳定守护进程设计 - -### v5.0.1 -- **Skills**: New `/all-plan` with Superpowers brainstorming + availability gating; Codex `lping/lpend` added; `gask` keeps brief summaries with `CCB_DONE`. -- **Status Bar**: Role label now reads role name from `.autoflow/roles.json` (supports `_meta.name`) and caches per path. -- **Installer**: Copy skill subdirectories (e.g., `references/`) for Claude/Codex installs. -- **CLI**: Added `ccb uninstall` / `ccb reinstall` with Claude config cleanup. -- **Routing**: Tighter project/session resolution (prefer `.ccb` anchor; avoid cross-project Claude session mismatches). - -### v5.0.0 -- **Claude Independence**: No need to start Claude first; Codex (or any agent) can be the primary CLI -- **Unified Control**: Single entry point controls Claude/OpenCode/Gemini equally -- **Simplified Launch**: Removed `ccb up`; default `ccb.config` is auto-created when missing -- **Flexible Mounting**: More flexible pane mounting and session binding -- **Daemon Autostart**: `caskd`/`laskd` auto-start in WezTerm/tmux when needed -- **Session Robustness**: PID liveness checks prevent stale sessions - -### v4.1.3 -- **Codex Config**: Automatically migrate deprecated `sandbox_mode = "full-auto"` to `"danger-full-access"` to fix Codex startup -- **Stability**: Fixed race conditions where fast-exiting commands could close panes before `remain-on-exit` was set -- **Tmux**: More robust pane detection (prefer stable `$TMUX_PANE` env var) and better fallback when split targets disappear - -### v4.1.2 -- **Performance**: Added caching for tmux status bar (git branch & ccb status) to reduce system load -- **Strict Tmux**: Explicitly require `tmux` for auto-launch; removed error-prone auto-attach logic -- **CLI**: Added `--print-version` flag for fast version checks - -### v4.1.1 -- **CLI Fix**: Improved flag preservation (e.g., `-a`) when relaunching `ccb` in tmux -- **UX**: Better error messages when running in non-interactive sessions -- **Install**: Force update skills to ensure latest versions are applied - -### v4.1.0 -- **Async Guardrail**: `cask/gask/oask` prints a post-submit guardrail reminder for Claude -- **Sync Mode**: add `--sync` to suppress guardrail prompts for Codex callers -- **Codex Skills**: update `oask/gask` skills to wait silently with `--sync` - -### v4.0.9 -- **Project_ID Simplification**: `ccb_project_id` uses current-directory `.ccb/` anchor (no ancestor traversal, no git dependency) -- **Codex Skills Stability**: Codex `oask/gask` skills default to waiting (`--timeout -1`) to avoid sending the next task too early - -### v4.0.8 -- **Daemon Log Binding Refresh**: `caskd` daemon now periodically refreshes `.codex-session` log paths by parsing `start_cmd` and scanning latest logs -- **Tmux Clipboard Enhancement**: Added `xsel` support and `update-environment` for better clipboard integration across GUI/remote sessions - -### v4.0.7 -- **Tmux Status Bar Redesign**: Dual-line status bar with modern dot indicators (●/○), git branch, and CCB version display -- **Session Freshness**: Always scan logs for latest session instead of using cached session file -- **Simplified Auto Mode**: `ccb -a` now purely uses `--dangerously-skip-permissions` - -### v4.0.6 -- **Session Overrides**: `cping/gping/oping/cpend/opend` support `--session-file` / `CCB_SESSION_FILE` to bypass wrong `cwd` - -### v4.0.5 -- **Gemini Reliability**: Retry reading Gemini session JSON to avoid transient partial-write failures -- **Claude Code Reliability**: `gpend` supports `--session-file` / `CCB_SESSION_FILE` to bypass wrong `cwd` - -### v4.0.4 -- **Fix**: Auto-repair duplicate `[projects.\"...\"]` entries in `~/.codex/config.toml` before starting Codex - -### v4.0.3 -- **Project Cleanliness**: Store session files under `.ccb/` (fallback to legacy root dotfiles) -- **Claude Code Reliability**: `cask/gask/oask` support `--session-file` / `CCB_SESSION_FILE` to bypass wrong `cwd` -- **Codex Config Safety**: Write auto-approval settings into a CCB-marked block to avoid config conflicts - -### v4.0.2 -- **Clipboard Paste**: Cross-platform support (xclip/wl-paste/pbpaste) in tmux config -- **Install UX**: Auto-reload tmux config after installation -- **Stability**: Default TMUX_ENTER_DELAY set to 0.5s for better reliability - -### v4.0.1 -- **Tokyo Night Theme**: Switch tmux status bar and pane borders to Tokyo Night color palette - -### v4.0 -- **Full Refactor**: Rebuilt from the ground up with a cleaner architecture -- **Perfect tmux Support**: First-class splits, pane labels, borders and statusline -- **Works in Any Terminal**: Recommended to run everything in tmux (except native Windows) - -### v3.0.0 -- **Smart Daemons**: `caskd`/`gaskd`/`oaskd` with 60s idle timeout & parallel queue support -- **Cross-AI Collaboration**: Support multiple agents (Claude/Codex) calling one agent (OpenCode) simultaneously -- **Interruption Detection**: Gemini now supports intelligent interruption handling -- **Chained Execution**: Codex can call `oask` to drive OpenCode -- **Stability**: Robust queue management and lock files - -### v2.3.9 -- Fix oask session tracking bug - follow new session when OpenCode creates one - -### v2.3.8 -- Plan mode enabled for autoflow projects regardless of `-a` flag - -### v2.3.7 -- Per-directory lock: different working directories can run cask/gask/oask independently - -### v2.3.6 -- Add non-blocking lock for cask/gask/oask to prevent concurrent requests -- Unify oask with cask/gask logic (use _wait_for_complete_reply) - -### v2.3.5 -- Fix plan mode conflict with auto mode (--dangerously-skip-permissions) -- Fix oask returning stale reply when OpenCode still processing - -### v2.3.4 -- Auto-enable plan mode when autoflow is installed - -### v2.3.3 -- Simplify cping.md to match oping/gping style (~65% token reduction) - -### v2.3.2 -- Optimize skill files: extract common patterns to docs/async-ask-pattern.md (~60% token reduction) - -### v2.3.1 -- Fix race condition in gask/cask: pre-check for existing messages before wait loop - -
diff --git a/bin/ask b/bin/ask index 21df402..228f76c 100755 --- a/bin/ask +++ b/bin/ask @@ -22,11 +22,9 @@ Examples: from __future__ import annotations import os -import shutil import subprocess import sys import tempfile -import time from datetime import datetime from pathlib import Path @@ -100,6 +98,64 @@ def _use_unified_daemon() -> bool: return True # Default to unified daemon +def _maybe_start_unified_daemon() -> bool: + """Try to start unified askd daemon if not running.""" + import shutil + import time + import sys + from askd_runtime import state_file_path + from askd.daemon import ping_daemon + + # Check if already running + state_file = state_file_path("askd.json") + if ping_daemon(timeout_s=0.5, state_file=state_file): + return True + + # Find askd binary + candidates: list[str] = [] + local = (Path(__file__).resolve().parent / "askd") + if local.exists(): + candidates.append(str(local)) + found = shutil.which("askd") + if found: + candidates.append(found) + if not candidates: + return False + + # Prepare command with cross-platform handling + entry = candidates[0] + lower = entry.lower() + if lower.endswith((".cmd", ".bat", ".exe")): + argv = [entry] + else: + argv = [sys.executable, entry] + + # Start daemon in background with platform-specific flags + try: + kwargs = { + "stdin": subprocess.DEVNULL, + "stdout": subprocess.DEVNULL, + "stderr": subprocess.DEVNULL, + "close_fds": True, + } + if os.name == "nt": + kwargs["creationflags"] = getattr(subprocess, "DETACHED_PROCESS", 0) | getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0) + else: + kwargs["start_new_session"] = True + subprocess.Popen(argv, **kwargs) + except Exception: + return False + + # Wait for daemon to be ready + deadline = time.time() + 2.0 + while time.time() < deadline: + if ping_daemon(timeout_s=0.2, state_file=state_file): + return True + time.sleep(0.1) + + return False + + def _send_via_unified_daemon( provider: str, message: str, @@ -107,27 +163,25 @@ def _send_via_unified_daemon( no_wrap: bool, caller: str, ) -> int: - """Send request via unified askd daemon.""" + """Send request via unified askd daemon with auto-start retry.""" import json import socket from askd_runtime import state_file_path import askd_rpc - ready_timeout = min(timeout, 2.0) if timeout and timeout > 0 else 2.0 - if not _ensure_unified_daemon_ready(timeout_s=ready_timeout): - print("[ERROR] Unified askd daemon not running", file=sys.stderr) - print("Start it with `askd` (or enable autostart via CCB_ASKD_AUTOSTART=1).", file=sys.stderr) - return EXIT_ERROR - # Use CCB_RUN_DIR (set by CCB startup) to locate the state file. # This already contains the correct project-specific path. state_file = state_file_path("askd.json") state = askd_rpc.read_state(state_file) if not state: - print("[ERROR] Unified askd daemon not running", file=sys.stderr) - return EXIT_ERROR + # Try to start daemon and retry once + if _maybe_start_unified_daemon(): + state = askd_rpc.read_state(state_file) + if not state: + print("[ERROR] Unified askd daemon not running", file=sys.stderr) + return EXIT_ERROR host = state.get("connect_host") or state.get("host") or "127.0.0.1" port = int(state.get("port") or 0) @@ -164,31 +218,74 @@ def _send_via_unified_daemon( req["email_msg_id"] = os.environ.get("CCB_EMAIL_MSG_ID", "") req["email_from"] = os.environ.get("CCB_EMAIL_FROM", "") - try: - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.settimeout(timeout + 10 if timeout > 0 else 3610) - sock.connect((host, port)) - sock.sendall((json.dumps(req) + "\n").encode("utf-8")) - - data = b"" - while True: - chunk = sock.recv(4096) - if not chunk: - break - data += chunk - if b"\n" in data: - break - - sock.close() - resp = json.loads(data.decode("utf-8").strip()) - exit_code = int(resp.get("exit_code") or 0) - reply = resp.get("reply") or "" - if reply: - print(reply) - return exit_code - except Exception as e: - print(f"[ERROR] {e}", file=sys.stderr) - return EXIT_ERROR + # Try to send request, with one retry on connection failure only + request_sent = False + for attempt in range(2): + sock = None + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(timeout + 10 if timeout > 0 else 3610) + sock.connect((host, port)) + + # Mark that connection succeeded - no retry after this point + request_sent = True + + sock.sendall((json.dumps(req) + "\n").encode("utf-8")) + + data = b"" + while True: + chunk = sock.recv(4096) + if not chunk: + break + data += chunk + if b"\n" in data: + break + + resp = json.loads(data.decode("utf-8").strip()) + exit_code = int(resp.get("exit_code") or 0) + reply = resp.get("reply") or "" + if reply: + print(reply) + return exit_code + except (ConnectionRefusedError, ConnectionResetError) as e: + # Only retry if connection failed before request was sent + if attempt == 0 and not request_sent: + if _maybe_start_unified_daemon(): + # Re-read state for new daemon + state = askd_rpc.read_state(state_file) + if state: + host = state.get("connect_host") or state.get("host") or "127.0.0.1" + port = int(state.get("port") or 0) + token = state.get("token") or "" + req["token"] = token + continue # Retry with new connection info + print(f"[ERROR] {e}", file=sys.stderr) + return EXIT_ERROR + except OSError as e: + # For other OS errors, only retry if connection not yet established + if attempt == 0 and not request_sent: + if _maybe_start_unified_daemon(): + state = askd_rpc.read_state(state_file) + if state: + host = state.get("connect_host") or state.get("host") or "127.0.0.1" + port = int(state.get("port") or 0) + token = state.get("token") or "" + req["token"] = token + continue + print(f"[ERROR] {e}", file=sys.stderr) + return EXIT_ERROR + except Exception as e: + print(f"[ERROR] {e}", file=sys.stderr) + return EXIT_ERROR + finally: + # Always close socket to prevent leaks + if sock: + try: + sock.close() + except Exception: + pass + + return EXIT_ERROR def _env_bool(name: str, default: bool = False) -> bool: @@ -198,96 +295,6 @@ def _env_bool(name: str, default: bool = False) -> bool: return val not in ("0", "false", "no", "off") -def _is_pid_alive(pid: int) -> bool: - if pid <= 0: - return False - try: - os.kill(pid, 0) - return True - except OSError: - return False - except Exception: - return True - - -def _askd_start_argv() -> list[str] | None: - local = script_dir / "askd" - candidates: list[str] = [] - if local.exists(): - candidates.append(str(local)) - found = shutil.which("askd") - if found: - candidates.append(found) - if not candidates: - return None - - entry = candidates[0] - lower = entry.lower() - if lower.endswith((".cmd", ".bat", ".exe")): - return [entry] - return [sys.executable, entry] - - -def _ensure_unified_daemon_ready(timeout_s: float = 2.0) -> bool: - if not _use_unified_daemon(): - return True - - from askd_runtime import state_file_path - import askd_rpc - - state_file = state_file_path("askd.json") - try: - if askd_rpc.ping_daemon("ask", 0.2, state_file): - return True - except Exception: - pass - - if not _env_bool("CCB_ASKD_AUTOSTART", True): - return False - - argv = _askd_start_argv() - if not argv: - return False - - env = os.environ.copy() - parent_raw = (env.get("CCB_PARENT_PID") or "").strip() - if parent_raw: - try: - parent_pid = int(parent_raw) - except Exception: - parent_pid = 0 - if parent_pid <= 0 or not _is_pid_alive(parent_pid): - env.pop("CCB_PARENT_PID", None) - env.pop("CCB_MANAGED", None) - - kwargs = { - "stdin": subprocess.DEVNULL, - "stdout": subprocess.DEVNULL, - "stderr": subprocess.DEVNULL, - "close_fds": True, - "env": env, - } - if os.name == "nt": - kwargs["creationflags"] = getattr(subprocess, "DETACHED_PROCESS", 0) | getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0) - else: - kwargs["start_new_session"] = True - - try: - subprocess.Popen(argv, **kwargs) - except Exception: - return False - - deadline = time.time() + max(0.2, float(timeout_s)) - while time.time() < deadline: - try: - if askd_rpc.ping_daemon("ask", 0.2, state_file): - return True - except Exception: - pass - time.sleep(0.1) - return False - - def _default_foreground() -> bool: # Allow explicit override if _env_bool("CCB_ASK_BACKGROUND", False): @@ -394,10 +401,15 @@ def main(argv: list[str]) -> int: return EXIT_ERROR # Notify mode: sync send, no wait for reply (used for hook notifications) - # MUST be checked before unified daemon path to avoid full request-response cycle - # which would cause reply-to-self loops via notify_completion -> ask -> daemon -> notify_completion if notify_mode: _require_caller() + if _use_unified_daemon(): + # TODO: Add fire-and-forget RPC mode for unified daemon + # For now, disable unified mode for notify and use legacy path + print("[WARN] Notify mode not yet supported with unified daemon, using legacy", file=sys.stderr) + # Fall through to legacy path below + + # Legacy daemon path for notify mode cmd = [daemon_cmd, "--sync"] if no_wrap: cmd.append("--no-wrap") @@ -416,33 +428,25 @@ def main(argv: list[str]) -> int: print(f"[ERROR] {e}", file=sys.stderr) return EXIT_ERROR - # Use unified daemon if enabled (default: True) - if _use_unified_daemon(): - caller = _require_caller() - return _send_via_unified_daemon(provider, message, timeout, no_wrap, caller) - - # Foreground mode: run provider directly (avoid background cleanup in managed envs) + # Foreground mode: run provider directly via unified daemon if foreground_mode: - cmd = [daemon_cmd, "--sync", "--timeout", str(timeout)] - if no_wrap and provider == "claude": - cmd.append("--no-wrap") - env = os.environ.copy() - env["CCB_CALLER"] = _require_caller() - try: - result = subprocess.run(cmd, input=message, text=True, env=env) - return result.returncode - except Exception as e: - print(f"[ERROR] {e}", file=sys.stderr) - return EXIT_ERROR - - # Default async mode: background task via nohup, using unified askd daemon - if _use_unified_daemon(): - ready_timeout = min(timeout, 2.0) if timeout and timeout > 0 else 2.0 - if not _ensure_unified_daemon_ready(timeout_s=ready_timeout): - print("[ERROR] Unified askd daemon not running", file=sys.stderr) - print("Start it with `askd` (or enable autostart via CCB_ASKD_AUTOSTART=1).", file=sys.stderr) - return EXIT_ERROR + if _use_unified_daemon(): + caller = _require_caller() + return _send_via_unified_daemon(provider, message, timeout, no_wrap, caller) + else: + cmd = [daemon_cmd, "--sync", "--timeout", str(timeout)] + if no_wrap and provider == "claude": + cmd.append("--no-wrap") + env = os.environ.copy() + env["CCB_CALLER"] = _require_caller() + try: + result = subprocess.run(cmd, input=message, text=True, env=env) + return result.returncode + except Exception as e: + print(f"[ERROR] {e}", file=sys.stderr) + return EXIT_ERROR + # Default async mode: background task via nohup task_id = make_task_id() log_dir = Path(tempfile.gettempdir()) / "ccb-tasks" log_dir.mkdir(parents=True, exist_ok=True) diff --git a/bin/ccb-cleanup b/bin/ccb-cleanup index 74a1e1b..93026f3 100755 --- a/bin/ccb-cleanup +++ b/bin/ccb-cleanup @@ -65,7 +65,28 @@ def cleanup_stale_locks(): return removed -def list_running_daemons(): +def get_tmux_pane_for_workdir(work_dir: str) -> str: + """Find tmux pane ID for a given work directory.""" + try: + import subprocess + result = subprocess.run( + ["tmux", "list-panes", "-a", "-F", "#{pane_id}\t#{pane_current_path}"], + capture_output=True, + text=True, + timeout=2.0 + ) + if result.returncode == 0: + for line in result.stdout.strip().split('\n'): + if '\t' in line: + pane_id, pane_path = line.split('\t', 1) + if pane_path == work_dir: + return pane_id + except Exception: + pass + return "unknown" + + +def list_running_daemons(verbose=False): """List all running askd daemons.""" cache_dir = Path.home() / ".cache" / "ccb" / "projects" if not cache_dir.exists(): @@ -82,32 +103,110 @@ def list_running_daemons(): if pid > 0 and is_pid_alive(pid): parent_alive = is_pid_alive(parent_pid) if parent_pid > 0 else False project_hash = state_file.parent.name - daemons.append({ + daemon_info = { "pid": pid, "parent_pid": parent_pid, "parent_alive": parent_alive, "project_hash": project_hash, "started_at": data.get("started_at", "unknown"), - }) + } + + if verbose: + daemon_info.update({ + "work_dir": data.get("work_dir", "unknown"), + "port": data.get("port", "unknown"), + "host": data.get("host", "unknown"), + "managed": data.get("managed", False), + "tmux_pane": get_tmux_pane_for_workdir(data.get("work_dir", "")), + }) + + daemons.append(daemon_info) except Exception: pass return daemons +def kill_daemon_by_pid(pid: int) -> bool: + """Kill a specific daemon by PID.""" + if not is_pid_alive(pid): + print(f"PID {pid} is not running", file=sys.stderr) + return False + + try: + os.kill(pid, 15) # SIGTERM + print(f"✅ Killed daemon PID {pid}") + return True + except Exception as e: + print(f"❌ Failed to kill PID {pid}: {e}", file=sys.stderr) + return False + + +def interactive_kill(): + """Interactive mode to select and kill daemons.""" + daemons = list_running_daemons(verbose=True) + if not daemons: + print("No running daemons found") + return + + print("=== Running askd daemons ===\n") + for idx, d in enumerate(daemons, 1): + status = "ZOMBIE (parent dead)" if not d["parent_alive"] else "OK" + tmux_pane = d.get('tmux_pane', 'unknown') + print(f"{idx}. PID {d['pid']} (parent {d['parent_pid']}) - {status}") + print(f" Tmux Pane: {tmux_pane}") + print(f" Project: {d['project_hash']}") + print(f" Work Dir: {d.get('work_dir', 'unknown')}") + print(f" Started: {d['started_at']}") + print() + + try: + choice = input("Enter daemon number to kill (or 'q' to quit): ").strip() + if choice.lower() == 'q': + print("Cancelled") + return + + idx = int(choice) + if 1 <= idx <= len(daemons): + daemon = daemons[idx - 1] + confirm = input(f"Kill PID {daemon['pid']}? (y/N): ").strip().lower() + if confirm == 'y': + kill_daemon_by_pid(daemon['pid']) + else: + print("Cancelled") + else: + print(f"Invalid choice: {idx}", file=sys.stderr) + except (ValueError, KeyboardInterrupt): + print("\nCancelled") + + def main(): import argparse parser = argparse.ArgumentParser(description="Clean up CCB zombie daemons and stale files") parser.add_argument("--list", action="store_true", help="List running daemons") + parser.add_argument("-v", "--verbose", action="store_true", help="Show detailed daemon info") parser.add_argument("--clean", action="store_true", help="Clean stale files") parser.add_argument("--kill-zombies", action="store_true", help="Kill zombie daemons (parent dead)") + parser.add_argument("--kill-pid", type=int, metavar="PID", help="Kill specific daemon by PID") + parser.add_argument("-i", "--interactive", action="store_true", help="Interactive mode to select daemon to kill") args = parser.parse_args() + # Interactive mode + if args.interactive: + interactive_kill() + return + + # Kill specific PID + if args.kill_pid: + success = kill_daemon_by_pid(args.kill_pid) + sys.exit(0 if success else 1) + + # List daemons if args.list or not (args.clean or args.kill_zombies): print("=== Running askd daemons ===") - daemons = list_running_daemons() + daemons = list_running_daemons(verbose=args.verbose) if not daemons: print("No running daemons found") else: @@ -116,6 +215,13 @@ def main(): print(f" PID {d['pid']} (parent {d['parent_pid']}) - {status}") print(f" Project: {d['project_hash']}") print(f" Started: {d['started_at']}") + if args.verbose: + print(f" Work Dir: {d.get('work_dir', 'unknown')}") + print(f" Port: {d.get('port', 'unknown')}") + print(f" Host: {d.get('host', 'unknown')}") + print(f" Managed: {d.get('managed', False)}") + tmux_pane = d.get('tmux_pane', 'unknown') + print(f" Tmux Pane: {tmux_pane}") if args.clean: print("\n=== Cleaning stale files ===") diff --git a/ccb b/ccb index 40d01dc..60bc0b5 100755 --- a/ccb +++ b/ccb @@ -21,6 +21,7 @@ import re import shutil import posixpath import shlex +import threading from pathlib import Path script_dir = Path(__file__).resolve().parent @@ -51,9 +52,9 @@ backend_env = get_backend_env() if backend_env and not os.environ.get("CCB_BACKEND_ENV"): os.environ["CCB_BACKEND_ENV"] = backend_env -VERSION = "5.2.6" -GIT_COMMIT = "v5.2.6" -GIT_DATE = "2026-02-24" +VERSION = "5.2.4" +GIT_COMMIT = "c539e79" +GIT_DATE = "2026-02-25" _WIN_DRIVE_RE = re.compile(r"^[A-Za-z]:([/\\\\]|$)") _MNT_DRIVE_RE = re.compile(r"^/mnt/([A-Za-z])/(.*)$") @@ -384,7 +385,14 @@ def _is_pid_alive(pid: int) -> bool: try: os.kill(int(pid), 0) return True + except ProcessLookupError: + # Process doesn't exist + return False + except PermissionError: + # Process exists but no permission to check + return True except Exception: + # Other errors - assume dead for safety return False @@ -570,6 +578,10 @@ class AILauncher: self.runtime_dir.mkdir(parents=True, exist_ok=True) self._cleaned = False self._askd_checked = False + self._watchdog_thread = None + self._watchdog_stop_event = None + self._daemon_proc = None # Track daemon Popen object for reaping + self._daemon_proc_lock = threading.Lock() # Protect daemon_proc access self.terminal_type = self._detect_terminal_type() self.tmux_sessions = {} self.tmux_panes = {} @@ -737,6 +749,24 @@ class AILauncher: def _maybe_start_caskd(self) -> None: self._maybe_start_provider_daemon("codex") + def _maybe_start_unified_askd(self) -> None: + """Start unified askd daemon (provider-agnostic).""" + # Try to start for any enabled provider that uses askd (including claude) + for provider in ["codex", "gemini", "opencode", "droid", "claude"]: + if provider in [p.lower() for p in self.providers]: + # Try to start and check if successful + self._maybe_start_provider_daemon(provider) + # Verify daemon actually started by pinging + try: + from askd_runtime import state_file_path + from askd.daemon import ping_daemon + state_file = state_file_path("askd.json") + if ping_daemon(timeout_s=0.5, state_file=state_file): + return # Successfully started + except Exception: + pass + # If not successful, continue to next provider + def _maybe_start_provider_daemon(self, provider: str) -> None: def _bool_from_env(name: str): raw = os.environ.get(name) @@ -804,12 +834,45 @@ class AILauncher: if spec.daemon_bin_name == "askd" and not self._askd_checked: self._askd_checked = True if not _owned_by_ccb(st): - print("⚠️ askd already running but not managed by this CCB; restarting to bind lifecycle.") - if callable(shutdown_daemon_fn): + # Check if forced rebind is enabled (case-insensitive) + force_rebind = _env_bool("CCB_FORCE_REBIND", False) + + # Check if foreign parent is still alive before forcing rebind + # Safely normalize parent_pid to int (handle None, non-int, etc.) + try: + foreign_parent_pid = int((st or {}).get("parent_pid") or 0) + except Exception: + foreign_parent_pid = 0 + + foreign_parent_alive = False + if not force_rebind and foreign_parent_pid > 0: try: - shutdown_daemon_fn(timeout_s=1.0, state_file=state_file) + foreign_parent_alive = _is_pid_alive(foreign_parent_pid) except Exception: - pass + foreign_parent_alive = False + + if force_rebind or not foreign_parent_alive: + # Safe to rebind: either forced or foreign parent is dead/stale + if force_rebind: + print(f"⚠️ CCB_FORCE_REBIND=1 set, forcing askd rebind despite live parent (PID {foreign_parent_pid})...") + else: + print(f"⚠️ askd owned by dead parent (PID {foreign_parent_pid}), restarting to bind lifecycle...") + if callable(shutdown_daemon_fn): + try: + shutdown_daemon_fn(timeout_s=1.0, state_file=state_file) + except Exception: + pass + else: + # Foreign parent is still alive, don't force rebind + print(f"⚠️ askd owned by live parent (PID {foreign_parent_pid}), skipping rebind to avoid disruption") + print(f" Set CCB_FORCE_REBIND=1 to override this safety check") + host = st.get("host") if isinstance(st, dict) else None + port = st.get("port") if isinstance(st, dict) else None + if host and port: + print(f"✅ {spec.daemon_bin_name} already running at {host}:{port}") + else: + print(f"✅ {spec.daemon_bin_name} already running") + return deadline = time.time() + 2.0 while time.time() < deadline: if not ping_daemon(timeout_s=0.2, state_file=state_file): @@ -860,7 +923,11 @@ class AILauncher: try: env = os.environ.copy() env["CCB_PARENT_PID"] = str(os.getpid()) - subprocess.Popen([sys.executable, str(daemon_script)], env=env, **kwargs) + proc = subprocess.Popen([sys.executable, str(daemon_script)], env=env, **kwargs) + # Track daemon process for reaping (thread-safe) + if spec.daemon_bin_name == "askd": + with self._daemon_proc_lock: + self._daemon_proc = proc except Exception as exc: print(f"⚠️ Failed to start {spec.daemon_bin_name}: {exc}") return @@ -879,6 +946,144 @@ class AILauncher: time.sleep(0.1) print(f"⚠️ {spec.daemon_bin_name} start requested, but daemon not reachable yet") + def _start_daemon_watchdog(self) -> None: + """Start watchdog thread to monitor askd daemon health.""" + import threading + + # Restart if thread exists but is dead + if self._watchdog_thread is not None: + if not self._watchdog_thread.is_alive(): + self._watchdog_thread = None + self._watchdog_stop_event = None + else: + return # Already running + + self._watchdog_stop_event = threading.Event() + self._watchdog_thread = threading.Thread( + target=self._daemon_watchdog_loop, + daemon=True, + name="askd-watchdog" + ) + self._watchdog_thread.start() + + def _daemon_watchdog_loop(self) -> None: + """Watchdog loop to monitor and restart askd daemon if needed.""" + from askd_runtime import state_file_path + from askd.daemon import ping_daemon, read_state + + # Validate and clamp check interval + try: + check_interval = float(os.environ.get("CCB_WATCHDOG_INTERVAL_S", "10")) + check_interval = max(1.0, min(check_interval, 300.0)) # Clamp to 1-300 seconds + except (ValueError, TypeError): + check_interval = 10.0 + + consecutive_failures = 0 + max_failures = 3 + ownership_mismatch_count = 0 + max_ownership_mismatches = 3 + + while not self._watchdog_stop_event.wait(check_interval): + try: + # Reap zombie child processes + self._reap_zombie_children() + + # Check if tracked daemon process has exited + with self._daemon_proc_lock: + if self._daemon_proc is not None: + exit_code = self._daemon_proc.poll() + if exit_code is not None: + # Daemon process has exited + print(f"⚠️ askd daemon process exited with code {exit_code}", file=sys.stderr) + self._daemon_proc = None + # Will be restarted by health check below + + # Check askd daemon health + state_file = state_file_path("askd.json") + if not ping_daemon(timeout_s=0.5, state_file=state_file): + consecutive_failures += 1 + if consecutive_failures >= max_failures: + # Daemon is unhealthy, try to restart + print(f"⚠️ askd daemon unhealthy (failed {consecutive_failures} checks), attempting restart...", file=sys.stderr) + self._maybe_start_unified_askd() + consecutive_failures = 0 + else: + # Daemon is healthy, verify ownership + state = read_state(state_file=state_file) + if isinstance(state, dict): + parent_pid = int(state.get("parent_pid") or 0) + managed = bool(state.get("managed")) + if managed and parent_pid != self.ccb_pid: + ownership_mismatch_count += 1 + print(f"⚠️ askd daemon ownership mismatch (parent_pid={parent_pid}, expected={self.ccb_pid}, count={ownership_mismatch_count})", file=sys.stderr) + if ownership_mismatch_count >= max_ownership_mismatches: + # Check if forced rebind is enabled (case-insensitive) + force_rebind = _env_bool("CCB_FORCE_REBIND", False) + + # Check if foreign parent is still alive before forcing rebind + foreign_parent_alive = False + if not force_rebind: + # Use existing _is_pid_alive for consistent cross-platform check + try: + foreign_parent_alive = _is_pid_alive(parent_pid) + except Exception: + foreign_parent_alive = False + + if force_rebind or not foreign_parent_alive: + # Safe to rebind: either forced or foreign parent is dead/stale + if force_rebind: + print(f"⚠️ CCB_FORCE_REBIND=1 set, forcing rebind despite live parent (PID {parent_pid})...", file=sys.stderr) + else: + print(f"⚠️ Foreign parent (PID {parent_pid}) is dead, attempting to rebind daemon...", file=sys.stderr) + try: + from askd_rpc import shutdown_daemon + shutdown_daemon("ask", timeout_s=2.0, state_file=state_file) + time.sleep(1.0) # Wait for shutdown + except Exception: + pass + self._maybe_start_unified_askd() + ownership_mismatch_count = 0 + else: + # Foreign parent is still alive, don't force rebind + print(f"⚠️ Foreign parent (PID {parent_pid}) is still alive, skipping forced rebind to avoid disruption", file=sys.stderr) + print(f" Set CCB_FORCE_REBIND=1 to override this safety check", file=sys.stderr) + ownership_mismatch_count = 0 # Reset to avoid repeated warnings + else: + ownership_mismatch_count = 0 # Reset on correct ownership + consecutive_failures = 0 + except Exception as e: + # Silently continue on watchdog errors + pass + + def _reap_zombie_children(self) -> None: + """Reap tracked daemon process if it has exited.""" + # Only reap the tracked daemon process, not all children + with self._daemon_proc_lock: + if self._daemon_proc is not None: + try: + exit_code = self._daemon_proc.poll() + if exit_code is not None: + # Process has exited, reap it by calling wait() + try: + self._daemon_proc.wait(timeout=0.1) + except Exception: + pass + except Exception: + pass + + def _stop_daemon_watchdog(self) -> None: + """Stop watchdog thread.""" + if self._watchdog_stop_event: + self._watchdog_stop_event.set() + if self._watchdog_thread: + self._watchdog_thread.join(timeout=1.0) + # Only clear if thread actually stopped + if not self._watchdog_thread.is_alive(): + self._watchdog_thread = None + else: + # Thread still running, log warning + pass # Silently continue, daemon thread will exit with process + def _detect_terminal_type(self): # Forced by environment variable forced = (os.environ.get("CCB_TERMINAL") or os.environ.get("CODEX_TERMINAL") or "").strip().lower() @@ -1130,34 +1335,6 @@ class AILauncher: def _claude_session_file(self) -> Path: return self._project_session_file(".claude-session") - def _backfill_existing_claude_session_work_dir_fields(self) -> None: - """Backfill work_dir/work_dir_norm for an existing .claude-session on startup.""" - path = self._claude_session_file() - if not path.exists(): - return - - try: - raw = path.read_text(encoding="utf-8-sig") - data = json.loads(raw) - except Exception: - return - if not isinstance(data, dict): - return - - changed = False - if not isinstance(data.get("work_dir"), str) or not str(data.get("work_dir") or "").strip(): - data["work_dir"] = str(self.project_root) - changed = True - if not isinstance(data.get("work_dir_norm"), str) or not str(data.get("work_dir_norm") or "").strip(): - data["work_dir_norm"] = _normalize_path_for_match(str(self.project_root)) - changed = True - - if not changed: - return - - payload = json.dumps(data, ensure_ascii=False, indent=2) - safe_write_session(path, payload) - def _read_local_claude_session_id(self) -> str | None: data = self._read_json_file(self._claude_session_file()) sid = data.get("claude_session_id") @@ -2953,6 +3130,10 @@ class AILauncher: if self._cleaned: return self._cleaned = True + + # Stop watchdog thread first + self._stop_daemon_watchdog() + if not quiet: print(f"\n🧹 {t('cleaning_up')}") @@ -3011,10 +3192,47 @@ class AILauncher: except Exception: pass - # Ensure unified askd daemon exits when CCB exits. + # Ensure unified askd daemon exits when CCB exits (with ownership safety). try: askd_state = state_file_path("askd.json") - shutdown_daemon("ask", 1.0, askd_state) + + # Read askd state to check ownership + askd_st = {} + if askd_state and askd_state.exists(): + try: + with open(askd_state, "r", encoding="utf-8") as f: + askd_st = json.load(f) + except Exception: + pass + + # Check if we own this daemon or if forced shutdown is enabled + force_rebind = _env_bool("CCB_FORCE_REBIND", False) + owned_by_us = False + try: + parent_pid = int((askd_st or {}).get("parent_pid") or 0) + owned_by_us = (parent_pid == self.ccb_pid) + except Exception: + pass + + # Only shutdown if we own it, or force flag is set, or owner is dead + should_shutdown = False + if force_rebind: + should_shutdown = True + elif owned_by_us: + should_shutdown = True + else: + # Check if foreign owner is still alive + try: + parent_pid = int((askd_st or {}).get("parent_pid") or 0) + if parent_pid > 0: + foreign_alive = _is_pid_alive(parent_pid) + if not foreign_alive: + should_shutdown = True # Owner is dead, safe to cleanup + except Exception: + pass + + if should_shutdown: + shutdown_daemon("ask", 1.0, askd_state) except Exception: pass @@ -3057,11 +3275,6 @@ class AILauncher: if not self._require_project_config_dir(): return 2 - try: - self._backfill_existing_claude_session_work_dir_fields() - except Exception: - pass - if not self.providers: print("❌ No providers configured. Define providers in ccb.config or pass them on the command line.", file=sys.stderr) return 2 @@ -3186,6 +3399,9 @@ class AILauncher: if "codex" in self.providers and self.anchor_provider != "codex": self._maybe_start_caskd() + # Start watchdog thread to monitor daemon health (provider-agnostic) + self._start_daemon_watchdog() + try: try: self._sync_cend_registry() diff --git a/ccb-multi b/ccb-multi new file mode 160000 index 0000000..f26c0ba --- /dev/null +++ b/ccb-multi @@ -0,0 +1 @@ +Subproject commit f26c0ba0b25ad5381bb0f9bbc7d3b001a4d27d18 diff --git a/ccb-shared-context b/ccb-shared-context new file mode 160000 index 0000000..bcaf4c8 --- /dev/null +++ b/ccb-shared-context @@ -0,0 +1 @@ +Subproject commit bcaf4c837a83835f2a0981c1c2b61f56946bb0fd diff --git a/ccb-status b/ccb-status new file mode 160000 index 0000000..e38dee7 --- /dev/null +++ b/ccb-status @@ -0,0 +1 @@ +Subproject commit e38dee775585a48feb40971326d2ca86462df34c diff --git a/ccb-worktree b/ccb-worktree new file mode 160000 index 0000000..95ebe9e --- /dev/null +++ b/ccb-worktree @@ -0,0 +1 @@ +Subproject commit 95ebe9e3b8809774b3cc83672a670f24db688058 diff --git a/install.sh b/install.sh index 56bc0cf..31fe8bf 100755 --- a/install.sh +++ b/install.sh @@ -123,6 +123,10 @@ SCRIPTS_TO_LINK=( bin/maild bin/ctx-transfer ccb + ccb-multi/bin/ccb-multi + ccb-multi/bin/ccb-multi-clean + ccb-multi/bin/ccb-multi-history + ccb-multi/bin/ccb-multi-status ) CLAUDE_MARKDOWN=( @@ -598,6 +602,61 @@ copy_project() { fi } +install_ccb_multi_deps() { + local multi_dir="$INSTALL_PREFIX/ccb-multi" + + if [[ ! -d "$multi_dir" ]]; then + echo "WARN: ccb-multi directory not found, skipping npm install" + return + fi + + # Check if npm is available + if ! command -v npm >/dev/null 2>&1; then + echo "WARN: npm not found, skipping ccb-multi dependencies installation" + echo " ccb-multi requires Node.js and npm to be installed" + return + fi + + echo "Installing ccb-multi dependencies..." + if (cd "$multi_dir" && npm install --production --silent >/dev/null 2>&1); then + echo "OK: ccb-multi dependencies installed" + # Build TypeScript if needed + if [[ -f "$multi_dir/tsconfig.json" ]]; then + echo "Building ccb-multi..." + if (cd "$multi_dir" && npm run build >/dev/null 2>&1); then + echo "OK: ccb-multi built successfully" + else + echo "WARN: Failed to build ccb-multi" + fi + fi + else + echo "WARN: Failed to install ccb-multi dependencies" + echo " You can manually run: cd $multi_dir && npm install" + fi + + # Install and build other subpackages + for subpkg in ccb-status ccb-worktree ccb-shared-context; do + local pkg_dir="$INSTALL_PREFIX/$subpkg" + if [[ -d "$pkg_dir" ]]; then + echo "Installing $subpkg dependencies..." + if (cd "$pkg_dir" && npm install --production --silent >/dev/null 2>&1); then + echo "OK: $subpkg dependencies installed" + # Build TypeScript if needed + if [[ -f "$pkg_dir/tsconfig.json" ]]; then + echo "Building $subpkg..." + if (cd "$pkg_dir" && npm run build >/dev/null 2>&1); then + echo "OK: $subpkg built successfully" + else + echo "WARN: Failed to build $subpkg" + fi + fi + else + echo "WARN: Failed to install $subpkg dependencies" + fi + fi + done +} + install_bin_links() { mkdir -p "$BIN_DIR" @@ -1509,6 +1568,7 @@ install_all() { cleanup_legacy_files save_wezterm_config copy_project + install_ccb_multi_deps install_bin_links ensure_path_configured install_claude_commands @@ -1529,6 +1589,7 @@ install_all() { echo " AGENTS.md configured with review rubrics" echo " .clinerules configured with role assignments" echo " Global settings.json permissions added" + echo " ccb-multi tools installed" } uninstall_claude_md_config() { diff --git a/lib/askd_server.py b/lib/askd_server.py index 932a759..e64e996 100644 --- a/lib/askd_server.py +++ b/lib/askd_server.py @@ -86,6 +86,10 @@ def __init__( self.managed = env_managed if managed is None else bool(managed) if self.parent_pid: self.managed = True + self._heartbeat_thread = None + self._heartbeat_stop_event = None + self._started_at = None + self._state_write_lock = threading.Lock() # Serialize persistent state writes def serve_forever(self) -> int: run_dir().mkdir(parents=True, exist_ok=True) @@ -236,14 +240,32 @@ def _parent_monitor() -> None: actual_host, actual_port = httpd.server_address self._write_state(str(actual_host), int(actual_port)) + self._started_at = time.strftime("%Y-%m-%d %H:%M:%S") + self._write_persistent_state("running") + self._start_heartbeat_thread() write_log( log_path(self.spec.log_file_name), f"[INFO] {self.spec.daemon_key} started pid={os.getpid()} addr={actual_host}:{actual_port}", ) + + crashed = False + crash_reason = "" try: httpd.serve_forever(poll_interval=0.2) + except Exception as e: + # Unexpected crash during serve + crashed = True + crash_reason = f"Exception: {e}" + write_log(log_path(self.spec.log_file_name), f"[ERROR] {self.spec.daemon_key} crashed: {e}") + self._stop_heartbeat_thread() + self._write_persistent_state("crashed", crash_reason) + raise finally: write_log(log_path(self.spec.log_file_name), f"[INFO] {self.spec.daemon_key} stopped") + self._stop_heartbeat_thread() + # Only write stopped if not crashed + if not crashed: + self._write_persistent_state("stopped", "graceful shutdown") if self.on_stop: try: self.on_stop() @@ -277,3 +299,71 @@ def _write_state(self, host: str, port: int) -> None: os.chmod(self.state_file, 0o600) except Exception: pass + + def _write_persistent_state(self, status: str, exit_reason: str = "", exit_code: int = 0) -> None: + """Write persistent state to askd.last.json for debugging and observability.""" + with self._state_write_lock: # Serialize writes + last_state_file = self.state_file.parent / f"{self.state_file.stem}.last.json" + payload = { + "status": status, # running, stopping, stopped, crashed + "pid": os.getpid(), + "started_at": self._started_at, + "heartbeat_at": time.strftime("%Y-%m-%d %H:%M:%S"), + "work_dir": self.work_dir, + "parent_pid": int(self.parent_pid or 0) or None, + "managed": bool(self.managed), + } + if status in ("stopped", "crashed"): + payload["stopped_at"] = time.strftime("%Y-%m-%d %H:%M:%S") + if exit_reason: + payload["exit_reason"] = exit_reason + if exit_code: + payload["exit_code"] = exit_code + try: + last_state_file.parent.mkdir(parents=True, exist_ok=True) + safe_write_session(last_state_file, json.dumps(payload, ensure_ascii=False, indent=2) + "\n") + except Exception: + pass + + def _start_heartbeat_thread(self) -> None: + """Start heartbeat thread to periodically update state file.""" + # Restart if thread exists but is dead + if self._heartbeat_thread is not None: + if not self._heartbeat_thread.is_alive(): + self._heartbeat_thread = None + self._heartbeat_stop_event = None + else: + return # Already running + + self._heartbeat_stop_event = threading.Event() + self._heartbeat_thread = threading.Thread( + target=self._heartbeat_loop, + daemon=True, + name="askd-heartbeat" + ) + self._heartbeat_thread.start() + + def _heartbeat_loop(self) -> None: + """Periodically update heartbeat_at in state file.""" + # Validate and clamp interval + try: + interval = float(os.environ.get("CCB_HEARTBEAT_INTERVAL_S", "2")) + interval = max(0.5, min(interval, 60.0)) # Clamp to 0.5-60 seconds + except (ValueError, TypeError): + interval = 2.0 + + while not self._heartbeat_stop_event.wait(interval): + try: + self._write_persistent_state("running") + except Exception: + pass + + def _stop_heartbeat_thread(self) -> None: + """Stop heartbeat thread.""" + if self._heartbeat_stop_event: + self._heartbeat_stop_event.set() + if self._heartbeat_thread: + self._heartbeat_thread.join(timeout=0.5) + # Only clear if thread actually stopped + if not self._heartbeat_thread.is_alive(): + self._heartbeat_thread = None diff --git a/lib/gemini_comm.py b/lib/gemini_comm.py index bb00bc4..ebc71d2 100755 --- a/lib/gemini_comm.py +++ b/lib/gemini_comm.py @@ -8,7 +8,6 @@ import hashlib import json import os -import re import sys import time import threading @@ -34,101 +33,72 @@ _GEMINI_HASH_CACHE_TS = 0.0 -def _slugify_project_hash(name: str) -> str: - """Return Gemini-compatible slug for a project directory name.""" - text = (name or "").strip().lower() - text = re.sub(r"[^a-z0-9]+", "-", text) - return text.strip("-") +def _is_ccb_instance_dir(work_dir: Path) -> bool: + """Detect ccb-multi instance directories. -def _compute_project_hashes(work_dir: Optional[Path] = None) -> tuple[str, str]: - """Return ``(slug_hash, sha256_hash)`` for *work_dir*. + These have generic basenames that collide across projects, so SHA-256 + should be preferred over basename for Gemini session lookup. - Gemini CLI >= 0.29.0 uses a slugified basename for project directories; - older versions used a SHA-256 hash of the absolute path. We compute both - so the caller can try each one. + Detection (any match = True): + 1) CCB_INSTANCE_ID env var is set (ccb-multi always sets this) + 2) Parent directory is named '.ccb-instances' """ - path = work_dir or Path.cwd() + if os.environ.get("CCB_INSTANCE_ID", "").strip(): + return True try: - abs_path = path.expanduser().absolute() + abs_path = work_dir.expanduser().absolute() except Exception: - abs_path = path - basename_hash = _slugify_project_hash(abs_path.name) - sha256_hash = hashlib.sha256(str(abs_path).encode()).hexdigest() - return basename_hash, sha256_hash + abs_path = work_dir + return abs_path.parent.name == ".ccb-instances" -def _project_hash_candidates(work_dir: Optional[Path] = None, *, root: Optional[Path] = None) -> list[str]: - """Return ordered project-hash candidates for this work directory. +def _compute_project_hashes(work_dir: Optional[Path] = None) -> tuple[str, str]: + """Return ``(basename_hash, sha256_hash)`` for *work_dir*. - Supports Gemini's historical SHA-256 layout and modern slug-based layouts, - including collision-suffixed directories like ``name-1``. + Gemini CLI >= 0.29.0 uses the directory basename; older versions used + SHA-256 of the absolute path. Callers that need to probe both formats + should use this helper. """ path = work_dir or Path.cwd() try: abs_path = path.expanduser().absolute() except Exception: abs_path = path - - raw_base = (abs_path.name or "").strip() - slug_base, sha256_hash = _compute_project_hashes(abs_path) - suffix_re = re.compile(rf"^{re.escape(slug_base)}-\d+$") if slug_base else None - - candidates: list[str] = [] - seen: set[str] = set() - - def _add(value: str) -> None: - token = (value or "").strip() - if not token or token in seen: - return - seen.add(token) - candidates.append(token) - - root_path = Path(root).expanduser() if root else None - discovered: list[tuple[float, str]] = [] - if root_path and root_path.is_dir() and slug_base: - try: - for child in root_path.iterdir(): - if not child.is_dir(): - continue - chats = child / "chats" - if not chats.is_dir(): - continue - name = child.name - if name == slug_base or name == raw_base or (suffix_re and suffix_re.match(name)): - try: - latest_mtime = max( - (p.stat().st_mtime for p in chats.glob("session-*.json") if p.is_file()), - default=chats.stat().st_mtime, - ) - except OSError: - latest_mtime = 0.0 - discovered.append((latest_mtime, name)) - except OSError: - pass - - for _mtime, name in sorted(discovered, key=lambda item: item[0], reverse=True): - _add(name) - _add(slug_base) - _add(raw_base) - _add(sha256_hash) - return candidates + basename_hash = abs_path.name + sha256_hash = hashlib.sha256(str(abs_path).encode()).hexdigest() + return basename_hash, sha256_hash def _get_project_hash(work_dir: Optional[Path] = None) -> str: - """Return the Gemini session directory name for *work_dir*. + """Return the *primary* project hash for *work_dir*. - Prefers discovered slug-based directories (including collision suffixes), - falls back to SHA-256 (older versions), and defaults to slug basename for - forward compatibility. + Prefers the new basename format when its ``chats/`` directory exists, + falls back to SHA-256, and defaults to basename for forward compat. + + For ccb-multi instance directories (e.g. instance-1), prefer SHA-256 + to avoid cross-project basename collisions. """ path = work_dir or Path.cwd() + basename_hash, sha256_hash = _compute_project_hashes(path) root = Path(os.environ.get("GEMINI_ROOT") or (Path.home() / ".gemini" / "tmp")).expanduser() - candidates = _project_hash_candidates(path, root=root) - for project_hash in candidates: - if (root / project_hash / "chats").is_dir(): - return project_hash - return candidates[0] if candidates else "" + is_instance = _is_ccb_instance_dir(path) + + if is_instance: + # Old instance dirs (instance-1, ...) have generic basenames that + # collide across projects. New format (inst--N) is unique, but + # we still prefer SHA-256 for backward compat with old instances. + if (root / sha256_hash / "chats").is_dir(): + return sha256_hash + if (root / basename_hash / "chats").is_dir(): + return basename_hash + return sha256_hash + else: + if (root / basename_hash / "chats").is_dir(): + return basename_hash + if (root / sha256_hash / "chats").is_dir(): + return sha256_hash + return basename_hash def _iter_registry_work_dirs() -> list[Path]: @@ -164,10 +134,9 @@ def _work_dirs_for_hash(project_hash: str) -> list[Path]: _GEMINI_HASH_CACHE = {} for wd in _iter_registry_work_dirs(): try: - # Register all known hash candidates so the watchdog can match - # slug, slug-suffixed, and legacy SHA-256 layouts. - hashes = _project_hash_candidates(wd, root=GEMINI_ROOT) - for h in hashes: + # Register both hash formats so the watchdog can match either + bn, sha = _compute_project_hashes(wd) + for h in (bn, sha): _GEMINI_HASH_CACHE.setdefault(h, []).append(wd) except Exception: continue @@ -260,13 +229,13 @@ def __init__(self, root: Path = GEMINI_ROOT, work_dir: Optional[Path] = None): forced_hash = os.environ.get("GEMINI_PROJECT_HASH", "").strip() if forced_hash: self._project_hash = forced_hash - self._all_known_hashes = {forced_hash} + self._all_known_hashes: set[str] = {forced_hash} else: self._project_hash = _get_project_hash(self.work_dir) - # Store all known hashes so they survive hash adoption and Gemini - # hash-format changes. - self._all_known_hashes = set(_project_hash_candidates(self.work_dir, root=self.root)) - self._all_known_hashes.add(self._project_hash) + bn, sha = _compute_project_hashes(self.work_dir) + # Store all known hashes so they survive hash adoption + self._all_known_hashes = {bn, sha} + self._is_instance = _is_ccb_instance_dir(self.work_dir) self._preferred_session: Optional[Path] = None try: poll = float(os.environ.get("GEMINI_POLL_INTERVAL", "0.05")) @@ -309,51 +278,49 @@ def _scan_latest_session_any_project(self) -> Optional[Path]: return sessions[-1] if sessions else None def _scan_latest_session(self) -> Optional[Path]: - # Build scan order: primary hash first, then all known alternatives - scan_order = [self._project_hash] - if hasattr(self, "_all_known_hashes"): - for h in sorted(self._all_known_hashes - {self._project_hash}): - scan_order.append(h) - # Deduplicate while preserving order + """Scan for the latest session file across all known hash dirs.""" + best: Optional[Path] = None + best_mtime: float = 0.0 + + # De-duplicate: primary first, then remaining known hashes seen: set[str] = set() - unique_order: list[str] = [] - for project_hash in scan_order: - if project_hash not in seen: - seen.add(project_hash) - unique_order.append(project_hash) + scan_order: list[str] = [] + for h in [self._project_hash] + sorted(self._all_known_hashes - {self._project_hash}): + if h not in seen: + seen.add(h) + scan_order.append(h) - best: Optional[Path] = None - best_mtime = 0.0 - winning_hash = self._project_hash - for project_hash in unique_order: + for project_hash in scan_order: chats = self.root / project_hash / "chats" if not chats.is_dir(): continue try: - for p in chats.iterdir(): + for p in chats.glob("session-*.json"): if not p.is_file() or p.name.startswith("."): continue - if not (p.suffix == ".json" and p.name.startswith("session-")): - continue try: mt = p.stat().st_mtime except OSError: continue if mt > best_mtime: - best_mtime = mt best = p - winning_hash = project_hash + best_mtime = mt except OSError: continue - if best: - # Auto-adopt the winning hash if it changed - if winning_hash != self._project_hash: - self._project_hash = winning_hash - self._debug(f"Adopted project hash: {winning_hash}") - return best + # If the winning session lives under a different hash, adopt it + # and add it to known hashes so future scans still cover all formats + if best is not None: + try: + winning_hash = best.parent.parent.name + if winning_hash: + self._all_known_hashes.add(winning_hash) + if winning_hash != self._project_hash: + self._project_hash = winning_hash + except Exception: + pass - return None + return best def _latest_session(self) -> Optional[Path]: preferred = self._preferred_session @@ -364,6 +331,16 @@ def _latest_session(self) -> Optional[Path]: if preferred and preferred.exists(): if scanned and scanned.exists(): try: + # For ccb-multi instance dirs, only accept scanned session + # from the SAME hash directory to prevent cross-project + # contamination via shared basename (old "instance-N" format). + if self._is_instance: + pref_hash = preferred.parent.parent.name + scan_hash = scanned.parent.parent.name + if pref_hash != scan_hash: + self._debug(f"Instance mode: ignoring cross-hash scan {scanned}") + return preferred + pref_mtime = preferred.stat().st_mtime scan_mtime = scanned.stat().st_mtime if scan_mtime > pref_mtime: