diff --git a/SWEBENCH_MODULE_README.md b/SWEBENCH_MODULE_README.md new file mode 100644 index 0000000..bb39657 --- /dev/null +++ b/SWEBENCH_MODULE_README.md @@ -0,0 +1,166 @@ +# TProfiler SWE-bench 性能分析和评测模块 + +## 概述 + +SWE-bench模块是TProfiler的一个扩展功能,用于评测AI模型在软件工程任务上的性能表现。该模块实现了对AI模型解决真实GitHub issues能力的自动化评测。 + +## 主要功能 + +1. **任务管理** + - 支持从GitHub加载真实的软件工程任务 + - 支持自定义任务创建 + - 任务难度分级和分类 + +2. **模型评测** + - 支持多种主流AI模型(GPT-4、Claude、Llama等) + - 自动调用模型API生成解决方案 + - 在Docker容器中安全执行代码 + +3. **性能分析** + - 执行时间统计 + - 资源使用监控(CPU、内存) + - API调用和Token使用统计 + - 成本估算 + +4. **测试验证** + - 自动应用生成的补丁 + - 运行项目测试套件 + - 解析测试结果 + +5. **报告生成** + - 多格式报告(文本、HTML、JSON、CSV) + - 详细的性能指标 + - 可视化结果展示 + +## 使用方法 + +### 1. 配置 + +编辑 `swebench.properties` 文件: + +```properties +# 基本配置 +swebench.parallel.tasks=4 +swebench.task.timeout=30 +swebench.max.retry=3 + +# 模型API配置 +swebench.model.api.url=https://api.openai.com/v1/completions +swebench.model.api.key=your-api-key-here +swebench.model.max.tokens=4096 + +# Docker配置 +swebench.docker.image=swebench/eval:latest + +# 数据集类型:full, lite, verified +swebench.dataset.type=lite +``` + +### 2. 启动评测 + +#### 命令行模式 + +```bash +# 开始评测 +./swebench-client start GPT-4 + +# 停止评测 +./swebench-client stop + +# 查看状态 +./swebench-client status + +# 列出支持的模型 +./swebench-client list + +# 查看帮助 +./swebench-client help +``` + +#### 交互模式 + +直接运行 `./swebench-client` 进入交互式菜单。 + +### 3. 查看结果 + +评测完成后,报告会保存在配置的报告路径下(默认为 `~/swebench-reports`): + +- `swebench__.txt` - 文本报告 +- `swebench__.html` - HTML报告(可在浏览器中查看) +- `swebench__.json` - JSON格式(便于程序处理) +- `swebench__.csv` - CSV格式(可导入Excel) +- `swebench_summary.txt` - 汇总报告 + +## 架构设计 + +``` +com.taobao.profile.swebench/ +├── SWEBenchManager.java # 核心管理器 +├── SWEBenchConfig.java # 配置管理 +├── task/ +│ ├── SWEBenchTask.java # 任务定义 +│ └── TaskResult.java # 任务结果 +├── evaluator/ +│ ├── ModelEvaluator.java # 模型评估器 +│ ├── DockerEnvironment.java # Docker环境管理 +│ ├── ModelInterface.java # 模型接口 +│ └── TestExecutor.java # 测试执行器 +├── reporter/ +│ └── BenchmarkReporter.java # 报告生成器 +└── client/ + └── SWEBenchClient.java # 客户端程序 +``` + +## 性能指标 + +评测报告包含以下关键指标: + +1. **成功率**:成功解决的任务占比 +2. **执行时间**:每个任务的执行耗时 +3. **测试通过率**:生成代码的测试覆盖度 +4. **资源使用**:CPU、内存使用情况 +5. **API调用**:模型API调用次数 +6. **Token使用**:总Token消耗量 +7. **成本估算**:基于Token使用的成本 + +## 集成TProfiler + +SWE-bench模块与TProfiler深度集成: + +1. 使用TProfiler的性能分析功能监控评测过程 +2. 利用TProfiler的线程分析追踪并发任务执行 +3. 通过TProfiler的慢查询分析优化Docker操作 + +## 扩展性 + +该模块设计为易于扩展: + +1. **添加新模型**:实现 `ModelInterface` 接口 +2. **自定义任务源**:扩展任务加载逻辑 +3. **新的报告格式**:在 `BenchmarkReporter` 中添加新方法 +4. **测试框架支持**:扩展 `TestExecutor` 的解析逻辑 + +## 依赖要求 + +- Java 6+ +- Docker +- 网络连接(用于调用模型API和下载GitHub仓库) + +## 注意事项 + +1. 确保Docker已正确安装和配置 +2. 模型API密钥请妥善保管 +3. 评测过程可能耗时较长,建议在服务器上运行 +4. 注意API调用成本,合理设置并行任务数 + +## 未来计划 + +1. 支持更多编程语言(目前主要支持Python) +2. 增加更多模型支持 +3. 实现分布式评测 +4. 添加实时监控界面 +5. 支持自定义评测指标 + +## 贡献 + +欢迎提交Issue和Pull Request来改进这个模块! \ No newline at end of file diff --git a/pkg/TProfiler/bin/swebench-client b/pkg/TProfiler/bin/swebench-client new file mode 100644 index 0000000..6ec656b --- /dev/null +++ b/pkg/TProfiler/bin/swebench-client @@ -0,0 +1,7 @@ +#!/bin/sh + +. $(dirname $0)/common-env + +MAIN_CLASS=com.taobao.profile.swebench.client.SWEBenchClient + +exec "$JAVACMD" -classpath $CLASS_PATH $MAIN_CLASS "$@" \ No newline at end of file diff --git a/pkg/TProfiler/bin/swebench-client.bat b/pkg/TProfiler/bin/swebench-client.bat new file mode 100644 index 0000000..920db6c --- /dev/null +++ b/pkg/TProfiler/bin/swebench-client.bat @@ -0,0 +1,2 @@ +@echo off +call startup.bat com.taobao.profile.swebench.client.SWEBenchClient \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/Manager.java b/src/main/java/com/taobao/profile/Manager.java index 4644152..8a935f7 100644 --- a/src/main/java/com/taobao/profile/Manager.java +++ b/src/main/java/com/taobao/profile/Manager.java @@ -47,6 +47,18 @@ public class Manager { * 远程刷出方法数据 */ public static final String FLUSHMETHOD = "flushmethod"; + /** + * 启动SWE-bench评测 + */ + public static final String SWEBENCH_START = "swebench_start"; + /** + * 停止SWE-bench评测 + */ + public static final String SWEBENCH_STOP = "swebench_stop"; + /** + * 查询SWE-bench状态 + */ + public static final String SWEBENCH_STATUS = "swebench_status"; /** * 是否用纳秒采集 */ diff --git a/src/main/java/com/taobao/profile/swebench/SWEBenchConfig.java b/src/main/java/com/taobao/profile/swebench/SWEBenchConfig.java new file mode 100644 index 0000000..072473b --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/SWEBenchConfig.java @@ -0,0 +1,241 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * SWE-bench评测配置 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class SWEBenchConfig { + + /** + * 默认配置文件名 + */ + private static final String CONFIG_FILE = "swebench.properties"; + + /** + * 并行任务数 + */ + private int parallelTaskCount = 4; + + /** + * 单个任务超时时间(分钟) + */ + private int taskTimeoutMinutes = 30; + + /** + * 最大重试次数 + */ + private int maxRetryCount = 3; + + /** + * 报告输出路径 + */ + private String reportPath = System.getProperty("user.home") + "/swebench-reports"; + + /** + * 任务数据路径 + */ + private String taskDataPath = System.getProperty("user.home") + "/swebench-tasks"; + + /** + * 是否启用性能分析 + */ + private boolean enableProfiling = true; + + /** + * 是否保存中间结果 + */ + private boolean saveIntermediateResults = true; + + /** + * Docker镜像名称 + */ + private String dockerImage = "swebench/eval:latest"; + + /** + * 评测数据集类型 + */ + private String datasetType = "lite"; // full, lite, verified + + /** + * 模型API配置 + */ + private String modelApiUrl; + private String modelApiKey; + private int modelMaxTokens = 4096; + + public SWEBenchConfig() { + loadConfig(); + } + + /** + * 从配置文件加载配置 + */ + private void loadConfig() { + Properties props = new Properties(); + + // 尝试从多个位置加载配置文件 + File[] configLocations = { + new File(CONFIG_FILE), + new File(System.getProperty("user.home") + "/.tprofiler/" + CONFIG_FILE), + new File("conf/" + CONFIG_FILE) + }; + + for (File configFile : configLocations) { + if (configFile.exists()) { + try (FileReader reader = new FileReader(configFile)) { + props.load(reader); + parseProperties(props); + System.out.println("加载SWE-bench配置文件: " + configFile.getAbsolutePath()); + return; + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + // 使用默认配置 + System.out.println("未找到SWE-bench配置文件,使用默认配置"); + } + + /** + * 解析配置属性 + */ + private void parseProperties(Properties props) { + // 基本配置 + parallelTaskCount = Integer.parseInt(props.getProperty("swebench.parallel.tasks", "4")); + taskTimeoutMinutes = Integer.parseInt(props.getProperty("swebench.task.timeout", "30")); + maxRetryCount = Integer.parseInt(props.getProperty("swebench.max.retry", "3")); + + // 路径配置 + reportPath = props.getProperty("swebench.report.path", reportPath); + taskDataPath = props.getProperty("swebench.task.path", taskDataPath); + + // 功能开关 + enableProfiling = Boolean.parseBoolean(props.getProperty("swebench.enable.profiling", "true")); + saveIntermediateResults = Boolean.parseBoolean(props.getProperty("swebench.save.intermediate", "true")); + + // Docker配置 + dockerImage = props.getProperty("swebench.docker.image", dockerImage); + + // 数据集配置 + datasetType = props.getProperty("swebench.dataset.type", "lite"); + + // 模型API配置 + modelApiUrl = props.getProperty("swebench.model.api.url"); + modelApiKey = props.getProperty("swebench.model.api.key"); + modelMaxTokens = Integer.parseInt(props.getProperty("swebench.model.max.tokens", "4096")); + } + + // Getters and setters + + public int getParallelTaskCount() { + return parallelTaskCount; + } + + public void setParallelTaskCount(int parallelTaskCount) { + this.parallelTaskCount = parallelTaskCount; + } + + public int getTaskTimeoutMinutes() { + return taskTimeoutMinutes; + } + + public void setTaskTimeoutMinutes(int taskTimeoutMinutes) { + this.taskTimeoutMinutes = taskTimeoutMinutes; + } + + public int getMaxRetryCount() { + return maxRetryCount; + } + + public void setMaxRetryCount(int maxRetryCount) { + this.maxRetryCount = maxRetryCount; + } + + public String getReportPath() { + return reportPath; + } + + public void setReportPath(String reportPath) { + this.reportPath = reportPath; + } + + public String getTaskDataPath() { + return taskDataPath; + } + + public void setTaskDataPath(String taskDataPath) { + this.taskDataPath = taskDataPath; + } + + public boolean isEnableProfiling() { + return enableProfiling; + } + + public void setEnableProfiling(boolean enableProfiling) { + this.enableProfiling = enableProfiling; + } + + public boolean isSaveIntermediateResults() { + return saveIntermediateResults; + } + + public void setSaveIntermediateResults(boolean saveIntermediateResults) { + this.saveIntermediateResults = saveIntermediateResults; + } + + public String getDockerImage() { + return dockerImage; + } + + public void setDockerImage(String dockerImage) { + this.dockerImage = dockerImage; + } + + public String getDatasetType() { + return datasetType; + } + + public void setDatasetType(String datasetType) { + this.datasetType = datasetType; + } + + public String getModelApiUrl() { + return modelApiUrl; + } + + public void setModelApiUrl(String modelApiUrl) { + this.modelApiUrl = modelApiUrl; + } + + public String getModelApiKey() { + return modelApiKey; + } + + public void setModelApiKey(String modelApiKey) { + this.modelApiKey = modelApiKey; + } + + public int getModelMaxTokens() { + return modelMaxTokens; + } + + public void setModelMaxTokens(int modelMaxTokens) { + this.modelMaxTokens = modelMaxTokens; + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/swebench/SWEBenchManager.java b/src/main/java/com/taobao/profile/swebench/SWEBenchManager.java new file mode 100644 index 0000000..5faf412 --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/SWEBenchManager.java @@ -0,0 +1,234 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +import com.taobao.profile.Manager; +import com.taobao.profile.swebench.task.SWEBenchTask; +import com.taobao.profile.swebench.task.TaskResult; +import com.taobao.profile.swebench.evaluator.ModelEvaluator; +import com.taobao.profile.swebench.reporter.BenchmarkReporter; +import com.taobao.profile.swebench.loader.TaskLoader; + +/** + * SWE-bench评测管理器 + * 负责协调AI模型在软件工程任务上的性能评测 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class SWEBenchManager { + + private static SWEBenchManager instance = new SWEBenchManager(); + + /** + * 线程池用于并行执行评测任务 + */ + private ExecutorService executorService; + + /** + * 任务列表 + */ + private List tasks; + + /** + * 模型评估器 + */ + private ModelEvaluator evaluator; + + /** + * 报告生成器 + */ + private BenchmarkReporter reporter; + + /** + * 是否正在运行 + */ + private volatile boolean isRunning = false; + + /** + * 评测配置 + */ + private SWEBenchConfig config; + + private SWEBenchManager() { + this.tasks = new ArrayList<>(); + this.config = new SWEBenchConfig(); + } + + /** + * 获取单例实例 + */ + public static SWEBenchManager getInstance() { + return instance; + } + + /** + * 初始化评测环境 + */ + public void initialize() { + if (Manager.instance().isDebugMode()) { + System.out.println("初始化SWE-bench评测环境..."); + } + + // 创建线程池 + int threadCount = config.getParallelTaskCount(); + executorService = Executors.newFixedThreadPool(threadCount); + + // 初始化评估器和报告器 + evaluator = new ModelEvaluator(config); + reporter = new BenchmarkReporter(config); + + // 加载任务 + loadTasks(); + } + + /** + * 加载评测任务 + */ + private void loadTasks() { + tasks.clear(); + + try { + // 根据配置的数据集类型加载任务 + String datasetType = config.getDatasetType(); + + if ("sample".equals(datasetType)) { + // 加载示例任务 + tasks.addAll(TaskLoader.loadSampleTasks()); + } else if ("csv".equals(datasetType)) { + // 从CSV文件加载 + String csvPath = config.getTaskDataPath() + "/swebench_tasks.csv"; + tasks.addAll(TaskLoader.loadFromCsv(csvPath)); + } else if ("json".equals(datasetType)) { + // 从JSON文件加载 + String jsonPath = config.getTaskDataPath() + "/swebench_tasks.json"; + tasks.addAll(TaskLoader.loadFromJson(jsonPath)); + } else { + // 默认加载示例任务 + tasks.addAll(TaskLoader.loadSampleTasks()); + } + + if (Manager.instance().isDebugMode()) { + System.out.println("成功加载SWE-bench任务,任务数: " + tasks.size()); + for (SWEBenchTask task : tasks) { + System.out.println(" - " + task.getTaskId() + ": " + task.getIssueTitle()); + } + } + } catch (Exception e) { + System.err.println("加载任务失败: " + e.getMessage()); + e.printStackTrace(); + // 加载失败时使用示例任务 + tasks.addAll(TaskLoader.loadSampleTasks()); + } + } + + /** + * 开始评测 + * + * @param modelName 要评测的模型名称 + * @return 是否成功开始 + */ + public boolean startBenchmark(String modelName) { + if (isRunning) { + System.err.println("评测已在运行中"); + return false; + } + + isRunning = true; + System.out.println("开始SWE-bench评测,模型: " + modelName); + + // 记录开始时间 + long startTime = System.currentTimeMillis(); + + List results = new ArrayList<>(); + + try { + // 执行所有任务 + for (SWEBenchTask task : tasks) { + TaskResult result = evaluator.evaluateTask(task, modelName); + results.add(result); + + // 实时输出进度 + if (Manager.instance().isDebugMode()) { + System.out.println("完成任务: " + task.getTaskId() + + ", 成功: " + result.isSuccess()); + } + } + + // 生成报告 + reporter.generateReport(modelName, results, startTime); + + } catch (Exception e) { + e.printStackTrace(); + return false; + } finally { + isRunning = false; + } + + return true; + } + + /** + * 停止评测 + */ + public void stopBenchmark() { + if (!isRunning) { + return; + } + + System.out.println("停止SWE-bench评测..."); + isRunning = false; + + if (executorService != null) { + executorService.shutdownNow(); + try { + executorService.awaitTermination(30, TimeUnit.SECONDS); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + /** + * 获取评测状态 + */ + public String getStatus() { + return isRunning ? "运行中" : "已停止"; + } + + /** + * 添加自定义任务 + */ + public void addTask(SWEBenchTask task) { + tasks.add(task); + } + + /** + * 获取配置 + */ + public SWEBenchConfig getConfig() { + return config; + } + + /** + * 清理资源 + */ + public void shutdown() { + stopBenchmark(); + if (executorService != null && !executorService.isShutdown()) { + executorService.shutdown(); + } + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/swebench/client/SWEBenchClient.java b/src/main/java/com/taobao/profile/swebench/client/SWEBenchClient.java new file mode 100644 index 0000000..45b7c1f --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/client/SWEBenchClient.java @@ -0,0 +1,319 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench.client; + +import java.io.*; +import java.net.Socket; +import java.util.Scanner; + +import com.taobao.profile.swebench.SWEBenchManager; +import com.taobao.profile.swebench.task.SWEBenchTask; + +/** + * SWE-bench客户端 + * 用于启动和管理SWE-bench评测 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class SWEBenchClient { + + private static final String VERSION = "1.0.0"; + + public static void main(String[] args) { + SWEBenchClient client = new SWEBenchClient(); + + if (args.length == 0) { + client.interactiveMode(); + } else { + client.commandMode(args); + } + } + + /** + * 命令行模式 + */ + private void commandMode(String[] args) { + String command = args[0].toLowerCase(); + + switch (command) { + case "start": + if (args.length < 2) { + System.err.println("用法: swebench-client start "); + System.exit(1); + } + startBenchmark(args[1]); + break; + + case "stop": + stopBenchmark(); + break; + + case "status": + getStatus(); + break; + + case "help": + case "-h": + case "--help": + printHelp(); + break; + + case "version": + case "-v": + case "--version": + System.out.println("SWE-bench Client " + VERSION); + break; + + case "list": + listModels(); + break; + + case "config": + showConfig(); + break; + + default: + System.err.println("未知命令: " + command); + System.err.println("使用 'swebench-client help' 查看帮助"); + System.exit(1); + } + } + + /** + * 交互模式 + */ + private void interactiveMode() { + Scanner scanner = new Scanner(System.in); + + System.out.println("====================================="); + System.out.println("SWE-bench 评测客户端 v" + VERSION); + System.out.println("====================================="); + System.out.println(); + + printMenu(); + + while (true) { + System.out.print("\n请选择操作: "); + String input = scanner.nextLine().trim(); + + switch (input) { + case "1": + System.out.print("请输入模型名称: "); + String modelName = scanner.nextLine().trim(); + startBenchmark(modelName); + break; + + case "2": + stopBenchmark(); + break; + + case "3": + getStatus(); + break; + + case "4": + listModels(); + break; + + case "5": + showConfig(); + break; + + case "6": + addCustomTask(scanner); + break; + + case "0": + case "q": + case "quit": + case "exit": + System.out.println("退出程序"); + System.exit(0); + break; + + default: + System.out.println("无效的选择,请重试"); + } + + printMenu(); + } + } + + /** + * 打印菜单 + */ + private void printMenu() { + System.out.println("\n----- 菜单 -----"); + System.out.println("1. 开始评测"); + System.out.println("2. 停止评测"); + System.out.println("3. 查看状态"); + System.out.println("4. 列出支持的模型"); + System.out.println("5. 查看配置"); + System.out.println("6. 添加自定义任务"); + System.out.println("0. 退出"); + System.out.println("----------------"); + } + + /** + * 开始评测 + */ + private void startBenchmark(String modelName) { + try { + System.out.println("正在启动SWE-bench评测..."); + System.out.println("模型: " + modelName); + + // 初始化评测管理器 + SWEBenchManager manager = SWEBenchManager.getInstance(); + manager.initialize(); + + // 启动评测 + boolean success = manager.startBenchmark(modelName); + + if (success) { + System.out.println("评测已完成"); + } else { + System.err.println("评测失败"); + } + + } catch (Exception e) { + System.err.println("启动评测时出错: " + e.getMessage()); + e.printStackTrace(); + } + } + + /** + * 停止评测 + */ + private void stopBenchmark() { + try { + System.out.println("正在停止评测..."); + + SWEBenchManager manager = SWEBenchManager.getInstance(); + manager.stopBenchmark(); + + System.out.println("评测已停止"); + + } catch (Exception e) { + System.err.println("停止评测时出错: " + e.getMessage()); + } + } + + /** + * 获取状态 + */ + private void getStatus() { + try { + SWEBenchManager manager = SWEBenchManager.getInstance(); + String status = manager.getStatus(); + + System.out.println("当前状态: " + status); + + } catch (Exception e) { + System.err.println("获取状态时出错: " + e.getMessage()); + } + } + + /** + * 列出支持的模型 + */ + private void listModels() { + System.out.println("\n支持的模型:"); + System.out.println("- GPT-4"); + System.out.println("- GPT-3.5-turbo"); + System.out.println("- Claude-2"); + System.out.println("- Claude-instant"); + System.out.println("- Llama-2-70b"); + System.out.println("- CodeLlama-34b"); + System.out.println("- StarCoder"); + System.out.println("- Custom (需要配置API)"); + } + + /** + * 显示配置 + */ + private void showConfig() { + try { + SWEBenchManager manager = SWEBenchManager.getInstance(); + manager.initialize(); + + System.out.println("\n当前配置:"); + System.out.println("并行任务数: " + manager.getConfig().getParallelTaskCount()); + System.out.println("任务超时: " + manager.getConfig().getTaskTimeoutMinutes() + " 分钟"); + System.out.println("最大重试: " + manager.getConfig().getMaxRetryCount() + " 次"); + System.out.println("报告路径: " + manager.getConfig().getReportPath()); + System.out.println("数据集类型: " + manager.getConfig().getDatasetType()); + System.out.println("Docker镜像: " + manager.getConfig().getDockerImage()); + System.out.println("启用性能分析: " + manager.getConfig().isEnableProfiling()); + + } catch (Exception e) { + System.err.println("显示配置时出错: " + e.getMessage()); + } + } + + /** + * 添加自定义任务 + */ + private void addCustomTask(Scanner scanner) { + System.out.println("\n添加自定义任务:"); + + try { + System.out.print("任务ID: "); + String taskId = scanner.nextLine().trim(); + + System.out.print("仓库所有者: "); + String repoOwner = scanner.nextLine().trim(); + + System.out.print("仓库名称: "); + String repoName = scanner.nextLine().trim(); + + System.out.print("Issue编号: "); + String issueNumber = scanner.nextLine().trim(); + + System.out.print("Issue标题: "); + String issueTitle = scanner.nextLine().trim(); + + System.out.print("Issue描述: "); + String issueDescription = scanner.nextLine().trim(); + + // 创建任务 + SWEBenchTask task = new SWEBenchTask(taskId, repoOwner, repoName); + task.setIssueNumber(issueNumber); + task.setIssueTitle(issueTitle); + task.setIssueDescription(issueDescription); + + // 添加到管理器 + SWEBenchManager manager = SWEBenchManager.getInstance(); + manager.addTask(task); + + System.out.println("任务已添加: " + taskId); + + } catch (Exception e) { + System.err.println("添加任务时出错: " + e.getMessage()); + } + } + + /** + * 打印帮助信息 + */ + private void printHelp() { + System.out.println("用法: swebench-client [命令] [参数]"); + System.out.println(); + System.out.println("命令:"); + System.out.println(" start 开始评测指定模型"); + System.out.println(" stop 停止当前评测"); + System.out.println(" status 查看评测状态"); + System.out.println(" list 列出支持的模型"); + System.out.println(" config 显示当前配置"); + System.out.println(" help 显示此帮助信息"); + System.out.println(" version 显示版本信息"); + System.out.println(); + System.out.println("如果不提供命令,将进入交互模式"); + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/swebench/evaluator/DockerEnvironment.java b/src/main/java/com/taobao/profile/swebench/evaluator/DockerEnvironment.java new file mode 100644 index 0000000..07cb3a9 --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/evaluator/DockerEnvironment.java @@ -0,0 +1,213 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench.evaluator; + +import java.io.*; +import java.util.ArrayList; +import java.util.List; + +import com.taobao.profile.swebench.SWEBenchConfig; +import com.taobao.profile.swebench.task.SWEBenchTask; + +/** + * Docker环境管理 + * 负责创建和管理任务执行的Docker容器 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class DockerEnvironment { + + private SWEBenchConfig config; + private static final String CONTAINER_PREFIX = "swebench-"; + + public DockerEnvironment(SWEBenchConfig config) { + this.config = config; + } + + /** + * 准备Docker容器 + */ + public void prepareContainer(SWEBenchTask task, String repoPath) throws IOException { + String containerName = getContainerName(task); + + // 检查容器是否已存在 + if (containerExists(containerName)) { + // 停止并删除旧容器 + stopContainer(containerName); + removeContainer(containerName); + } + + // 创建新容器 + createContainer(task, containerName, repoPath); + } + + /** + * 创建容器 + */ + private void createContainer(SWEBenchTask task, String containerName, String repoPath) throws IOException { + List command = new ArrayList<>(); + command.add("docker"); + command.add("run"); + command.add("-d"); + command.add("--name"); + command.add(containerName); + command.add("-v"); + command.add(repoPath + ":/workspace"); + command.add("-w"); + command.add("/workspace"); + + // 设置资源限制 + command.add("--memory=4g"); + command.add("--cpus=2"); + + // 使用配置的镜像 + command.add(config.getDockerImage()); + command.add("sleep"); + command.add("infinity"); + + executeDockerCommand(command); + } + + /** + * 在容器中执行命令 + */ + public String executeInContainer(String containerName, String cmd) throws IOException { + List command = new ArrayList<>(); + command.add("docker"); + command.add("exec"); + command.add(containerName); + command.add("bash"); + command.add("-c"); + command.add(cmd); + + return executeDockerCommand(command); + } + + /** + * 复制文件到容器 + */ + public void copyToContainer(String containerName, String sourcePath, String destPath) throws IOException { + List command = new ArrayList<>(); + command.add("docker"); + command.add("cp"); + command.add(sourcePath); + command.add(containerName + ":" + destPath); + + executeDockerCommand(command); + } + + /** + * 从容器复制文件 + */ + public void copyFromContainer(String containerName, String sourcePath, String destPath) throws IOException { + List command = new ArrayList<>(); + command.add("docker"); + command.add("cp"); + command.add(containerName + ":" + sourcePath); + command.add(destPath); + + executeDockerCommand(command); + } + + /** + * 清理容器 + */ + public void cleanupContainer(SWEBenchTask task) { + String containerName = getContainerName(task); + try { + stopContainer(containerName); + removeContainer(containerName); + } catch (Exception e) { + // 忽略清理错误 + } + } + + /** + * 检查容器是否存在 + */ + private boolean containerExists(String containerName) { + try { + List command = new ArrayList<>(); + command.add("docker"); + command.add("ps"); + command.add("-a"); + command.add("--format"); + command.add("{{.Names}}"); + + String output = executeDockerCommand(command); + return output.contains(containerName); + } catch (Exception e) { + return false; + } + } + + /** + * 停止容器 + */ + private void stopContainer(String containerName) throws IOException { + List command = new ArrayList<>(); + command.add("docker"); + command.add("stop"); + command.add(containerName); + + executeDockerCommand(command); + } + + /** + * 删除容器 + */ + private void removeContainer(String containerName) throws IOException { + List command = new ArrayList<>(); + command.add("docker"); + command.add("rm"); + command.add(containerName); + + executeDockerCommand(command); + } + + /** + * 执行Docker命令 + */ + private String executeDockerCommand(List command) throws IOException { + ProcessBuilder pb = new ProcessBuilder(command); + pb.redirectErrorStream(true); + + Process process = pb.start(); + StringBuilder output = new StringBuilder(); + + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(process.getInputStream()))) { + String line; + while ((line = reader.readLine()) != null) { + output.append(line).append("\n"); + } + } + + try { + int exitCode = process.waitFor(); + if (exitCode != 0) { + throw new IOException("Docker命令执行失败: " + String.join(" ", command) + + "\n输出: " + output.toString()); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("Docker命令被中断"); + } + + return output.toString(); + } + + /** + * 获取容器名称 + */ + public String getContainerName(SWEBenchTask task) { + return CONTAINER_PREFIX + task.getTaskId().toLowerCase().replaceAll("[^a-z0-9-]", "-"); + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/swebench/evaluator/ModelEvaluator.java b/src/main/java/com/taobao/profile/swebench/evaluator/ModelEvaluator.java new file mode 100644 index 0000000..689ffe4 --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/evaluator/ModelEvaluator.java @@ -0,0 +1,228 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench.evaluator; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadMXBean; +import java.util.ArrayList; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import com.taobao.profile.Manager; +import com.taobao.profile.Profiler; +import com.taobao.profile.swebench.SWEBenchConfig; +import com.taobao.profile.swebench.task.SWEBenchTask; +import com.taobao.profile.swebench.task.TaskResult; + +/** + * 模型评估器 + * 负责调用AI模型解决任务并评估结果 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class ModelEvaluator { + + private SWEBenchConfig config; + private DockerEnvironment dockerEnv; + private ModelInterface modelInterface; + + public ModelEvaluator(SWEBenchConfig config) { + this.config = config; + this.dockerEnv = new DockerEnvironment(config); + this.modelInterface = new ModelInterface(config); + } + + /** + * 评估单个任务 + */ + public TaskResult evaluateTask(SWEBenchTask task, String modelName) { + TaskResult result = new TaskResult(task.getTaskId(), modelName); + result.getPerformanceMetrics().setStartTime(System.currentTimeMillis()); + + // 如果启用了性能分析,开始记录 + int profileMethodId = -1; + if (config.isEnableProfiling() && Manager.instance().canProfile()) { + profileMethodId = task.getTaskId().hashCode(); + Profiler.Start(profileMethodId); + } + + try { + // 1. 准备执行环境 + if (Manager.instance().isDebugMode()) { + System.out.println("准备任务环境: " + task.getTaskId()); + } + + prepareEnvironment(task); + + // 2. 调用模型生成解决方案 + long startCpuTime = getCpuTime(); + String generatedPatch = modelInterface.generateSolution(task, modelName); + long cpuTime = getCpuTime() - startCpuTime; + + result.setGeneratedPatch(generatedPatch); + result.getPerformanceMetrics().setCpuTimeMillis(cpuTime); + result.getPerformanceMetrics().setApiCallCount(modelInterface.getLastApiCallCount()); + result.getPerformanceMetrics().setTokenCount(modelInterface.getLastTokenCount()); + + // 3. 应用补丁并运行测试 + TestExecutor testExecutor = new TestExecutor(dockerEnv); + TaskResult.TestResult testResult = testExecutor.runTests(task, generatedPatch); + result.setTestResult(testResult); + + // 4. 判断是否成功 + result.setSuccess(testResult.getFailedTests() == 0 && testResult.getTotalTests() > 0); + + // 5. 收集性能数据 + collectPerformanceData(result); + + } catch (TimeoutException e) { + result.setSuccess(false); + result.setErrorMessage("任务执行超时: " + e.getMessage()); + } catch (Exception e) { + result.setSuccess(false); + result.setErrorMessage("任务执行失败: " + e.getMessage()); + e.printStackTrace(); + } finally { + // 记录结束时间 + result.getPerformanceMetrics().recordEnd(); + + // 结束性能分析 + if (profileMethodId != -1 && Manager.instance().canProfile()) { + Profiler.End(profileMethodId); + } + + // 清理环境 + cleanupEnvironment(task); + } + + return result; + } + + /** + * 准备执行环境 + */ + private void prepareEnvironment(SWEBenchTask task) throws IOException { + // 创建工作目录 + File workDir = new File(config.getTaskDataPath(), task.getTaskId()); + if (!workDir.exists()) { + workDir.mkdirs(); + } + + // 克隆或更新仓库 + String repoPath = cloneRepository(task, workDir); + + // 准备Docker容器 + dockerEnv.prepareContainer(task, repoPath); + } + + /** + * 克隆仓库 + */ + private String cloneRepository(SWEBenchTask task, File workDir) throws IOException { + File repoDir = new File(workDir, task.getRepoName()); + + if (!repoDir.exists()) { + // 克隆仓库 + String cloneCmd = String.format("git clone %s %s", + task.getRepoUrl(), repoDir.getAbsolutePath()); + executeCommand(cloneCmd, workDir); + } + + // 切换到指定分支 + if (task.getRepoBranch() != null) { + String checkoutCmd = "git checkout " + task.getRepoBranch(); + executeCommand(checkoutCmd, repoDir); + } + + return repoDir.getAbsolutePath(); + } + + /** + * 执行命令 + */ + private void executeCommand(String command, File workDir) throws IOException { + Process process = Runtime.getRuntime().exec(command, null, workDir); + try { + boolean finished = process.waitFor(5, TimeUnit.MINUTES); + if (!finished) { + process.destroyForcibly(); + throw new IOException("命令执行超时: " + command); + } + + if (process.exitValue() != 0) { + throw new IOException("命令执行失败: " + command); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("命令执行被中断: " + command); + } + } + + /** + * 收集性能数据 + */ + private void collectPerformanceData(TaskResult result) { + // 收集内存使用 + Runtime runtime = Runtime.getRuntime(); + long memoryUsed = runtime.totalMemory() - runtime.freeMemory(); + result.getPerformanceMetrics().setMemoryUsedBytes(memoryUsed); + + // 估算成本(基于token数量) + double costPerToken = 0.00002; // 示例成本 + double cost = result.getPerformanceMetrics().getTokenCount() * costPerToken; + result.getPerformanceMetrics().setCostEstimate(cost); + } + + /** + * 清理环境 + */ + private void cleanupEnvironment(SWEBenchTask task) { + try { + dockerEnv.cleanupContainer(task); + + // 如果不保存中间结果,删除工作目录 + if (!config.isSaveIntermediateResults()) { + File workDir = new File(config.getTaskDataPath(), task.getTaskId()); + deleteDirectory(workDir); + } + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * 递归删除目录 + */ + private void deleteDirectory(File dir) { + if (dir.exists()) { + File[] files = dir.listFiles(); + if (files != null) { + for (File file : files) { + if (file.isDirectory()) { + deleteDirectory(file); + } else { + file.delete(); + } + } + } + dir.delete(); + } + } + + /** + * 获取CPU时间 + */ + private long getCpuTime() { + ThreadMXBean bean = ManagementFactory.getThreadMXBean(); + return bean.isCurrentThreadCpuTimeSupported() ? + bean.getCurrentThreadCpuTime() / 1000000L : 0L; + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/swebench/evaluator/ModelInterface.java b/src/main/java/com/taobao/profile/swebench/evaluator/ModelInterface.java new file mode 100644 index 0000000..70cd892 --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/evaluator/ModelInterface.java @@ -0,0 +1,267 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench.evaluator; + +import java.io.*; +import java.net.*; +import java.util.HashMap; +import java.util.Map; + +import com.taobao.profile.swebench.SWEBenchConfig; +import com.taobao.profile.swebench.task.SWEBenchTask; + +/** + * 模型接口 + * 负责与AI模型进行交互,生成解决方案 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class ModelInterface { + + private SWEBenchConfig config; + private int lastApiCallCount = 0; + private int lastTokenCount = 0; + + public ModelInterface(SWEBenchConfig config) { + this.config = config; + } + + /** + * 调用模型生成解决方案 + */ + public String generateSolution(SWEBenchTask task, String modelName) throws IOException { + // 重置计数器 + lastApiCallCount = 0; + lastTokenCount = 0; + + String prompt = buildPrompt(task); + String response = callModel(modelName, prompt); + + // 从响应中提取补丁 + return extractPatch(response); + } + + /** + * 构建提示词 + */ + private String buildPrompt(SWEBenchTask task) { + StringBuilder prompt = new StringBuilder(); + + // 系统提示 + prompt.append("You are an expert software engineer. "); + prompt.append("Your task is to solve the following GitHub issue by generating a patch.\n\n"); + + // 任务描述 + prompt.append(task.generateTaskPrompt()); + + // 指导说明 + prompt.append("\nInstructions:\n"); + prompt.append("1. Analyze the issue carefully\n"); + prompt.append("2. Identify the root cause\n"); + prompt.append("3. Generate a minimal patch that fixes the issue\n"); + prompt.append("4. Make sure the patch follows the project's coding style\n"); + prompt.append("5. The patch should be in unified diff format\n\n"); + + prompt.append("Please provide your solution as a patch:\n"); + + return prompt.toString(); + } + + /** + * 调用模型API + */ + private String callModel(String modelName, String prompt) throws IOException { + lastApiCallCount++; + + // 这里是一个简化的实现,实际应该根据不同的模型调用相应的API + if (config.getModelApiUrl() == null || config.getModelApiUrl().isEmpty()) { + // 如果没有配置API,返回模拟响应 + return generateMockResponse(modelName, prompt); + } + + // 调用真实API + return callRealAPI(modelName, prompt); + } + + /** + * 调用真实的模型API + */ + private String callRealAPI(String modelName, String prompt) throws IOException { + URL url = new URL(config.getModelApiUrl()); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + + try { + conn.setRequestMethod("POST"); + conn.setRequestProperty("Content-Type", "application/json"); + conn.setRequestProperty("Authorization", "Bearer " + config.getModelApiKey()); + conn.setDoOutput(true); + + // 构建请求体 + Map requestBody = new HashMap<>(); + requestBody.put("model", modelName); + requestBody.put("prompt", prompt); + requestBody.put("max_tokens", config.getModelMaxTokens()); + + // 发送请求 + try (OutputStreamWriter writer = new OutputStreamWriter(conn.getOutputStream())) { + writer.write(toJson(requestBody)); + } + + // 读取响应 + StringBuilder response = new StringBuilder(); + try (BufferedReader reader = new BufferedReader( + new InputStreamReader(conn.getInputStream()))) { + String line; + while ((line = reader.readLine()) != null) { + response.append(line).append("\n"); + } + } + + // 解析响应并更新token计数 + Map responseData = parseJson(response.toString()); + if (responseData.containsKey("usage")) { + Map usage = (Map) responseData.get("usage"); + lastTokenCount = ((Number) usage.get("total_tokens")).intValue(); + } + + return (String) responseData.get("content"); + + } finally { + conn.disconnect(); + } + } + + /** + * 生成模拟响应(用于测试) + */ + private String generateMockResponse(String modelName, String prompt) { + lastTokenCount = prompt.length() / 4; // 粗略估算token数 + + StringBuilder response = new StringBuilder(); + response.append("Based on the issue description, here is the patch:\n\n"); + response.append("```diff\n"); + response.append("--- a/example.py\n"); + response.append("+++ b/example.py\n"); + response.append("@@ -10,7 +10,7 @@\n"); + response.append(" def example_function():\n"); + response.append("- return \"old value\"\n"); + response.append("+ return \"new value\"\n"); + response.append(" \n"); + response.append("```\n"); + + return response.toString(); + } + + /** + * 从响应中提取补丁 + */ + private String extractPatch(String response) { + // 查找diff代码块 + int startIndex = response.indexOf("```diff"); + if (startIndex == -1) { + startIndex = response.indexOf("```patch"); + } + + if (startIndex != -1) { + startIndex = response.indexOf('\n', startIndex) + 1; + int endIndex = response.indexOf("```", startIndex); + if (endIndex != -1) { + return response.substring(startIndex, endIndex).trim(); + } + } + + // 如果没有找到代码块,尝试查找diff格式 + if (response.contains("--- ") && response.contains("+++ ")) { + return extractDiffFormat(response); + } + + // 返回整个响应作为补丁 + return response; + } + + /** + * 提取diff格式的补丁 + */ + private String extractDiffFormat(String response) { + StringBuilder patch = new StringBuilder(); + String[] lines = response.split("\n"); + boolean inDiff = false; + + for (String line : lines) { + if (line.startsWith("--- ") || line.startsWith("+++ ") || + line.startsWith("@@ ") || line.startsWith("+") || + line.startsWith("-") || line.startsWith(" ")) { + inDiff = true; + patch.append(line).append("\n"); + } else if (inDiff && !line.trim().isEmpty() && + !line.startsWith("+") && !line.startsWith("-")) { + // 结束diff部分 + break; + } + } + + return patch.toString().trim(); + } + + /** + * 简单的JSON序列化 + */ + private String toJson(Map map) { + // 这里应该使用真正的JSON库,这只是一个简化示例 + StringBuilder json = new StringBuilder("{"); + boolean first = true; + + for (Map.Entry entry : map.entrySet()) { + if (!first) json.append(","); + json.append("\"").append(entry.getKey()).append("\":"); + + if (entry.getValue() instanceof String) { + json.append("\"").append(escapeJson((String) entry.getValue())).append("\""); + } else { + json.append(entry.getValue()); + } + first = false; + } + + json.append("}"); + return json.toString(); + } + + /** + * 转义JSON字符串 + */ + private String escapeJson(String value) { + return value.replace("\\", "\\\\") + .replace("\"", "\\\"") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t"); + } + + /** + * 简单的JSON解析 + */ + private Map parseJson(String json) { + // 这里应该使用真正的JSON库,这只是一个简化示例 + Map result = new HashMap<>(); + // TODO: 实现JSON解析 + return result; + } + + // Getters + + public int getLastApiCallCount() { + return lastApiCallCount; + } + + public int getLastTokenCount() { + return lastTokenCount; + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/swebench/evaluator/TestExecutor.java b/src/main/java/com/taobao/profile/swebench/evaluator/TestExecutor.java new file mode 100644 index 0000000..62428e1 --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/evaluator/TestExecutor.java @@ -0,0 +1,224 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench.evaluator; + +import java.io.*; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.taobao.profile.swebench.task.SWEBenchTask; +import com.taobao.profile.swebench.task.TaskResult; + +/** + * 测试执行器 + * 负责应用补丁并执行测试 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class TestExecutor { + + private DockerEnvironment dockerEnv; + + public TestExecutor(DockerEnvironment dockerEnv) { + this.dockerEnv = dockerEnv; + } + + /** + * 运行测试 + */ + public TaskResult.TestResult runTests(SWEBenchTask task, String patch) throws IOException { + TaskResult.TestResult result = new TaskResult.TestResult(); + String containerName = dockerEnv.getContainerName(task); + + try { + // 1. 应用补丁 + applyPatch(containerName, patch); + + // 2. 运行测试命令 + List testOutputs = new ArrayList<>(); + + if (task.getTestCommands() != null && !task.getTestCommands().isEmpty()) { + for (String testCommand : task.getTestCommands()) { + String output = dockerEnv.executeInContainer(containerName, testCommand); + testOutputs.add(output); + } + } else { + // 使用默认测试命令 + String output = runDefaultTests(containerName, task); + testOutputs.add(output); + } + + // 3. 解析测试结果 + parseTestResults(testOutputs, result); + + // 4. 检查失败的测试 + if (task.getFailingTests() != null) { + checkFailingTests(containerName, task.getFailingTests(), result); + } + + } catch (Exception e) { + result.setTestOutput("测试执行失败: " + e.getMessage()); + result.setTotalTests(1); + result.setFailedTests(1); + } + + return result; + } + + /** + * 应用补丁 + */ + private void applyPatch(String containerName, String patch) throws IOException { + // 将补丁保存到临时文件 + File patchFile = File.createTempFile("patch", ".diff"); + try (FileWriter writer = new FileWriter(patchFile)) { + writer.write(patch); + } + + // 复制补丁到容器 + dockerEnv.copyToContainer(containerName, patchFile.getAbsolutePath(), "/tmp/patch.diff"); + + // 应用补丁 + String applyCommand = "cd /workspace && git apply /tmp/patch.diff"; + String output = dockerEnv.executeInContainer(containerName, applyCommand); + + // 清理临时文件 + patchFile.delete(); + + // 检查补丁是否应用成功 + if (output.contains("error") || output.contains("failed")) { + throw new IOException("补丁应用失败: " + output); + } + } + + /** + * 运行默认测试 + */ + private String runDefaultTests(String containerName, SWEBenchTask task) throws IOException { + // 尝试常见的测试命令 + String[] testCommands = { + "python -m pytest", + "python -m unittest discover", + "npm test", + "mvn test", + "gradle test", + "make test" + }; + + for (String command : testCommands) { + try { + String output = dockerEnv.executeInContainer(containerName, + "cd /workspace && " + command + " 2>&1 || true"); + if (!output.contains("command not found")) { + return output; + } + } catch (Exception e) { + // 忽略错误,尝试下一个命令 + } + } + + return "No test command found"; + } + + /** + * 解析测试结果 + */ + private void parseTestResults(List outputs, TaskResult.TestResult result) { + int totalTests = 0; + int passedTests = 0; + int failedTests = 0; + List failedTestNames = new ArrayList<>(); + StringBuilder fullOutput = new StringBuilder(); + + for (String output : outputs) { + fullOutput.append(output).append("\n"); + + // 解析pytest输出 + if (output.contains("passed") || output.contains("failed")) { + Pattern pytestPattern = Pattern.compile("(\\d+) passed.*?(\\d+) failed"); + Matcher matcher = pytestPattern.matcher(output); + if (matcher.find()) { + passedTests += Integer.parseInt(matcher.group(1)); + failedTests += Integer.parseInt(matcher.group(2)); + } + } + + // 解析unittest输出 + if (output.contains("Ran") && output.contains("tests")) { + Pattern unittestPattern = Pattern.compile("Ran (\\d+) tests?"); + Matcher matcher = unittestPattern.matcher(output); + if (matcher.find()) { + totalTests = Integer.parseInt(matcher.group(1)); + } + + if (output.contains("OK")) { + passedTests = totalTests; + } else if (output.contains("FAILED")) { + Pattern failPattern = Pattern.compile("failures=(\\d+)"); + matcher = failPattern.matcher(output); + if (matcher.find()) { + failedTests = Integer.parseInt(matcher.group(1)); + passedTests = totalTests - failedTests; + } + } + } + + // 提取失败的测试名称 + String[] lines = output.split("\n"); + for (String line : lines) { + if (line.contains("FAILED") || line.contains("FAIL:")) { + failedTestNames.add(line.trim()); + } + } + } + + // 如果没有解析到总测试数,根据已知数据计算 + if (totalTests == 0) { + totalTests = passedTests + failedTests; + } + + result.setTotalTests(totalTests); + result.setPassedTests(passedTests); + result.setFailedTests(failedTests); + result.setFailedTestNames(failedTestNames); + result.setTestOutput(fullOutput.toString()); + } + + /** + * 检查特定的失败测试 + */ + private void checkFailingTests(String containerName, List failingTests, + TaskResult.TestResult result) throws IOException { + List stillFailing = new ArrayList<>(); + + for (String testName : failingTests) { + // 运行单个测试 + String command = String.format("cd /workspace && python -m pytest %s -v 2>&1 || true", testName); + String output = dockerEnv.executeInContainer(containerName, command); + + if (output.contains("FAILED") || output.contains("ERROR")) { + stillFailing.add(testName); + } + } + + // 更新失败的测试列表 + if (!stillFailing.isEmpty()) { + result.setFailedTestNames(stillFailing); + result.setFailedTests(stillFailing.size()); + + // 调整通过的测试数 + if (result.getTotalTests() > 0) { + result.setPassedTests(result.getTotalTests() - stillFailing.size()); + } + } + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/swebench/loader/TaskLoader.java b/src/main/java/com/taobao/profile/swebench/loader/TaskLoader.java new file mode 100644 index 0000000..4053b6f --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/loader/TaskLoader.java @@ -0,0 +1,163 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench.loader; + +import java.io.*; +import java.util.*; + +import com.taobao.profile.swebench.task.SWEBenchTask; + +/** + * 任务加载器 + * 负责从各种数据源加载SWE-bench任务 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class TaskLoader { + + /** + * 加载示例任务 + * 这些是一些典型的SWE-bench任务示例 + */ + public static List loadSampleTasks() { + List tasks = new ArrayList<>(); + + // 示例任务1:简单的bug修复 + SWEBenchTask task1 = new SWEBenchTask("sample-001", "example", "calculator"); + task1.setIssueNumber("123"); + task1.setIssueTitle("Division by zero error in calculate method"); + task1.setIssueDescription( + "When calling calculate(10, 0, '/'), the method throws an unhandled exception.\n" + + "Expected behavior: Should return an error message instead of throwing exception." + ); + task1.setRepoBranch("main"); + task1.setDifficultyLevel(2); + task1.setExpectedTimeMinutes(15); + + List failingTests1 = new ArrayList<>(); + failingTests1.add("test_division_by_zero"); + task1.setFailingTests(failingTests1); + + tasks.add(task1); + + // 示例任务2:功能增强 + SWEBenchTask task2 = new SWEBenchTask("sample-002", "example", "string-utils"); + task2.setIssueNumber("456"); + task2.setIssueTitle("Add support for case-insensitive string comparison"); + task2.setIssueDescription( + "The current compare() method is case-sensitive only.\n" + + "Please add an optional parameter to enable case-insensitive comparison." + ); + task2.setRepoBranch("develop"); + task2.setDifficultyLevel(3); + task2.setExpectedTimeMinutes(30); + task2.setTaskType(SWEBenchTask.TaskType.FEATURE); + + tasks.add(task2); + + // 示例任务3:性能优化 + SWEBenchTask task3 = new SWEBenchTask("sample-003", "example", "data-processor"); + task3.setIssueNumber("789"); + task3.setIssueTitle("Optimize large file processing performance"); + task3.setIssueDescription( + "Processing files larger than 100MB takes too long.\n" + + "Current implementation loads entire file into memory.\n" + + "Please implement streaming processing to improve performance." + ); + task3.setRepoBranch("performance"); + task3.setDifficultyLevel(4); + task3.setExpectedTimeMinutes(60); + task3.setTaskType(SWEBenchTask.TaskType.REFACTOR); + + tasks.add(task3); + + return tasks; + } + + /** + * 从JSON文件加载任务 + */ + public static List loadFromJson(String filePath) throws IOException { + List tasks = new ArrayList<>(); + + // 简化的JSON解析实现 + // 实际应该使用JSON库如Jackson或Gson + try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) { + // TODO: 实现JSON解析逻辑 + // 这里只是示例框架 + } + + return tasks; + } + + /** + * 从CSV文件加载任务 + */ + public static List loadFromCsv(String filePath) throws IOException { + List tasks = new ArrayList<>(); + + try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) { + String line; + boolean isHeader = true; + + while ((line = reader.readLine()) != null) { + if (isHeader) { + isHeader = false; + continue; + } + + String[] parts = line.split(","); + if (parts.length >= 6) { + SWEBenchTask task = new SWEBenchTask( + parts[0].trim(), // taskId + parts[1].trim(), // repoOwner + parts[2].trim() // repoName + ); + task.setIssueNumber(parts[3].trim()); + task.setIssueTitle(parts[4].trim()); + task.setIssueDescription(parts[5].trim()); + + tasks.add(task); + } + } + } + + return tasks; + } + + /** + * 从GitHub API加载任务 + * 注意:需要配置GitHub API token + */ + public static List loadFromGitHub(String owner, String repo, String label) { + List tasks = new ArrayList<>(); + + // TODO: 实现GitHub API调用 + // 1. 获取指定标签的issues + // 2. 转换为SWEBenchTask对象 + // 3. 获取相关的测试信息 + + return tasks; + } + + /** + * 从Hugging Face数据集加载 + * 这是官方SWE-bench数据集的来源 + */ + public static List loadFromHuggingFace(String datasetType) { + List tasks = new ArrayList<>(); + + // TODO: 实现Hugging Face数据集加载 + // 使用datasets库或REST API + // 数据集名称:princeton-nlp/SWE-bench + + return tasks; + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/swebench/reporter/BenchmarkReporter.java b/src/main/java/com/taobao/profile/swebench/reporter/BenchmarkReporter.java new file mode 100644 index 0000000..fea7fa8 --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/reporter/BenchmarkReporter.java @@ -0,0 +1,379 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench.reporter; + +import java.io.*; +import java.text.SimpleDateFormat; +import java.util.*; + +import com.taobao.profile.swebench.SWEBenchConfig; +import com.taobao.profile.swebench.task.TaskResult; +import com.taobao.profile.utils.DailyRollingFileWriter; + +/** + * 基准测试报告生成器 + * 负责生成评测结果报告 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class BenchmarkReporter { + + private SWEBenchConfig config; + private SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + private SimpleDateFormat fileFormat = new SimpleDateFormat("yyyyMMdd_HHmmss"); + + public BenchmarkReporter(SWEBenchConfig config) { + this.config = config; + } + + /** + * 生成报告 + */ + public void generateReport(String modelName, List results, long startTime) { + try { + // 创建报告目录 + File reportDir = new File(config.getReportPath()); + if (!reportDir.exists()) { + reportDir.mkdirs(); + } + + // 生成多种格式的报告 + generateTextReport(modelName, results, startTime); + generateHtmlReport(modelName, results, startTime); + generateJsonReport(modelName, results, startTime); + generateCsvReport(modelName, results, startTime); + + // 生成汇总报告 + generateSummaryReport(modelName, results, startTime); + + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * 生成文本报告 + */ + private void generateTextReport(String modelName, List results, long startTime) + throws IOException { + String fileName = String.format("swebench_%s_%s.txt", + modelName.replaceAll("[^a-zA-Z0-9]", "_"), fileFormat.format(new Date())); + File reportFile = new File(config.getReportPath(), fileName); + + try (PrintWriter writer = new PrintWriter(new FileWriter(reportFile))) { + writer.println("====================================="); + writer.println("SWE-bench 评测报告"); + writer.println("====================================="); + writer.println(); + writer.println("模型: " + modelName); + writer.println("开始时间: " + dateFormat.format(new Date(startTime))); + writer.println("结束时间: " + dateFormat.format(new Date())); + writer.println("总耗时: " + formatDuration(System.currentTimeMillis() - startTime)); + writer.println(); + + // 统计信息 + generateStatistics(writer, results); + + // 详细结果 + writer.println("\n详细结果:"); + writer.println("-------------------------------------"); + + for (TaskResult result : results) { + writer.println("\n任务ID: " + result.getTaskId()); + writer.println("状态: " + (result.isSuccess() ? "成功" : "失败")); + writer.println("执行时间: " + result.getPerformanceMetrics().getExecutionTimeMillis() + "ms"); + writer.println("测试通过率: " + String.format("%.2f%%", result.getTestResult().getPassRate())); + writer.println("测试结果: " + result.getTestResult().getPassedTests() + "/" + + result.getTestResult().getTotalTests()); + + if (!result.isSuccess() && result.getErrorMessage() != null) { + writer.println("错误信息: " + result.getErrorMessage()); + } + + writer.println("-------------------------------------"); + } + } + + System.out.println("文本报告已生成: " + reportFile.getAbsolutePath()); + } + + /** + * 生成HTML报告 + */ + private void generateHtmlReport(String modelName, List results, long startTime) + throws IOException { + String fileName = String.format("swebench_%s_%s.html", + modelName.replaceAll("[^a-zA-Z0-9]", "_"), fileFormat.format(new Date())); + File reportFile = new File(config.getReportPath(), fileName); + + try (PrintWriter writer = new PrintWriter(new FileWriter(reportFile))) { + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println("SWE-bench 评测报告 - " + modelName + ""); + writer.println(""); + writer.println(""); + writer.println(""); + + writer.println("

SWE-bench 评测报告

"); + writer.println("
"); + writer.println("

模型: " + modelName + "

"); + writer.println("

开始时间: " + dateFormat.format(new Date(startTime)) + "

"); + writer.println("

结束时间: " + dateFormat.format(new Date()) + "

"); + writer.println("

总耗时: " + formatDuration(System.currentTimeMillis() - startTime) + "

"); + + // 统计信息 + int totalTasks = results.size(); + int successTasks = 0; + double totalCost = 0; + long totalTokens = 0; + + for (TaskResult result : results) { + if (result.isSuccess()) successTasks++; + totalCost += result.getPerformanceMetrics().getCostEstimate(); + totalTokens += result.getPerformanceMetrics().getTokenCount(); + } + + writer.println("

总任务数: " + totalTasks + "

"); + writer.println("

成功数: " + successTasks + "

"); + writer.println("

成功率: " + String.format("%.2f%%", (double)successTasks/totalTasks*100) + "

"); + writer.println("

总成本: $" + String.format("%.4f", totalCost) + "

"); + writer.println("

总Token数: " + totalTokens + "

"); + writer.println("
"); + + // 结果表格 + writer.println("

详细结果

"); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + + for (TaskResult result : results) { + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + writer.println(""); + } + + writer.println("
任务ID状态执行时间(ms)测试通过率API调用Token数成本
" + result.getTaskId() + "" + + (result.isSuccess() ? "成功" : "失败") + "" + result.getPerformanceMetrics().getExecutionTimeMillis() + "" + String.format("%.2f%%", result.getTestResult().getPassRate()) + "" + result.getPerformanceMetrics().getApiCallCount() + "" + result.getPerformanceMetrics().getTokenCount() + "$" + String.format("%.4f", result.getPerformanceMetrics().getCostEstimate()) + "
"); + writer.println(""); + writer.println(""); + } + + System.out.println("HTML报告已生成: " + reportFile.getAbsolutePath()); + } + + /** + * 生成JSON报告 + */ + private void generateJsonReport(String modelName, List results, long startTime) + throws IOException { + String fileName = String.format("swebench_%s_%s.json", + modelName.replaceAll("[^a-zA-Z0-9]", "_"), fileFormat.format(new Date())); + File reportFile = new File(config.getReportPath(), fileName); + + try (PrintWriter writer = new PrintWriter(new FileWriter(reportFile))) { + writer.println("{"); + writer.println(" \"model\": \"" + modelName + "\","); + writer.println(" \"startTime\": \"" + dateFormat.format(new Date(startTime)) + "\","); + writer.println(" \"endTime\": \"" + dateFormat.format(new Date()) + "\","); + writer.println(" \"duration\": " + (System.currentTimeMillis() - startTime) + ","); + writer.println(" \"results\": ["); + + for (int i = 0; i < results.size(); i++) { + TaskResult result = results.get(i); + writer.println(" {"); + writer.println(" \"taskId\": \"" + result.getTaskId() + "\","); + writer.println(" \"success\": " + result.isSuccess() + ","); + writer.println(" \"executionTime\": " + result.getPerformanceMetrics().getExecutionTimeMillis() + ","); + writer.println(" \"testPassRate\": " + result.getTestResult().getPassRate() + ","); + writer.println(" \"apiCalls\": " + result.getPerformanceMetrics().getApiCallCount() + ","); + writer.println(" \"tokens\": " + result.getPerformanceMetrics().getTokenCount() + ","); + writer.println(" \"cost\": " + result.getPerformanceMetrics().getCostEstimate()); + writer.print(" }"); + if (i < results.size() - 1) writer.print(","); + writer.println(); + } + + writer.println(" ]"); + writer.println("}"); + } + + System.out.println("JSON报告已生成: " + reportFile.getAbsolutePath()); + } + + /** + * 生成CSV报告 + */ + private void generateCsvReport(String modelName, List results, long startTime) + throws IOException { + String fileName = String.format("swebench_%s_%s.csv", + modelName.replaceAll("[^a-zA-Z0-9]", "_"), fileFormat.format(new Date())); + File reportFile = new File(config.getReportPath(), fileName); + + try (PrintWriter writer = new PrintWriter(new FileWriter(reportFile))) { + // CSV头 + writer.println("TaskID,Model,Success,ExecutionTime(ms),TestPassRate(%),PassedTests,TotalTests,APIcalls,Tokens,Cost($)"); + + // 数据行 + for (TaskResult result : results) { + writer.printf("%s,%s,%s,%d,%.2f,%d,%d,%d,%d,%.4f\n", + result.getTaskId(), + modelName, + result.isSuccess(), + result.getPerformanceMetrics().getExecutionTimeMillis(), + result.getTestResult().getPassRate(), + result.getTestResult().getPassedTests(), + result.getTestResult().getTotalTests(), + result.getPerformanceMetrics().getApiCallCount(), + result.getPerformanceMetrics().getTokenCount(), + result.getPerformanceMetrics().getCostEstimate() + ); + } + } + + System.out.println("CSV报告已生成: " + reportFile.getAbsolutePath()); + } + + /** + * 生成汇总报告 + */ + private void generateSummaryReport(String modelName, List results, long startTime) + throws IOException { + File summaryFile = new File(config.getReportPath(), "swebench_summary.txt"); + + // 追加模式写入 + try (PrintWriter writer = new PrintWriter(new FileWriter(summaryFile, true))) { + int successCount = 0; + double totalCost = 0; + long totalTime = 0; + + for (TaskResult result : results) { + if (result.isSuccess()) successCount++; + totalCost += result.getPerformanceMetrics().getCostEstimate(); + totalTime += result.getPerformanceMetrics().getExecutionTimeMillis(); + } + + writer.printf("%s | %s | 任务数: %d | 成功: %d (%.2f%%) | 总耗时: %s | 总成本: $%.4f\n", + dateFormat.format(new Date()), + modelName, + results.size(), + successCount, + (double)successCount/results.size()*100, + formatDuration(totalTime), + totalCost + ); + } + } + + /** + * 生成统计信息 + */ + private void generateStatistics(PrintWriter writer, List results) { + int totalTasks = results.size(); + int successTasks = 0; + int failedTasks = 0; + long totalExecutionTime = 0; + long totalCpuTime = 0; + long totalMemory = 0; + int totalApiCalls = 0; + int totalTokens = 0; + double totalCost = 0; + + Map difficultyDistribution = new HashMap<>(); + + for (TaskResult result : results) { + if (result.isSuccess()) { + successTasks++; + } else { + failedTasks++; + } + + totalExecutionTime += result.getPerformanceMetrics().getExecutionTimeMillis(); + totalCpuTime += result.getPerformanceMetrics().getCpuTimeMillis(); + totalMemory += result.getPerformanceMetrics().getMemoryUsedBytes(); + totalApiCalls += result.getPerformanceMetrics().getApiCallCount(); + totalTokens += result.getPerformanceMetrics().getTokenCount(); + totalCost += result.getPerformanceMetrics().getCostEstimate(); + } + + writer.println("统计信息:"); + writer.println("-------------------------------------"); + writer.println("总任务数: " + totalTasks); + writer.println("成功数: " + successTasks); + writer.println("失败数: " + failedTasks); + writer.println("成功率: " + String.format("%.2f%%", (double)successTasks/totalTasks*100)); + writer.println(); + writer.println("性能指标:"); + writer.println("平均执行时间: " + (totalTasks > 0 ? totalExecutionTime/totalTasks : 0) + "ms"); + writer.println("平均CPU时间: " + (totalTasks > 0 ? totalCpuTime/totalTasks : 0) + "ms"); + writer.println("平均内存使用: " + formatBytes(totalTasks > 0 ? totalMemory/totalTasks : 0)); + writer.println(); + writer.println("API使用:"); + writer.println("总API调用: " + totalApiCalls); + writer.println("总Token数: " + totalTokens); + writer.println("总成本: $" + String.format("%.4f", totalCost)); + writer.println("平均成本: $" + String.format("%.4f", totalTasks > 0 ? totalCost/totalTasks : 0)); + } + + /** + * 格式化时长 + */ + private String formatDuration(long millis) { + long seconds = millis / 1000; + long minutes = seconds / 60; + long hours = minutes / 60; + + if (hours > 0) { + return String.format("%d小时%d分钟%d秒", hours, minutes % 60, seconds % 60); + } else if (minutes > 0) { + return String.format("%d分钟%d秒", minutes, seconds % 60); + } else { + return String.format("%d秒", seconds); + } + } + + /** + * 格式化字节数 + */ + private String formatBytes(long bytes) { + if (bytes < 1024) { + return bytes + " B"; + } else if (bytes < 1024 * 1024) { + return String.format("%.2f KB", bytes / 1024.0); + } else if (bytes < 1024 * 1024 * 1024) { + return String.format("%.2f MB", bytes / (1024.0 * 1024)); + } else { + return String.format("%.2f GB", bytes / (1024.0 * 1024 * 1024)); + } + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/swebench/task/SWEBenchTask.java b/src/main/java/com/taobao/profile/swebench/task/SWEBenchTask.java new file mode 100644 index 0000000..ca5c379 --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/task/SWEBenchTask.java @@ -0,0 +1,277 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench.task; + +import java.util.List; +import java.util.Map; + +/** + * SWE-bench任务定义 + * 代表一个需要AI模型解决的软件工程问题 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class SWEBenchTask { + + /** + * 任务ID + */ + private String taskId; + + /** + * GitHub仓库信息 + */ + private String repoOwner; + private String repoName; + private String repoBranch; + + /** + * Issue信息 + */ + private String issueNumber; + private String issueTitle; + private String issueDescription; + + /** + * 测试相关 + */ + private List testCommands; + private List failingTests; + + /** + * 预期的代码变更文件 + */ + private List expectedFiles; + + /** + * 任务难度等级 (1-5) + */ + private int difficultyLevel; + + /** + * 任务类型 + */ + private TaskType taskType; + + /** + * 额外的元数据 + */ + private Map metadata; + + /** + * 基准解决方案(用于对比) + */ + private String baselinePatch; + + /** + * 任务创建时间 + */ + private long createTime; + + /** + * 预期完成时间(分钟) + */ + private int expectedTimeMinutes; + + public enum TaskType { + BUG_FIX("bug_fix"), + FEATURE("feature"), + REFACTOR("refactor"), + TEST("test"), + DOCUMENTATION("documentation"); + + private String value; + + TaskType(String value) { + this.value = value; + } + + public String getValue() { + return value; + } + } + + public SWEBenchTask() { + this.createTime = System.currentTimeMillis(); + this.taskType = TaskType.BUG_FIX; + this.difficultyLevel = 3; + } + + public SWEBenchTask(String taskId, String repoOwner, String repoName) { + this(); + this.taskId = taskId; + this.repoOwner = repoOwner; + this.repoName = repoName; + } + + /** + * 生成任务的完整描述 + */ + public String generateTaskPrompt() { + StringBuilder prompt = new StringBuilder(); + prompt.append("Repository: ").append(repoOwner).append("/").append(repoName).append("\n"); + prompt.append("Branch: ").append(repoBranch).append("\n"); + prompt.append("Issue #").append(issueNumber).append(": ").append(issueTitle).append("\n\n"); + prompt.append("Description:\n").append(issueDescription).append("\n\n"); + + if (failingTests != null && !failingTests.isEmpty()) { + prompt.append("Failing tests:\n"); + for (String test : failingTests) { + prompt.append("- ").append(test).append("\n"); + } + } + + return prompt.toString(); + } + + /** + * 获取GitHub仓库URL + */ + public String getRepoUrl() { + return String.format("https://github.com/%s/%s", repoOwner, repoName); + } + + /** + * 获取Issue URL + */ + public String getIssueUrl() { + return String.format("%s/issues/%s", getRepoUrl(), issueNumber); + } + + // Getters and setters + + public String getTaskId() { + return taskId; + } + + public void setTaskId(String taskId) { + this.taskId = taskId; + } + + public String getRepoOwner() { + return repoOwner; + } + + public void setRepoOwner(String repoOwner) { + this.repoOwner = repoOwner; + } + + public String getRepoName() { + return repoName; + } + + public void setRepoName(String repoName) { + this.repoName = repoName; + } + + public String getRepoBranch() { + return repoBranch; + } + + public void setRepoBranch(String repoBranch) { + this.repoBranch = repoBranch; + } + + public String getIssueNumber() { + return issueNumber; + } + + public void setIssueNumber(String issueNumber) { + this.issueNumber = issueNumber; + } + + public String getIssueTitle() { + return issueTitle; + } + + public void setIssueTitle(String issueTitle) { + this.issueTitle = issueTitle; + } + + public String getIssueDescription() { + return issueDescription; + } + + public void setIssueDescription(String issueDescription) { + this.issueDescription = issueDescription; + } + + public List getTestCommands() { + return testCommands; + } + + public void setTestCommands(List testCommands) { + this.testCommands = testCommands; + } + + public List getFailingTests() { + return failingTests; + } + + public void setFailingTests(List failingTests) { + this.failingTests = failingTests; + } + + public List getExpectedFiles() { + return expectedFiles; + } + + public void setExpectedFiles(List expectedFiles) { + this.expectedFiles = expectedFiles; + } + + public int getDifficultyLevel() { + return difficultyLevel; + } + + public void setDifficultyLevel(int difficultyLevel) { + this.difficultyLevel = difficultyLevel; + } + + public TaskType getTaskType() { + return taskType; + } + + public void setTaskType(TaskType taskType) { + this.taskType = taskType; + } + + public Map getMetadata() { + return metadata; + } + + public void setMetadata(Map metadata) { + this.metadata = metadata; + } + + public String getBaselinePatch() { + return baselinePatch; + } + + public void setBaselinePatch(String baselinePatch) { + this.baselinePatch = baselinePatch; + } + + public long getCreateTime() { + return createTime; + } + + public void setCreateTime(long createTime) { + this.createTime = createTime; + } + + public int getExpectedTimeMinutes() { + return expectedTimeMinutes; + } + + public void setExpectedTimeMinutes(int expectedTimeMinutes) { + this.expectedTimeMinutes = expectedTimeMinutes; + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/swebench/task/TaskResult.java b/src/main/java/com/taobao/profile/swebench/task/TaskResult.java new file mode 100644 index 0000000..a041b77 --- /dev/null +++ b/src/main/java/com/taobao/profile/swebench/task/TaskResult.java @@ -0,0 +1,323 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.swebench.task; + +import java.util.List; +import java.util.Map; + +/** + * SWE-bench任务执行结果 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class TaskResult { + + /** + * 任务ID + */ + private String taskId; + + /** + * 模型名称 + */ + private String modelName; + + /** + * 是否成功解决 + */ + private boolean success; + + /** + * 生成的补丁内容 + */ + private String generatedPatch; + + /** + * 测试结果 + */ + private TestResult testResult; + + /** + * 性能指标 + */ + private PerformanceMetrics performanceMetrics; + + /** + * 错误信息 + */ + private String errorMessage; + + /** + * 执行日志 + */ + private List executionLogs; + + /** + * 额外的结果数据 + */ + private Map additionalData; + + /** + * 测试结果内部类 + */ + public static class TestResult { + private int totalTests; + private int passedTests; + private int failedTests; + private List failedTestNames; + private String testOutput; + + public TestResult() { + this.totalTests = 0; + this.passedTests = 0; + this.failedTests = 0; + } + + public double getPassRate() { + return totalTests > 0 ? (double) passedTests / totalTests * 100 : 0; + } + + // Getters and setters + public int getTotalTests() { + return totalTests; + } + + public void setTotalTests(int totalTests) { + this.totalTests = totalTests; + } + + public int getPassedTests() { + return passedTests; + } + + public void setPassedTests(int passedTests) { + this.passedTests = passedTests; + } + + public int getFailedTests() { + return failedTests; + } + + public void setFailedTests(int failedTests) { + this.failedTests = failedTests; + } + + public List getFailedTestNames() { + return failedTestNames; + } + + public void setFailedTestNames(List failedTestNames) { + this.failedTestNames = failedTestNames; + } + + public String getTestOutput() { + return testOutput; + } + + public void setTestOutput(String testOutput) { + this.testOutput = testOutput; + } + } + + /** + * 性能指标内部类 + */ + public static class PerformanceMetrics { + private long startTime; + private long endTime; + private long executionTimeMillis; + private long cpuTimeMillis; + private long memoryUsedBytes; + private int apiCallCount; + private int tokenCount; + private double costEstimate; + + public PerformanceMetrics() { + this.startTime = System.currentTimeMillis(); + } + + public void recordEnd() { + this.endTime = System.currentTimeMillis(); + this.executionTimeMillis = endTime - startTime; + } + + // Getters and setters + public long getStartTime() { + return startTime; + } + + public void setStartTime(long startTime) { + this.startTime = startTime; + } + + public long getEndTime() { + return endTime; + } + + public void setEndTime(long endTime) { + this.endTime = endTime; + } + + public long getExecutionTimeMillis() { + return executionTimeMillis; + } + + public void setExecutionTimeMillis(long executionTimeMillis) { + this.executionTimeMillis = executionTimeMillis; + } + + public long getCpuTimeMillis() { + return cpuTimeMillis; + } + + public void setCpuTimeMillis(long cpuTimeMillis) { + this.cpuTimeMillis = cpuTimeMillis; + } + + public long getMemoryUsedBytes() { + return memoryUsedBytes; + } + + public void setMemoryUsedBytes(long memoryUsedBytes) { + this.memoryUsedBytes = memoryUsedBytes; + } + + public int getApiCallCount() { + return apiCallCount; + } + + public void setApiCallCount(int apiCallCount) { + this.apiCallCount = apiCallCount; + } + + public int getTokenCount() { + return tokenCount; + } + + public void setTokenCount(int tokenCount) { + this.tokenCount = tokenCount; + } + + public double getCostEstimate() { + return costEstimate; + } + + public void setCostEstimate(double costEstimate) { + this.costEstimate = costEstimate; + } + } + + public TaskResult() { + this.testResult = new TestResult(); + this.performanceMetrics = new PerformanceMetrics(); + } + + public TaskResult(String taskId, String modelName) { + this(); + this.taskId = taskId; + this.modelName = modelName; + } + + /** + * 生成结果摘要 + */ + public String generateSummary() { + StringBuilder summary = new StringBuilder(); + summary.append("Task: ").append(taskId).append("\n"); + summary.append("Model: ").append(modelName).append("\n"); + summary.append("Success: ").append(success).append("\n"); + summary.append("Execution Time: ").append(performanceMetrics.getExecutionTimeMillis()).append("ms\n"); + + if (testResult != null) { + summary.append("Test Pass Rate: ").append(String.format("%.2f%%", testResult.getPassRate())).append("\n"); + summary.append("Tests: ").append(testResult.getPassedTests()).append("/").append(testResult.getTotalTests()).append("\n"); + } + + if (!success && errorMessage != null) { + summary.append("Error: ").append(errorMessage).append("\n"); + } + + return summary.toString(); + } + + // Getters and setters + + public String getTaskId() { + return taskId; + } + + public void setTaskId(String taskId) { + this.taskId = taskId; + } + + public String getModelName() { + return modelName; + } + + public void setModelName(String modelName) { + this.modelName = modelName; + } + + public boolean isSuccess() { + return success; + } + + public void setSuccess(boolean success) { + this.success = success; + } + + public String getGeneratedPatch() { + return generatedPatch; + } + + public void setGeneratedPatch(String generatedPatch) { + this.generatedPatch = generatedPatch; + } + + public TestResult getTestResult() { + return testResult; + } + + public void setTestResult(TestResult testResult) { + this.testResult = testResult; + } + + public PerformanceMetrics getPerformanceMetrics() { + return performanceMetrics; + } + + public void setPerformanceMetrics(PerformanceMetrics performanceMetrics) { + this.performanceMetrics = performanceMetrics; + } + + public String getErrorMessage() { + return errorMessage; + } + + public void setErrorMessage(String errorMessage) { + this.errorMessage = errorMessage; + } + + public List getExecutionLogs() { + return executionLogs; + } + + public void setExecutionLogs(List executionLogs) { + this.executionLogs = executionLogs; + } + + public Map getAdditionalData() { + return additionalData; + } + + public void setAdditionalData(Map additionalData) { + this.additionalData = additionalData; + } +} \ No newline at end of file diff --git a/src/main/java/com/taobao/profile/thread/InnerSocketThread.java b/src/main/java/com/taobao/profile/thread/InnerSocketThread.java index afb6722..b6c7be1 100644 --- a/src/main/java/com/taobao/profile/thread/InnerSocketThread.java +++ b/src/main/java/com/taobao/profile/thread/InnerSocketThread.java @@ -19,6 +19,7 @@ import com.taobao.profile.Manager; import com.taobao.profile.runtime.MethodCache; +import com.taobao.profile.thread.SWEBenchThread; /** * 对外提供Socket开关 @@ -53,6 +54,10 @@ public void run() { write(child.getOutputStream()); } else if (Manager.FLUSHMETHOD.equals(command)) { MethodCache.flushMethodData(); + } else if (command != null && command.startsWith("swebench_")) { + // 处理SWE-bench相关命令 + String response = SWEBenchThread.getInstance().handleCommand(command); + writeResponse(child.getOutputStream(), response); } else { Manager.instance().setSwitchFlag(false); } @@ -111,6 +116,20 @@ private void write(OutputStream os) throws IOException { out.write('\r'); out.flush(); } + + /** + * 输出响应 + * + * @param os + * @param response + * @throws IOException + */ + private void writeResponse(OutputStream os, String response) throws IOException { + BufferedOutputStream out = new BufferedOutputStream(os); + out.write(response.getBytes()); + out.write('\r'); + out.flush(); + } /** * 调试使用 diff --git a/src/main/java/com/taobao/profile/thread/SWEBenchThread.java b/src/main/java/com/taobao/profile/thread/SWEBenchThread.java new file mode 100644 index 0000000..66dd8d1 --- /dev/null +++ b/src/main/java/com/taobao/profile/thread/SWEBenchThread.java @@ -0,0 +1,120 @@ +/** + * (C) 2011-2012 Alibaba Group Holding Limited. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + */ +package com.taobao.profile.thread; + +import com.taobao.profile.Manager; +import com.taobao.profile.swebench.SWEBenchManager; + +/** + * SWE-bench集成线程 + * 负责处理来自InnerSocketThread的SWE-bench相关命令 + * + * @author TProfiler Team + * @since 2025-1 + */ +public class SWEBenchThread { + + private static SWEBenchThread instance = new SWEBenchThread(); + + private SWEBenchThread() { + } + + public static SWEBenchThread getInstance() { + return instance; + } + + /** + * 处理SWE-bench命令 + * + * @param command 命令 + * @return 响应结果 + */ + public String handleCommand(String command) { + if (command == null) { + return "ERROR: 命令为空"; + } + + String[] parts = command.split(":"); + String action = parts[0]; + + try { + if (Manager.SWEBENCH_START.equals(action)) { + if (parts.length < 2) { + return "ERROR: 缺少模型名称参数"; + } + return startSWEBench(parts[1]); + + } else if (Manager.SWEBENCH_STOP.equals(action)) { + return stopSWEBench(); + + } else if (Manager.SWEBENCH_STATUS.equals(action)) { + return getSWEBenchStatus(); + + } else { + return "ERROR: 未知的SWE-bench命令: " + action; + } + } catch (Exception e) { + return "ERROR: " + e.getMessage(); + } + } + + /** + * 启动SWE-bench评测 + */ + private String startSWEBench(String modelName) { + try { + SWEBenchManager manager = SWEBenchManager.getInstance(); + manager.initialize(); + + // 在新线程中启动评测,避免阻塞 + Thread benchmarkThread = new Thread(new Runnable() { + @Override + public void run() { + manager.startBenchmark(modelName); + } + }); + benchmarkThread.setName("TProfiler-SWEBench-" + modelName); + benchmarkThread.setDaemon(true); + benchmarkThread.start(); + + return "OK: SWE-bench评测已启动,模型: " + modelName; + + } catch (Exception e) { + return "ERROR: 启动失败 - " + e.getMessage(); + } + } + + /** + * 停止SWE-bench评测 + */ + private String stopSWEBench() { + try { + SWEBenchManager manager = SWEBenchManager.getInstance(); + manager.stopBenchmark(); + return "OK: SWE-bench评测已停止"; + + } catch (Exception e) { + return "ERROR: 停止失败 - " + e.getMessage(); + } + } + + /** + * 获取SWE-bench状态 + */ + private String getSWEBenchStatus() { + try { + SWEBenchManager manager = SWEBenchManager.getInstance(); + String status = manager.getStatus(); + return "OK: SWE-bench状态 - " + status; + + } catch (Exception e) { + return "ERROR: 获取状态失败 - " + e.getMessage(); + } + } +} \ No newline at end of file diff --git a/src/main/resources/swebench.properties b/src/main/resources/swebench.properties new file mode 100644 index 0000000..f33a823 --- /dev/null +++ b/src/main/resources/swebench.properties @@ -0,0 +1,27 @@ +# SWE-bench 配置文件 + +# 基本配置 +swebench.parallel.tasks=4 +swebench.task.timeout=30 +swebench.max.retry=3 + +# 路径配置 +swebench.report.path=${user.home}/swebench-reports +swebench.task.path=${user.home}/swebench-tasks + +# 功能开关 +swebench.enable.profiling=true +swebench.save.intermediate=true + +# Docker配置 +swebench.docker.image=swebench/eval:latest + +# 数据集配置 +# 可选值: full, lite, verified +swebench.dataset.type=lite + +# 模型API配置 +# 如果使用云端模型,请配置以下参数 +# swebench.model.api.url=https://api.openai.com/v1/completions +# swebench.model.api.key=your-api-key-here +swebench.model.max.tokens=4096 \ No newline at end of file