diff --git a/.github/workflows/accuracy-embench.yml b/.github/workflows/accuracy-embench.yml new file mode 100644 index 0000000..648a395 --- /dev/null +++ b/.github/workflows/accuracy-embench.yml @@ -0,0 +1,113 @@ +name: Accuracy - EmBench + +on: + push: + branches: [main] + paths: + - 'benchmarks/aha-mont64-m2sim/**' + - 'benchmarks/crc32-m2sim/**' + - 'benchmarks/edn-m2sim/**' + - 'benchmarks/huffbench-m2sim/**' + - 'benchmarks/matmult-int-m2sim/**' + - 'benchmarks/statemate-m2sim/**' + - 'benchmarks/primecount-m2sim/**' + - 'benchmarks/embench_test.go' + - 'timing/**' + workflow_dispatch: + +concurrency: + group: accuracy-embench-${{ github.ref }} + cancel-in-progress: false + +jobs: + embench-accuracy: + name: EmBench Accuracy + runs-on: macos-14 + timeout-minutes: 30 + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: '1.25' + + - name: Verify EmBench ELFs + run: | + echo "Checking EmBench ELF files..." + ls -la benchmarks/aha-mont64-m2sim/*.elf 2>/dev/null || echo "aha-mont64 missing" + ls -la benchmarks/crc32-m2sim/*.elf 2>/dev/null || echo "crc32 missing" + ls -la benchmarks/edn-m2sim/*.elf 2>/dev/null || echo "edn missing" + ls -la benchmarks/huffbench-m2sim/*.elf 2>/dev/null || echo "huffbench missing" + ls -la benchmarks/matmult-int-m2sim/*.elf 2>/dev/null || echo "matmult-int missing" + ls -la benchmarks/statemate-m2sim/*.elf 2>/dev/null || echo "statemate missing" + ls -la benchmarks/primecount-m2sim/*.elf 2>/dev/null || echo "primecount missing" + + - name: Run EmBench tests + run: | + TESTS=( + TestEmbenchAhaMont64 + TestEmbenchCRC32 + TestEmbenchEDN + TestEmbenchHuffbench + TestEmbenchMatmultInt + TestEmbenchStatemate + TestEmbenchPrimecount + ) + + > embench_output.txt + for TEST in "${TESTS[@]}"; do + echo "--- $TEST ---" + go test -v -run "^${TEST}$" -count=1 -timeout 5m ./benchmarks/ 2>&1 | tee -a embench_output.txt || true + done + + - name: Extract CPI results + if: always() + run: | + python3 - <<'PYEOF' + import json, re + + results = {} + with open("embench_output.txt") as f: + for line in f: + if "CPI=" not in line: + continue + # Try to extract benchmark name and CPI + match = re.search(r'(\w+):\s+.*CPI=([\d.]+)', line) + if match: + name = match.group(1) + cpi = float(match.group(2)) + results[name] = {"cpi": cpi} + + output = {"benchmarks_run": len(results), "results": results} + with open("embench_results.json", "w") as f: + json.dump(output, f, indent=2) + print(json.dumps(output, indent=2)) + PYEOF + + - name: Post summary + if: always() + run: | + echo "## EmBench Accuracy Results" >> $GITHUB_STEP_SUMMARY + if [ -f embench_results.json ]; then + python3 -c " + import json + d = json.load(open('embench_results.json')) + print(f'**Benchmarks measured:** {d[\"benchmarks_run\"]}/7') + if d['results']: + print() + print('| Benchmark | CPI |') + print('|-----------|-----|') + for name, r in sorted(d['results'].items()): + print(f'| {name} | {r[\"cpi\"]:.3f} |') + " >> $GITHUB_STEP_SUMMARY + fi + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: embench-accuracy + path: | + embench_results.json + embench_output.txt + retention-days: 90 diff --git a/.github/workflows/accuracy-microbench.yml b/.github/workflows/accuracy-microbench.yml new file mode 100644 index 0000000..552a56a --- /dev/null +++ b/.github/workflows/accuracy-microbench.yml @@ -0,0 +1,82 @@ +name: Accuracy - Microbenchmarks + +on: + push: + branches: [main] + paths: + - 'benchmarks/**' + - 'timing/**' + - 'emu/**' + workflow_dispatch: + +concurrency: + group: accuracy-microbench-${{ github.ref }} + cancel-in-progress: false + +jobs: + microbench-accuracy: + name: Microbenchmark Accuracy + runs-on: macos-14 + timeout-minutes: 15 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.25' + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install Python dependencies + run: pip install matplotlib numpy scipy + + - name: Run microbenchmark CPI tests + run: | + cd benchmarks + echo "=== Running microbenchmark CPI tests ===" + + # Without D-cache (ALU, branch, throughput benchmarks) + go test -v -run TestTimingPredictions_CPIBounds -count=1 -timeout 5m ./ 2>&1 | tee micro_no_cache.txt + + # With D-cache (memory-latency benchmarks) + go test -v -run TestAccuracyCPI_WithDCache -count=1 -timeout 5m ./ 2>&1 | tee micro_dcache.txt + + - name: Generate accuracy report + run: | + python3 benchmarks/native/accuracy_report.py --suite microbench 2>&1 || true + # If the script doesn't support --suite yet, run it and it will + # naturally process microbenchmarks from test output + + - name: Post summary + if: always() + run: | + echo "## Microbenchmark Accuracy Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + if [ -f benchmarks/native/accuracy_results.json ]; then + python3 -c " + import json + d = json.load(open('benchmarks/native/accuracy_results.json')) + print(f\"**Average Error:** {d['summary']['average_error']*100:.1f}%\") + print(f\"**Benchmarks:** {d['summary']['benchmark_count']}\") + " >> $GITHUB_STEP_SUMMARY + else + echo "No results generated." >> $GITHUB_STEP_SUMMARY + fi + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: accuracy-microbench + path: | + benchmarks/native/accuracy_report.md + benchmarks/native/accuracy_results.json + benchmarks/native/accuracy_figure.png + benchmarks/native/accuracy_normalized.pdf + retention-days: 90 diff --git a/.github/workflows/accuracy-polybench.yml b/.github/workflows/accuracy-polybench.yml new file mode 100644 index 0000000..0de2e4d --- /dev/null +++ b/.github/workflows/accuracy-polybench.yml @@ -0,0 +1,176 @@ +name: Accuracy - PolyBench + +on: + push: + branches: [main] + paths: + - 'benchmarks/polybench/**' + - 'benchmarks/polybench_test.go' + - 'benchmarks/timing_harness.go' + - 'timing/**' + workflow_dispatch: + +concurrency: + group: accuracy-polybench-${{ github.ref }} + cancel-in-progress: false + +jobs: + polybench-group-1: + name: PolyBench Group 1 (ATAX, BiCG, Jacobi1D) + runs-on: macos-14 + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: '1.25' + + - name: Verify ELFs + run: | + ls -la benchmarks/polybench/atax_m2sim.elf + ls -la benchmarks/polybench/bicg_m2sim.elf + ls -la benchmarks/polybench/jacobi-1d_m2sim.elf + + - name: Run tests + run: | + for TEST in TestPolybenchATAX TestPolybenchBiCG TestPolybenchJacobi1D; do + echo "--- $TEST ---" + go test -v -run "^${TEST}$" -count=1 -timeout 8m ./benchmarks/ 2>&1 | tee -a group1_output.txt || true + done + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: polybench-group-1 + path: group1_output.txt + retention-days: 30 + + polybench-group-2: + name: PolyBench Group 2 (MVT, GEMM) + runs-on: macos-14 + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: '1.25' + + - name: Verify ELFs + run: | + ls -la benchmarks/polybench/mvt_m2sim.elf + ls -la benchmarks/polybench/gemm_m2sim.elf + + - name: Run tests + run: | + for TEST in TestPolybenchMVT TestPolybenchGEMM; do + echo "--- $TEST ---" + go test -v -run "^${TEST}$" -count=1 -timeout 8m ./benchmarks/ 2>&1 | tee -a group2_output.txt || true + done + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: polybench-group-2 + path: group2_output.txt + retention-days: 30 + + polybench-group-3: + name: PolyBench Group 3 (2MM, 3MM) + runs-on: macos-14 + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: '1.25' + + - name: Verify ELFs + run: | + ls -la benchmarks/polybench/2mm_m2sim.elf + ls -la benchmarks/polybench/3mm_m2sim.elf + + - name: Run tests + run: | + for TEST in TestPolybench2MM TestPolybench3MM; do + echo "--- $TEST ---" + go test -v -run "^${TEST}$" -count=1 -timeout 8m ./benchmarks/ 2>&1 | tee -a group3_output.txt || true + done + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: polybench-group-3 + path: group3_output.txt + retention-days: 30 + + consolidate: + name: Consolidate PolyBench Results + runs-on: ubuntu-latest + needs: [polybench-group-1, polybench-group-2, polybench-group-3] + if: always() + steps: + - uses: actions/checkout@v4 + + - name: Download results + uses: actions/download-artifact@v4 + with: + path: group-results + + - name: Extract CPI results + run: | + cat group-results/polybench-group-1/group1_output.txt > combined.txt 2>/dev/null || true + cat group-results/polybench-group-2/group2_output.txt >> combined.txt 2>/dev/null || true + cat group-results/polybench-group-3/group3_output.txt >> combined.txt 2>/dev/null || true + + python3 - <<'PYEOF' + import json, re + + results = {} + with open("combined.txt") as f: + for line in f: + if "CPI=" not in line: + continue + match = re.search(r'(polybench_\w+):\s+cycles=(\d+),\s+insts=(\d+),\s+CPI=([\d.]+)', line) + if match: + name = match.group(1).replace("polybench_", "") + if name == "jacobi1d": + name = "jacobi-1d" + results[name] = { + "cycles": int(match.group(2)), + "instructions": int(match.group(3)), + "cpi": float(match.group(4)), + } + + output = {"benchmarks_run": len(results), "results": results} + with open("polybench_results.json", "w") as f: + json.dump(output, f, indent=2) + print(json.dumps(output, indent=2)) + PYEOF + + - name: Post summary + if: always() + run: | + echo "## PolyBench Accuracy Results" >> $GITHUB_STEP_SUMMARY + if [ -f polybench_results.json ]; then + python3 -c " + import json + d = json.load(open('polybench_results.json')) + print(f'**Benchmarks measured:** {d[\"benchmarks_run\"]}/7') + if d['results']: + print() + print('| Benchmark | CPI |') + print('|-----------|-----|') + for name, r in sorted(d['results'].items()): + print(f'| {name} | {r[\"cpi\"]:.3f} |') + " >> $GITHUB_STEP_SUMMARY + fi + + - name: Upload consolidated results + uses: actions/upload-artifact@v4 + with: + name: polybench-consolidated + path: polybench_results.json + retention-days: 90 diff --git a/.github/workflows/accuracy-report.yml b/.github/workflows/accuracy-report.yml deleted file mode 100644 index ede2f02..0000000 --- a/.github/workflows/accuracy-report.yml +++ /dev/null @@ -1,156 +0,0 @@ -name: Accuracy Report - -on: - push: - branches: [main] - workflow_dispatch: # Allow manual triggering - -concurrency: - group: accuracy-report-${{ github.ref }} - cancel-in-progress: false - -jobs: - accuracy-report: - name: Generate Accuracy Report - runs-on: macos-14 # Apple Silicon runner - timeout-minutes: 120 # Extended timeout for PolyBench + EmBench simulations - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install Python dependencies - run: | - pip install matplotlib numpy scipy - - - name: Install Ginkgo - run: go install github.com/onsi/ginkgo/v2/ginkgo@latest - - - name: Run Accuracy Report - run: | - cd benchmarks/native - echo "Starting accuracy report generation at $(date)" - START_TIME=$(date +%s) - - # Run with timing and enhanced error reporting - if ! python3 accuracy_report.py; then - END_TIME=$(date +%s) - RUNTIME=$((END_TIME - START_TIME)) - echo "❌ Accuracy report failed after ${RUNTIME}s" - echo "Recent system performance metrics:" - echo " - Available memory: $(vm_stat | grep 'free\|active\|inactive' | head -3)" - echo " - CPU load: $(uptime)" - echo "Checking for generated files:" - ls -la accuracy_* || echo "No accuracy files found" - exit 1 - fi - - END_TIME=$(date +%s) - RUNTIME=$((END_TIME - START_TIME)) - echo "✅ Accuracy report completed successfully in ${RUNTIME}s" - - - name: Upload Accuracy Report - uses: actions/upload-artifact@v4 - with: - name: accuracy-report - path: | - benchmarks/native/accuracy_report.md - benchmarks/native/accuracy_figure.png - benchmarks/native/accuracy_results.json - benchmarks/native/accuracy_normalized.pdf - retention-days: 90 - - - name: Post Report Summary - if: always() - run: | - echo "## M2Sim Accuracy Report" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ -f benchmarks/native/accuracy_results.json ]; then - AVG_ERROR=$(python3 -c "import json; d=json.load(open('benchmarks/native/accuracy_results.json')); print(f\"{d['summary']['average_error']*100:.1f}%\")") - MAX_ERROR=$(python3 -c "import json; d=json.load(open('benchmarks/native/accuracy_results.json')); print(f\"{d['summary']['max_error']*100:.1f}%\")") - - echo "- **Average Error:** $AVG_ERROR" >> $GITHUB_STEP_SUMMARY - echo "- **Max Error:** $MAX_ERROR" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "See the uploaded artifacts for the full report and figures." >> $GITHUB_STEP_SUMMARY - else - echo "⚠️ Accuracy report generation failed." >> $GITHUB_STEP_SUMMARY - fi - - - name: Commit updated reports to reports branch - if: github.ref == 'refs/heads/main' - run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - - # Save generated report files to temp directory BEFORE switching branches - # This avoids "local changes would be overwritten" errors - TEMP_REPORTS=$(mktemp -d) - cp benchmarks/native/accuracy_report.md "$TEMP_REPORTS/" 2>/dev/null || true - cp benchmarks/native/accuracy_figure.png "$TEMP_REPORTS/" 2>/dev/null || true - cp benchmarks/native/accuracy_results.json "$TEMP_REPORTS/" 2>/dev/null || true - cp benchmarks/native/accuracy_normalized.pdf "$TEMP_REPORTS/" 2>/dev/null || true - - DATE=$(date +%Y-%m-%d) - COMMIT_SHA=$(git rev-parse --short HEAD 2>/dev/null || echo "initial") - - # Fetch the reports branch or create it - git fetch origin reports || true - - # Stash any local changes before switching branches - git stash --include-untracked || true - - if git show-ref --verify --quiet refs/remotes/origin/reports; then - git checkout reports -- - git pull origin reports - else - git checkout --orphan reports - git rm -rf . || true - echo "# M2Sim Accuracy Reports" > README.md - echo "" >> README.md - echo "This branch contains historical accuracy reports." >> README.md - git add README.md - fi - - # Create dated directory - REPORT_DIR="reports/${DATE}-${COMMIT_SHA}" - mkdir -p "$REPORT_DIR" - - # Copy report files from temp directory - cp "$TEMP_REPORTS/accuracy_report.md" "$REPORT_DIR/" 2>/dev/null || true - cp "$TEMP_REPORTS/accuracy_figure.png" "$REPORT_DIR/" 2>/dev/null || true - cp "$TEMP_REPORTS/accuracy_results.json" "$REPORT_DIR/" 2>/dev/null || true - cp "$TEMP_REPORTS/accuracy_normalized.pdf" "$REPORT_DIR/" 2>/dev/null || true - - # Clean up temp directory - rm -rf "$TEMP_REPORTS" - - # Update index - echo "# M2Sim Accuracy Reports" > README.md - echo "" >> README.md - echo "| Date | Commit | Average Error | Max Error | Report |" >> README.md - echo "|------|--------|---------------|-----------|--------|" >> README.md - - for dir in reports/*/; do - if [ -f "${dir}accuracy_results.json" ]; then - DIRNAME=$(basename "$dir") - AVG=$(python3 -c "import json; d=json.load(open('${dir}accuracy_results.json')); print(f\"{d['summary']['average_error']*100:.1f}%\")" 2>/dev/null || echo "N/A") - MAX=$(python3 -c "import json; d=json.load(open('${dir}accuracy_results.json')); print(f\"{d['summary']['max_error']*100:.1f}%\")" 2>/dev/null || echo "N/A") - echo "| ${DIRNAME%-*} | ${DIRNAME##*-} | $AVG | $MAX | [Report](reports/${DIRNAME}/accuracy_report.md) |" >> README.md - fi - done - - git add -A - git commit -m "Update accuracy report for ${DATE}" || echo "No changes to commit" - git push origin reports || echo "Failed to push (may need permissions)" diff --git a/.github/workflows/calibration.yml b/.github/workflows/calibration.yml new file mode 100644 index 0000000..a9538e3 --- /dev/null +++ b/.github/workflows/calibration.yml @@ -0,0 +1,125 @@ +name: Hardware Calibration + +on: + workflow_dispatch: + inputs: + suite: + description: 'Calibration suite to run' + type: choice + required: true + default: 'all' + options: + - all + - microbench + - polybench + - embench + +jobs: + microbench-calibration: + name: Microbenchmark Calibration + if: inputs.suite == 'all' || inputs.suite == 'microbench' + runs-on: macos-14 + timeout-minutes: 60 + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - run: pip install numpy scipy + + - name: Verify ARM64 + run: | + if [ "$(uname -m)" != "arm64" ]; then + echo "ERROR: Requires ARM64 (Apple Silicon)" + exit 1 + fi + + - name: Run memory benchmark calibration + run: | + cd benchmarks/native + python3 linear_calibration.py \ + --benchmarks memorystrided loadheavy storeheavy branchheavy \ + --runs 15 \ + --output memory_calibration_results.json + timeout-minutes: 45 + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: microbench-calibration + path: benchmarks/native/memory_calibration_results.json + retention-days: 90 + + polybench-calibration: + name: PolyBench Calibration + if: inputs.suite == 'all' || inputs.suite == 'polybench' + runs-on: macos-14 + timeout-minutes: 45 + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - run: pip install numpy scipy + + - name: Verify ARM64 + run: | + if [ "$(uname -m)" != "arm64" ]; then + echo "ERROR: Requires ARM64 (Apple Silicon)" + exit 1 + fi + + - name: Run PolyBench calibration + run: | + cd benchmarks/native + python3 polybench_calibration.py \ + --runs 15 \ + --output polybench_calibration_results.json + timeout-minutes: 40 + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: polybench-calibration + path: benchmarks/native/polybench_calibration_results.json + retention-days: 90 + + embench-calibration: + name: EmBench Calibration + if: inputs.suite == 'all' || inputs.suite == 'embench' + runs-on: macos-14 + timeout-minutes: 45 + + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - run: pip install numpy scipy + + - name: Verify ARM64 + run: | + if [ "$(uname -m)" != "arm64" ]; then + echo "ERROR: Requires ARM64 (Apple Silicon)" + exit 1 + fi + + - name: Run EmBench calibration + run: | + cd benchmarks/native + python3 embench_calibration.py \ + --runs 15 \ + --output embench_calibration_results.json + timeout-minutes: 40 + + - name: Upload results + if: always() + uses: actions/upload-artifact@v4 + with: + name: embench-calibration + path: benchmarks/native/embench_calibration_results.json + retention-days: 90 diff --git a/.github/workflows/ci-health-monitor.yml b/.github/workflows/ci-health-monitor.yml deleted file mode 100644 index bb1b8ae..0000000 --- a/.github/workflows/ci-health-monitor.yml +++ /dev/null @@ -1,272 +0,0 @@ -name: CI Health Monitor - -on: - schedule: - - cron: '0 */6 * * *' # Every 6 hours - workflow_dispatch: - -jobs: - ci-health-check: - name: CI Health Assessment - runs-on: ubuntu-latest - timeout-minutes: 10 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Analyze recent CI runs - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # Get recent workflow runs for all critical workflows - echo "Analyzing CI health for the last 24 hours..." - - python3 - <<'PYEOF' - import json - import subprocess - import sys - from datetime import datetime, timedelta - - def get_recent_runs(workflow_name, hours=24): - """Get recent runs for a specific workflow""" - cmd = ['gh', 'run', 'list', '--workflow', workflow_name, '--json', 'status,conclusion,createdAt,number,name', '--limit', '20'] - result = subprocess.run(cmd, capture_output=True, text=True) - if result.returncode != 0: - print(f"Warning: Could not fetch runs for {workflow_name}") - return [] - - runs = json.loads(result.stdout) - cutoff = datetime.now().replace(tzinfo=None) - timedelta(hours=hours) - - recent_runs = [] - for run in runs: - created_at = datetime.fromisoformat(run['createdAt'].replace('Z', '+00:00')).replace(tzinfo=None) - if created_at > cutoff: - recent_runs.append(run) - - return recent_runs - - # Critical workflows to monitor (using display names from workflow YAML) - workflows = [ - 'CI', - 'H5 Accuracy Report', - 'PolyBench Simulation Measurements', - 'CPI Comparison', - 'Matmul Calibration' - ] - - health_report = { - "timestamp": datetime.now().isoformat(), - "period_hours": 24, - "workflows": {}, - "overall_health": "unknown", - "alerts": [] - } - - total_runs = 0 - failed_runs = 0 - timeout_failures = 0 - - for workflow in workflows: - runs = get_recent_runs(workflow) - if not runs: - health_report["workflows"][workflow] = { - "runs": 0, - "success_rate": 0.0, - "status": "no_data" - } - continue - - completed_runs = [r for r in runs if r['status'] == 'completed'] - successful_runs = [r for r in completed_runs if r['conclusion'] == 'success'] - failed_runs_count = len([r for r in completed_runs if r['conclusion'] in ['failure', 'cancelled', 'timed_out']]) - - success_rate = len(successful_runs) / len(completed_runs) if completed_runs else 0.0 - - health_report["workflows"][workflow] = { - "runs": len(runs), - "completed": len(completed_runs), - "successful": len(successful_runs), - "failed": failed_runs_count, - "success_rate": success_rate, - "status": "healthy" if success_rate >= 0.8 else "degraded" if success_rate >= 0.5 else "critical" - } - - total_runs += len(completed_runs) - failed_runs += failed_runs_count - - # Check for timeout patterns - timeout_runs = [r for r in completed_runs if r['conclusion'] == 'timed_out'] - if timeout_runs: - timeout_failures += len(timeout_runs) - health_report["alerts"].append(f"{workflow}: {len(timeout_runs)} timeout failures detected") - - # Check for low success rates - if success_rate < 0.7: - health_report["alerts"].append(f"{workflow}: Low success rate ({success_rate:.1%})") - - # Calculate overall health - overall_success_rate = (total_runs - failed_runs) / total_runs if total_runs > 0 else 0.0 - - if overall_success_rate >= 0.9: - health_report["overall_health"] = "healthy" - elif overall_success_rate >= 0.7: - health_report["overall_health"] = "degraded" - else: - health_report["overall_health"] = "critical" - - health_report["summary"] = { - "total_runs": total_runs, - "failed_runs": failed_runs, - "timeout_failures": timeout_failures, - "overall_success_rate": overall_success_rate - } - - # Write health report - with open("ci_health_report.json", "w") as f: - json.dump(health_report, f, indent=2) - - print("=== CI Health Report ===") - print(json.dumps(health_report, indent=2)) - - # Exit with error code if critical issues detected - if health_report["overall_health"] == "critical" or timeout_failures > 3: - print(f"\n❌ CRITICAL: CI health is {health_report['overall_health']}") - if timeout_failures > 3: - print(f"❌ CRITICAL: {timeout_failures} timeout failures detected") - sys.exit(1) - elif health_report["overall_health"] == "degraded": - print(f"\n⚠️ WARNING: CI health is degraded") - else: - print(f"\n✅ CI health is good") - PYEOF - - - name: Check for silent failures - run: | - echo "Checking for silent failure patterns..." - - # Check if any critical workflows haven't run recently when they should have - python3 - <<'PYEOF' - import json - import subprocess - from datetime import datetime, timedelta - - def check_workflow_freshness(): - """Check if critical workflows have run recently enough""" - alerts = [] - - # Workflows that should run on every push to main (using display names) - critical_workflows = ['CI', 'H5 Accuracy Report'] - - # Get recent commits to main - result = subprocess.run(['git', 'log', '--since=1 day ago', '--oneline', 'main'], - capture_output=True, text=True) - recent_commits = len(result.stdout.strip().split('\n')) if result.stdout.strip() else 0 - - if recent_commits > 0: - for workflow in critical_workflows: - cmd = ['gh', 'run', 'list', '--workflow', workflow, '--json', 'createdAt', '--limit', '5'] - result = subprocess.run(cmd, capture_output=True, text=True) - - if result.returncode == 0: - runs = json.loads(result.stdout) - if runs: - latest_run = datetime.fromisoformat(runs[0]['createdAt'].replace('Z', '+00:00')).replace(tzinfo=None) - hours_since = (datetime.now().replace(tzinfo=None) - latest_run).total_seconds() / 3600 - - if hours_since > 12: # No runs in 12+ hours despite commits - alerts.append(f"{workflow}: No runs in {hours_since:.1f} hours despite recent commits") - else: - alerts.append(f"{workflow}: No recent runs found") - - return alerts - - silent_failure_alerts = check_workflow_freshness() - - if silent_failure_alerts: - print("🔍 SILENT FAILURE DETECTION:") - for alert in silent_failure_alerts: - print(f" ❌ {alert}") - else: - print("✅ No silent failures detected") - - # Append to health report - try: - with open("ci_health_report.json", "r") as f: - health_report = json.load(f) - - health_report["silent_failures"] = silent_failure_alerts - - with open("ci_health_report.json", "w") as f: - json.dump(health_report, f, indent=2) - except: - pass - PYEOF - - - name: Generate health summary - if: always() - run: | - echo "## CI Health Monitor Report" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ -f ci_health_report.json ]; then - python3 -c " - import json - try: - d = json.load(open('ci_health_report.json')) - print(f\"**Overall Health:** {d['overall_health'].title()}\") - print(f\"**Success Rate:** {d['summary']['overall_success_rate']:.1%} ({d['summary']['failed_runs']} failures out of {d['summary']['total_runs']} runs)\") - print(f\"**Timeout Failures:** {d['summary']['timeout_failures']}\") - print() - - print('### Workflow Status') - for workflow, status in d['workflows'].items(): - icon = '✅' if status['status'] == 'healthy' else '⚠️' if status['status'] == 'degraded' else '❌' - print(f'{icon} **{workflow}**: {status[\"success_rate\"]:.1%} success rate ({status[\"successful\"]}/{status[\"completed\"]} runs)') - - if d.get('alerts'): - print() - print('### Alerts') - for alert in d['alerts']: - print(f'⚠️ {alert}') - - if d.get('silent_failures'): - print() - print('### Silent Failure Detection') - for failure in d['silent_failures']: - print(f'🔍 {failure}') - except Exception as e: - print(f'Error processing health report: {e}') - " >> $GITHUB_STEP_SUMMARY - else - echo "❌ Health report generation failed." >> $GITHUB_STEP_SUMMARY - fi - - - name: Upload health report - if: always() - uses: actions/upload-artifact@v4 - with: - name: ci-health-report - path: ci_health_report.json - retention-days: 30 - - - name: Comment on infrastructure issues if critical - if: always() - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - if [ -f ci_health_report.json ]; then - HEALTH_STATUS=$(python3 -c "import json; d=json.load(open('ci_health_report.json')); print(d['overall_health'])") - TIMEOUT_FAILURES=$(python3 -c "import json; d=json.load(open('ci_health_report.json')); print(d['summary']['timeout_failures'])") - - if [ "$HEALTH_STATUS" = "critical" ] || [ "$TIMEOUT_FAILURES" -gt 3 ]; then - echo "Critical CI health detected, would create infrastructure issue if not already exists..." - # Note: In production, this would create an issue or comment on existing infrastructure issues - fi - fi \ No newline at end of file diff --git a/.github/workflows/ci-metrics-dashboard.yml b/.github/workflows/ci-metrics-dashboard.yml deleted file mode 100644 index f7fcbd8..0000000 --- a/.github/workflows/ci-metrics-dashboard.yml +++ /dev/null @@ -1,498 +0,0 @@ -name: CI Metrics Dashboard - -on: - schedule: - - cron: '0 8 * * *' # Daily at 8 AM UTC - workflow_dispatch: - -jobs: - generate-ci-dashboard: - name: Generate CI Metrics Dashboard - runs-on: ubuntu-latest - timeout-minutes: 20 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install Python dependencies - run: | - pip install matplotlib numpy pandas requests plotly kaleido - - - name: Generate comprehensive CI metrics - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - python3 - <<'PYEOF' - import json - import subprocess - import pandas as pd - import matplotlib.pyplot as plt - import plotly.graph_objects as go - import plotly.express as px - from plotly.subplots import make_subplots - from datetime import datetime, timedelta - import numpy as np - - def get_workflow_runs(workflow_name, days=30): - """Get workflow runs for the past N days""" - cmd = ['gh', 'run', 'list', '--workflow', workflow_name, '--json', - 'status,conclusion,createdAt,updatedAt,runNumber,id,name', '--limit', '100'] - result = subprocess.run(cmd, capture_output=True, text=True) - - if result.returncode != 0: - print(f"Warning: Could not fetch runs for {workflow_name}") - return [] - - runs = json.loads(result.stdout) - cutoff = datetime.now() - timedelta(days=days) - - recent_runs = [] - for run in runs: - created_at = datetime.fromisoformat(run['createdAt'].replace('Z', '+00:00')) - if created_at > cutoff: - recent_runs.append(run) - - return recent_runs - - def calculate_duration_seconds(created_at, updated_at): - """Calculate run duration in seconds""" - try: - start = datetime.fromisoformat(created_at.replace('Z', '+00:00')) - end = datetime.fromisoformat(updated_at.replace('Z', '+00:00')) - return (end - start).total_seconds() - except: - return 0 - - # Workflows to analyze - critical_workflows = [ - 'ci.yml', - 'h5-accuracy-report.yml', - 'polybench-sim.yml', - 'cpi-comparison.yml', - 'matmul-calibration.yml', - 'polybench-segmented.yml', - 'h5-parallel-accuracy.yml' - ] - - # Collect data for all workflows - dashboard_data = { - "generated_at": datetime.now().isoformat(), - "period_days": 30, - "workflows": {}, - "summary": {}, - "trends": {} - } - - all_runs_data = [] - - for workflow in critical_workflows: - print(f"Analyzing {workflow}...") - runs = get_workflow_runs(workflow, days=30) - - if not runs: - dashboard_data["workflows"][workflow] = { - "total_runs": 0, - "success_rate": 0.0, - "avg_duration": 0.0, - "failure_rate": 0.0 - } - continue - - # Process runs data - completed_runs = [r for r in runs if r['status'] == 'completed'] - successful_runs = [r for r in completed_runs if r['conclusion'] == 'success'] - failed_runs = [r for r in completed_runs if r['conclusion'] in ['failure', 'cancelled', 'timed_out']] - timeout_runs = [r for r in completed_runs if r['conclusion'] == 'timed_out'] - - # Calculate metrics - success_rate = len(successful_runs) / len(completed_runs) if completed_runs else 0.0 - failure_rate = len(failed_runs) / len(completed_runs) if completed_runs else 0.0 - - # Duration analysis - durations = [calculate_duration_seconds(r['createdAt'], r['updatedAt']) for r in completed_runs] - avg_duration = np.mean(durations) if durations else 0.0 - - dashboard_data["workflows"][workflow] = { - "total_runs": len(runs), - "completed_runs": len(completed_runs), - "successful_runs": len(successful_runs), - "failed_runs": len(failed_runs), - "timeout_runs": len(timeout_runs), - "success_rate": success_rate, - "failure_rate": failure_rate, - "avg_duration_seconds": avg_duration, - "avg_duration_minutes": avg_duration / 60.0 - } - - # Add to aggregate data for trending - for run in completed_runs: - duration = calculate_duration_seconds(run['createdAt'], run['updatedAt']) - all_runs_data.append({ - 'workflow': workflow, - 'date': datetime.fromisoformat(run['createdAt'].replace('Z', '+00:00')).date(), - 'status': run['conclusion'], - 'duration': duration, - 'success': 1 if run['conclusion'] == 'success' else 0 - }) - - # Calculate summary metrics - total_runs = sum([w['total_runs'] for w in dashboard_data["workflows"].values()]) - total_completed = sum([w['completed_runs'] for w in dashboard_data["workflows"].values()]) - total_successful = sum([w['successful_runs'] for w in dashboard_data["workflows"].values()]) - total_failed = sum([w['failed_runs'] for w in dashboard_data["workflows"].values()]) - - dashboard_data["summary"] = { - "total_runs": total_runs, - "total_completed": total_completed, - "total_successful": total_successful, - "total_failed": total_failed, - "overall_success_rate": total_successful / total_completed if total_completed > 0 else 0.0, - "overall_failure_rate": total_failed / total_completed if total_completed > 0 else 0.0, - "avg_daily_runs": total_runs / 30.0 - } - - # Generate visualizations - print("Generating CI metrics dashboard...") - - # 1. Success Rate by Workflow (Bar chart) - workflows_list = list(dashboard_data["workflows"].keys()) - success_rates = [dashboard_data["workflows"][w]["success_rate"] * 100 for w in workflows_list] - - fig = make_subplots( - rows=2, cols=2, - subplot_titles=('Success Rate by Workflow', 'Average Duration by Workflow', - 'Daily Success Rate Trend', 'Workflow Run Volume'), - specs=[[{"type": "bar"}, {"type": "bar"}], - [{"type": "scatter"}, {"type": "bar"}]] - ) - - # Success rate chart - fig.add_trace( - go.Bar(x=workflows_list, y=success_rates, name="Success Rate (%)", - marker_color=['green' if sr >= 90 else 'orange' if sr >= 70 else 'red' for sr in success_rates]), - row=1, col=1 - ) - - # Duration chart - durations = [dashboard_data["workflows"][w]["avg_duration_minutes"] for w in workflows_list] - fig.add_trace( - go.Bar(x=workflows_list, y=durations, name="Avg Duration (min)", - marker_color='blue'), - row=1, col=2 - ) - - # Daily trend analysis - if all_runs_data: - df = pd.DataFrame(all_runs_data) - daily_success = df.groupby('date')['success'].mean() * 100 - dates = daily_success.index - success_trend = daily_success.values - - fig.add_trace( - go.Scatter(x=dates, y=success_trend, mode='lines+markers', - name="Daily Success Rate (%)", line_color='green'), - row=2, col=1 - ) - - # Volume chart - daily_volume = df.groupby('date').size() - fig.add_trace( - go.Bar(x=daily_volume.index, y=daily_volume.values, - name="Daily Runs", marker_color='purple'), - row=2, col=2 - ) - - fig.update_layout( - title=f"CI Health Dashboard - {datetime.now().strftime('%Y-%m-%d')}", - showlegend=False, - height=800 - ) - - # Save dashboard - fig.write_html("ci_dashboard.html") - fig.write_image("ci_dashboard.png", width=1200, height=800) - - # Create detailed metrics table - with open("ci_metrics_table.html", "w") as f: - f.write(""" - CI Metrics Table - -

CI Metrics Detailed Table

- - - - - - - - - """) - - for workflow, data in dashboard_data["workflows"].items(): - success_rate = data["success_rate"] * 100 - status_class = "success" if success_rate >= 90 else "warning" if success_rate >= 70 else "danger" - health = "Healthy" if success_rate >= 90 else "Degraded" if success_rate >= 70 else "Critical" - - f.write(f""" - - - - - - - - """) - - f.write("
WorkflowTotal RunsSuccess RateAvg DurationFailuresTimeoutsHealth Status
{workflow}{data["total_runs"]}{success_rate:.1f}%{data["avg_duration_minutes"]:.1f} min{data["failed_runs"]}{data["timeout_runs"]}{health}
") - - # Save data - with open("ci_dashboard_data.json", "w") as f: - json.dump(dashboard_data, f, indent=2) - - print("Dashboard generation complete!") - print(f"Overall CI Health: {dashboard_data['summary']['overall_success_rate']*100:.1f}% success rate") - PYEOF - - - name: Generate CI health trends - run: | - python3 - <<'PYEOF' - import json - import matplotlib.pyplot as plt - from datetime import datetime - - # Load dashboard data - with open("ci_dashboard_data.json", "r") as f: - data = json.load(f) - - # Generate simple matplotlib charts as backup - workflows = list(data["workflows"].keys()) - success_rates = [data["workflows"][w]["success_rate"] * 100 for w in workflows] - - # Create figure with subplots - fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10)) - - # Success rates - colors = ['green' if sr >= 90 else 'orange' if sr >= 70 else 'red' for sr in success_rates] - ax1.bar(range(len(workflows)), success_rates, color=colors) - ax1.set_title('Success Rate by Workflow (%)') - ax1.set_xticks(range(len(workflows))) - ax1.set_xticklabels([w.replace('.yml', '') for w in workflows], rotation=45) - ax1.set_ylabel('Success Rate (%)') - ax1.axhline(y=90, color='green', linestyle='--', alpha=0.7, label='Target (90%)') - ax1.legend() - - # Duration analysis - durations = [data["workflows"][w]["avg_duration_minutes"] for w in workflows] - ax2.bar(range(len(workflows)), durations, color='blue') - ax2.set_title('Average Duration by Workflow (minutes)') - ax2.set_xticks(range(len(workflows))) - ax2.set_xticklabels([w.replace('.yml', '') for w in workflows], rotation=45) - ax2.set_ylabel('Duration (minutes)') - - # Failure counts - failed_counts = [data["workflows"][w]["failed_runs"] for w in workflows] - timeout_counts = [data["workflows"][w]["timeout_runs"] for w in workflows] - - x = range(len(workflows)) - ax3.bar(x, failed_counts, label='Failed', color='red', alpha=0.7) - ax3.bar(x, timeout_counts, bottom=failed_counts, label='Timeouts', color='darkred', alpha=0.7) - ax3.set_title('Failure Breakdown by Workflow') - ax3.set_xticks(x) - ax3.set_xticklabels([w.replace('.yml', '') for w in workflows], rotation=45) - ax3.set_ylabel('Failure Count') - ax3.legend() - - # Summary metrics - summary_labels = ['Total Runs', 'Successful', 'Failed'] - summary_values = [ - data["summary"]["total_runs"], - data["summary"]["total_successful"], - data["summary"]["total_failed"] - ] - ax4.pie(summary_values[1:], labels=['Successful', 'Failed'], autopct='%1.1f%%', - colors=['green', 'red'], startangle=90) - ax4.set_title(f'Overall CI Health\n(30-day period)') - - plt.tight_layout() - plt.savefig('ci_metrics_charts.png', dpi=300, bbox_inches='tight') - plt.savefig('ci_metrics_charts.pdf', bbox_inches='tight') - - print("Additional charts generated successfully!") - PYEOF - - - name: Create CI performance report - run: | - cat > ci_performance_report.md << 'MEOF' - # CI Infrastructure Performance Report - - **Generated:** $(date -u) - **Period:** Last 30 days - **Dashboard Version:** Phase 3 CI Hardening Implementation - - ## Executive Summary - - MEOF - - # Add dynamic content to report - python3 - <<'PYEOF' - import json - - with open("ci_dashboard_data.json", "r") as f: - data = json.load(f) - - summary = data["summary"] - - with open("ci_performance_report.md", "a") as f: - f.write(f""" - - **Overall Success Rate:** {summary['overall_success_rate']*100:.1f}% - - **Total Runs Analyzed:** {summary['total_runs']} - - **Average Daily Runs:** {summary['avg_daily_runs']:.1f} - - **Total Failures:** {summary['total_failed']} - - ## Workflow Health Status - - | Workflow | Success Rate | Avg Duration | Status | - |----------|-------------|--------------|---------| - """) - - for workflow, metrics in data["workflows"].items(): - success_rate = metrics["success_rate"] * 100 - duration = metrics["avg_duration_minutes"] - - if success_rate >= 90: - status = "✅ Healthy" - elif success_rate >= 70: - status = "⚠️ Degraded" - else: - status = "❌ Critical" - - f.write(f"| {workflow} | {success_rate:.1f}% | {duration:.1f}m | {status} |\n") - - f.write(f""" - ## CI Hardening Implementation Status - - ### ✅ Phase 1: Immediate Stability - - Extended timeouts implemented across all workflows - - H5 Accuracy Report: 60m → 90m timeout - - PolyBench Simulation: 20m → 35m timeout - - All core CI jobs have explicit timeout configurations - - ### ✅ Phase 2: Performance Optimization - - Parallel execution framework deployed - - Test segmentation architecture implemented - - PolyBench tests split into 3 groups for reliability - - Multi-runner parallel H5 accuracy testing available - - ### ✅ Phase 3: Long-term Resilience - - CI health monitoring dashboard operational - - Daily metrics collection and trending - - Automated performance tracking - - Silent failure detection mechanisms - - ## Recommendations - - """) - - # Add recommendations based on data - critical_workflows = [w for w, m in data["workflows"].items() if m["success_rate"] < 0.7] - degraded_workflows = [w for w, m in data["workflows"].items() if 0.7 <= m["success_rate"] < 0.9] - - if critical_workflows: - f.write("### ⚠️ Critical Issues\n") - for workflow in critical_workflows: - f.write(f"- **{workflow}**: Success rate below 70%, requires immediate attention\n") - f.write("\n") - - if degraded_workflows: - f.write("### 📈 Performance Improvements\n") - for workflow in degraded_workflows: - f.write(f"- **{workflow}**: Success rate below 90%, consider optimization\n") - f.write("\n") - - if not critical_workflows and len(degraded_workflows) <= 1: - f.write("### ✅ System Health\n") - f.write("CI infrastructure is performing well with no critical issues detected.\n") - f.write("Continue monitoring and consider proactive optimizations.\n\n") - - f.write(""" - ## Dashboard Artifacts - - - **Interactive Dashboard:** `ci_dashboard.html` - - **Metrics Charts:** `ci_metrics_charts.png` - - **Performance Data:** `ci_dashboard_data.json` - - **Detailed Table:** `ci_metrics_table.html` - - --- - *Report generated by CI Metrics Dashboard workflow - Phase 3 CI Infrastructure Hardening* - """) - - print("Performance report generated successfully!") - PYEOF - - - name: Upload comprehensive dashboard artifacts - uses: actions/upload-artifact@v4 - with: - name: ci-dashboard-complete - path: | - ci_dashboard.html - ci_dashboard.png - ci_metrics_charts.png - ci_metrics_charts.pdf - ci_metrics_table.html - ci_dashboard_data.json - ci_performance_report.md - retention-days: 90 - - - name: Post dashboard summary - if: always() - run: | - echo "## CI Metrics Dashboard Generated" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ -f ci_dashboard_data.json ]; then - python3 -c " - import json - d = json.load(open('ci_dashboard_data.json')) - s = d['summary'] - print(f\"**Overall CI Health:** {s['overall_success_rate']*100:.1f}% success rate\") - print(f\"**Total Runs (30d):** {s['total_runs']}\") - print(f\"**Average Daily Runs:** {s['avg_daily_runs']:.1f}\") - print(f\"**Total Failures:** {s['total_failed']}\") - print() - - critical = [w for w, m in d['workflows'].items() if m['success_rate'] < 0.7] - degraded = [w for w, m in d['workflows'].items() if 0.7 <= m['success_rate'] < 0.9] - - if critical: - print('### ⚠️ Critical Workflows (< 70%):') - for w in critical: - print(f'- {w}: {d[\"workflows\"][w][\"success_rate\"]*100:.1f}%') - - if degraded: - print('### 📊 Degraded Workflows (70-90%):') - for w in degraded: - print(f'- {w}: {d[\"workflows\"][w][\"success_rate\"]*100:.1f}%') - - if not critical and len(degraded) <= 1: - print('### ✅ All workflows performing well!') - - print() - print('📊 [View Interactive Dashboard](artifacts/ci-dashboard-complete/ci_dashboard.html)') - print('📈 [Download Performance Report](artifacts/ci-dashboard-complete/ci_performance_report.md)') - " >> $GITHUB_STEP_SUMMARY - else - echo "❌ Dashboard generation failed." >> $GITHUB_STEP_SUMMARY - fi \ No newline at end of file diff --git a/.github/workflows/cpi-comparison.yml b/.github/workflows/cpi-comparison.yml deleted file mode 100644 index 0bc6937..0000000 --- a/.github/workflows/cpi-comparison.yml +++ /dev/null @@ -1,78 +0,0 @@ -name: CPI Comparison - -on: - push: - branches: [main] - workflow_dispatch: - -jobs: - cpi-comparison: - name: Fast Timing vs Full Pipeline CPI - runs-on: ubuntu-latest - timeout-minutes: 30 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Run CPI comparison - run: | - go test -v -run TestCPIComparison -timeout 20m ./benchmarks/ 2>&1 | tee cpi_comparison_output.txt - - - name: Post summary - if: always() - run: | - echo "## CPI Comparison: Fast Timing vs Full Pipeline" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ -f benchmarks/cpi_three_way_results.json ]; then - echo "### Three-Way Comparison (M2 Hardware vs Full Pipeline vs Fast Timing)" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - python3 -c " - import json - with open('benchmarks/cpi_three_way_results.json') as f: - data = json.load(f) - print(f\"{'Benchmark':<15} {'M2 CPI':>10} {'Full CPI':>10} {'Fast CPI':>10} {'Full Err%':>12} {'Fast Err%':>12}\") - print('-' * 72) - for r in data: - print(f\"{r['name']:<15} {r['m2_cpi']:>10.3f} {r['full_pipeline_cpi']:>10.3f} {r['fast_timing_cpi']:>10.3f} {r['full_error_pct']:>11.1f}% {r['fast_error_pct']:>11.1f}%\") - " >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - fi - - if [ -f benchmarks/cpi_comparison_results.json ]; then - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Full Benchmark Comparison" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - python3 -c " - import json - with open('benchmarks/cpi_comparison_results.json') as f: - data = json.load(f) - print(f\"{'Benchmark':<30} {'Full CPI':>10} {'Fast CPI':>10} {'Divergence':>12}\") - print('-' * 65) - for r in data: - print(f\"{r['name']:<30} {r['full_pipeline_cpi']:>10.3f} {r['fast_timing_cpi']:>10.3f} {r['divergence_pct']:>11.1f}%\") - total_abs = sum(abs(r['divergence_pct']) for r in data) - avg = total_abs / len(data) if data else 0 - print(f\"\nAverage |divergence|: {avg:.1f}%\") - " >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - fi - - - name: Upload results - if: always() - uses: actions/upload-artifact@v4 - with: - name: cpi-comparison - path: | - benchmarks/cpi_comparison_results.json - benchmarks/cpi_three_way_results.json - cpi_comparison_output.txt - retention-days: 90 diff --git a/.github/workflows/diana-statistical-validation.yml b/.github/workflows/diana-statistical-validation.yml deleted file mode 100644 index 2ec8968..0000000 --- a/.github/workflows/diana-statistical-validation.yml +++ /dev/null @@ -1,363 +0,0 @@ -name: Statistical Validation Framework (Diana) - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - schedule: - # Run weekly comprehensive validation - - cron: '0 10 * * 1' # Monday 10 AM UTC - workflow_dispatch: - inputs: - benchmark: - description: 'Specific benchmark to validate (gemm, atax, gesummv) or "full-suite"' - required: true - default: 'gemm' - validation_mode: - description: 'Validation mode' - required: true - default: 'standard' - type: choice - options: - - 'standard' - - 'comprehensive' - - 'regression-only' - -jobs: - statistical-validation: - name: Diana's Statistical Validation Framework - runs-on: ubuntu-latest - timeout-minutes: 45 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 50 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install Python dependencies - run: | - python3 -m pip install --upgrade pip - pip install numpy scipy matplotlib - # Verify scientific computing dependencies - python3 -c "import numpy as np; import scipy.stats as stats; print('Scientific dependencies ready')" - - - name: Build M2Sim tools - run: | - echo "Building M2Sim profile tool for validation..." - go build -o profile-tool ./cmd/profile - chmod +x profile-tool - ls -la profile-tool - - - name: Prepare PolyBench benchmarks - run: | - echo "Preparing PolyBench benchmarks for statistical validation..." - - # Ensure PolyBench directory exists and is accessible - if [ ! -d "benchmarks/polybench" ]; then - echo "Warning: PolyBench directory not found, creating minimal test structure" - mkdir -p benchmarks/polybench/gemm - mkdir -p benchmarks/polybench/atax - mkdir -p benchmarks/polybench/gesummv - fi - - # Check for available benchmarks - echo "Available benchmarks:" - find benchmarks/polybench -maxdepth 1 -type d | head -10 - - - name: Create validation results directory - run: | - mkdir -p validation_results - mkdir -p validation_results/plots - mkdir -p validation_results/reports - - - name: Run Statistical Validation Framework - id: validation - run: | - echo "Starting Diana's Statistical Validation Framework" - - # Determine validation parameters - BENCHMARK="${{ github.event.inputs.benchmark || 'gemm' }}" - VALIDATION_MODE="${{ github.event.inputs.validation_mode || 'standard' }}" - - echo "Validation parameters:" - echo " Benchmark: $BENCHMARK" - echo " Mode: $VALIDATION_MODE" - echo " Trigger: ${{ github.event_name }}" - - # Create validation execution script - cat > run_validation.py << 'EOF' - #!/usr/bin/env python3 - import sys - import subprocess - import os - from pathlib import Path - - def run_validation(): - try: - benchmark = os.environ.get('BENCHMARK', 'gemm') - validation_mode = os.environ.get('VALIDATION_MODE', 'standard') - - # Basic validation check for quick feedback - if validation_mode == 'regression-only': - cmd = [ - 'python3', 'scripts/performance_optimization_validation.py' - ] - print("Running performance regression validation only...") - result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) - - if result.returncode == 0: - print("✅ Performance regression validation passed") - return 0 - else: - print("❌ Performance regression validation failed") - print("STDOUT:", result.stdout) - print("STDERR:", result.stderr) - return 1 - - # Full statistical validation framework - cmd = [ - 'python3', 'scripts/diana_comprehensive_qa_validation.py' - ] - - if benchmark == 'full-suite': - cmd.extend(['--full-suite']) - else: - cmd.extend(['--benchmark', benchmark]) - - cmd.extend(['--output', 'validation_results']) - - print(f"Running: {' '.join(cmd)}") - result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800) # 30 min timeout - - # Output results for CI visibility - if result.stdout: - print("=== VALIDATION OUTPUT ===") - print(result.stdout) - - if result.stderr: - print("=== VALIDATION WARNINGS/ERRORS ===") - print(result.stderr) - - # Check for validation results - results_dir = Path('validation_results') - if results_dir.exists(): - reports = list(results_dir.glob('*.md')) - print(f"Generated {len(reports)} validation reports:") - for report in reports: - print(f" - {report}") - - return result.returncode - - except subprocess.TimeoutExpired: - print("❌ Validation timed out - framework may need optimization") - return 124 - except Exception as e: - print(f"❌ Validation failed with exception: {e}") - return 1 - - if __name__ == '__main__': - exit(run_validation()) - EOF - - # Execute validation - export BENCHMARK="$BENCHMARK" - export VALIDATION_MODE="$VALIDATION_MODE" - - python3 run_validation.py > validation_output.txt 2>&1 - VALIDATION_EXIT_CODE=$? - - # Always output the results for CI visibility - cat validation_output.txt - - # Set outputs for subsequent steps - echo "validation_exit_code=$VALIDATION_EXIT_CODE" >> $GITHUB_OUTPUT - - # Extract key validation metrics if available - if [ -f "validation_results/diana_qa_suite_summary.md" ]; then - echo "suite_summary_available=true" >> $GITHUB_OUTPUT - elif ls validation_results/diana_qa_*.md 2>/dev/null; then - echo "individual_reports_available=true" >> $GITHUB_OUTPUT - fi - - exit $VALIDATION_EXIT_CODE - - - name: Analyze validation results - if: always() - run: | - echo "Analyzing Diana's QA validation results..." - - # Generate validation summary - cat > validation_results/VALIDATION_SUMMARY.md << 'HEADER' - # Diana's Statistical Validation Summary - - **Date:** $(date -u) - **Commit:** ${{ github.sha }} - **Branch:** ${{ github.ref_name }} - **Trigger:** ${{ github.event_name }} - **Validation Framework:** Issue #486 - Statistical Validation for Performance Enhancement - - ## Validation Framework Overview - - Diana's comprehensive QA framework validates Alex's Performance Optimization Enhancement (Issue #481) with: - - **R² >95% correlation analysis** for calibration parameter generalization - - **Cross-scale accuracy verification** (64³ → 1024³ progressive scaling) - - **Development velocity validation** (≥3x improvement target) - - **Performance regression monitoring** integration with Maya's optimizations - - ## Results Summary - - HEADER - - # Add validation results - if [ -f "validation_results/diana_qa_suite_summary.md" ]; then - echo "### Suite Validation Results" >> validation_results/VALIDATION_SUMMARY.md - echo "" >> validation_results/VALIDATION_SUMMARY.md - # Extract key metrics from suite summary - grep -E "PASSED|WARNING|FAILED|R² Correlation|Max Error|Velocity" validation_results/diana_qa_suite_summary.md >> validation_results/VALIDATION_SUMMARY.md || true - fi - - # List individual validation reports - echo "" >> validation_results/VALIDATION_SUMMARY.md - echo "### Generated Reports" >> validation_results/VALIDATION_SUMMARY.md - echo "" >> validation_results/VALIDATION_SUMMARY.md - - for report in validation_results/*.md; do - if [ -f "$report" ]; then - report_name=$(basename "$report") - echo "- \`$report_name\`" >> validation_results/VALIDATION_SUMMARY.md - fi - done - - # Add validation status - echo "" >> validation_results/VALIDATION_SUMMARY.md - echo "## CI Integration Status" >> validation_results/VALIDATION_SUMMARY.md - echo "" >> validation_results/VALIDATION_SUMMARY.md - - if [ "${{ steps.validation.outputs.validation_exit_code }}" = "0" ]; then - echo "✅ **VALIDATION PASSED** - All QA criteria satisfied" >> validation_results/VALIDATION_SUMMARY.md - else - echo "❌ **VALIDATION FAILED** - QA criteria not met" >> validation_results/VALIDATION_SUMMARY.md - fi - - echo "" >> validation_results/VALIDATION_SUMMARY.md - echo "### Integration with Existing CI:" >> validation_results/VALIDATION_SUMMARY.md - echo "- **Accuracy Report:** Validated against current baseline" >> validation_results/VALIDATION_SUMMARY.md - echo "- **Performance Regression:** Monitored for Maya's optimizations" >> validation_results/VALIDATION_SUMMARY.md - echo "- **CPI Comparison:** Statistical correlation with hardware baselines" >> validation_results/VALIDATION_SUMMARY.md - echo "- **Matmul Calibration:** Cross-scale accuracy verification" >> validation_results/VALIDATION_SUMMARY.md - - - name: Upload validation artifacts - if: always() - uses: actions/upload-artifact@v4 - with: - name: diana-statistical-validation-${{ github.sha }} - path: | - validation_results/ - validation_output.txt - *.prof - retention-days: 30 - - - name: Comment on PR (if applicable) - if: github.event_name == 'pull_request' - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - const path = 'validation_results/VALIDATION_SUMMARY.md'; - - if (fs.existsSync(path)) { - const summary = fs.readFileSync(path, 'utf8'); - const exitCode = '${{ steps.validation.outputs.validation_exit_code }}'; - - const status = exitCode === '0' ? '✅ PASSED' : '❌ FAILED'; - const icon = exitCode === '0' ? '🔬' : '⚠️'; - - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: `## ${icon} Diana's Statistical Validation Results - ${status}\n\n${summary}` - }); - } - - - name: Check validation status - run: | - if [ "${{ steps.validation.outputs.validation_exit_code }}" != "0" ]; then - echo "❌ Statistical validation failed - critical QA requirements not met" - echo " Review validation reports for specific failures:" - echo " - R² correlation analysis (target: ≥95%)" - echo " - Cross-scale accuracy verification (target: ≤20% error)" - echo " - Development velocity validation (target: ≥3x improvement)" - echo " - Performance regression monitoring" - exit 1 - else - echo "✅ Statistical validation passed - QA framework validated successfully" - echo " Diana's comprehensive validation confirms:" - echo " - Alex's statistical framework meets scientific rigor standards" - echo " - Maya's performance optimizations preserve accuracy" - echo " - Development velocity improvements quantified and verified" - echo " - Performance regression monitoring operational" - fi - - integration-validation: - name: QA Integration with Existing CI - runs-on: ubuntu-latest - needs: statistical-validation - if: github.event_name == 'pull_request' - timeout-minutes: 10 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Validate CI integration - run: | - echo "Validating Diana's QA framework integration with existing CI workflows..." - - # Check for critical CI workflow files - workflows=( - ".github/workflows/accuracy-report.yml" - ".github/workflows/performance-regression-monitoring.yml" - ".github/workflows/cpi-comparison.yml" - ".github/workflows/matmul-calibration.yml" - ) - - echo "Verifying integration points:" - for workflow in "${workflows[@]}"; do - if [ -f "$workflow" ]; then - echo "✅ $workflow - Available for integration" - else - echo "⚠️ $workflow - Not found" - fi - done - - # Verify statistical validation script exists - if [ -f "scripts/diana_comprehensive_qa_validation.py" ]; then - echo "✅ Diana's QA validation framework - Deployed" - else - echo "❌ Diana's QA validation framework - Missing" - exit 1 - fi - - # Verify Alex's statistical framework integration - if [ -f "scripts/incremental_testing_validation.py" ]; then - echo "✅ Alex's statistical framework - Available for integration" - else - echo "⚠️ Alex's statistical framework - Not found" - fi - - echo "" - echo "🔬 QA Integration Status: OPERATIONAL" - echo " Diana's statistical validation framework successfully integrated with existing CI infrastructure" \ No newline at end of file diff --git a/.github/workflows/embench-calibration.yml b/.github/workflows/embench-calibration.yml deleted file mode 100644 index 61c9a1e..0000000 --- a/.github/workflows/embench-calibration.yml +++ /dev/null @@ -1,64 +0,0 @@ -name: EmBench Hardware Calibration - -on: - workflow_dispatch: - -jobs: - calibrate: - name: EmBench Linear Regression Calibration on Apple Silicon - runs-on: macos-14 - timeout-minutes: 45 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install Python dependencies - run: pip install numpy scipy - - - name: Verify ARM64 architecture - run: | - ARCH=$(uname -m) - echo "Architecture: $ARCH" - if [ "$ARCH" != "arm64" ]; then - echo "ERROR: This workflow requires ARM64 (Apple Silicon)" - exit 1 - fi - - - name: Run EmBench calibration - run: | - cd benchmarks/native - python3 embench_calibration.py \ - --runs 15 \ - --output embench_calibration_results.json - timeout-minutes: 40 - - - name: Display results - if: always() - run: | - echo "## EmBench Calibration Results" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - if [ -f benchmarks/native/embench_calibration_results.json ]; then - echo '```json' >> $GITHUB_STEP_SUMMARY - cat benchmarks/native/embench_calibration_results.json >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - else - echo "Calibration results not found." >> $GITHUB_STEP_SUMMARY - fi - - - name: Upload calibration results - if: always() - uses: actions/upload-artifact@v4 - with: - name: embench-calibration-results - path: benchmarks/native/embench_calibration_results.json - retention-days: 90 - - - name: Clean up build artifacts - if: always() - run: rm -rf benchmarks/embench-native-build diff --git a/.github/workflows/h4-multicore-accuracy.yml b/.github/workflows/h4-multicore-accuracy.yml deleted file mode 100644 index 72f9560..0000000 --- a/.github/workflows/h4-multicore-accuracy.yml +++ /dev/null @@ -1,352 +0,0 @@ -name: H4 Multi-Core Accuracy Validation - -on: - workflow_dispatch: # Allow manual triggering for H4 development - inputs: - core_count: - description: 'Core count for validation (2, 4, or 8)' - required: false - default: '2' - type: choice - options: - - '2' - - '4' - - '8' - validation_mode: - description: 'Validation mode' - required: false - default: 'full' - type: choice - options: - - 'quick' # 2-core validation only - - 'full' # Comprehensive multi-core analysis - - 'benchmark' # Benchmark development validation - push: - branches: [main] - paths: - - 'scripts/h4_multicore_analysis.py' - - 'scripts/h4_2core_validation.py' - - 'benchmarks/multicore/**' - - 'timing/multicore/**' - - 'docs/h4_multicore_statistical_methodology.md' - pull_request: - paths: - - 'scripts/h4_multicore_analysis.py' - - 'scripts/h4_2core_validation.py' - - 'benchmarks/multicore/**' - -jobs: - h4-2core-validation: - name: H4 2-Core Framework Validation - runs-on: macos-14 # Apple Silicon runner for M2 hardware baseline compatibility - timeout-minutes: 90 # Extended timeout for multi-core benchmark compilation and execution - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install Python dependencies for multi-core analysis - run: | - pip install numpy scipy matplotlib pandas sqlite3 - pip install scikit-learn # For enhanced statistical modeling - - - name: Install development tools - run: | - # OpenMP support for multi-core benchmark compilation - brew install libomp - export LDFLAGS="-L$(brew --prefix libomp)/lib" - export CPPFLAGS="-I$(brew --prefix libomp)/include" - - - name: Verify M2 hardware environment - run: | - echo "=== M2 Hardware Validation ===" - sysctl hw.ncpu - sysctl hw.memsize - sysctl machdep.cpu.brand_string - echo "" - echo "OpenMP verification:" - echo '#include ' | gcc -fopenmp -E - >/dev/null 2>&1 && echo "✅ OpenMP available" || echo "❌ OpenMP not available" - - - name: Build M2Sim with multi-core support verification - run: | - echo "=== M2Sim Build Verification ===" - go build ./cmd/m2sim/main.go - echo "✅ M2Sim builds successfully" - - - name: Create and compile 2-core benchmarks - run: | - echo "=== H4 2-Core Benchmark Setup ===" - cd scripts - python3 h4_2core_validation.py create-benchmarks - - echo "Compiling benchmarks with OpenMP support..." - cd ../benchmarks/multicore - export CC=gcc - export LDFLAGS="-L$(brew --prefix libomp)/lib" - export CPPFLAGS="-I$(brew --prefix libomp)/include" - make all || echo "⚠️ Some benchmarks failed to compile" - - echo "Available benchmarks:" - ls -la cache_coherence_intensive memory_bandwidth_stress compute_intensive_parallel 2>/dev/null || echo "No benchmarks compiled successfully" - - - name: Run H4 2-core validation framework - run: | - echo "=== H4 2-Core Validation Execution ===" - cd scripts - python3 h4_2core_validation.py validate || echo "⚠️ Validation completed with issues" - - - name: Run H4 multi-core analysis (if validation passes) - run: | - echo "=== H4 Multi-Core Analysis Framework ===" - cd scripts - if [ -f "../benchmarks/multicore/cache_coherence_intensive" ]; then - echo "Running multi-core analysis on available benchmarks..." - python3 h4_multicore_analysis.py 2core-validation || echo "⚠️ Multi-core analysis completed with issues" - else - echo "⚠️ No compiled benchmarks available for analysis" - fi - - - name: Generate H4 accuracy report - run: | - echo "=== H4 Accuracy Report Generation ===" - cd scripts - python3 h4_multicore_analysis.py report || echo "⚠️ Report generation completed with issues" - - - name: Collect H4 validation artifacts - run: | - echo "=== Collecting H4 Artifacts ===" - mkdir -p h4_artifacts - - # Copy validation reports - find . -name "*h4*validation*report*.json" -exec cp {} h4_artifacts/ \; 2>/dev/null || echo "No validation reports found" - find . -name "*h4*multicore*report*.json" -exec cp {} h4_artifacts/ \; 2>/dev/null || echo "No multicore reports found" - - # Copy statistical model database - find . -name "h4_multicore_results.db" -exec cp {} h4_artifacts/ \; 2>/dev/null || echo "No database found" - - # Copy benchmark compilation logs - find benchmarks/multicore -name "*.log" -exec cp {} h4_artifacts/ \; 2>/dev/null || echo "No compilation logs found" - - # Create summary file - echo "H4 Multi-Core Accuracy Validation Artifacts" > h4_artifacts/README.txt - echo "Generated: $(date)" >> h4_artifacts/README.txt - echo "Commit: $GITHUB_SHA" >> h4_artifacts/README.txt - ls -la h4_artifacts/ - - - name: Upload H4 validation artifacts - uses: actions/upload-artifact@v4 - if: always() - with: - name: h4-multicore-validation-artifacts - path: | - h4_artifacts/ - reports/*h4*multicore*.json - reports/*h4*multicore*.md - docs/h4_multicore_statistical_methodology.md - scripts/h4_multicore_analysis.py - scripts/h4_2core_validation.py - benchmarks/multicore/README.md - retention-days: 90 - - - name: Parse H4 validation results for summary - id: h4_results - if: always() - run: | - echo "=== Parsing H4 Results ===" - - # Find most recent validation report - VALIDATION_REPORT=$(find . -name "*h4*validation*report*.json" -type f | head -1) - MULTICORE_REPORT=$(find . -name "*h4*multicore*accuracy*report*.json" -type f | head -1) - - if [ -f "$VALIDATION_REPORT" ]; then - echo "Found validation report: $VALIDATION_REPORT" - - SUCCESSFUL_BENCHMARKS=$(python3 -c "import json; d=json.load(open('$VALIDATION_REPORT')); print(d['summary']['successful_validations'])" 2>/dev/null || echo "0") - TOTAL_BENCHMARKS=$(python3 -c "import json; d=json.load(open('$VALIDATION_REPORT')); print(d['summary']['total_benchmarks'])" 2>/dev/null || echo "0") - - echo "successful_benchmarks=$SUCCESSFUL_BENCHMARKS" >> $GITHUB_OUTPUT - echo "total_benchmarks=$TOTAL_BENCHMARKS" >> $GITHUB_OUTPUT - echo "validation_report_exists=true" >> $GITHUB_OUTPUT - else - echo "validation_report_exists=false" >> $GITHUB_OUTPUT - fi - - if [ -f "$MULTICORE_REPORT" ]; then - echo "Found multicore report: $MULTICORE_REPORT" - - H4_STATUS=$(python3 -c "import json; d=json.load(open('$MULTICORE_REPORT')); print('ACHIEVED' if d['summary']['h4_target_met'] else 'NOT_ACHIEVED')" 2>/dev/null || echo "UNKNOWN") - OVERALL_ERROR=$(python3 -c "import json; d=json.load(open('$MULTICORE_REPORT')); print(f\"{d['overall_accuracy']['average_error_pct']:.1f}%\")" 2>/dev/null || echo "N/A") - - echo "h4_status=$H4_STATUS" >> $GITHUB_OUTPUT - echo "overall_error=$OVERALL_ERROR" >> $GITHUB_OUTPUT - echo "multicore_report_exists=true" >> $GITHUB_OUTPUT - else - echo "multicore_report_exists=false" >> $GITHUB_OUTPUT - fi - - - name: Post H4 validation summary - if: always() - run: | - echo "## H4 Multi-Core Accuracy Validation Report" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ "${{ steps.h4_results.outputs.validation_report_exists }}" = "true" ]; then - echo "### 2-Core Validation Framework" >> $GITHUB_STEP_SUMMARY - echo "- **Benchmarks Validated:** ${{ steps.h4_results.outputs.successful_benchmarks }}/${{ steps.h4_results.outputs.total_benchmarks }}" >> $GITHUB_STEP_SUMMARY - echo "- **Target:** Minimum 3 successful validations" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ "${{ steps.h4_results.outputs.successful_benchmarks }}" -ge "3" ]; then - echo "✅ **2-Core Framework:** Validation PASSED" >> $GITHUB_STEP_SUMMARY - else - echo "❌ **2-Core Framework:** Validation FAILED" >> $GITHUB_STEP_SUMMARY - fi - else - echo "⚠️ **2-Core Validation:** No validation report generated" >> $GITHUB_STEP_SUMMARY - fi - - echo "" >> $GITHUB_STEP_SUMMARY - - if [ "${{ steps.h4_results.outputs.multicore_report_exists }}" = "true" ]; then - echo "### Multi-Core Accuracy Analysis" >> $GITHUB_STEP_SUMMARY - echo "- **H4 Status:** ${{ steps.h4_results.outputs.h4_status }}" >> $GITHUB_STEP_SUMMARY - echo "- **Overall Error:** ${{ steps.h4_results.outputs.overall_error }} (Target: <20%)" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ "${{ steps.h4_results.outputs.h4_status }}" = "ACHIEVED" ]; then - echo "✅ **H4 Accuracy Target:** ACHIEVED" >> $GITHUB_STEP_SUMMARY - else - echo "❌ **H4 Accuracy Target:** NOT ACHIEVED" >> $GITHUB_STEP_SUMMARY - fi - else - echo "⚠️ **Multi-Core Analysis:** No analysis report generated" >> $GITHUB_STEP_SUMMARY - fi - - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Next Steps" >> $GITHUB_STEP_SUMMARY - - if [ "${{ steps.h4_results.outputs.validation_report_exists }}" = "true" ] && [ "${{ steps.h4_results.outputs.successful_benchmarks }}" -ge "3" ]; then - echo "- ✅ 2-core framework validated - ready for 4-core extension" >> $GITHUB_STEP_SUMMARY - else - echo "- 🔧 2-core framework needs refinement - check benchmark compilation and M2Sim multi-core support" >> $GITHUB_STEP_SUMMARY - fi - - echo "- 📊 Download artifacts for detailed analysis and statistical models" >> $GITHUB_STEP_SUMMARY - echo "- 📈 See uploaded reports for accuracy breakdowns by benchmark category" >> $GITHUB_STEP_SUMMARY - - - name: Comment on H4 issue - if: github.ref == 'refs/heads/main' && always() - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - echo "=== Posting H4 Issue Comment ===" - - if [ "${{ steps.h4_results.outputs.validation_report_exists }}" = "true" ] || [ "${{ steps.h4_results.outputs.multicore_report_exists }}" = "true" ]; then - # Determine overall status - VALIDATION_STATUS="UNKNOWN" - if [ "${{ steps.h4_results.outputs.successful_benchmarks }}" -ge "3" ]; then - VALIDATION_STATUS="PASSED" - else - VALIDATION_STATUS="PARTIAL" - fi - - ACCURACY_STATUS="${{ steps.h4_results.outputs.h4_status }}" - - COMMENT_BODY="# [CI] H4 Multi-Core Accuracy Framework Results - - ## H4 Implementation Status Update - - **Validation Framework**: ${VALIDATION_STATUS} - - **2-Core Benchmarks**: ${{ steps.h4_results.outputs.successful_benchmarks }}/${{ steps.h4_results.outputs.total_benchmarks }} validated - - **Statistical Framework**: Multi-dimensional regression implemented - - **Accuracy Target**: ${ACCURACY_STATUS} (Target: <20% error) - - **Framework Components**: - - ✅ Multi-dimensional regression framework - - ✅ Cache coherence timing methodology - - ✅ 2-core validation suite - - ✅ CI integration pipeline - - **Technical Details**: - - **Commit**: ${GITHUB_SHA:0:8} - - **Workflow**: [View Details]($GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID) - - **Artifacts**: [Download H4 Reports]($GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID) - - **Next Implementation Phase**: - $(if [ "$VALIDATION_STATUS" = "PASSED" ]; then echo "Ready for 4-core framework extension and M2Sim multi-core integration"; else echo "2-core framework refinement and benchmark compilation fixes needed"; fi)" - - gh issue comment 474 --body "$COMMENT_BODY" || echo "Failed to comment on issue #474 - issue may not exist yet" - else - echo "⚠️ No validation results to report" - fi - - h4-statistical-validation: - name: H4 Statistical Framework Validation - runs-on: ubuntu-latest - timeout-minutes: 30 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install Python dependencies - run: | - pip install numpy scipy matplotlib pandas sqlite3 scikit-learn pytest - - - name: Validate H4 statistical framework - run: | - echo "=== H4 Statistical Framework Validation ===" - cd scripts - - # Test statistical framework components - python3 -c " - from h4_multicore_analysis import H4MultiCoreAnalyzer - analyzer = H4MultiCoreAnalyzer() - print('✅ H4MultiCoreAnalyzer class loads successfully') - - # Test multi-dimensional regression framework - import numpy as np - X = np.array([[0.05, 1.2, 0.02, 1.1], [0.08, 1.5, 0.03, 1.3]]) - y = np.array([1.0, 1.2]) - print('✅ Multi-dimensional regression framework validated') - - print('✅ H4 statistical methodology ready for implementation') - " - - - name: Validate documentation completeness - run: | - echo "=== Documentation Validation ===" - - # Check for required H4 documentation - test -f "docs/h4_multicore_statistical_methodology.md" && echo "✅ Statistical methodology documented" || echo "❌ Missing methodology documentation" - test -f "scripts/h4_multicore_analysis.py" && echo "✅ Analysis framework implemented" || echo "❌ Missing analysis framework" - test -f "scripts/h4_2core_validation.py" && echo "✅ Validation framework implemented" || echo "❌ Missing validation framework" - - # Verify documentation quality - grep -q "Multi-dimensional regression" docs/h4_multicore_statistical_methodology.md && echo "✅ Statistical methodology documented" || echo "⚠️ Statistical details may be incomplete" - grep -q "Cache coherence" docs/h4_multicore_statistical_methodology.md && echo "✅ Cache coherence methodology documented" || echo "⚠️ Coherence methodology may be incomplete" - - - name: Upload framework validation results - uses: actions/upload-artifact@v4 - with: - name: h4-framework-validation - path: | - docs/h4_multicore_statistical_methodology.md - scripts/h4_multicore_analysis.py - scripts/h4_2core_validation.py - retention-days: 30 \ No newline at end of file diff --git a/.github/workflows/h5-accuracy-report.yml b/.github/workflows/h5-accuracy-report.yml deleted file mode 100644 index 3994b44..0000000 --- a/.github/workflows/h5-accuracy-report.yml +++ /dev/null @@ -1,120 +0,0 @@ -name: H5 Accuracy Report - -on: - workflow_dispatch: # Allow manual triggering - push: - branches: [main] - paths: - - 'benchmarks/**' - - 'h5_accuracy_report.py' - - 'timing/**' - -concurrency: - group: h5-accuracy-report-${{ github.ref }} - cancel-in-progress: false - -jobs: - h5-accuracy-report: - name: Generate H5 Milestone Accuracy Report - runs-on: macos-14 # Apple Silicon runner for native M2 matching - timeout-minutes: 120 # Extended timeout for comprehensive accuracy testing - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install Python dependencies - run: | - pip install matplotlib numpy scipy - - - name: Install Ginkgo - run: go install github.com/onsi/ginkgo/v2/ginkgo@latest - - - name: Verify PolyBench ELFs exist - run: | - echo "Checking PolyBench ELF files..." - ls -la benchmarks/polybench/*.elf || echo "Some ELF files missing - will skip those benchmarks" - - - name: Verify EmBench ELFs exist - run: | - echo "Checking EmBench ELF files..." - ls -la benchmarks/aha-mont64-m2sim/*.elf benchmarks/crc32-m2sim/*.elf benchmarks/edn-m2sim/*.elf benchmarks/huffbench-m2sim/*.elf benchmarks/matmult-int-m2sim/*.elf benchmarks/statemate-m2sim/*.elf benchmarks/primecount-m2sim/*.elf || echo "Some EmBench ELF files missing - will skip those benchmarks" - - - name: Run H5 Accuracy Report - run: | - echo "Running H5 accuracy framework..." - python3 h5_accuracy_report.py - - - name: Upload H5 Accuracy Report - uses: actions/upload-artifact@v4 - with: - name: h5-accuracy-report - path: | - h5_accuracy_report.md - h5_accuracy_results.json - benchmarks/native/accuracy_report.md - benchmarks/native/accuracy_figure.png - benchmarks/native/accuracy_results.json - benchmarks/native/accuracy_normalized.pdf - retention-days: 90 - - - name: Post H5 Report Summary - if: always() - run: | - echo "## H5 Milestone Accuracy Report" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ -f h5_accuracy_results.json ]; then - TOTAL_BENCHMARKS=$(python3 -c "import json; d=json.load(open('h5_accuracy_results.json')); print(d['h5_milestone']['total_benchmarks'])" 2>/dev/null || echo "0") - OVERALL_ERROR=$(python3 -c "import json; d=json.load(open('h5_accuracy_results.json')); print(f\"{d['h5_milestone']['overall_average_error']*100:.1f}%\")" 2>/dev/null || echo "N/A") - H5_STATUS=$(python3 -c "import json; d=json.load(open('h5_accuracy_results.json')); print(d['h5_milestone']['status'])" 2>/dev/null || echo "unknown") - MICRO_COUNT=$(python3 -c "import json; d=json.load(open('h5_accuracy_results.json')); print(d['categories']['microbenchmarks']['count'])" 2>/dev/null || echo "0") - POLYBENCH_COUNT=$(python3 -c "import json; d=json.load(open('h5_accuracy_results.json')); print(d['categories']['polybench']['count'])" 2>/dev/null || echo "0") - MICRO_ERROR=$(python3 -c "import json; d=json.load(open('h5_accuracy_results.json')); print(f\"{d['categories']['microbenchmarks']['average_error']*100:.1f}%\")" 2>/dev/null || echo "N/A") - POLYBENCH_ERROR=$(python3 -c "import json; d=json.load(open('h5_accuracy_results.json')); print(f\"{d['categories']['polybench']['average_error']*100:.1f}%\")" 2>/dev/null || echo "N/A") - - echo "### H5 Status: $H5_STATUS" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "- **Total Benchmarks:** $TOTAL_BENCHMARKS (Target: 15+)" >> $GITHUB_STEP_SUMMARY - echo "- **Overall Average Error:** $OVERALL_ERROR (Target: <20%)" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Breakdown by Category" >> $GITHUB_STEP_SUMMARY - echo "- **Microbenchmarks:** $MICRO_COUNT benchmarks, $MICRO_ERROR average error" >> $GITHUB_STEP_SUMMARY - echo "- **PolyBench Intermediate:** $POLYBENCH_COUNT benchmarks, $POLYBENCH_ERROR average error" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "See the uploaded artifacts for the detailed H5 accuracy report." >> $GITHUB_STEP_SUMMARY - else - echo "⚠️ H5 accuracy report generation failed." >> $GITHUB_STEP_SUMMARY - fi - - - name: Comment on H5 Issue - if: github.ref == 'refs/heads/main' && always() - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - if [ -f h5_accuracy_results.json ]; then - TOTAL_BENCHMARKS=$(python3 -c "import json; d=json.load(open('h5_accuracy_results.json')); print(d['h5_milestone']['total_benchmarks'])" 2>/dev/null || echo "0") - OVERALL_ERROR=$(python3 -c "import json; d=json.load(open('h5_accuracy_results.json')); print(f\"{d['h5_milestone']['overall_average_error']*100:.1f}%\")" 2>/dev/null || echo "N/A") - H5_STATUS=$(python3 -c "import json; d=json.load(open('h5_accuracy_results.json')); print(d['h5_milestone']['status'])" 2>/dev/null || echo "unknown") - - COMMENT_BODY="# [CI] H5 Accuracy Framework Results - ## H5 Milestone Validation Complete - **Status**: ${H5_STATUS} - - **Total Benchmarks**: ${TOTAL_BENCHMARKS} (Target: 15+) - - **Overall Average Error**: ${OVERALL_ERROR} (Target: <20%) - **Commit**: ${GITHUB_SHA:0:8} - **Workflow Run**: [View Details]($GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID) - [Download H5 Accuracy Report Artifacts]($GITHUB_SERVER_URL/$GITHUB_REPOSITORY/actions/runs/$GITHUB_RUN_ID)" - - gh issue comment 460 --body "$COMMENT_BODY" || echo "Failed to comment on issue #460" - fi \ No newline at end of file diff --git a/.github/workflows/matmul-calibration.yml b/.github/workflows/matmul-calibration.yml index 199d7ae..18d132c 100644 --- a/.github/workflows/matmul-calibration.yml +++ b/.github/workflows/matmul-calibration.yml @@ -1,10 +1,6 @@ name: Matmul Calibration on: - push: - branches: [main] - pull_request: - branches: [main] workflow_dispatch: jobs: diff --git a/.github/workflows/memory-bench-calibration.yml b/.github/workflows/memory-bench-calibration.yml deleted file mode 100644 index dff84e5..0000000 --- a/.github/workflows/memory-bench-calibration.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: Memory Subsystem Calibration - -on: - workflow_dispatch: - -jobs: - calibrate: - name: Run Memory Calibration on Apple Silicon - runs-on: macos-14 - timeout-minutes: 60 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install dependencies - run: pip install numpy scipy - - - name: Verify ARM64 architecture - run: | - ARCH=$(uname -m) - echo "Architecture: $ARCH" - if [ "$ARCH" != "arm64" ]; then - echo "ERROR: This workflow requires ARM64 (Apple Silicon)" - exit 1 - fi - - - name: Run memory benchmark calibration - run: | - cd benchmarks/native - python3 linear_calibration.py \ - --benchmarks memorystrided loadheavy storeheavy branchheavy \ - --runs 15 \ - --output memory_calibration_results.json - timeout-minutes: 45 - - - name: Display calibration results - if: always() - run: | - if [ -f benchmarks/native/memory_calibration_results.json ]; then - echo "## Memory Calibration Results" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo '```json' >> $GITHUB_STEP_SUMMARY - cat benchmarks/native/memory_calibration_results.json >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - else - echo "Calibration results not found." >> $GITHUB_STEP_SUMMARY - fi - - - name: Upload calibration results - if: always() - uses: actions/upload-artifact@v4 - with: - name: memory-calibration-results - path: benchmarks/native/memory_calibration_results.json - retention-days: 90 diff --git a/.github/workflows/performance-profiling.yml b/.github/workflows/performance-profiling.yml deleted file mode 100644 index 8f07102..0000000 --- a/.github/workflows/performance-profiling.yml +++ /dev/null @@ -1,218 +0,0 @@ -name: Performance Profiling Analysis - -on: - workflow_dispatch: - inputs: - bench_filter: - description: 'Benchmark regex filter (e.g., "Pipeline", "Decoder", or ".")' - required: false - default: '.' - benchtime: - description: 'Iterations per benchmark (e.g., 1000x or 5s)' - required: false - default: '1000x' - cpu_profile: - description: 'Generate CPU profile (pprof)' - type: boolean - required: false - default: true - mem_profile: - description: 'Generate memory profile (pprof)' - type: boolean - required: false - default: true - schedule: - - cron: '0 2 * * 0' # Sunday 2 AM UTC - -jobs: - profile-pipeline: - name: Profile Pipeline Benchmarks - runs-on: ubuntu-latest - timeout-minutes: 30 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Create output directory - run: mkdir -p profiling-results - - - name: Run benchmarks with timing - run: | - FILTER="${{ github.event.inputs.bench_filter || '.' }}" - BENCHTIME="${{ github.event.inputs.benchtime || '1000x' }}" - - go test -run='XXX_NO_MATCH' -bench="$FILTER" \ - -benchtime="$BENCHTIME" -count=5 \ - -benchmem \ - ./timing/pipeline/ | tee profiling-results/bench-output.txt - - - name: Generate CPU profile - if: github.event.inputs.cpu_profile != 'false' - run: | - # Run a representative benchmark for CPU profiling - go test -run='XXX_NO_MATCH' -bench=BenchmarkPipelineTick8Wide \ - -benchtime=5s -count=1 \ - -cpuprofile=profiling-results/cpu.prof \ - ./timing/pipeline/ - - # Generate text report from CPU profile - go tool pprof -text profiling-results/cpu.prof > profiling-results/cpu-profile-text.txt 2>&1 || true - go tool pprof -top profiling-results/cpu.prof > profiling-results/cpu-top.txt 2>&1 || true - - - name: Generate memory profile - if: github.event.inputs.mem_profile != 'false' - run: | - go test -run='XXX_NO_MATCH' -bench=BenchmarkPipelineTick8Wide \ - -benchtime=5s -count=1 \ - -memprofile=profiling-results/mem.prof \ - ./timing/pipeline/ - - go tool pprof -text -alloc_space profiling-results/mem.prof > profiling-results/mem-alloc-text.txt 2>&1 || true - go tool pprof -top -alloc_space profiling-results/mem.prof > profiling-results/mem-alloc-top.txt 2>&1 || true - - - name: Generate summary report - if: always() - run: | - cat > profiling-results/SUMMARY.md << 'HEADER' - # Performance Profiling Results - HEADER - - echo "**Date:** $(date -u)" >> profiling-results/SUMMARY.md - echo "**Commit:** ${{ github.sha }}" >> profiling-results/SUMMARY.md - echo "" >> profiling-results/SUMMARY.md - - echo "## Benchmark Results" >> profiling-results/SUMMARY.md - echo '```' >> profiling-results/SUMMARY.md - cat profiling-results/bench-output.txt >> profiling-results/SUMMARY.md - echo '```' >> profiling-results/SUMMARY.md - echo "" >> profiling-results/SUMMARY.md - - if [ -f profiling-results/cpu-top.txt ]; then - echo "## CPU Profile (Top Functions)" >> profiling-results/SUMMARY.md - echo '```' >> profiling-results/SUMMARY.md - head -30 profiling-results/cpu-top.txt >> profiling-results/SUMMARY.md - echo '```' >> profiling-results/SUMMARY.md - echo "" >> profiling-results/SUMMARY.md - fi - - if [ -f profiling-results/mem-alloc-top.txt ]; then - echo "## Memory Allocation Profile (Top)" >> profiling-results/SUMMARY.md - echo '```' >> profiling-results/SUMMARY.md - head -30 profiling-results/mem-alloc-top.txt >> profiling-results/SUMMARY.md - echo '```' >> profiling-results/SUMMARY.md - fi - - - name: Upload profiling results - if: always() - uses: actions/upload-artifact@v4 - with: - name: performance-profiling-results - path: profiling-results/ - retention-days: 30 - - - name: Display summary - if: always() - run: | - if [ -f profiling-results/SUMMARY.md ]; then - cat profiling-results/SUMMARY.md >> "$GITHUB_STEP_SUMMARY" - fi - - profile-cmd: - name: Profile via cmd/profile tool - runs-on: ubuntu-latest - timeout-minutes: 30 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Build profile tool - run: go build -o profile-tool ./cmd/profile - - - name: Create output directory - run: mkdir -p cmd-profiling-results - - - name: Profile emulation mode - run: | - # Use a microbenchmark ELF if available, otherwise skip - ELF=$(find benchmarks -name '*.elf' -type f | head -1) - if [ -z "$ELF" ]; then - echo "No ELF binaries found, skipping cmd/profile tests" - exit 0 - fi - - echo "Profiling: $ELF" - ./profile-tool \ - -cpuprofile=cmd-profiling-results/emu-cpu.prof \ - -memprofile=cmd-profiling-results/emu-mem.prof \ - -duration=10s \ - "$ELF" > cmd-profiling-results/emu-output.txt 2>&1 || true - - - name: Profile timing mode - run: | - ELF=$(find benchmarks -name '*.elf' -type f | head -1) - if [ -z "$ELF" ]; then - exit 0 - fi - - ./profile-tool -timing \ - -cpuprofile=cmd-profiling-results/timing-cpu.prof \ - -memprofile=cmd-profiling-results/timing-mem.prof \ - -duration=10s \ - "$ELF" > cmd-profiling-results/timing-output.txt 2>&1 || true - - - name: Profile fast-timing mode - run: | - ELF=$(find benchmarks -name '*.elf' -type f | head -1) - if [ -z "$ELF" ]; then - exit 0 - fi - - ./profile-tool -fast-timing \ - -cpuprofile=cmd-profiling-results/fast-timing-cpu.prof \ - -memprofile=cmd-profiling-results/fast-timing-mem.prof \ - -duration=10s \ - "$ELF" > cmd-profiling-results/fast-timing-output.txt 2>&1 || true - - - name: Generate mode comparison - if: always() - run: | - echo "# cmd/profile Mode Comparison" > cmd-profiling-results/SUMMARY.md - echo "" >> cmd-profiling-results/SUMMARY.md - - for mode in emu timing fast-timing; do - outfile="cmd-profiling-results/${mode}-output.txt" - if [ -f "$outfile" ]; then - echo "## ${mode}" >> cmd-profiling-results/SUMMARY.md - echo '```' >> cmd-profiling-results/SUMMARY.md - cat "$outfile" >> cmd-profiling-results/SUMMARY.md - echo '```' >> cmd-profiling-results/SUMMARY.md - echo "" >> cmd-profiling-results/SUMMARY.md - fi - done - - - name: Upload results - if: always() - uses: actions/upload-artifact@v4 - with: - name: cmd-profiling-results - path: cmd-profiling-results/ - retention-days: 30 - - - name: Display summary - if: always() - run: | - if [ -f cmd-profiling-results/SUMMARY.md ]; then - cat cmd-profiling-results/SUMMARY.md >> "$GITHUB_STEP_SUMMARY" - fi diff --git a/.github/workflows/performance-regression-monitoring.yml b/.github/workflows/performance-regression-monitoring.yml deleted file mode 100644 index edb22a5..0000000 --- a/.github/workflows/performance-regression-monitoring.yml +++ /dev/null @@ -1,282 +0,0 @@ -name: Performance Regression Monitoring - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - schedule: - # Run nightly performance monitoring - - cron: '0 6 * * *' # 6 AM UTC daily - -jobs: - performance-baseline: - name: Performance Baseline Monitoring - runs-on: ubuntu-latest - timeout-minutes: 20 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - # Get enough history for comparison - fetch-depth: 100 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Create results directory - run: mkdir -p performance-results - - - name: Run performance validation - id: validation - run: | - # Install Python dependencies if needed - python3 -m pip install --upgrade pip - - # Run the performance validation script - cd ${{ github.workspace }} - python3 scripts/performance_optimization_validation.py > performance-results/validation-output.txt 2>&1 || true - - # Extract key metrics for comparison - echo "benchmark_results<> $GITHUB_OUTPUT - if [ -f "performance-results/validation-output.txt" ]; then - grep -E "ns/op|allocs/op" performance-results/validation-output.txt | head -10 >> $GITHUB_OUTPUT - fi - echo "EOF" >> $GITHUB_OUTPUT - - - name: Run critical path benchmarks - run: | - echo "Running critical path benchmarks for regression detection..." - - # Core pipeline benchmarks - go test -bench=BenchmarkPipelineTick8Wide -benchtime=5000x -count=3 \ - ./timing/pipeline/ | tee performance-results/pipeline-benchmarks.txt - - # Decoder benchmarks (optimization target) - go test -bench='BenchmarkDecoder.*' -benchtime=5000x -count=3 \ - ./timing/pipeline/ | tee performance-results/decoder-benchmarks.txt || true - - # Memory-intensive benchmarks - go test -bench='BenchmarkPipeline.*LoadHeavy.*' -benchtime=1000x -count=3 \ - ./timing/pipeline/ | tee performance-results/memory-benchmarks.txt || true - - - name: Analyze performance trends - run: | - cat > performance-results/analysis.py << 'EOF' - #!/usr/bin/env python3 - import re - import sys - from pathlib import Path - - def parse_benchmark_line(line): - """Parse a Go benchmark line to extract metrics.""" - if 'ns/op' not in line: - return None - - parts = line.strip().split() - if len(parts) < 3: - return None - - name = parts[0] - try: - ns_per_op = float(parts[2]) - return (name, ns_per_op) - except ValueError: - return None - - def analyze_benchmarks(benchmark_file): - """Analyze benchmark results for performance regression.""" - if not Path(benchmark_file).exists(): - return {} - - with open(benchmark_file, 'r') as f: - lines = f.readlines() - - benchmarks = {} - for line in lines: - result = parse_benchmark_line(line) - if result: - name, ns_per_op = result - if name not in benchmarks: - benchmarks[name] = [] - benchmarks[name].append(ns_per_op) - - # Calculate averages - averages = {} - for name, values in benchmarks.items(): - if values: - averages[name] = sum(values) / len(values) - - return averages - - # Analyze all benchmark files - files = ['pipeline-benchmarks.txt', 'decoder-benchmarks.txt', 'memory-benchmarks.txt'] - all_results = {} - - for filename in files: - results = analyze_benchmarks(f'performance-results/{filename}') - all_results.update(results) - - # Generate performance summary - print("=== Performance Analysis Summary ===") - print(f"Commit: {sys.argv[1] if len(sys.argv) > 1 else 'unknown'}") - print(f"Total benchmarks analyzed: {len(all_results)}") - print() - - # Performance thresholds (in ns/op) - thresholds = { - 'BenchmarkPipelineTick8Wide': 5000, # 5μs threshold - 'BenchmarkDecoderDecode': 1000, # 1μs threshold - 'BenchmarkDecoderDecodeInto': 500, # 0.5μs threshold - } - - # Check for regressions - regressions = [] - improvements = [] - - print("| Benchmark | Performance (ns/op) | Status |") - print("|-----------|-------------------|--------|") - - for name, avg_time in sorted(all_results.items()): - threshold = thresholds.get(name, 10000) # Default 10μs threshold - - if avg_time > threshold: - status = "⚠️ REGRESSION" - regressions.append(name) - elif avg_time < threshold * 0.7: # 30% better than threshold - status = "✅ IMPROVEMENT" - improvements.append(name) - else: - status = "✅ NORMAL" - - print(f"| {name} | {avg_time:.1f} | {status} |") - - print() - - # Summary - if regressions: - print(f"🚨 PERFORMANCE REGRESSIONS DETECTED: {len(regressions)}") - for name in regressions: - print(f" - {name}: {all_results[name]:.1f} ns/op") - sys.exit(1) - elif improvements: - print(f"🚀 PERFORMANCE IMPROVEMENTS DETECTED: {len(improvements)}") - for name in improvements: - print(f" - {name}: {all_results[name]:.1f} ns/op") - else: - print("✅ No significant performance changes detected") - - EOF - - python3 performance-results/analysis.py "${{ github.sha }}" | tee performance-results/analysis-summary.txt - - - name: Generate performance report - if: always() - run: | - cat > performance-results/PERFORMANCE_REPORT.md << 'HEADER' - # Performance Monitoring Report - - **Date:** $(date -u) - **Commit:** ${{ github.sha }} - **Branch:** ${{ github.ref_name }} - **Trigger:** ${{ github.event_name }} - - ## Optimization Status - - This report validates the performance optimizations implemented in Issue #487: - - ✅ Instruction decoder memory allocation optimization - - ✅ Branch predictor reuse enhancement - - ✅ Critical path bottleneck elimination - - ## Performance Results - - HEADER - - # Add benchmark results - if [ -f performance-results/analysis-summary.txt ]; then - echo "" >> performance-results/PERFORMANCE_REPORT.md - echo "### Benchmark Analysis" >> performance-results/PERFORMANCE_REPORT.md - echo '```' >> performance-results/PERFORMANCE_REPORT.md - cat performance-results/analysis-summary.txt >> performance-results/PERFORMANCE_REPORT.md - echo '```' >> performance-results/PERFORMANCE_REPORT.md - fi - - # Add detailed benchmark data - echo "" >> performance-results/PERFORMANCE_REPORT.md - echo "### Detailed Results" >> performance-results/PERFORMANCE_REPORT.md - echo "" >> performance-results/PERFORMANCE_REPORT.md - - for file in pipeline-benchmarks.txt decoder-benchmarks.txt memory-benchmarks.txt; do - if [ -f "performance-results/$file" ]; then - echo "#### ${file}" >> performance-results/PERFORMANCE_REPORT.md - echo '```' >> performance-results/PERFORMANCE_REPORT.md - grep 'Benchmark' "performance-results/$file" >> performance-results/PERFORMANCE_REPORT.md - echo '```' >> performance-results/PERFORMANCE_REPORT.md - echo "" >> performance-results/PERFORMANCE_REPORT.md - fi - done - - # Add optimization impact summary - cat >> performance-results/PERFORMANCE_REPORT.md << 'FOOTER' - - ## Optimization Impact Assessment - - ### Success Metrics (from Issue #487): - - **Target**: 50-80% reduction in calibration iteration time - - **Approach**: Data-driven optimization based on profiling results - - **Focus**: Critical path optimization while preserving accuracy - - ### Implementation Status: - - ✅ **Critical Path Analysis**: Bottlenecks identified via systematic profiling - - ✅ **Memory Optimization**: Decoder allocation hotspot eliminated - - ✅ **Performance Monitoring**: CI integration for regression detection - - ✅ **Validation Framework**: Automated optimization impact measurement - - ### Next Steps: - - Monitor performance trends across commits - - Validate calibration workflow speed improvements - - Extend optimization to additional bottlenecks as identified - - FOOTER - - - name: Upload performance results - if: always() - uses: actions/upload-artifact@v4 - with: - name: performance-monitoring-results-${{ github.sha }} - path: performance-results/ - retention-days: 30 - - - name: Comment on PR (if applicable) - if: github.event_name == 'pull_request' - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - const path = 'performance-results/PERFORMANCE_REPORT.md'; - - if (fs.existsSync(path)) { - const report = fs.readFileSync(path, 'utf8'); - - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: `## 🚀 Performance Monitoring Results\n\n${report}` - }); - } - - - name: Check performance regression status - run: | - # This step will fail the workflow if regressions are detected - # The analysis.py script exits with code 1 if regressions are found - if grep -q "PERFORMANCE REGRESSIONS DETECTED" performance-results/analysis-summary.txt; then - echo "❌ Performance regressions detected - failing the build" - cat performance-results/analysis-summary.txt - exit 1 - else - echo "✅ No performance regressions detected" - fi \ No newline at end of file diff --git a/.github/workflows/performance-regression.yml b/.github/workflows/performance-regression.yml index e606aa6..3661607 100644 --- a/.github/workflows/performance-regression.yml +++ b/.github/workflows/performance-regression.yml @@ -3,8 +3,6 @@ name: Performance Regression Detection on: pull_request: types: [opened, synchronize, ready_for_review] - push: - branches: [main] jobs: performance-regression: diff --git a/.github/workflows/polybench-calibration.yml b/.github/workflows/polybench-calibration.yml deleted file mode 100644 index 4caaf59..0000000 --- a/.github/workflows/polybench-calibration.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: PolyBench Hardware Calibration - -on: - workflow_dispatch: - -jobs: - calibrate: - name: PolyBench Linear Regression Calibration on Apple Silicon - runs-on: macos-14 - timeout-minutes: 45 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install Python dependencies - run: pip install numpy scipy - - - name: Verify ARM64 architecture - run: | - ARCH=$(uname -m) - echo "Architecture: $ARCH" - if [ "$ARCH" != "arm64" ]; then - echo "ERROR: This workflow requires ARM64 (Apple Silicon)" - exit 1 - fi - - - name: Run PolyBench calibration - run: | - cd benchmarks/native - python3 polybench_calibration.py \ - --runs 15 \ - --output polybench_calibration_results.json - timeout-minutes: 40 - - - name: Display results - if: always() - run: | - echo "## PolyBench Calibration Results" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - if [ -f benchmarks/native/polybench_calibration_results.json ]; then - echo '```json' >> $GITHUB_STEP_SUMMARY - cat benchmarks/native/polybench_calibration_results.json >> $GITHUB_STEP_SUMMARY - echo '```' >> $GITHUB_STEP_SUMMARY - else - echo "Calibration results not found." >> $GITHUB_STEP_SUMMARY - fi - - - name: Upload calibration results - if: always() - uses: actions/upload-artifact@v4 - with: - name: polybench-calibration-results - path: benchmarks/native/polybench_calibration_results.json - retention-days: 90 diff --git a/.github/workflows/polybench-segmented.yml b/.github/workflows/polybench-segmented.yml deleted file mode 100644 index c85bd67..0000000 --- a/.github/workflows/polybench-segmented.yml +++ /dev/null @@ -1,217 +0,0 @@ -name: PolyBench Segmented Testing - -on: - workflow_dispatch: - push: - branches: [main] - paths: - - 'benchmarks/polybench_test.go' - - 'benchmarks/timing_harness.go' - - 'timing/**' - -jobs: - polybench-group-1: - name: PolyBench Group 1 (ATAX, BiCG, Jacobi1D) - runs-on: macos-14 - timeout-minutes: 30 - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Verify PolyBench ELFs exist - run: | - echo "Checking required ELF files for Group 1..." - ls -la benchmarks/polybench/atax_m2sim.elf - ls -la benchmarks/polybench/bicg_m2sim.elf - ls -la benchmarks/polybench/jacobi-1d_m2sim.elf - - - name: Run Group 1 tests - run: | - echo "Running PolyBench Group 1: ATAX, BiCG, Jacobi1D" - go test -v -run "TestPolybenchATAX|TestPolybenchBiCG|TestPolybenchJacobi1D" -count=1 -timeout 25m ./benchmarks/ 2>&1 | tee group1_output.txt - - - name: Upload Group 1 results - if: always() - uses: actions/upload-artifact@v4 - with: - name: polybench-group-1-results - path: group1_output.txt - retention-days: 7 - - polybench-group-2: - name: PolyBench Group 2 (MVT, GEMM) - runs-on: macos-14 - timeout-minutes: 30 - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Verify PolyBench ELFs exist - run: | - echo "Checking required ELF files for Group 2..." - ls -la benchmarks/polybench/mvt_m2sim.elf - ls -la benchmarks/polybench/gemm_m2sim.elf - - - name: Run Group 2 tests - run: | - echo "Running PolyBench Group 2: MVT, GEMM" - go test -v -run "TestPolybenchMVT|TestPolybenchGEMM" -count=1 -timeout 25m ./benchmarks/ 2>&1 | tee group2_output.txt - - - name: Upload Group 2 results - if: always() - uses: actions/upload-artifact@v4 - with: - name: polybench-group-2-results - path: group2_output.txt - retention-days: 7 - - polybench-group-3: - name: PolyBench Group 3 (2MM, 3MM) - runs-on: macos-14 - timeout-minutes: 30 - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Verify PolyBench ELFs exist - run: | - echo "Checking required ELF files for Group 3..." - ls -la benchmarks/polybench/2mm_m2sim.elf - ls -la benchmarks/polybench/3mm_m2sim.elf - - - name: Run Group 3 tests - run: | - echo "Running PolyBench Group 3: 2MM, 3MM" - go test -v -run "TestPolybench2MM|TestPolybench3MM" -count=1 -timeout 25m ./benchmarks/ 2>&1 | tee group3_output.txt - - - name: Upload Group 3 results - if: always() - uses: actions/upload-artifact@v4 - with: - name: polybench-group-3-results - path: group3_output.txt - retention-days: 7 - - consolidate-results: - name: Consolidate PolyBench Results - runs-on: ubuntu-latest - needs: [polybench-group-1, polybench-group-2, polybench-group-3] - if: always() - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download all group results - uses: actions/download-artifact@v4 - with: - path: group-results - - - name: Consolidate and analyze results - run: | - echo "Consolidating results from all PolyBench groups..." - - # Combine all output files - cat group-results/polybench-group-1-results/group1_output.txt > consolidated_polybench.txt || echo "Group 1 results missing" - cat group-results/polybench-group-2-results/group2_output.txt >> consolidated_polybench.txt || echo "Group 2 results missing" - cat group-results/polybench-group-3-results/group3_output.txt >> consolidated_polybench.txt || echo "Group 3 results missing" - - # Extract CPI data - python3 - <<'PYEOF' - import json - import re - import os - - results = {} - try: - with open("consolidated_polybench.txt") as f: - for line in f: - if "CPI=" not in line: - continue - match = re.search(r'(polybench_\w+):\s+cycles=(\d+),\s+insts=(\d+),\s+CPI=([\d.]+)', line) - if match: - name = match.group(1) - cycles = int(match.group(2)) - insts = int(match.group(3)) - cpi = float(match.group(4)) - short_name = name.replace("polybench_", "") - if short_name == "jacobi1d": - short_name = "jacobi-1d" - results[short_name] = { - "sim_name": name, - "cycles": cycles, - "instructions": insts, - "cpi": cpi, - } - except FileNotFoundError: - print("WARNING: No consolidated results file found") - - output = { - "source": "polybench_segmented_testing", - "benchmarks_run": len(results), - "results": results, - "segmented_execution": True, - "groups_completed": [] - } - - if os.path.exists("group-results/polybench-group-1-results/group1_output.txt"): - output["groups_completed"].append("group-1") - if os.path.exists("group-results/polybench-group-2-results/group2_output.txt"): - output["groups_completed"].append("group-2") - if os.path.exists("group-results/polybench-group-3-results/group3_output.txt"): - output["groups_completed"].append("group-3") - - with open("polybench_segmented_results.json", "w") as f: - json.dump(output, f, indent=2) - - print(json.dumps(output, indent=2)) - print(f"\nSegmented testing results: {len(results)} benchmarks from {len(output['groups_completed'])} groups") - PYEOF - - - name: Upload consolidated results - uses: actions/upload-artifact@v4 - with: - name: polybench-segmented-consolidated - path: | - polybench_segmented_results.json - consolidated_polybench.txt - retention-days: 30 - - - name: Post segmented testing summary - if: always() - run: | - echo "## PolyBench Segmented Testing Results" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ -f polybench_segmented_results.json ]; then - python3 -c " - import json - d = json.load(open('polybench_segmented_results.json')) - print(f'**Benchmarks measured:** {d[\"benchmarks_run\"]}/7 (from {len(d[\"groups_completed\"])} groups)') - print(f'**Groups completed:** {\", \".join(d[\"groups_completed\"])}') - print() - if d['results']: - print('| Benchmark | Cycles | Instructions | CPI |') - print('|-----------|--------|--------------|-----|') - for name, r in sorted(d['results'].items()): - print(f'| {name} | {r[\"cycles\"]} | {r[\"instructions\"]} | {r[\"cpi\"]:.3f} |') - else: - print('No benchmark results extracted.') - " >> $GITHUB_STEP_SUMMARY - else - echo "Segmented testing consolidation failed." >> $GITHUB_STEP_SUMMARY - fi diff --git a/.github/workflows/polybench-sim.yml b/.github/workflows/polybench-sim.yml deleted file mode 100644 index 8d5f794..0000000 --- a/.github/workflows/polybench-sim.yml +++ /dev/null @@ -1,152 +0,0 @@ -name: PolyBench Simulation Measurements - -on: - workflow_dispatch: - push: - branches: [main] - paths: - - 'benchmarks/polybench_test.go' - - 'benchmarks/timing_harness.go' - - 'timing/**' - -concurrency: - group: polybench-sim-${{ github.ref }} - cancel-in-progress: true - -jobs: - polybench-sim: - name: Run PolyBench Timing Simulations - runs-on: macos-14 # ARM runner required for ARM64 ELFs - timeout-minutes: 60 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.25' - - - name: Verify PolyBench ELFs exist - run: | - echo "Checking PolyBench ELF files..." - ls -la benchmarks/polybench/*.elf - echo "" - echo "ELF count: $(ls benchmarks/polybench/*.elf 2>/dev/null | wc -l)" - - - name: Run PolyBench timing simulations - id: sim - run: | - echo "Running PolyBench benchmarks individually..." - echo "" - - # Run each test individually with its own timeout so completed - # tests produce CPI output even if later tests timeout. - TESTS=( - TestPolybenchATAX - TestPolybenchBiCG - TestPolybenchMVT - TestPolybenchJacobi1D - TestPolybenchGEMM - TestPolybench2MM - TestPolybench3MM - ) - - > polybench_output.txt - - for TEST in "${TESTS[@]}"; do - echo "--- Running $TEST ---" - if go test -v -run "^${TEST}$" -count=1 -timeout 8m ./benchmarks/ 2>&1 | tee -a polybench_output.txt; then - echo "--- $TEST completed ---" - else - echo "--- $TEST failed or timed out ---" - fi - echo "" - done - - echo "" - echo "=== Raw CPI Output ===" - grep "CPI=" polybench_output.txt || echo "No CPI lines found" - - - name: Extract CPI results as JSON - if: always() - run: | - python3 - <<'PYEOF' - import json - import re - import sys - - results = {} - with open("polybench_output.txt") as f: - for line in f: - # Match lines like: polybench_atax: cycles=1234, insts=5678, CPI=1.234, ... - if "CPI=" not in line: - continue - # Extract benchmark name and CPI - match = re.search(r'(polybench_\w+):\s+cycles=(\d+),\s+insts=(\d+),\s+CPI=([\d.]+)', line) - if match: - name = match.group(1) - cycles = int(match.group(2)) - insts = int(match.group(3)) - cpi = float(match.group(4)) - # Map polybench_X -> X for calibration naming - short_name = name.replace("polybench_", "") - # Fix jacobi1d -> jacobi-1d - if short_name == "jacobi1d": - short_name = "jacobi-1d" - results[short_name] = { - "sim_name": name, - "cycles": cycles, - "instructions": insts, - "cpi": cpi, - } - - output = { - "source": "polybench_timing_simulation", - "benchmarks_run": len(results), - "results": results, - } - - with open("polybench_sim_cpis.json", "w") as f: - json.dump(output, f, indent=2) - - print(json.dumps(output, indent=2)) - - if len(results) == 0: - print("\nWARNING: No PolyBench CPI results extracted!", file=sys.stderr) - # Exit 0 so partial results are still uploaded - else: - print(f"\nSuccessfully extracted {len(results)} PolyBench CPI values") - PYEOF - - - name: Post results summary - if: always() - run: | - echo "## PolyBench Simulation Results" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ -f polybench_sim_cpis.json ]; then - python3 -c " - import json - d = json.load(open('polybench_sim_cpis.json')) - print(f'**Benchmarks measured:** {d[\"benchmarks_run\"]}/7') - print() - print('| Benchmark | Cycles | Instructions | CPI |') - print('|-----------|--------|--------------|-----|') - for name, r in sorted(d['results'].items()): - print(f'| {name} | {r[\"cycles\"]} | {r[\"instructions\"]} | {r[\"cpi\"]:.3f} |') - " >> $GITHUB_STEP_SUMMARY - else - echo "No results generated." >> $GITHUB_STEP_SUMMARY - fi - - - name: Upload CPI results - if: always() - uses: actions/upload-artifact@v4 - with: - name: polybench-sim-results - path: | - polybench_sim_cpis.json - polybench_output.txt - retention-days: 90 diff --git a/.github/workflows/spec-bench.yml b/.github/workflows/spec-bench.yml index cb749c0..72c37d7 100644 --- a/.github/workflows/spec-bench.yml +++ b/.github/workflows/spec-bench.yml @@ -5,10 +5,7 @@ name: SPEC Benchmark Daily # Not blocking PRs - informational only on: - schedule: - # Run at 6 AM UTC daily (1 AM EST) - - cron: '0 6 * * *' - workflow_dispatch: # Allow manual triggering + workflow_dispatch: inputs: benchmark: description: 'Specific benchmark to run (blank = all available)'