Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
336 changes: 336 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,336 @@
name: Stockfish Benchmark

on:
pull_request:
paths:
# Only run benchmarks when engine code changes
- 'moonfish/**'
- 'opening_book/**'
- 'pyproject.toml'
- 'requirements.txt'

permissions:
contents: read
pull-requests: write

env:
GIT_LFS_SKIP_SMUDGE: 1
MOONFISH_OPENING_BOOK: ${{ github.workspace }}/opening_book/cerebellum.bin

jobs:
react-start:
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- name: Add eyes reaction to PR
env:
GH_TOKEN: ${{ github.token }}
run: |
gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/reactions \
-f content='eyes' --silent || true

benchmark:
runs-on: ubuntu-latest
needs: react-start
strategy:
fail-fast: false
matrix:
chunk: [0, 1, 2, 3, 4] # 5 parallel jobs, 20 rounds each = 100 total per skill level
skill_level: [3, 4, 5] # Test against multiple skill levels
env:
UV_SYSTEM_PYTHON: 1

steps:
- uses: actions/checkout@v4
with:
lfs: false
fetch-depth: 0

- name: Ensure opening book
run: |
set -euo pipefail

if [ -f opening_book/cerebellum.bin ]; then
if head -1 opening_book/cerebellum.bin | grep -q "git-lfs"; then
echo "LFS pointer detected; downloading opening book..."
rm -f opening_book/cerebellum.bin
else
echo "Opening book already present."
exit 0
fi
fi

echo "Downloading full opening book from release..."
curl -L -o opening_book/cerebellum.bin "https://github.com/luccabb/moonfish/releases/download/v1.0.0/cerebellum.bin"

- name: Verify opening book
run: |
ls -lh opening_book/cerebellum.bin
python - <<'PY'
import os, sys
path = "opening_book/cerebellum.bin"
size = os.path.getsize(path)
print(f"opening book size: {size} bytes")
if size < 10_000_000:
print("opening book too small; likely an LFS pointer", file=sys.stderr)
sys.exit(1)
PY

- name: Install uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
cache-dependency-glob: "requirements.txt"

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Install dependencies
run: make install

- name: Validate opening book with python-chess
run: |
python - <<'PY'
import chess
import chess.polyglot
book_path = "opening_book/cerebellum.bin"
with chess.polyglot.MemoryMappedReader(book_path) as reader:
entry = reader.find(chess.Board())
print(f"book entry: {entry.move.uci()}")
PY

- name: Install Stockfish
run: |
sudo apt-get update
sudo apt-get install -y stockfish

- name: Install cutechess-cli dependencies
run: |
sudo apt-get install -y cmake qt5-qmake qtbase5-dev qtbase5-dev-tools libqt5svg5-dev

- name: Cache cutechess-cli
id: cache-cutechess
uses: actions/cache@v4
with:
path: /usr/local/bin/cutechess-cli
key: cutechess-cli-1.4.0

- name: Build cutechess-cli
if: steps.cache-cutechess.outputs.cache-hit != 'true'
run: |
git clone --depth 1 --branch v1.4.0 https://github.com/cutechess/cutechess.git /tmp/cutechess
cd /tmp/cutechess
mkdir build && cd build
cmake ..
make -j$(nproc)
sudo cp cutechess-cli /usr/local/bin/

- name: Build moonfish binary
run: make build-lichess

- name: Run Stockfish benchmark
run: |
CHUNK=${{ matrix.chunk }}
SKILL=${{ matrix.skill_level }}
ROUNDS_PER_CHUNK=20
SEED=$((CHUNK * 1000 + SKILL * 100 + 42)) # Different seed per chunk/skill for opening variety

echo "Running moonfish vs Stockfish benchmark (chunk $CHUNK, skill $SKILL)..."
echo "Stockfish skill level: $SKILL"
echo "Moonfish: 60s per move, Stockfish: 60+5 time control"
echo "Rounds: $ROUNDS_PER_CHUNK, Concurrency: $(nproc), Seed: $SEED"
echo ""

cutechess-cli \
-engine name=moonfish cmd=./dist/moonfish dir=. proto=uci tc=inf st=60 timemargin=10000 \
-engine name=stockfish cmd=stockfish proto=uci option.Skill\ Level=$SKILL option.Threads=1 tc=60+5 timemargin=10000 \
-rounds $ROUNDS_PER_CHUNK \
-repeat \
-concurrency 20 \
-pgnout benchmark-skill$SKILL-chunk$CHUNK.pgn \
-srand $SEED \
-recover \
2>&1 | tee benchmark-skill$SKILL-chunk$CHUNK.log

echo ""
echo "=== Benchmark Results (Skill $SKILL, Chunk $CHUNK) ==="
tail -20 benchmark-skill$SKILL-chunk$CHUNK.log

- name: Parse results
run: |
CHUNK=${{ matrix.chunk }}
SKILL=${{ matrix.skill_level }}
PGN="benchmark-skill$SKILL-chunk$CHUNK.pgn"

# Extract score line from log
SCORE=$(grep "Score of moonfish vs stockfish:" benchmark-skill$SKILL-chunk$CHUNK.log | tail -1)
WINS=$(echo "$SCORE" | sed -E 's/.*: ([0-9]+) - ([0-9]+) - ([0-9]+).*/\1/')
LOSSES=$(echo "$SCORE" | sed -E 's/.*: ([0-9]+) - ([0-9]+) - ([0-9]+).*/\2/')
DRAWS=$(echo "$SCORE" | sed -E 's/.*: ([0-9]+) - ([0-9]+) - ([0-9]+).*/\3/')

# Parse PGN for detailed stats
# Moonfish as White: wins/losses/draws
WHITE_WINS=$(grep -B5 'Result "1-0"' "$PGN" | grep -c 'White "moonfish"' || echo 0)
WHITE_LOSSES=$(grep -B5 'Result "0-1"' "$PGN" | grep -c 'White "moonfish"' || echo 0)
WHITE_DRAWS=$(grep -B5 'Result "1/2-1/2"' "$PGN" | grep -c 'White "moonfish"' || echo 0)

# Moonfish as Black: wins/losses/draws
BLACK_WINS=$(grep -B5 'Result "0-1"' "$PGN" | grep -c 'Black "moonfish"' || echo 0)
BLACK_LOSSES=$(grep -B5 'Result "1-0"' "$PGN" | grep -c 'Black "moonfish"' || echo 0)
BLACK_DRAWS=$(grep -B5 'Result "1/2-1/2"' "$PGN" | grep -c 'Black "moonfish"' || echo 0)

# Save detailed results
cat > results-skill$SKILL-chunk$CHUNK.txt << EOF
SKILL=$SKILL
WINS=$WINS
LOSSES=$LOSSES
DRAWS=$DRAWS
WHITE_WINS=$WHITE_WINS
WHITE_LOSSES=$WHITE_LOSSES
WHITE_DRAWS=$WHITE_DRAWS
BLACK_WINS=$BLACK_WINS
BLACK_LOSSES=$BLACK_LOSSES
BLACK_DRAWS=$BLACK_DRAWS
EOF

echo "Skill $SKILL, Chunk $CHUNK: W=$WINS L=$LOSSES D=$DRAWS (White: $WHITE_WINS-$WHITE_LOSSES-$WHITE_DRAWS, Black: $BLACK_WINS-$BLACK_LOSSES-$BLACK_DRAWS)"

- name: Upload chunk results
uses: actions/upload-artifact@v4
if: always()
with:
name: benchmark-skill${{ matrix.skill_level }}-chunk${{ matrix.chunk }}
path: |
benchmark-skill${{ matrix.skill_level }}-chunk${{ matrix.chunk }}.pgn
benchmark-skill${{ matrix.skill_level }}-chunk${{ matrix.chunk }}.log
results-skill${{ matrix.skill_level }}-chunk${{ matrix.chunk }}.txt

aggregate:
runs-on: ubuntu-latest
needs: benchmark
if: ${{ !cancelled() && contains(join(needs.benchmark.result, ','), 'success') }}
steps:
- uses: actions/checkout@v4

- name: Download all chunk results
uses: actions/download-artifact@v4
with:
pattern: benchmark-skill*-chunk*
merge-multiple: true

- name: Merge all PGN files
run: |
for SKILL in 3 4 5; do
cat benchmark-skill$SKILL-chunk*.pgn > benchmark-skill$SKILL-all.pgn 2>/dev/null || echo "No PGN files for skill $SKILL"
done

- name: Aggregate results
run: |
echo "Aggregating results from all chunks..."

# Build comment body
{
echo "## 🔬 Stockfish Benchmark Results"
echo ""

for SKILL in 3 4 5; do
# Initialize counters
TOTAL_WINS=0 TOTAL_LOSSES=0 TOTAL_DRAWS=0
TOTAL_WHITE_WINS=0 TOTAL_WHITE_LOSSES=0 TOTAL_WHITE_DRAWS=0
TOTAL_BLACK_WINS=0 TOTAL_BLACK_LOSSES=0 TOTAL_BLACK_DRAWS=0

for f in results-skill$SKILL-chunk*.txt; do
if [ -f "$f" ]; then
eval "$(grep -E '^[A-Z_]+=' "$f" | sed 's/^[[:space:]]*//')"
TOTAL_WINS=$((TOTAL_WINS + WINS))
TOTAL_LOSSES=$((TOTAL_LOSSES + LOSSES))
TOTAL_DRAWS=$((TOTAL_DRAWS + DRAWS))
TOTAL_WHITE_WINS=$((TOTAL_WHITE_WINS + WHITE_WINS))
TOTAL_WHITE_LOSSES=$((TOTAL_WHITE_LOSSES + WHITE_LOSSES))
TOTAL_WHITE_DRAWS=$((TOTAL_WHITE_DRAWS + WHITE_DRAWS))
TOTAL_BLACK_WINS=$((TOTAL_BLACK_WINS + BLACK_WINS))
TOTAL_BLACK_LOSSES=$((TOTAL_BLACK_LOSSES + BLACK_LOSSES))
TOTAL_BLACK_DRAWS=$((TOTAL_BLACK_DRAWS + BLACK_DRAWS))
fi
done

TOTAL=$((TOTAL_WINS + TOTAL_LOSSES + TOTAL_DRAWS))
WHITE_TOTAL=$((TOTAL_WHITE_WINS + TOTAL_WHITE_LOSSES + TOTAL_WHITE_DRAWS))
BLACK_TOTAL=$((TOTAL_BLACK_WINS + TOTAL_BLACK_LOSSES + TOTAL_BLACK_DRAWS))

echo "### vs Stockfish Skill Level $SKILL"
echo ""
echo "| Metric | Wins | Losses | Draws | Total | Win % |"
echo "|--------|------|--------|-------|-------|-------|"

if [ "$TOTAL" -gt 0 ]; then
WIN_RATE=$(echo "scale=1; $TOTAL_WINS * 100 / $TOTAL" | bc)
echo "| **Overall** | $TOTAL_WINS | $TOTAL_LOSSES | $TOTAL_DRAWS | $TOTAL | ${WIN_RATE}% |"
fi
if [ "$WHITE_TOTAL" -gt 0 ]; then
WHITE_WIN_RATE=$(echo "scale=1; $TOTAL_WHITE_WINS * 100 / $WHITE_TOTAL" | bc)
echo "| As White | $TOTAL_WHITE_WINS | $TOTAL_WHITE_LOSSES | $TOTAL_WHITE_DRAWS | $WHITE_TOTAL | ${WHITE_WIN_RATE}% |"
fi
if [ "$BLACK_TOTAL" -gt 0 ]; then
BLACK_WIN_RATE=$(echo "scale=1; $TOTAL_BLACK_WINS * 100 / $BLACK_TOTAL" | bc)
echo "| As Black | $TOTAL_BLACK_WINS | $TOTAL_BLACK_LOSSES | $TOTAL_BLACK_DRAWS | $BLACK_TOTAL | ${BLACK_WIN_RATE}% |"
fi

# Parse game endings (excluding checkmates, which are covered by win/loss stats)
PGN="benchmark-skill$SKILL-all.pgn"
if [ -f "$PGN" ]; then
ENDINGS=$(grep -oE ', [^}]+\}' "$PGN" | sed 's/, //; s/}//' | grep -v 'mates' | sort | uniq -c | sort -rn)
if [ -n "$ENDINGS" ]; then
echo ""
echo "**Non-checkmate endings:**"
echo "$ENDINGS" | while read count ending; do
echo "- $ending: $count"
done
fi
fi
echo ""
done

echo "<details><summary>Configuration</summary>"
echo ""
echo "- 5 chunks × 20 rounds × 3 skill levels = 300 total games"
echo "- Each opening played with colors reversed (-repeat) for fairness"
echo "- Moonfish: 60s per move"
echo "- Stockfish: 60+5 time control"
echo ""
echo "</details>"
} > pr-comment.md

# Also write to step summary
cat pr-comment.md >> $GITHUB_STEP_SUMMARY

- name: Comment on PR
if: github.event_name == 'pull_request'
env:
GH_TOKEN: ${{ github.token }}
run: |
gh pr comment ${{ github.event.pull_request.number }} --body-file pr-comment.md

- name: Update PR reaction (eyes -> thumbs up)
if: github.event_name == 'pull_request'
env:
GH_TOKEN: ${{ github.token }}
run: |
# Remove eyes reaction
REACTIONS=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/reactions --jq '.[] | select(.content == "eyes") | .id' || true)
for ID in $REACTIONS; do
gh api -X DELETE repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/reactions/$ID --silent || true
done
# Add thumbs up
gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/reactions \
-f content='+1' --silent || true

- name: Upload aggregated results
uses: actions/upload-artifact@v4
with:
name: benchmark-aggregated
path: |
benchmark-skill*-all.pgn
results-*.txt
11 changes: 11 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ on:
pull_request:
branches: [ master ]

env:
GIT_LFS_SKIP_SMUDGE: 1

jobs:
test:
runs-on: ${{ matrix.os }}
Expand All @@ -16,6 +19,8 @@ jobs:

steps:
- uses: actions/checkout@v4
with:
lfs: false

- name: Install uv
uses: astral-sh/setup-uv@v5
Expand Down Expand Up @@ -55,6 +60,8 @@ jobs:
UV_SYSTEM_PYTHON: 1
steps:
- uses: actions/checkout@v4
with:
lfs: false

- name: Install uv
uses: astral-sh/setup-uv@v5
Expand All @@ -81,6 +88,8 @@ jobs:
UV_SYSTEM_PYTHON: 1
steps:
- uses: actions/checkout@v4
with:
lfs: false

- name: Install uv
uses: astral-sh/setup-uv@v5
Expand All @@ -107,6 +116,8 @@ jobs:
UV_SYSTEM_PYTHON: 1
steps:
- uses: actions/checkout@v4
with:
lfs: false

- name: Install uv
uses: astral-sh/setup-uv@v5
Expand Down
Loading