diff --git a/.gitignore b/.gitignore index 2e7237d7..d4be3118 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,22 @@ __pycache__/ # C extensions *.so +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# IDE files +.vscode/ +.idea/ +*.swp +*.swo +*~ + # Distribution / packaging .Python build/ @@ -36,7 +52,7 @@ MANIFEST pip-log.txt pip-delete-this-directory.txt -# Unit test / coverage reports +# Testing and coverage htmlcov/ .tox/ .nox/ @@ -49,6 +65,23 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ + +# MADEngine specific +credential.json +data.json +*.log +*.csv +*.html +library_trace.csv +library_perf.csv +perf.csv +perf.html + +# Temporary and build files +temp/ +tmp/ +*.tmp +.pytest_cache/ cover/ # Translations @@ -101,4 +134,12 @@ scripts/ .*_env/ .vscode/ -tmp/ \ No newline at end of file +build_manifest.json +tmp/ +k8s_manifests/ +k8s_results/ +rocprof_output/ +rpd_output/ +slurm_output/ +MagicMock/ +.madengine_session_start \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..76c8fd63 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,36 @@ +# Pre-commit hooks configuration +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-json + - id: check-toml + - id: check-added-large-files + - id: check-merge-conflict + - id: debug-statements + + - repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black + language_version: python3 + + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + + - repo: https://github.com/pycqa/flake8 + rev: 6.0.0 + hooks: + - id: flake8 + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.3.0 + hooks: + - id: mypy + additional_dependencies: [types-requests, types-PyYAML] + exclude: ^(tests/|scripts/) diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..0f2f086c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,148 @@ +# Changelog + +All notable changes to madengine will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Fixed +- **ROCprofv3 Argument Parsing**: Fixed rocprof_wrapper.sh argument parsing with custom commands + - Test `test_can_change_default_behavior_of_profiling_tool_with_additionalContext` now includes required `--` separator + - Without `--`, rocprofv3 would incorrectly parse application command as profiler boolean option + - Error manifested as: `ValueError: invalid truth value bash (type=str)` + - Fix ensures compatibility with both rocprof (legacy) and rocprofv3 (ROCm >= 7.0) +- **Error Pattern Detection**: Fixed false failure detection in HuggingFace GPT2/BERT models + - ROCProf logging messages (E20251230/W20251230 prefixes) no longer trigger false failures + - Added benign pattern list to exclude profiling tool output from error detection + - Made error patterns more specific (e.g., `RuntimeError:` instead of `Error:`) + - Improved performance metric extraction robustness to prevent bash segfaults during profiling + - Tests: Added `TestErrorPatternMatching` class in `tests/unit/test_error_handling.py` +- Removed stale compiled Python file (`__init__.pyc`) from source tree +- Cleaned up unused `typing_extensions` import in `core/console.py` +- Improved type hint accuracy in `Console.sh()` method docstring + +### Documentation +- **ROCprofv3 Usage Guide**: Enhanced documentation for custom profiling commands + - Added section in `docs/profiling.md` explaining the `--` separator requirement + - Added "Best Practices" section in `examples/profiling-configs/README.md` + - Enhanced `rocprof_wrapper.sh` header comments with usage examples + - Clarified that `--` must always be included when using custom rocprof commands + - Documented auto-detection behavior between rocprof (legacy) and rocprofv3 + +### Breaking Changes +- **CLI Unification**: Simplified command-line interface + - ✅ `madengine` is now the unified CLI command (previously `madengine-cli`) + - ❌ Removed legacy `madengine` v1.x CLI (previously `mad.py`) + - ❌ Removed `madengine-cli` alias (use `madengine` instead) + - **Migration**: Simply replace `madengine-cli` with `madengine` in your scripts + - All functionality remains identical, just cleaner command naming + +### Removed +- **Legacy CLI Components**: + - `src/madengine/mad.py` - Legacy CLI entry point (v1.x) + - `src/madengine/tools/run_models.py` - Legacy model runner + - `docs/legacy-cli.md` - Legacy CLI documentation +- Justification: Modern `madengine` CLI (formerly `madengine-cli`) provides all functionality plus K8s, SLURM, and distributed support + +### Security +- **CRITICAL:** Fixed SQL injection vulnerability in legacy database module (`src/madengine/db/database_functions.py`) + - Replaced string formatting with parameterized queries using SQLAlchemy `text()` + - Prevents potential SQL injection attacks in `get_matching_db_entries()` function +- Fixed 4 instances of bare `except:` blocks that could mask critical exceptions + - `kubernetes.py`: Replaced with specific exception types (`ConfigException`, `FileNotFoundError`, `ApiException`) + - `console.py`: Replaced with specific exception types (`OSError`, `ValueError`) for resource cleanup + +### Added +- **ROCprofv3 Profiling Suite** (ROCm 7.0+): 8 pre-configured profiling profiles for AI model benchmarking + - `rocprofv3_compute` - Compute-bound analysis (VALU/SALU instructions, wave execution) + - `rocprofv3_memory` - Memory-bound analysis (cache metrics, memory bandwidth) + - `rocprofv3_communication` - Multi-GPU communication analysis (RCCL traces, inter-GPU transfers) + - `rocprofv3_full` - Comprehensive profiling with all metrics (high overhead) + - `rocprofv3_lightweight` - Minimal overhead profiling (production-friendly) + - `rocprofv3_perfetto` - Perfetto UI compatible trace generation + - `rocprofv3_api_overhead` - API call timing analysis (HIP/HSA/marker traces) + - `rocprofv3_pc_sampling` - Kernel hotspot identification (PC sampling at 1000 Hz) +- **Hardware Counter Definitions**: 4 counter files for targeted profiling scenarios + - `compute_bound.txt` - Wave execution, ALU instructions, wait states + - `memory_bound.txt` - Cache hit rates, memory controller traffic, LDS usage + - `communication_bound.txt` - PCIe traffic, atomic operations, synchronization + - `full_profile.txt` - Comprehensive metrics for complete analysis +- **Profiling Configuration Examples**: 6 ready-to-use JSON configs in `examples/profiling-configs/` + - Single-GPU profiles (compute, memory, lightweight) + - Multi-GPU distributed training profile + - Comprehensive full-stack profiling + - Multi-node SLURM deployment config +- **Comprehensive Launcher Support**: Full K8s and SLURM support for 6 distributed frameworks + - TorchTitan: LLM pre-training with FSDP2+TP+PP+CP parallelism + - vLLM: High-throughput LLM inference with continuous batching + - SGLang: Fast LLM inference with structured generation + - DeepSpeed: ZeRO optimization training (K8s support added) + - Megatron-LM: Large-scale transformer training (K8s + SLURM) + - torchrun: Standard PyTorch DDP/FSDP +- **Centralized Launcher Documentation**: `docs/distributed-launchers.md` with comprehensive guide +- **Example Configurations**: 6 new minimal configs for distributed launchers (K8s) +- Comprehensive development tooling and configuration +- Pre-commit hooks for code quality +- Makefile for common development tasks +- Developer guide with coding standards +- Type checking with mypy +- Code formatting with black and isort +- Enhanced .gitignore for better file exclusions +- CI/CD configuration templates +- **Major Documentation Refactor**: Complete integration of distributed execution and CLI guides into README.md +- Professional open-source project structure with badges and table of contents +- Comprehensive MAD package integration documentation +- Enhanced model discovery and tag system documentation +- Modern deployment scenarios and configuration examples + +### Changed +- **README.md**: Added launcher ecosystem highlights to v2.0 features +- **K8s README**: Updated with new launcher configs and comprehensive launcher section +- **Documentation Structure**: Consolidated all launcher docs into single comprehensive guide +- Improved package initialization and imports +- Replaced print statements with proper logging in main CLI +- Enhanced error handling and logging throughout codebase +- Cleaned up setup.py for better maintainability +- Updated development dependencies in pyproject.toml +- **Complete README.md overhaul**: Merged all documentation into a single, comprehensive source +- Restructured documentation to emphasize MAD package integration +- Enhanced CLI usage examples and distributed execution workflows +- Improved developer contribution guidelines and legacy compatibility notes + +### Changed (Previous) +- Removed Python cache files from repository +- Fixed import organization and structure +- Improved docstring formatting and consistency +- Cleaned up documentation fragmentation + +### Removed +- Unnecessary debug print statements +- Python cache files and build artifacts +- **Legacy documentation files**: `docs/distributed-execution-solution.md` and `docs/madengine-cli-guide.md` +- **Duplicate documentation**: `docs/TORCHTITAN_LAUNCHER.md` (consolidated into distributed-launchers.md) +- Redundant documentation scattered across multiple files + +## [Previous Versions] + +For changes in previous versions, please refer to the git history. + +--- + +## Guidelines for Changelog Updates + +### Categories +- **Added** for new features +- **Changed** for changes in existing functionality +- **Deprecated** for soon-to-be removed features +- **Removed** for now removed features +- **Fixed** for any bug fixes +- **Security** for vulnerability fixes + +### Format +- Keep entries brief but descriptive +- Include ticket/issue numbers when applicable +- Group related changes together +- Use present tense ("Add feature" not "Added feature") +- Target audience: users and developers of the project diff --git a/README.md b/README.md index 28907fcb..d0840b63 100644 --- a/README.md +++ b/README.md @@ -1,426 +1,675 @@ # madengine -Set of interfaces to run various AI models from public MAD. -# What is madengine? +[![Python](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://python.org) +[![CI](https://img.shields.io/badge/CI-GitHub%20Actions-green.svg)](https://github.com/ROCm/madengine/actions) +[![Code Style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Version](https://img.shields.io/badge/version-2.0-brightgreen.svg)](CHANGELOG.md) +[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) + +> **AI model automation and benchmarking platform for local and distributed execution** + +madengine is a modern CLI tool for running Large Language Models (LLMs) and Deep Learning models across local and distributed environments. Built for the [MAD (Model Automation and Dashboarding)](https://github.com/ROCm/MAD) ecosystem, it provides seamless execution from single GPUs to multi-node clusters. + +## 📖 Table of Contents + +- [Key Features](#-key-features) +- [Quick Start](#-quick-start) +- [Commands](#-commands) +- [Documentation](#-documentation) +- [Architecture](#-architecture) +- [Feature Matrix](#-feature-matrix) +- [Usage Examples](#-usage-examples) +- [Model Discovery](#-model-discovery) +- [Performance Profiling](#-performance-profiling) +- [Reporting and Database](#-reporting-and-database) +- [Installation](#-installation) +- [Tips & Best Practices](#-tips--best-practices) +- [Contributing](#-contributing) +- [License](#-license) +- [Links & Resources](#-links--resources) + +## ✨ Key Features + +- **🚀 Modern CLI** - Rich terminal output with Typer and Rich +- **🎯 Simple Deployment** - Run locally or deploy to Kubernetes/SLURM via configuration +- **🔧 Distributed Launchers** - Full support for torchrun, DeepSpeed, Megatron-LM, TorchTitan, vLLM, SGLang +- **🐳 Container-Native** - Docker-based execution with GPU support (ROCm, CUDA) +- **📊 Performance Tools** - Integrated profiling with rocprof/rocprofv3, rocblas, MIOpen, RCCL tracing +- **🎯 ROCprofv3 Profiles** - 8 pre-configured profiles for compute/memory/communication bottleneck analysis +- **🔍 Environment Validation** - TheRock ROCm detection and validation tools +- **⚙️ Intelligent Defaults** - Minimal K8s configs with automatic preset application + +## 🚀 Quick Start -An AI Models automation and dashboarding command-line tool to run LLMs and Deep Learning models locally or remotelly with CI. - -The madengine library is to support AI automation having following features: -- AI Models run reliably on supported platforms and drive software quality -- Simple, minimalistic, out-of-the-box solution that enable confidence on hardware and software stack -- Real-time, audience-relevant AI Models performance metrics tracking, presented in clear, intuitive manner -- Best-practices for handling internal projects and external open-source projects - -# Installation - -madengine is meant to be used in conjunction with [MAD](https://github.com/ROCm/MAD). Below are the steps to set it up and run it using the command line interface (CLI). - -## Clone MAD -``` -git clone git@github.com:ROCm/MAD.git -cd MAD -``` - -## Install madengine - -### Install from source - -``` -# Create virtual environment if necessary -python3 -m venv venv - -# Active the virtual environment venv -source venv/bin/activate +```bash +# Install madengine +pip install git+https://github.com/ROCm/madengine.git -# Clone madengine -git clone git@github.com:ROCm/madengine.git +# Clone MAD package (required for models) +git clone https://github.com/ROCm/MAD.git && cd MAD -# Change current working directory to madengine -cd madengine - -# Install madengine from source: -pip install . +# Discover available models +madengine discover --tags dummy +# Run locally +madengine run --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' ``` -### Install from repo - -You can also install the madengine library directly from the Github repository. +**Results saved to `perf_entry.csv`** -``` -pip install git+https://github.com/ROCm/madengine.git@main -``` - -## Clone +## 📋 Commands -# Run madengine CLI +madengine provides five main commands for model automation and benchmarking: -How to run madengine CLI on your local machine. +| Command | Description | Use Case | +|---------|-------------|----------| +| **[discover](#-model-discovery)** | Find available models | Model exploration and validation | +| **[build](#building-images)** | Build Docker images | Create containerized models | +| **[run](#-usage-examples)** | Execute models | Local and distributed execution | +| **[report](docs/cli-reference.md#report---generate-reports)** | Generate HTML reports | Convert CSV to viewable reports | +| **[database](docs/cli-reference.md#database---upload-to-mongodb)** | Upload to MongoDB | Store results in database | -```shell -(venv) test-node:~/MAD$ madengine --help -usage: madengine [-h] [-v] {run,discover,report,database} ... +**Quick Start:** -A Model automation and dashboarding command-line tool to run LLMs and Deep Learning models locally. +```bash +# Discover models +madengine discover --tags dummy + +# Build image +madengine build --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Run model +madengine run --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Generate report +madengine report to-html --csv-file perf_entry.csv + +# Upload results +madengine database --csv-file perf_entry.csv --db mydb --collection results +``` + +For detailed command options, see the **[CLI Command Reference](docs/cli-reference.md)**. + +## 📚 Documentation + +| Guide | Description | +|-------|-------------| +| [Installation](docs/installation.md) | Complete installation instructions | +| [Usage Guide](docs/usage.md) | Commands, workflows, and examples | +| **[CLI Reference](docs/cli-reference.md)** | **Detailed command options and examples** | +| [Deployment](docs/deployment.md) | Kubernetes and SLURM deployment | +| [Configuration](docs/configuration.md) | Advanced configuration options | +| [Batch Build](docs/batch-build.md) | Selective builds for CI/CD | +| [Launchers](docs/launchers.md) | Distributed training frameworks | +| [Profiling](docs/profiling.md) | Performance analysis tools | +| [Contributing](docs/contributing.md) | How to contribute | + +## 🏗️ Architecture + +``` + ┌────────────────────────────────────────┐ + │ madengine CLI v2.0 │ + │ (Typer + Rich Terminal Interface) │ + └────────────────────────────────────────┘ + │ + ┌─────────────────────────────┼─────────────────────────────┐ + │ │ │ │ │ + ┌────▼────┐ ┌───▼────┐ ┌────▼────┐ ┌────▼─────┐ ┌─────▼─────┐ + │discover │ │ build │ │ run │ │ report │ │ database │ + │ │ │ │ │ │ │ │ │ │ + └────┬────┘ └───┬────┘ └────┬────┘ └────┬─────┘ └─────┬─────┘ + │ │ │ │ │ + │ │ │ │ │ + ▼ ▼ ▼ │ │ + ┌────────────────────────────────────┐ │ │ + │ Model Discovery System │ │ │ + │ • Root models (models.json) │ │ │ + │ • Directory models (scripts/) │ │ │ + │ • Dynamic models (get_models.py) │ │ │ + └────────────────────────────────────┘ │ │ + │ │ │ + ▼ │ │ + ┌────────────────────────┐ │ │ + │ Orchestration Layer │ │ │ + │ • BuildOrchestrator │◄───────────────---┘ │ + │ • RunOrchestrator │ │ + └────────┬───────────────┘ │ + │ │ + ┌────────┼────────┐ │ + │ │ │ │ + ┌────▼───┐ ┌─▼──────┐ ┌▼─────────┐ │ + │ Local │ │ K8s │ │ SLURM │ │ + │ Docker │ │ Jobs │ │ Jobs │ │ + └────┬───┘ └─┬──────┘ └┬─────────┘ │ + │ │ │ │ + └───────┼─────────┘ │ + │ │ + ┌───────┴─────────┐ │ + │ Distributed │ │ + │ Launchers │ │ + └───────┬─────────┘ │ + │ │ + ┌──────────┼──────────┐ │ + │ │ │ │ + ┌──▼───┐ ┌──▼───┐ ┌──▼───┐ │ + │Train │ │Train │ │Infer │ │ + │ │ │ │ │ │ │ + └──┬───┘ └──┬───┘ └──┬───┘ │ + │ │ │ │ + torchrun DeepSpeed vLLM │ + TorchTitan Megatron SGLang │ + -LM (Disagg) │ + │ │ + ▼ │ + ┌────────────────┐ │ + │ Performance │ │ + │ Output │ │ + │ (CSV/JSON) │ │ + └────┬───────────┘ │ + │ │ + └──────────────┬────────────────────────────────────---┘ + │ + ┌─────────────┴─────────────┐ + │ │ + ┌────▼─────┐ ┌─────▼──────┐ + │ Reporting│ │ Database │ + │ • to-html│ │ • MongoDB │ + │ • to-email │ • Upload │ + └──────────┘ └────────────┘ +``` + +**Component Flow:** + +1. **CLI Layer** - User interface with 5 commands (discover, build, run, report, database) +2. **Model Discovery** - Find and validate models from MAD package +3. **Orchestration** - BuildOrchestrator & RunOrchestrator manage workflows +4. **Execution Targets** - Local Docker, Kubernetes Jobs, or SLURM Jobs +5. **Distributed Launchers** - Training (torchrun, DeepSpeed, TorchTitan, Megatron-LM) and Inference (vLLM, SGLang) +6. **Performance Output** - CSV/JSON results with metrics +7. **Post-Processing** - Report generation (HTML/Email) and database upload (MongoDB) + +## 🎯 Feature Matrix + +### Supported Launchers & Infrastructure + +| Launcher | Local | Kubernetes | SLURM | Type | Key Features | +|----------|-------|-----------|-------|------|--------------| +| **torchrun** | ✅ | ✅ | ✅ | Training | PyTorch DDP/FSDP, elastic training | +| **DeepSpeed** | ✅ | ✅ | ✅ | Training | ZeRO optimization, pipeline parallelism | +| **Megatron-LM** | ✅ | ✅ | ✅ | Training | Tensor+Pipeline parallel, large transformers | +| **TorchTitan** | ✅ | ✅ | ✅ | Training | FSDP2+TP+PP+CP, Llama 3.1 (8B-405B) | +| **vLLM** | ✅ | ✅ | ✅ | Inference | v1 engine, PagedAttention, Ray cluster | +| **SGLang** | ✅ | ✅ | ✅ | Inference | RadixAttention, structured generation | +| **SGLang Disagg** | ❌ | ✅ | ✅ | Inference | Disaggregated prefill/decode, Mooncake, 3+ nodes | + +**Note:** All launchers support single-GPU, multi-GPU (single node), and multi-node (where infrastructure allows). See [Launchers Guide](docs/launchers.md) for details. + +### Parallelism Capabilities + +| Launcher | Data Parallel | Tensor Parallel | Pipeline Parallel | Context Parallel | Ray Cluster | Architecture | +|----------|--------------|----------------|-------------------|-----------------|-------------|--------------| +| **torchrun** | ✅ DDP/FSDP | ❌ | ❌ | ❌ | ❌ | Unified | +| **DeepSpeed** | ✅ ZeRO | ❌ | ✅ | ❌ | ❌ | Unified | +| **Megatron-LM** | ✅ | ✅ | ✅ | ❌ | ❌ | Unified | +| **TorchTitan** | ✅ FSDP2 | ✅ | ✅ | ✅ | ❌ | Unified | +| **vLLM** | ❌ | ✅ | ✅ | ❌ | ✅ Multi-node | Unified | +| **SGLang** | ❌ | ✅ | ❌ | ❌ | ✅ Multi-node | Unified | +| **SGLang Disagg** | ❌ | ✅ | ✅ (via disagg) | ❌ | ✅ Multi-node | Disaggregated | + +### Infrastructure Capabilities + +| Feature | Local | Kubernetes | SLURM | +|---------|-------|-----------|-------| +| **Execution** | Docker containers | K8s Jobs | SLURM jobs | +| **Multi-Node** | ❌ | ✅ Indexed Jobs | ✅ Job arrays | +| **Resource Mgmt** | Manual | Declarative (YAML) | Batch scheduler | +| **Monitoring** | Docker logs | kubectl/dashboard | squeue/scontrol | +| **Auto-scaling** | ❌ | ✅ | ❌ | +| **Network** | Host | CNI plugin | InfiniBand/Ethernet | + +## 💻 Usage Examples + +### Local Execution -optional arguments: - -h, --help show this help message and exit - -v, --version show program's version number and exit +```bash +# Single GPU +madengine run --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Multi-GPU with torchrun (DDP/FSDP) +madengine run --tags model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "docker_gpus": "0,1,2,3", + "distributed": { + "launcher": "torchrun", + "nproc_per_node": 4 + } + }' + +# With DeepSpeed (ZeRO optimization) +madengine run --tags model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "docker_gpus": "all", + "distributed": { + "launcher": "deepspeed", + "nproc_per_node": 8 + } + }' +``` + +### Kubernetes Deployment -Commands: - Available commands for running models, generating reports, and toolings. +```bash +# Minimal config (auto-defaults applied) +madengine run --tags model \ + --additional-context '{"k8s": {"gpu_count": 2}}' + +# Multi-node inference with vLLM +madengine run --tags model \ + --additional-context '{ + "k8s": { + "namespace": "ml-team", + "gpu_count": 8 + }, + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 4 + } + }' + +# SGLang with structured generation +madengine run --tags model \ + --additional-context '{ + "k8s": {"gpu_count": 4}, + "distributed": { + "launcher": "sglang", + "nproc_per_node": 4 + } + }' +``` + +### SLURM Deployment - {run,discover,report,database} - run Run models on container - discover Discover the models - report Generate report of models - database CRUD for database -``` +```bash +# Build phase (local or CI) +madengine build --tags model \ + --registry gcr.io/myproject \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Deploy phase (on SLURM login node) +madengine run --manifest-file build_manifest.json \ + --additional-context '{ + "slurm": { + "partition": "gpu", + "nodes": 4, + "gpus_per_node": 8, + "time": "24:00:00" + }, + "distributed": { + "launcher": "torchtitan", + "nnodes": 4, + "nproc_per_node": 8 + } + }' +``` + +### Common Workflows + +**Development → Testing → Production:** -## Run models locally +```bash +# 1. Develop locally with single GPU +madengine run --tags model \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' -Command to run LLMs and Deep Learning Models on container. +# 2. Test multi-GPU locally +madengine run --tags model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "docker_gpus": "0,1", + "distributed": {"launcher": "torchrun", "nproc_per_node": 2} + }' -``` -# An example CLI command to run a model -madengine run --tags pyt_huggingface_bert --live-output --additional-context "{'guest_os': 'UBUNTU'}" -``` +# 3. Build and push to registry +madengine build --tags model \ + --registry docker.io/myorg \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' -```shell -(venv) test-node:~/MAD$ madengine run --help -usage: madengine run [-h] [--tags TAGS [TAGS ...]] [--timeout TIMEOUT] [--live-output] [--clean-docker-cache] [--additional-context-file ADDITIONAL_CONTEXT_FILE] - [--additional-context ADDITIONAL_CONTEXT] [--data-config-file-name DATA_CONFIG_FILE_NAME] [--tools-json-file-name TOOLS_JSON_FILE_NAME] - [--generate-sys-env-details GENERATE_SYS_ENV_DETAILS] [--force-mirror-local FORCE_MIRROR_LOCAL] [--keep-alive] [--keep-model-dir] - [--skip-model-run] [--disable-skip-gpu-arch] [-o OUTPUT] - -Run LLMs and Deep Learning models on container - -optional arguments: - -h, --help show this help message and exit - --tags TAGS [TAGS ...] - tags to run (can be multiple). - --timeout TIMEOUT time out for model run in seconds; Overrides per-model timeout if specified or default timeout of 7200 (2 hrs). Timeout of 0 will never - timeout. - --live-output prints output in real-time directly on STDOUT - --clean-docker-cache rebuild docker image without using cache - --additional-context-file ADDITIONAL_CONTEXT_FILE - additonal context, as json file, to filter behavior of workloads. Overrides detected contexts. - --additional-context ADDITIONAL_CONTEXT - additional context, as string representation of python dict, to filter behavior of workloads. Overrides detected contexts and additional- - context-file. - --data-config-file-name DATA_CONFIG_FILE_NAME - custom data configuration file. - --tools-json-file-name TOOLS_JSON_FILE_NAME - custom tools json configuration file. - --generate-sys-env-details GENERATE_SYS_ENV_DETAILS - generate system config env details by default - --force-mirror-local FORCE_MIRROR_LOCAL - Path to force all relevant dataproviders to mirror data locally on. - --keep-alive keep Docker container alive after run; will keep model directory after run - --keep-model-dir keep model directory after run - --skip-model-run skips running the model; will not keep model directory after run unless specified through keep-alive or keep-model-dir - --disable-skip-gpu-arch - disables skipping model based on gpu architecture - -o OUTPUT, --output OUTPUT - output file +# 4. Deploy to Kubernetes +madengine run --manifest-file build_manifest.json ``` -For each model in models.json, the script -- builds docker images associated with each model. The images are named 'ci-$(model_name)', and are not removed after the script completes. -- starts the docker container, with name, 'container_$(model_name)'. The container should automatically be stopped and removed whenever the script exits. -- clones the git 'url', and runs the 'script' -- compiles the final perf.csv and perf.html - -### Tag functionality for running model - -With the tag functionality, the user can select a subset of the models, that have the corresponding tags matching user specified tags, to be run. User specified tags can be specified with the `--tags` argument. If multiple tags are specified, all models that match any tag is selected. -Each model name in models.json is automatically a tag that can be used to run that model. Tags are also supported in comma-separated form as a Jenkins parameter. - - -#### Search models with tags +**CI/CD Pipeline:** -Use cases of running models with static and dynamic search. Tags option supports searching models in models.json, scripts/model_dir/models.json, and scripts/model_dir/get_models_json.py. A user can add new models not only to the models.json file of DLM but also to the model folder in Flexible. To do this, the user needs to follow these steps: - -Update models.json: Add the new model's configuration details to the models.json file. This includes specifying the model's name, version, and any other relevant metadata. -Place Model Files: Copy the model files into the appropriate directory within the model folder in Flexible. Ensure that the folder structure and file naming conventions match the expected format. - -``` -# 1. run models in ~/MAD/models.json -(venv) test-node:~/MAD$ madengine run --tags dummy --live-output - -# 2. run model in ~/MAD/scripts/dummy2/models.json -(venv) test-node:~/MAD$ madengine run --tags dummy2:dummy_2 --live-output +```bash +# Batch build (selective rebuilds) +madengine build --batch-manifest batch.json \ + --registry docker.io/myorg -# 3. run model in ~/MAD/scripts/dummy3/get_models_json.py -(venv) test-node:~/MAD$ madengine run --tags dummy3:dummy_3 --live-output +# Run tests +madengine run --manifest-file build_manifest.json \ + --additional-context '{"k8s": {"namespace": "ci-test"}}' -# 4. run model with configurations -(venv) test-node:~/MAD$ madengine run --tags dummy2:dummy_2:batch_size=512:in=32:out=16 --live-output +# Generate and email reports +madengine report to-email --directory ./results --output ci_report.html -# 5. run model with configurations -(venv) test-node:~/MAD$ madengine run --tags dummy3:dummy_3:batch_size=512:in=32:out=16 --live-output +# Upload to database +madengine database --csv-file perf_entry.csv \ + --database-name ci_db --collection-name test_results ``` -The configs of batch_size512:in32:out16 will be pass to environment variables and build arguments of docker. - -### Custom timeouts -The default timeout for model run is 2 hrs. This can be overridden if the model in models.json contains a `'timeout' : TIMEOUT` entry. Both the default timeout and/or timeout specified in models.json can be overridden using `--timeout TIMEOUT` command line argument. Having `TIMEOUT` set to 0 means that the model run will never timeout. - -### Live output functionality -By default, `madengine` is silent. The output is piped into log files. By specifying `--live-output`, the output is printed in real-time to STDOUT. - -### Contexts -Contexts are run-time parameters that change how the model is executed. Some contexts are auto-detected. Detected contexts may be over-ridden. Contexts are also used to filter Dockerfile used in model. - -For more details, see [How to provide contexts](docs/how-to-provide-contexts.md) +See [Usage Guide](docs/usage.md), [Configuration Guide](docs/configuration.md), and [CLI Reference](docs/cli-reference.md) for more examples. -### Credentials -Credentials to clone model git urls are provided in a centralized `credential.json` file. Models that require special credentials for cloning have a special `cred` field in the model definition in `models.json`. This field denotes the specific credential in `credential.json` to use. Public models repositories can skip the `cred` field. +### Building Images -There are several types of credentials supported. +```bash +# Build single model +madengine build --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' -1. For HTTP/HTTPS git urls, `username` and `password` should be provided in the credential. For Source Code Management(SCM) systems that support Access Tokens, the token can be substituted for the `password` field. The `username` and `password` will be passed as a docker build argument and a container environment variable in the docker build and run steps. Fore example, for `"cred":"AMD_GITHUB"` field in `models.json` and entry `"AMD_GITHUB": { "username": "github_username", "password":"pass" }` in `credential.json` the following docker build arguments and container environment variables will be added: `AMD_GITHUB_USERNAME="github_username"` and `AMD_GITHUB_PASSWORD="pass"`. - -2. For SSH git urls, `username` and `ssh_key_file` should be provided in the credential. The `username` is the SSH username, and `ssh_key_file` is the private ssh key, that has been registed with the SCM system. -Due to legal requirements, the Credentials to access all models is not provided by default in DLM. Please contact the model owner if you wish to access and run the model. +# Build with registry (for distributed deployment) +madengine build --tags model1 model2 \ + --registry localhost:5000 \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' -3. For NAS urls, `HOST`, `PORT`, `USERNAME`, and `PASSWORD` should be provided in the credential. Please check env variables starting with NAS in [Environment Variables] (https://github.com/ROCm/madengine/blob/main/README.md#environment-variables) +# Build for multiple GPU architectures +madengine build --tags model \ + --target-archs gfx908 gfx90a gfx942 \ + --registry gcr.io/myproject -3. For AWS S3 urls, `USERNAME`, and `PASSWORD` should be provided in the credential with var name as MAD_AWS_S3 as mentioned in [Environment Variables] (https://github.com/ROCm/madengine/blob/main/README.md#environment-variables) +# Batch build mode (selective builds for CI/CD) +madengine build --batch-manifest examples/build-manifest/batch.json \ + --registry docker.io/myorg +# Clean rebuild (no Docker cache) +madengine build --tags model --clean-docker-cache \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +``` -### Local data provider -The DLM user may wish to run a model locally multiple times, with the input data downloaded once, and reused subsquently. This functionality is only supported on models that support the Data Provider functionality. That is, the model specification in `models.json` have the `data` field, which points to a data specification in `data.json`. +**Output:** Creates `build_manifest.json` with built image names and configurations. -To use existing data on a local path, add to the data specification, using a `local` field within `data.json`. By default, this path is mounted read-only. To change this path to read-write, specify the `readwrite` field to `'true'` in the data configuration. +See [Batch Build Guide](docs/batch-build.md) and examples in [`examples/build-manifest/`](examples/build-manifest/). -If no data exists in local path, a local copy of data can be downloaded using by setting the `mirrorlocal` field in data specification in `data.json`. Not all providers support `mirrorlocal`. For the ones that do support this feature, the remote data is mirrored on this host path during the first run. In subsequent runs, the data may be reused through synchronization mechanisms. If the user wishes to skip the remote synchronization, the same location can be set as a `local` data provider in data.json, with higher precedence, or as the only provider for the data, by locally editing `data.json`. +## 🔍 Model Discovery -Alternatively, the command-line argument, `--force-mirror-local` forces local mirroring on *all* workloads, to the provided FORCEMIRRORLOCAL path. +madengine discovers models from the MAD package using three methods: -## Discover models +```bash +# Root models (models.json) +madengine discover --tags pyt_huggingface_bert -Commands for discovering models through models.json, scripts/{model_dir}/models.json, or scripts/{model_dir}/get_models_json.py +# Directory-specific (scripts/{dir}/models.json) +madengine discover --tags dummy2:dummy_2 +# Dynamic with parameters (scripts/{dir}/get_models_json.py) +madengine discover --tags dummy3:dummy_3:batch_size=512 ``` -(venv) test-node:~/MAD$ madengine discover --help -usage: madengine discover [-h] [--tags TAGS [TAGS ...]] -Discover the models +## 📊 Performance Profiling -optional arguments: - -h, --help show this help message and exit - --tags TAGS [TAGS ...] - tags to discover models (can be multiple). -``` +madengine includes integrated profiling tools for AMD ROCm: -Use cases about how to discover models: +```bash +# GPU profiling with rocprof +madengine run --tags model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [{"name": "rocprof"}] + }' + +# ROCprofv3 (ROCm 7.0+) - Advanced profiling with pre-configured profiles +madengine run --tags model \ + --additional-context '{"tools": [{"name": "rocprofv3_compute"}]}' + +# Use configuration files for complex setups +madengine run --tags model \ + --additional-context-file examples/profiling-configs/rocprofv3_multi_gpu.json + +# Library tracing (rocBLAS, MIOpen, Tensile, RCCL) +madengine run --tags model \ + --additional-context '{"tools": [{"name": "rocblas_trace"}]}' + +# Power and VRAM monitoring +madengine run --tags model \ + --additional-context '{"tools": [ + {"name": "gpu_info_power_profiler"}, + {"name": "gpu_info_vram_profiler"} + ]}' + +# Multiple tools (stackable) +madengine run --tags model \ + --additional-context '{"tools": [ + {"name": "rocprofv3_memory"}, + {"name": "rocblas_trace"}, + {"name": "gpu_info_power_profiler"} + ]}' +``` + +**Available Tools:** + +| Tool | Purpose | Output | +|------|---------|--------| +| `rocprof` | GPU kernel profiling | Kernel timings, occupancy | +| `rocprofv3_compute` | Compute-bound analysis (ROCm 7.0+) | ALU metrics, wave execution | +| `rocprofv3_memory` | Memory-bound analysis (ROCm 7.0+) | Cache hits, bandwidth | +| `rocprofv3_communication` | Multi-GPU communication (ROCm 7.0+) | RCCL traces, inter-GPU transfers | +| `rocprofv3_lightweight` | Minimal overhead profiling (ROCm 7.0+) | HIP and kernel traces | +| `rocblas_trace` | rocBLAS library calls | Function calls, arguments | +| `miopen_trace` | MIOpen library calls | Conv/pooling operations | +| `tensile_trace` | Tensile GEMM library | Matrix multiply details | +| `rccl_trace` | RCCL collective ops | Communication patterns | +| `gpu_info_power_profiler` | GPU power consumption | Power usage over time | +| `gpu_info_vram_profiler` | GPU memory usage | VRAM utilization | +| `therock_check` | TheRock ROCm validation | Installation detection | + +**ROCprofv3 Profiles** (ROCm 7.0+): + +madengine provides 8 pre-configured ROCprofv3 profiles for different bottleneck scenarios: + +- `rocprofv3_compute` - Compute-bound workloads (transformers, dense ops) +- `rocprofv3_memory` - Memory-bound workloads (large batches, high-res) +- `rocprofv3_communication` - Multi-GPU distributed training +- `rocprofv3_full` - Comprehensive profiling (all metrics, high overhead) +- `rocprofv3_lightweight` - Minimal overhead (production-friendly) +- `rocprofv3_perfetto` - Perfetto UI compatible traces +- `rocprofv3_api_overhead` - API call timing analysis +- `rocprofv3_pc_sampling` - Kernel hotspot identification + +See [`examples/profiling-configs/`](examples/profiling-configs/) for ready-to-use configuration files. + +**TheRock Validation:** +```bash +# Validate TheRock installation (AMD's pip-based ROCm) +madengine run --tags dummy_therock \ + --additional-context '{"tools": [{"name": "therock_check"}]}' ``` -# 1 discover all models in DLM -(venv) test-node:~/MAD$ madengine discover - -# 2. discover specified model using tags in models.json of DLM -(venv) test-node:~/MAD$ madengine discover --tags dummy - -# 3. discover specified model using tags in scripts/{model_dir}/models.json with static search i.e. models.json -(venv) test-node:~/MAD$ madengine discover --tags dummy2/dummy_2 -# 4. discover specified model using tags in scripts/{model_dir}/get_models_json.py with dynamic search i.e. get_models_json.py -(venv) test-node:~/MAD$ madengine discover --tags dummy3/dummy_3 +See [Profiling Guide](docs/profiling.md) for detailed usage and analysis. -# 5. pass additional args to your model script from CLI -(venv) test-node:~/MAD$ madengine discover --tags dummy3/dummy_3:bs16 - -# 6. get multiple models using tags -(venv) test-node:~/MAD$ madengine discover --tags pyt_huggingface_bert pyt_huggingface_gpt2 -``` +## 📊 Reporting and Database -Note: You cannot use a backslash '/' or a colon ':' in a model name or a tag for a model in `models.json` or `get_models_json.py` +### Generate Reports -## Generate reports +Convert performance CSV files to HTML reports: -Commands for generating reports. +```bash +# Single CSV to HTML +madengine report to-html --csv-file perf_entry.csv +# Consolidated email report (all CSVs in directory) +madengine report to-email --directory ./results --output summary.html ``` -(venv) test-node:~/MAD$ madengine report --help -usage: madengine report [-h] {update-perf,to-html,to-email} ... -optional arguments: - -h, --help show this help message and exit +### Upload to Database + +Store performance results in MongoDB: -Report Commands: - Available commands for generating reports. +```bash +# Set MongoDB connection +export MONGO_HOST=mongodb.example.com +export MONGO_PORT=27017 +export MONGO_USER=myuser +export MONGO_PASSWORD=mypassword - {update-perf,to-html,to-email} - update-perf Update perf.csv to database - to-html Convert CSV to HTML report of models - to-email Convert CSV to Email of models +# Upload CSV to MongoDB +madengine database --csv-file perf_entry.csv \ + --database-name performance_db \ + --collection-name model_runs ``` -### Report command - Update perf CSV to database +**Use Cases:** +- Track performance over time +- Compare results across different configurations +- Build performance dashboards +- Automated CI/CD reporting -Update perf.csv to database +See [CLI Reference](docs/cli-reference.md) for complete options. -``` -(venv) test-node:~/MAD$ madengine report update-perf --help -usage: madengine report update-perf [-h] [--single_result SINGLE_RESULT] [--exception-result EXCEPTION_RESULT] [--failed-result FAILED_RESULT] - [--multiple-results MULTIPLE_RESULTS] [--perf-csv PERF_CSV] [--model-name MODEL_NAME] [--common-info COMMON_INFO] - -Update performance metrics of models perf.csv to database. - -optional arguments: - -h, --help show this help message and exit - --single_result SINGLE_RESULT - path to the single result json - --exception-result EXCEPTION_RESULT - path to the single result json - --failed-result FAILED_RESULT - path to the single result json - --multiple-results MULTIPLE_RESULTS - path to the results csv - --perf-csv PERF_CSV - --model-name MODEL_NAME - --common-info COMMON_INFO -``` +## 📦 Installation -### Report command - Convert CSV to HTML +```bash +# Basic installation +pip install git+https://github.com/ROCm/madengine.git -Convert CSV to HTML report of models +# With Kubernetes support +pip install "madengine[kubernetes] @ git+https://github.com/ROCm/madengine.git" +# Development installation +git clone https://github.com/ROCm/madengine.git +cd madengine && pip install -e ".[dev]" ``` -(venv) test-node:~/MAD$ madengine report to-html --help -usage: madengine report to-html [-h] [--csv-file-path CSV_FILE_PATH] - -Convert CSV to HTML report of models. -optional arguments: - -h, --help show this help message and exit - --csv-file-path CSV_FILE_PATH -``` +See [Installation Guide](docs/installation.md) for detailed instructions. -### Report command - Convert CSV to Email +## 💡 Tips & Best Practices -Convert CSV to Email report of models +### General Usage -``` -(venv) test-node:~/MAD$ madengine report to-email --help -usage: madengine report to-email [-h] [--csv-file-path CSV_FILE_PATH] +- **Use configuration files** for complex setups instead of long command lines +- **Test locally first** with single GPU before scaling to multi-node +- **Enable verbose logging** (`--verbose`) when debugging issues +- **Use `--live-output`** for real-time monitoring of long-running operations -Convert CSV to Email of models. +### Build & Deployment -optional arguments: - -h, --help show this help message and exit - --csv-file-path CSV_FILE_PATH - Path to the directory containing the CSV files. -``` +- **Separate build and run phases** for distributed deployments +- **Use registries** for multi-node execution (K8s/SLURM) +- **Use batch build mode** for CI/CD to optimize build times +- **Specify `--target-archs`** when building for multiple GPU architectures -## Database +### Performance -Commands for database, such as create and update table of DB. +- **Start with small timeouts** and increase as needed +- **Use profiling tools** to identify bottlenecks +- **Monitor GPU utilization** with `gpu_info_power_profiler` +- **Profile library calls** with rocBLAS/MIOpen tracing -``` -(venv) test-node:~/MAD$ madengine database --help -usage: madengine database [-h] {create-table,update-table,upload-mongodb} ... +### Troubleshooting -optional arguments: - -h, --help show this help message and exit +```bash +# Check model is available +madengine discover --tags your_model -Database Commands: - Available commands for database, such as creating and updating table in DB. +# Verbose output for debugging +madengine run --tags model --verbose --live-output - {create-table,update-table,upload-mongodb} - create-table Create table in DB - update-table Update table in DB - upload-mongodb Update table in DB -``` +# Keep container alive for inspection +madengine run --tags model --keep-alive -### Database - Create Table +# Clean rebuild if build fails +madengine build --tags model --clean-docker-cache --verbose ``` -(venv) test-node:~/MAD$ madengine database create-table --help -usage: madengine database create-table [-h] [-v] -Create table in DB. +**Common Issues:** +- **False failures with profiling**: If models show FAILURE but have performance metrics, see [Profiling Troubleshooting](docs/profiling.md#false-failure-detection-with-rocprof) +- **ROCProf log errors**: Messages like `E20251230` are informational logs, not errors (fixed in v2.0+) +- **Configuration errors**: Validate JSON with `python -m json.tool your-config.json` -optional arguments: - -h, --help show this help message and exit - -v, --verbose verbose output -``` +## 🤝 Contributing -### Database - Update Table -``` -(venv) test-node:~/MAD$ madengine database update-table --help -usage: madengine database update-table [-h] [--csv-file-path CSV_FILE_PATH] [--model-json-path MODEL_JSON_PATH] +We welcome contributions! See [Contributing Guide](docs/contributing.md) for details. -Update table in DB. +```bash +git clone https://github.com/ROCm/madengine.git +cd madengine +python3 -m venv venv && source venv/bin/activate +pip install -e ".[dev]" -optional arguments: - -h, --help show this help message and exit - --csv-file-path CSV_FILE_PATH - Path to the csv file - --model-json-path MODEL_JSON_PATH - Path to the model json file -``` +# Run all tests +pytest -### Database - Upload MongoDB +# Run specific test module +pytest tests/unit/test_error_handling.py -v +# Run error pattern tests +pytest tests/unit/test_error_handling.py::TestErrorPatternMatching -v ``` -(venv) test-node:~/MAD$ madengine database upload-mongodb --help -usage: madengine database upload-mongodb [-h] [--type TYPE] [--file-path FILE_PATH] [--name NAME] -Update table in DB. +## 📄 License -optional arguments: - -h, --help show this help message and exit - --type TYPE type of document to upload: job or run - --file-path FILE_PATH - total path to directory where perf_entry.csv, *env.csv, and *.log are stored - --name NAME name of model to upload -``` +MIT License - see [LICENSE](LICENSE) file for details. -## Tools in madengine +## 🔗 Links & Resources -There are some tools distributed with madengine together. They work with madengine CLI to profile GPU and get trace of ROCm libraries. +### Documentation +- **[CLI Reference](docs/cli-reference.md)** - Complete command options +- **[Usage Guide](docs/usage.md)** - Workflows and examples +- **[Deployment Guide](docs/deployment.md)** - Kubernetes/SLURM deployment +- **[Configuration Guide](docs/configuration.md)** - Advanced configuration +- **[All Docs](docs/)** - Complete documentation index -### Tools - GPU Info Profile +### External Resources +- **MAD Package**: https://github.com/ROCm/MAD +- **Issues & Support**: https://github.com/ROCm/madengine/issues +- **ROCm Documentation**: https://rocm.docs.amd.com/ -Profile GPU usage of running LLMs and Deep Learning models. +### Getting Help -``` -(venv) test-node:~/MAD$ madengine run --tags pyt_huggingface_bert --additional-context "{'guest_os': 'UBUNTU','tools': [{'name':'rocprof'}]}" +**Command Help:** +```bash +madengine --help # Main help +madengine --help # Command-specific help +madengine report --help # Sub-app help +madengine report to-html --help # Sub-command help ``` -### Tools - Trace Libraries of ROCm +**Quick Checks:** +```bash +# Verify installation +madengine --version -Trace library usage of running LLMs and Deep Learning models. A demo of running model with tracing rocBlas. +# Discover available models +madengine discover -``` -(venv) test-node:~/MAD$ madengine run --tags pyt_huggingface_bert --additional-context "{'guest_os': 'UBUNTU','tools': [{'name':'rocblas_trace'}]}" +# Check specific model +madengine discover --tags your_model --verbose ``` -## Environment Variables +**Troubleshooting:** +- Check [CLI Reference](docs/cli-reference.md) for all command options +- Enable `--verbose` flag for detailed error messages +- See [Usage Guide](docs/usage.md) troubleshooting section +- Report issues: https://github.com/ROCm/madengine/issues -Madengine also exposes environment variables to allow for models location setting or data loading at DLM/MAD runtime. +--- -| Field | Description | -|-----------------------------| ----------------------------------------------------------------------------------| -| MODEL_DIR | the location of models dir | -| PUBLIC_GITHUB_ROCM_KEY | username and token of GitHub | -| MAD_AWS_S3 | the username and password of AWS S3 | -| NAS_NODES | the list of credentials of NAS Nodes | +## ⚠️ Migration Notice (v2.0.0+) -Examples for running models using environment variables. -```bash -# Apply AWS S3 -MAD_AWS_S3='{"USERNAME":"username","PASSWORD":"password"}' madengine run --tags dummy_data_aws --live-output +The CLI has been unified! Starting from v2.0.0: +- ✅ Use `madengine` (unified modern CLI with K8s, SLURM, distributed support) +- ❌ Legacy v1.x CLI has been removed -# Apply customized NAS -NAS_NODES=[{"HOST":"hostname","PORT":"22","USERNAME":"username","PASSWORD":"password"}] madengine run --tags dummy_data_austin_nas --live-output -``` - -## Unit Test -Run pytest to validate unit tests of MAD Engine. +--- -``` -pytest -v -s -``` +**Code Quality**: Clean codebase with no dead code, comprehensive test coverage, and following Python best practices. diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..ca9ebb4a --- /dev/null +++ b/docs/README.md @@ -0,0 +1,211 @@ +# madengine Documentation + +Complete documentation for madengine - AI model automation and distributed benchmarking platform. + +## 📚 Documentation Index + +### Getting Started + +| Guide | Description | +|-------|-------------| +| [Installation](installation.md) | Complete installation instructions | +| [Usage Guide](usage.md) | Commands, configuration, and examples | + +### Configuration & Deployment + +| Guide | Description | +|-------|-------------| +| [Configuration](configuration.md) | Advanced configuration options | +| [Batch Build](batch-build.md) | Selective builds with batch manifests | +| [Deployment](deployment.md) | Kubernetes and SLURM deployment | +| [Launchers](launchers.md) | Multi-node training frameworks | + +### Advanced Topics + +| Guide | Description | +|-------|-------------| +| [Profiling](profiling.md) | Performance analysis tools | +| [Contributing](contributing.md) | How to contribute to madengine | + +### Reference + +| Guide | Description | +|-------|-------------| +| **[CLI Reference](cli-reference.md)** | **Complete command-line options and examples** | +| [Recent Fixes](RECENT_FIXES.md) | Latest bug fixes and improvements | + +## 🏗️ Architecture + +``` + ┌────────────────────────────────────────┐ + │ madengine CLI v2.0 │ + │ (Typer + Rich Terminal Interface) │ + └────────────────────────────────────────┘ + │ + ┌─────────────────────────────┼─────────────────────────────┐ + │ │ │ │ │ + ┌────▼────┐ ┌───▼────┐ ┌────▼────┐ ┌────▼─────┐ ┌─────▼─────┐ + │discover │ │ build │ │ run │ │ report │ │ database │ + │ │ │ │ │ │ │ │ │ │ + └────┬────┘ └───┬────┘ └────┬────┘ └────┬─────┘ └─────┬─────┘ + │ │ │ │ │ + │ │ │ │ │ + ▼ ▼ ▼ │ │ + ┌────────────────────────────────────┐ │ │ + │ Model Discovery System │ │ │ + │ • Root models (models.json) │ │ │ + │ • Directory models (scripts/) │ │ │ + │ • Dynamic models (get_models.py) │ │ │ + └────────────────────────────────────┘ │ │ + │ │ │ + ▼ │ │ + ┌────────────────────────┐ │ │ + │ Orchestration Layer │ │ │ + │ • BuildOrchestrator │◄───────────────┘ │ + │ • RunOrchestrator │ │ + └────────┬───────────────┘ │ + │ │ + ┌────────┼────────┐ │ + │ │ │ │ + ┌────▼───┐ ┌─▼──────┐ ┌▼─────────┐ │ + │ Local │ │ K8s │ │ SLURM │ │ + │ Docker │ │ Jobs │ │ Jobs │ │ + └────┬───┘ └─┬──────┘ └┬─────────┘ │ + │ │ │ │ + └───────┼─────────┘ │ + │ │ + ┌───────┴─────────┐ │ + │ Distributed │ │ + │ Launchers │ │ + └───────┬─────────┘ │ + │ │ + ┌──────────┼──────────┐ │ + │ │ │ │ + ┌──▼───┐ ┌──▼───┐ ┌──▼───┐ │ + │Train │ │Train │ │Infer │ │ + │ │ │ │ │ │ │ + └──┬───┘ └──┬───┘ └──┬───┘ │ + │ │ │ │ + torchrun DeepSpeed vLLM │ + TorchTitan Megatron SGLang │ + -LM (Disagg) │ + │ │ + ▼ │ + ┌────────────────┐ │ + │ Performance │ │ + │ Output │ │ + │ (CSV/JSON) │ │ + └────┬───────────┘ │ + │ │ + └──────────────┬────────────────────────────────────┘ + │ + ┌─────────────┴─────────────┐ + │ │ + ┌────▼─────┐ ┌─────▼──────┐ + │ Reporting│ │ Database │ + │ • to-html│ │ • MongoDB │ + │ • to-email │ • Upload │ + └──────────┘ └────────────┘ +``` + +**Component Flow:** + +1. **CLI Layer** - User interface with 5 commands (discover, build, run, report, database) +2. **Model Discovery** - Find and validate models from MAD package +3. **Orchestration** - BuildOrchestrator & RunOrchestrator manage workflows +4. **Execution Targets** - Local Docker, Kubernetes Jobs, or SLURM Jobs +5. **Distributed Launchers** - Training (torchrun, DeepSpeed, TorchTitan, Megatron-LM) and Inference (vLLM, SGLang) +6. **Performance Output** - CSV/JSON results with metrics +7. **Post-Processing** - Report generation (HTML/Email) and database upload (MongoDB) + +## 🚀 Quick Links + +- **Main Repository**: https://github.com/ROCm/madengine +- **MAD Package**: https://github.com/ROCm/MAD +- **Issues**: https://github.com/ROCm/madengine/issues +- **ROCm Documentation**: https://rocm.docs.amd.com/ + +## 📖 Documentation by Use Case + +### I want to... + +**Run a model locally** +→ [Installation](installation.md) → [Usage Guide](usage.md) + +**Deploy to Kubernetes** +→ [Configuration](configuration.md) → [Deployment](deployment.md) + +**Deploy to SLURM** +→ [Configuration](configuration.md) → [Deployment](deployment.md) + +**Build multiple models selectively (CI/CD)** +→ [Batch Build](batch-build.md) + +**Profile model performance** +→ [Profiling](profiling.md) + +**Multi-node distributed training** +→ [Launchers](launchers.md) → [Deployment](deployment.md) + +**Contribute to madengine** +→ [Contributing](contributing.md) + +## 🔍 Key Concepts + +### MAD Package + +madengine operates within the MAD (Model Automation and Dashboarding) ecosystem. The MAD package contains: +- Model definitions (`models.json`) +- Execution scripts (`run.sh`) +- Docker configurations +- Data provider configurations (`data.json`) +- Credentials (`credential.json`) + +### CLI Interface + +**`madengine`** - Modern CLI with: +- Rich terminal output +- Distributed deployment support (K8s, SLURM) +- Build/run separation +- Manifest-based execution + +### Deployment Targets + +- **Local** - Docker containers on local machine +- **Kubernetes** - Cloud-native container orchestration +- **SLURM** - HPC cluster job scheduling + +### Distributed Launchers + +- **torchrun** - PyTorch DDP/FSDP +- **deepspeed** - ZeRO optimization +- **megatron** - Large transformers (SLURM only) +- **torchtitan** - LLM pre-training +- **vllm** - LLM inference +- **sglang** - Structured generation + +## 📝 Documentation Standards + +This documentation follows these principles: + +1. **Task-oriented** - Organized by what users want to accomplish +2. **Progressive disclosure** - Start simple, add complexity as needed +3. **Examples first** - Show working examples before explaining details +4. **Consistent naming** - Files follow simple naming pattern (no prefixes) +5. **Up-to-date** - Reflects current implementation (v2.0) + +## 🤝 Contributing to Documentation + +Documentation improvements are welcome! Please: + +1. Keep examples working and tested +2. Use consistent formatting and style +3. Update cross-references when moving content +4. Mark deprecated content clearly +5. Follow the existing structure + +See [Contributing Guide](contributing.md) for details. + +## 📄 License + +madengine is licensed under the MIT License. See [LICENSE](../LICENSE) for details. diff --git a/docs/batch-build.md b/docs/batch-build.md new file mode 100644 index 00000000..24983d98 --- /dev/null +++ b/docs/batch-build.md @@ -0,0 +1,245 @@ +# Batch Build Guide + +Complete guide to using batch manifests for selective model builds in CI/CD pipelines. + +## Overview + +Batch build mode enables selective builds with per-model configuration through a JSON manifest file. This is ideal for CI/CD pipelines where you need fine-grained control over which models to rebuild. + +## Usage + +```bash +madengine build --batch-manifest examples/build-manifest/batch.json \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +``` + +## Manifest Format + +### Basic Structure + +```json +[ + { + "model_name": "model1", + "build_new": true, + "registry": "docker.io/myorg", + "registry_image": "myorg/model1" + }, + { + "model_name": "model2", + "build_new": false + } +] +``` + +### Field Reference + +#### Required Fields + +| Field | Type | Description | +|-------|------|-------------| +| `model_name` | string | Model tag to include in manifest | + +#### Optional Fields + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `build_new` | boolean | `false` | `true`: Build from source
`false`: Reference existing image | +| `registry` | string | - | Per-model Docker registry (overrides global `--registry`) | +| `registry_image` | string | - | Custom registry image name/namespace | + +## Key Features + +### Selective Building + +- Models with `"build_new": true` are built from source +- Models with `"build_new": false` are referenced without building +- All models are included in the output `build_manifest.json` + +### Per-Model Registry Override + +Each model can specify its own registry: + +```json +[ + { + "model_name": "public_model", + "build_new": true, + "registry": "docker.io/myorg" + }, + { + "model_name": "private_model", + "build_new": true, + "registry": "gcr.io/myproject" + } +] +``` + +### Mutual Exclusivity with --tags + +Cannot use `--batch-manifest` and `--tags` together: + +```bash +# ❌ Error +madengine build --batch-manifest batch.json --tags model1 + +# ✅ Correct +madengine build --batch-manifest batch.json +``` + +## Common Use Cases + +### CI/CD Incremental Builds + +Rebuild only changed models while referencing stable ones: + +**Example:** [`examples/build-manifest/ci_incremental.json`](../examples/build-manifest/ci_incremental.json) + +```json +[ + {"model_name": "changed_model", "build_new": true}, + {"model_name": "stable_model_1", "build_new": false}, + {"model_name": "stable_model_2", "build_new": false} +] +``` + +**Usage:** +```bash +madengine build --batch-manifest examples/build-manifest/ci_incremental.json \ + --registry docker.io/myorg \ + --additional-context-file config.json +``` + +### Multi-Registry Deployment + +Deploy models to different registries: + +```json +[ + { + "model_name": "public_model", + "build_new": true, + "registry": "docker.io/myorg" + }, + { + "model_name": "private_model", + "build_new": true, + "registry": "gcr.io/myproject" + } +] +``` + +### Custom Image Names + +Specify custom image names and tags: + +```json +[ + { + "model_name": "my_model", + "build_new": true, + "registry": "docker.io/myorg", + "registry_image": "myorg/custom-name:v2.0" + } +] +``` + +## Complete Workflow + +### 1. Create Batch Manifest + +```bash +cat > my_batch.json << 'EOF' +[ + { + "model_name": "dummy", + "build_new": true + }, + { + "model_name": "stable_model", + "build_new": false, + "registry": "docker.io/myorg", + "registry_image": "myorg/stable:v1.0" + } +] +EOF +``` + +### 2. Build with Batch Manifest + +```bash +madengine build --batch-manifest my_batch.json \ + --registry localhost:5000 \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU" + }' \ + --verbose +``` + +### 3. Use Output Manifest + +The command generates `build_manifest.json` containing: +- Built models with their new image names +- Referenced models with their existing image names +- Per-model registry configuration + +Run the models: +```bash +madengine run --manifest-file build_manifest.json +``` + +## Examples + +See [`examples/build-manifest/`](../examples/build-manifest/) directory for: +- [`batch.json`](../examples/build-manifest/batch.json) - Basic example with all field types +- [`ci_incremental.json`](../examples/build-manifest/ci_incremental.json) - CI/CD incremental build pattern + +## Command Reference + +### Build Command + +```bash +madengine build [OPTIONS] +``` + +**Batch Build Options:** +- `--batch-manifest PATH` - Input batch manifest file (mutually exclusive with `--tags`) +- `--registry, -r URL` - Global Docker registry (can be overridden per model) +- `--additional-context, -c JSON` - Configuration as JSON string +- `--additional-context-file, -f PATH` - Configuration file +- `--manifest-output, -m PATH` - Output manifest file (default: `build_manifest.json`) +- `--verbose, -v` - Verbose logging + +### Output + +Creates `build_manifest.json` with: +```json +{ + "built_images": { + "image_name": { + "docker_image": "...", + "registry": "...", + ... + } + }, + "built_models": {...}, + "deployment_config": {...}, + "summary": {...} +} +``` + +## Best Practices + +1. **Version Control**: Keep batch manifests in version control for reproducibility +2. **Start Simple**: Begin with basic manifests and add complexity as needed +3. **Test Locally**: Validate batch manifests locally before CI/CD deployment +4. **Consistent Naming**: Use descriptive model names and consistent registry paths +5. **Document Changes**: Add comments in commit messages explaining manifest changes + +## See Also + +- [Configuration Guide](configuration.md) - Additional context and build arguments +- [Usage Guide](usage.md) - General build and run workflows +- [Deployment Guide](deployment.md) - Kubernetes and SLURM deployment + diff --git a/docs/cli-reference.md b/docs/cli-reference.md new file mode 100644 index 00000000..3528f4c7 --- /dev/null +++ b/docs/cli-reference.md @@ -0,0 +1,610 @@ +# CLI Command Reference + +Complete reference for all madengine CLI commands with detailed options and examples. + +## Table of Contents + +- [Overview](#overview) +- [Global Options](#global-options) +- [Commands](#commands) + - [discover](#discover---discover-available-models) + - [build](#build---build-docker-images) + - [run](#run---execute-models) + - [report](#report---generate-reports) + - [database](#database---upload-to-mongodb) +- [Exit Codes](#exit-codes) + +## Overview + +madengine provides a modern CLI for AI model automation and distributed execution. All commands follow a consistent pattern with rich terminal output and comprehensive error handling. + +```bash +madengine [OPTIONS] COMMAND [ARGS]... +``` + +## Global Options + +These options are available for the main `madengine` command: + +| Option | Description | +|--------|-------------| +| `--version` | Show version and exit | +| `--help` | Show help message and exit | + +## Commands + +### `discover` - Discover Available Models + +Discover all models available in the MAD package based on specified tags. + +**Usage:** + +```bash +madengine discover [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--tags` | `-t` | TEXT | `[]` | Model tags to discover (can specify multiple) | +| `--verbose` | `-v` | FLAG | `False` | Enable verbose logging | + +**Examples:** + +```bash +# Discover all models +madengine discover + +# Discover specific models by tag +madengine discover --tags dummy pyt_huggingface_bert + +# Multiple tags with comma separation +madengine discover --tags dummy,multi,vllm + +# With verbose output +madengine discover --tags model --verbose + +# Directory-specific models +madengine discover --tags dummy2:dummy_2 + +# Dynamic models with parameters +madengine discover --tags dummy3:dummy_3:batch_size=512 +``` + +**Discovery Methods:** + +1. **Root models** - From `models.json` in MAD package root +2. **Directory-specific** - From `scripts/{dir}/models.json` +3. **Dynamic models** - Generated by `scripts/{dir}/get_models_json.py` + +--- + +### `build` - Build Docker Images + +Build Docker images for models, optionally pushing them to a registry. + +**Usage:** + +```bash +madengine build [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--tags` | `-t` | TEXT | `[]` | Model tags to build (can specify multiple) | +| `--target-archs` | `-a` | TEXT | `[]` | Target GPU architectures (e.g., gfx908,gfx90a,gfx942) | +| `--registry` | `-r` | TEXT | `None` | Docker registry to push images to | +| `--batch-manifest` | | TEXT | `None` | Input batch.json file for batch build mode | +| `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string | +| `--additional-context-file` | `-f` | TEXT | `None` | File containing additional context JSON | +| `--clean-docker-cache` | | FLAG | `False` | Rebuild images without using cache | +| `--manifest-output` | `-m` | TEXT | `build_manifest.json` | Output file for build manifest | +| `--summary-output` | `-s` | TEXT | `None` | Output file for build summary JSON | +| `--live-output` | `-l` | FLAG | `False` | Print output in real-time | +| `--verbose` | `-v` | FLAG | `False` | Enable verbose logging | + +**Examples:** + +```bash +# Basic build +madengine build --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Build with registry +madengine build --tags model \ + --registry docker.io/myorg \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Build multiple models +madengine build --tags model1 model2 model3 \ + --registry localhost:5000 + +# Build for multiple GPU architectures +madengine build --tags model \ + --target-archs gfx908 gfx90a gfx942 \ + --registry gcr.io/myproject + +# Clean rebuild without cache +madengine build --tags model --clean-docker-cache + +# Batch build mode (selective builds) +madengine build --batch-manifest batch.json \ + --registry docker.io/myorg \ + --additional-context-file config.json + +# Custom manifest output +madengine build --tags model \ + --manifest-output my_manifest.json \ + --summary-output build_summary.json + +# Real-time output with verbose logging +madengine build --tags model --live-output --verbose +``` + +**Required Context for Build:** + +- `gpu_vendor`: `"AMD"` or `"NVIDIA"` +- `guest_os`: `"UBUNTU"` or `"CENTOS"` + +**Batch Build Mode:** + +When using `--batch-manifest`, provide a JSON file with selective build configuration: + +```json +[ + { + "model_name": "model1", + "build_new": true, + "registry": "docker.io/myorg", + "registry_image": "custom-namespace/model1" + }, + { + "model_name": "model2", + "build_new": false + } +] +``` + +See [Batch Build Guide](batch-build.md) for details. + +--- + +### `run` - Execute Models + +Run models locally or deploy to Kubernetes/SLURM clusters. + +**Usage:** + +```bash +madengine run [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--tags` | `-t` | TEXT | `[]` | Model tags to run (can specify multiple) | +| `--manifest-file` | `-m` | TEXT | `""` | Build manifest file path (for pre-built images) | +| `--registry` | `-r` | TEXT | `None` | Docker registry URL | +| `--timeout` | | INT | `-1` | Timeout in seconds (-1=default 7200s, 0=no timeout) | +| `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string | +| `--additional-context-file` | `-f` | TEXT | `None` | File containing additional context JSON | +| `--keep-alive` | | FLAG | `False` | Keep Docker containers alive after run | +| `--keep-model-dir` | | FLAG | `False` | Keep model directory after run | +| `--clean-docker-cache` | | FLAG | `False` | Rebuild images without using cache (full workflow) | +| `--manifest-output` | | TEXT | `build_manifest.json` | Output file for build manifest (full workflow) | +| `--summary-output` | `-s` | TEXT | `None` | Output file for summary JSON | +| `--live-output` | `-l` | FLAG | `False` | Print output in real-time | +| `--output` | `-o` | TEXT | `perf_entry.csv` | Performance output file | +| `--ignore-deprecated` | | FLAG | `False` | Force run deprecated models | +| `--data-config` | | TEXT | `data.json` | Custom data configuration file | +| `--tools-config` | | TEXT | `tools.json` | Custom tools JSON configuration | +| `--sys-env-details` | | FLAG | `True` | Generate system config env details | +| `--force-mirror-local` | | TEXT | `None` | Path to force local data mirroring | +| `--disable-skip-gpu-arch` | | FLAG | `False` | Disable skipping models based on GPU architecture | +| `--verbose` | `-v` | FLAG | `False` | Enable verbose logging | +| `--cleanup-perf` | | FLAG | `False` | Remove intermediate perf_entry files after run (keeps perf.csv and perf_super files) | + +**Examples:** + +```bash +# Local execution +madengine run --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Run with pre-built images (manifest-based) +madengine run --manifest-file build_manifest.json + +# Multi-GPU with torchrun +madengine run --tags model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "docker_gpus": "0,1,2,3", + "distributed": { + "launcher": "torchrun", + "nproc_per_node": 4 + } + }' + +# Kubernetes deployment (minimal config) +madengine run --tags model \ + --additional-context '{"k8s": {"gpu_count": 2}}' + +# Kubernetes multi-node with vLLM +madengine run --tags model \ + --additional-context '{ + "k8s": {"gpu_count": 8}, + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 4 + } + }' + +# SLURM deployment +madengine run --tags model \ + --additional-context '{ + "slurm": { + "partition": "gpu", + "nodes": 4, + "gpus_per_node": 8 + }, + "distributed": { + "launcher": "torchtitan", + "nnodes": 4, + "nproc_per_node": 8 + } + }' + +# With profiling tools +madengine run --tags model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [ + {"name": "rocprof"}, + {"name": "gpu_info_power_profiler"} + ] + }' + +# Custom timeout (2 hours) +madengine run --tags model --timeout 7200 + +# No timeout (run indefinitely) +madengine run --tags model --timeout 0 + +# Keep container alive for debugging +madengine run --tags model --keep-alive --verbose + +# Real-time output +madengine run --tags model --live-output + +# Custom performance output file +madengine run --tags model --output my_perf_results.csv + +# Clean up intermediate perf files after run +madengine run --tags model --cleanup-perf + +# Using configuration file +madengine run --tags model \ + --additional-context-file k8s-config.json +``` + +**Execution Modes:** + +1. **Full Workflow** - Build + Run (when no manifest exists) +2. **Execution Only** - Run only (when manifest-file provided and exists) +3. **Manifest-based** - Use pre-built images from manifest + +**Deployment Targets:** + +- **Local** - Docker containers on local machine +- **Kubernetes** - Detected when `k8s` key present in context +- **SLURM** - Detected when `slurm` key present in context + +**Performance Output:** + +Results are saved to CSV file (default: `perf_entry.csv`) with metrics including: +- Execution time +- GPU utilization +- Memory usage +- Model-specific performance metrics + +--- + +### `report` - Generate Reports + +Generate HTML reports from CSV performance files. + +#### Subcommands + +##### `report to-html` - Convert CSV to HTML + +Convert a single CSV file to HTML table format. + +**Usage:** + +```bash +madengine report to-html [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Required | Description | +|--------|-------|------|----------|-------------| +| `--csv-file` | | TEXT | **Yes** | Path to the CSV file to convert | +| `--verbose` | `-v` | FLAG | No | Enable verbose logging | + +**Examples:** + +```bash +# Convert CSV to HTML +madengine report to-html --csv-file perf_entry.csv + +# With custom CSV file +madengine report to-html --csv-file results/perf_mi300.csv + +# Verbose output +madengine report to-html --csv-file perf.csv --verbose +``` + +**Output:** Creates `{filename}.html` in the same directory as the CSV file. + +--- + +##### `report to-email` - Generate Email Report + +Convert all CSV files in a directory to a consolidated email-ready HTML report. + +**Usage:** + +```bash +madengine report to-email [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--directory` | `--dir` | TEXT | `"."` | Path to directory containing CSV files | +| `--output` | `-o` | TEXT | `run_results.html` | Output HTML filename | +| `--verbose` | `-v` | FLAG | `False` | Enable verbose logging | + +**Examples:** + +```bash +# Generate email report from current directory +madengine report to-email + +# Specify directory +madengine report to-email --directory ./results + +# Custom output filename +madengine report to-email --dir ./results --output summary.html + +# Verbose output +madengine report to-email --directory ./results --verbose +``` + +**Output:** Creates consolidated HTML report suitable for email distribution. + +--- + +### `database` - Upload to MongoDB + +Upload CSV performance data to MongoDB database. + +**Usage:** + +```bash +madengine database [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Default | Required | Description | +|--------|-------|------|---------|----------|-------------| +| `--csv-file` | | TEXT | `perf_entry.csv` | No | Path to the CSV file to upload | +| `--database-name` | `--db` | TEXT | `None` | **Yes** | Name of the MongoDB database | +| `--collection-name` | `--collection` | TEXT | `None` | **Yes** | Name of the MongoDB collection | +| `--verbose` | `-v` | FLAG | `False` | No | Enable verbose logging | + +**Examples:** + +```bash +# Upload to MongoDB +madengine database \ + --csv-file perf_entry.csv \ + --database-name mydb \ + --collection-name results + +# Short option names +madengine database \ + --csv-file perf.csv \ + --db test \ + --collection perf_data + +# With verbose output +madengine database \ + --csv-file perf.csv \ + --db mydb \ + --collection results \ + --verbose +``` + +**Environment Variables:** + +MongoDB connection details are read from environment variables: + +| Variable | Description | Example | +|----------|-------------|---------| +| `MONGO_HOST` | MongoDB host address | `localhost` or `mongodb.example.com` | +| `MONGO_PORT` | MongoDB port | `27017` | +| `MONGO_USER` | MongoDB username | `admin` | +| `MONGO_PASSWORD` | MongoDB password | `secretpassword` | + +**Example Setup:** + +```bash +export MONGO_HOST=mongodb.example.com +export MONGO_PORT=27017 +export MONGO_USER=myuser +export MONGO_PASSWORD=mypassword + +madengine database \ + --csv-file perf_entry.csv \ + --db performance_db \ + --collection model_runs +``` + +--- + +## Exit Codes + +madengine uses standard exit codes to indicate success or failure: + +| Code | Constant | Description | +|------|----------|-------------| +| `0` | `SUCCESS` | Command completed successfully | +| `1` | `FAILURE` | General failure | +| `2` | `INVALID_ARGS` | Invalid command-line arguments or configuration | +| `3` | `BUILD_FAILURE` | One or more builds failed | +| `4` | `RUN_FAILURE` | One or more model executions failed | + +**Example Usage in Scripts:** + +```bash +#!/bin/bash + +madengine build --tags model +if [ $? -eq 0 ]; then + echo "Build successful" + madengine run --manifest-file build_manifest.json +else + echo "Build failed with exit code $?" + exit 1 +fi +``` + +--- + +## Configuration File Format + +For complex configurations, use JSON files with `--additional-context-file`: + +**Example: `config.json`** + +```json +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "docker_gpus": "0,1,2,3", + "timeout_multiplier": 2.0, + "docker_env_vars": { + "PYTORCH_TUNABLEOP_ENABLED": "1", + "HSA_ENABLE_SDMA": "0", + "NCCL_DEBUG": "INFO" + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 4 + } +} +``` + +**Example: `k8s-config.json`** + +```json +{ + "gpu_vendor": "AMD", + "k8s": { + "namespace": "ml-team", + "gpu_count": 8, + "cpu_request": "32", + "memory_request": "256Gi", + "node_selector": { + "gpu-type": "mi300x" + } + }, + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 4 + } +} +``` + +**Example: `slurm-config.json`** + +```json +{ + "gpu_vendor": "AMD", + "slurm": { + "partition": "gpu", + "nodes": 4, + "gpus_per_node": 8, + "time": "24:00:00", + "account": "ml_research", + "qos": "high" + }, + "distributed": { + "launcher": "torchtitan", + "nnodes": 4, + "nproc_per_node": 8 + } +} +``` + +--- + +## Environment Variables + +madengine recognizes these environment variables: + +| Variable | Description | Default | +|----------|-------------|---------| +| `MODEL_DIR` | Path to MAD package directory | Auto-detected | +| `MAD_VERBOSE_CONFIG` | Enable verbose configuration logging | `false` | +| `MAD_DOCKERHUB_USER` | Docker Hub username | None | +| `MAD_DOCKERHUB_PASSWORD` | Docker Hub password/token | None | +| `MAD_DOCKERHUB_REPO` | Docker Hub repository | None | +| `MAD_CONTAINER_IMAGE` | Pre-built container image to use | None | +| `MONGO_HOST` | MongoDB host for database command | `localhost` | +| `MONGO_PORT` | MongoDB port for database command | `27017` | +| `MONGO_USER` | MongoDB username | None | +| `MONGO_PASSWORD` | MongoDB password | None | + +--- + +## Best Practices + +1. **Use configuration files** for complex setups instead of long command lines +2. **Separate build and run phases** for distributed deployments +3. **Test locally first** before deploying to clusters +4. **Use registries** for distributed execution across multiple nodes +5. **Enable verbose logging** (`--verbose`) when debugging issues +6. **Use real-time output** (`--live-output`) for long-running operations +7. **Version your configuration files** alongside your model code +8. **Use batch build mode** for CI/CD pipelines to optimize build times + +--- + +## Related Documentation + +- [Usage Guide](usage.md) - Comprehensive usage examples and workflows +- [Configuration Guide](configuration.md) - Advanced configuration options +- [Deployment Guide](deployment.md) - Kubernetes and SLURM deployment details +- [Batch Build Guide](batch-build.md) - Selective builds with batch manifests +- [Launchers Guide](launchers.md) - Distributed training frameworks +- [Profiling Guide](profiling.md) - Performance analysis tools + +--- + +**Version:** 2.0.0 +**Last Updated:** December 2025 + diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 00000000..8b6ff44c --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,682 @@ +# Configuration Guide + +Complete guide to configuring madengine for various use cases and environments. + +## Configuration Methods + +### 1. Inline JSON String + +```bash +madengine run --tags model \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +``` + +### 2. Configuration File + +```bash +madengine run --tags model --additional-context-file config.json +``` + +**config.json:** +```json +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "timeout_multiplier": 2.0 +} +``` + +## Basic Configuration + +### Required for Local Execution + +```json +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU" +} +``` + +**gpu_vendor** (case-insensitive): +- `"AMD"` - AMD ROCm GPUs +- `"NVIDIA"` - NVIDIA CUDA GPUs + +**guest_os** (case-insensitive): +- `"UBUNTU"` - Ubuntu Linux +- `"CENTOS"` - CentOS Linux + +## Build Configuration + +### Batch Manifest + +Use batch manifest files for selective builds with per-model configuration: + +```bash +madengine build --batch-manifest batch.json \ + --registry my-registry.com \ + --additional-context-file config.json +``` + +**Batch manifest structure** (`batch.json`): + +```json +[ + { + "model_name": "model1", + "build_new": true, + "registry": "registry1.io", + "registry_image": "namespace/model1" + }, + { + "model_name": "model2", + "build_new": false, + "registry": "registry2.io", + "registry_image": "namespace/model2" + } +] +``` + +**Fields:** +- `model_name` (string, required): Model tag to include +- `build_new` (boolean, optional, default: `false`): Whether to build this model + - `true`: Build the model from source + - `false`: Reference existing image without rebuilding +- `registry` (string, optional): Per-model registry override +- `registry_image` (string, optional): Custom registry image name/namespace + +**Key Behaviors:** +- Only models with `"build_new": true` are built +- Models with `"build_new": false` are included in output manifest without building +- Per-model `registry` overrides the global `--registry` flag +- Cannot use `--batch-manifest` and `--tags` together (mutually exclusive) + +**Use Case - CI/CD Incremental Builds:** + +```json +[ + {"model_name": "changed_model", "build_new": true}, + {"model_name": "stable_model1", "build_new": false}, + {"model_name": "stable_model2", "build_new": false} +] +``` + +This allows you to rebuild only changed models while maintaining references to existing stable images in a single manifest. + +## Docker Configuration + +### Environment Variables + +Pass environment variables to containers: + +```json +{ + "docker_env_vars": { + "HSA_ENABLE_SDMA": "0", + "PYTORCH_TUNABLEOP_ENABLED": "1", + "NCCL_DEBUG": "INFO" + } +} +``` + +### Custom Base Image + +Override Docker base image: + +```json +{ + "MAD_CONTAINER_IMAGE": "rocm/pytorch:custom-tag" +} +``` + +Or override BASE_DOCKER in FROM line: + +```json +{ + "docker_build_arg": { + "BASE_DOCKER": "rocm/pytorch:rocm6.1_ubuntu22.04_py3.10" + } +} +``` + +### Build Arguments + +Pass build-time variables: + +```json +{ + "docker_build_arg": { + "ROCM_VERSION": "6.1", + "PYTHON_VERSION": "3.10", + "CUSTOM_ARG": "value" + } +} +``` + +### Mount Host Directories + +Mount host directories inside containers: + +```json +{ + "docker_mounts": { + "/data-inside-container": "/data-on-host", + "/models": "/home/user/models" + } +} +``` + +### Select GPUs and CPUs + +Specify GPU and CPU subsets: + +```json +{ + "docker_gpus": "0,2-4,7", + "docker_cpus": "0-15,32-47" +} +``` + +Format: Comma-separated list with hyphen ranges. + +## Performance Configuration + +### Timeout Settings + +```json +{ + "timeout_multiplier": 2.0 +} +``` + +Or use command-line option: + +```bash +madengine run --tags model --timeout 7200 +``` + +### Local Data Mirroring + +Force local data caching: + +```json +{ + "mirrorlocal": "/tmp/local_mirror" +} +``` + +Or use command-line option: + +```bash +madengine run --tags model --force-mirror-local /tmp/mirror +``` + +## Kubernetes Deployment + +### Minimal Configuration + +```json +{ + "k8s": { + "gpu_count": 1 + } +} +``` + +Automatically applies: +- Namespace: `default` +- Resource limits based on GPU count +- Image pull policy: `IfNotPresent` +- Service account: `default` +- GPU vendor detection from context + +### Full Configuration + +```json +{ + "k8s": { + "gpu_count": 2, + "namespace": "ml-team", + "gpu_vendor": "AMD", + "memory": "32Gi", + "memory_limit": "64Gi", + "cpu": "16", + "cpu_limit": "32", + "service_account": "madengine-sa", + "image_pull_policy": "Always", + "image_pull_secrets": ["my-registry-secret"] + } +} +``` + +**K8s Options:** +- `gpu_count` - Number of GPUs (required) +- `namespace` - Kubernetes namespace (default: `default`) +- `gpu_vendor` - GPU vendor override (auto-detected from context) +- `memory` - Memory request (default: auto-scaled by GPU count) +- `memory_limit` - Memory limit (default: 2× memory request) +- `cpu` - CPU cores request (default: auto-scaled by GPU count) +- `cpu_limit` - CPU cores limit (default: 2× CPU request) +- `service_account` - Service account name +- `image_pull_policy` - `Always`, `IfNotPresent`, or `Never` +- `image_pull_secrets` - List of image pull secrets + +### Multi-Node Kubernetes + +```json +{ + "k8s": { + "gpu_count": 8 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 4 + } +} +``` + +## SLURM Deployment + +### Basic Configuration + +```json +{ + "slurm": { + "partition": "gpu", + "gpus_per_node": 4, + "time": "02:00:00" + } +} +``` + +### Full Configuration + +```json +{ + "slurm": { + "partition": "gpu", + "account": "research_group", + "qos": "normal", + "gpus_per_node": 8, + "nodes": 1, + "time": "24:00:00", + "mem": "64G", + "mail_user": "user@example.com", + "mail_type": "ALL" + } +} +``` + +**SLURM Options:** +- `partition` - SLURM partition name (required) +- `account` - Billing account +- `qos` - Quality of Service +- `gpus_per_node` - GPUs per node (default: 1) +- `nodes` - Number of nodes (default: 1) +- `time` - Wall time limit HH:MM:SS (required) +- `mem` - Memory per node (e.g., "64G") +- `mail_user` - Email for notifications +- `mail_type` - Notification types (BEGIN, END, FAIL, ALL) + +### Multi-Node SLURM + +```json +{ + "slurm": { + "partition": "gpu", + "nodes": 4, + "gpus_per_node": 8, + "time": "48:00:00" + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 4, + "nproc_per_node": 8 + } +} +``` + +## Distributed Training + +### Launcher Configuration + +```json +{ + "distributed": { + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 4, + "master_port": 29500 + } +} +``` + +**Launcher Options:** +- `launcher` - Framework name (required) +- `nnodes` - Number of nodes +- `nproc_per_node` - Processes/GPUs per node +- `master_port` - Master communication port (default: 29500) + +**Supported Launchers:** +- `torchrun` - PyTorch DDP/FSDP +- `deepspeed` - ZeRO optimization +- `megatron` - Large transformers (K8s + SLURM) +- `torchtitan` - LLM pre-training +- `vllm` - LLM inference +- `sglang` - Structured generation + +See [Launchers Guide](launchers.md) for details. + +### TorchTitan Configuration + +```json +{ + "distributed": { + "launcher": "torchtitan", + "nnodes": 4, + "nproc_per_node": 8 + }, + "env_vars": { + "TORCHTITAN_TENSOR_PARALLEL_SIZE": "8", + "TORCHTITAN_PIPELINE_PARALLEL_SIZE": "4", + "TORCHTITAN_FSDP_ENABLED": "1" + } +} +``` + +### vLLM Configuration + +```json +{ + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 4 + }, + "vllm": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1 + } +} +``` + +## Profiling Configuration + +### Basic Profiling + +```json +{ + "tools": [ + {"name": "rocprof"} + ] +} +``` + +### Custom Tool Configuration + +```json +{ + "tools": [ + { + "name": "rocprof", + "cmd": "rocprof --timestamp on", + "env_vars": { + "NCCL_DEBUG": "INFO" + } + } + ] +} +``` + +### Multiple Tools (Stackable) + +```json +{ + "tools": [ + {"name": "rocprof"}, + {"name": "miopen_trace"}, + {"name": "rocblas_trace"} + ] +} +``` + +**Available Tools:** +- `rocprof` - GPU profiling +- `rpd` - ROCm Profiler Data +- `rocblas_trace` - rocBLAS library tracing +- `miopen_trace` - MIOpen library tracing +- `tensile_trace` - Tensile library tracing +- `rccl_trace` - RCCL communication tracing +- `gpu_info_power_profiler` - Power consumption profiling +- `gpu_info_vram_profiler` - VRAM usage profiling + +See [Profiling Guide](profiling.md) for details. + +## Pre/Post Execution Scripts + +Run scripts before and after model execution: + +```json +{ + "pre_scripts": [ + { + "path": "scripts/common/pre_scripts/setup.sh", + "args": "-v" + } + ], + "encapsulate_script": "scripts/common/wrapper.sh", + "post_scripts": [ + { + "path": "scripts/common/post_scripts/cleanup.sh", + "args": "-r" + } + ] +} +``` + +## Model Arguments + +Pass arguments to model execution script: + +```json +{ + "model_args": "--model_name_or_path bigscience/bloom --batch_size 32" +} +``` + +## Data Provider Configuration + +Configure in `data.json` (MAD package root): + +```json +{ + "data_sources": { + "model_data": { + "nas": {"path": "/home/datum"}, + "minio": {"path": "s3://datasets/datum"}, + "aws": {"path": "s3://datasets/datum"} + } + }, + "mirrorlocal": "/tmp/local_mirror" +} +``` + +## Credential Configuration + +Configure in `credential.json` (MAD package root): + +```json +{ + "dockerhub": { + "username": "your_username", + "password": "your_token", + "repository": "myorg" + }, + "AMD_GITHUB": { + "username": "github_username", + "password": "github_token" + }, + "MAD_AWS_S3": { + "username": "aws_access_key", + "password": "aws_secret_key" + } +} +``` + +### Environment Variable Override + +```bash +export MAD_DOCKERHUB_USER=myusername +export MAD_DOCKERHUB_PASSWORD=mytoken +export MAD_DOCKERHUB_REPO=myorg +``` + +## Configuration Priority + +For Kubernetes/SLURM deployments: +1. CLI overrides (`--additional-context`) - Highest +2. User config file (`--additional-context-file`) +3. Profile presets (single-gpu/multi-gpu/multi-node) +4. GPU vendor presets (AMD/NVIDIA optimizations) +5. Base defaults (k8s/defaults.json) +6. Environment variables +7. Built-in fallbacks - Lowest + +## Complete Examples + +### Local GPU Development + +```json +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "docker_gpus": "0", + "docker_env_vars": { + "PYTORCH_TUNABLEOP_ENABLED": "1" + } +} +``` + +### Kubernetes Single-GPU + +```json +{ + "k8s": { + "gpu_count": 1, + "namespace": "dev" + } +} +``` + +### Kubernetes Multi-GPU Training + +```json +{ + "k8s": { + "gpu_count": 4, + "memory": "64Gi", + "cpu": "32" + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 4 + } +} +``` + +### SLURM Multi-Node + +```json +{ + "slurm": { + "partition": "gpu", + "nodes": 8, + "gpus_per_node": 8, + "time": "72:00:00", + "account": "research_proj" + }, + "distributed": { + "launcher": "deepspeed", + "nnodes": 8, + "nproc_per_node": 8 + } +} +``` + +### Production with Profiling + +```json +{ + "k8s": { + "gpu_count": 2, + "namespace": "production", + "memory": "32Gi" + }, + "tools": [ + {"name": "rocprof"}, + {"name": "gpu_info_power_profiler"} + ], + "docker_env_vars": { + "NCCL_DEBUG": "INFO", + "PYTORCH_TUNABLEOP_ENABLED": "1" + } +} +``` + +## Troubleshooting + +### Configuration Not Applied + +```bash +# Verify configuration is valid JSON +python -m json.tool config.json + +# Use verbose logging +madengine run --tags model \ + --additional-context-file config.json \ + --verbose +``` + +### Environment Variables Not Set + +```bash +# Check environment variables +env | grep MAD + +# Verify Docker receives env vars +docker inspect container_name | grep -A 10 Env +``` + +### GPU Vendor Auto-Detection + +madengine auto-detects GPU vendor if not specified: +- Looks for ROCm drivers → AMD +- Looks for CUDA drivers → NVIDIA +- Falls back to configuration or fails + +Override with explicit configuration: + +```json +{ + "gpu_vendor": "AMD" +} +``` + +## Best Practices + +1. **Use configuration files** for complex settings +2. **Start with minimal configs** and add as needed +3. **Validate JSON syntax** before running +4. **Use environment variables** for sensitive data +5. **Test locally first** before deploying +6. **Enable verbose logging** when debugging +7. **Document custom configurations** for team use + +## Next Steps + +- [Usage Guide](usage.md) - Using madengine commands +- [Deployment Guide](deployment.md) - Deploy to clusters +- [Profiling Guide](profiling.md) - Performance analysis +- [Launchers Guide](launchers.md) - Distributed training frameworks + diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 00000000..c6832178 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,219 @@ +# Contributing to madengine + +Thank you for your interest in contributing! We welcome all contributions, whether they are bug fixes, new features, or improvements to documentation. + +## Getting Started + +### 1. Fork and Clone + +```bash +# Fork on GitHub, then clone your fork +git clone https://github.com/YOUR_USERNAME/madengine.git +cd madengine +``` + +### 2. Setup Development Environment + +```bash +# Create virtual environment +python3 -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install in development mode with all dependencies +pip install -e ".[dev]" + +# Setup pre-commit hooks (optional but recommended) +pre-commit install +``` + +### 3. Create a Branch + +```bash +git checkout -b feature/your-feature-name +# or +git checkout -b fix/your-bugfix-name +``` + +## Development Workflow + +### Making Changes + +1. **Implement your changes** in the appropriate files +2. **Write tests** for new functionality (place in `tests/` directory) +3. **Update documentation** if needed +4. **Follow code standards** (see below) + +### Code Standards + +- **Style**: Black formatting (88 character line length) +- **Imports**: Organized with isort +- **Type Hints**: Add type hints for all public functions +- **Docstrings**: Use Google-style docstrings +- **Testing**: Maintain 95%+ test coverage for new code + +### Running Tests + +```bash +# Run all tests +pytest + +# Run with coverage report +pytest --cov=src/madengine --cov-report=html + +# Run specific test file +pytest tests/test_cli.py + +# Run tests matching pattern +pytest -k "test_build" +``` + +### Code Quality Checks + +```bash +# Format code +black src/ tests/ +isort src/ tests/ + +# Lint code +flake8 src/ tests/ + +# Type checking +mypy src/madengine + +# Run all quality checks (if pre-commit installed) +pre-commit run --all-files +``` + +## Commit Guidelines + +Use conventional commit format: + +```bash +# Good commit messages +git commit -m "feat(cli): add SLURM runner support" +git commit -m "fix(k8s): handle connection timeouts gracefully" +git commit -m "docs: update deployment examples" +git commit -m "test: add integration tests for build command" + +# Commit types +# feat: New feature +# fix: Bug fix +# docs: Documentation changes +# test: Test additions/changes +# refactor: Code refactoring +# style: Code style changes (formatting, etc.) +# perf: Performance improvements +# chore: Build process or auxiliary tool changes +``` + +## Submitting Changes + +### 1. Push to Your Fork + +```bash +git push origin feature/your-feature-name +``` + +### 2. Create Pull Request + +1. Go to the [madengine repository](https://github.com/ROCm/madengine) +2. Click "New Pull Request" +3. Select your fork and branch +4. Provide a clear description: + - What changes were made + - Why the changes were needed + - Any related issues (use `Fixes #123` to auto-close issues) + +### 3. Pull Request Checklist + +- [ ] Tests pass locally (`pytest`) +- [ ] Code follows style guidelines (`black`, `isort`, `flake8`) +- [ ] New tests added for new functionality +- [ ] Documentation updated if needed +- [ ] Commit messages follow conventional format +- [ ] No merge conflicts with main branch + +## Review Process + +1. **Automated Checks**: CI/CD runs tests and linting +2. **Code Review**: Maintainers review your code +3. **Feedback**: Address any requested changes +4. **Approval**: Once approved, your PR will be merged + +## Areas for Contribution + +### High Priority + +- Additional deployment backends +- Performance optimizations +- Enhanced error messages +- Test coverage improvements + +### Medium Priority + +- CLI enhancements +- Documentation improvements +- Monitoring and observability +- Configuration simplification + +### Good First Issues + +Look for issues labeled `good-first-issue` on GitHub. + +## Development Tips + +### Project Structure + +``` +madengine/ +├── src/madengine/ +│ ├── cli/ # CLI commands +│ ├── orchestration/ # Build and run orchestrators +│ ├── deployment/ # K8s and SLURM deployment +│ ├── execution/ # Container execution +│ ├── core/ # Core utilities +│ └── utils/ # Helper functions +├── tests/ # Test suite +├── docs/ # Documentation +└── examples/ # Example configurations +``` + +### Testing Philosophy + +- **Unit Tests**: Fast, isolated tests for individual components +- **Integration Tests**: End-to-end workflow testing +- **Fixtures**: Use pytest fixtures for common test data +- **Mocking**: Mock external dependencies (Docker, K8s API, etc.) + +### Debugging + +```bash +# Run with verbose logging +madengine run --tags model --verbose + +# Keep containers alive for debugging +madengine run --tags model --keep-alive + +# Use Python debugger +python -m pdb -m madengine.cli.app run --tags model +``` + +## Getting Help + +- **GitHub Issues**: https://github.com/ROCm/madengine/issues +- **Discussions**: https://github.com/ROCm/madengine/discussions +- **Documentation**: [docs/](.) + +## Code of Conduct + +Be respectful and constructive in all interactions. We aim to foster an inclusive and welcoming community. + +## Recognition + +Contributors are recognized in: +- **CHANGELOG.md**: All contributions documented +- **GitHub Contributors**: Automatic recognition +- **Release Notes**: Major contributions highlighted + +Thank you for contributing to madengine! + diff --git a/docs/deployment.md b/docs/deployment.md new file mode 100644 index 00000000..a7ff3fb3 --- /dev/null +++ b/docs/deployment.md @@ -0,0 +1,440 @@ +# Deployment Guide + +Deploy madengine workloads to Kubernetes or SLURM clusters for distributed execution. + +## Overview + +madengine supports two deployment backends: + +- **Kubernetes** - Cloud-native container orchestration +- **SLURM** - HPC cluster job scheduling + +Deployment is configured via `--additional-context` and happens automatically during the run phase. + +## Deployment Workflow + +``` +┌─────────────────────────────────────────────┐ +│ 1. Build Phase (Local or CI/CD) │ +│ madengine build --tags model │ +│ → Creates Docker image │ +│ → Pushes to registry │ +│ → Generates build_manifest.json │ +└─────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────┐ +│ 2. Deploy Phase (Run with Context) │ +│ madengine run │ +│ --manifest-file build_manifest.json │ +│ --additional-context '{"deploy":...}' │ +│ → Detects deployment target │ +│ → Creates K8s Job or SLURM script │ +│ → Submits and monitors execution │ +└─────────────────────────────────────────────┘ +``` + +## Kubernetes Deployment + +### Prerequisites + +- Kubernetes cluster with GPU support +- GPU device plugin installed ([AMD](https://github.com/ROCm/k8s-device-plugin) or [NVIDIA](https://github.com/NVIDIA/k8s-device-plugin)) +- Kubeconfig configured (`~/.kube/config` or in-cluster) +- Docker registry accessible from cluster + +### Quick Start + +#### Minimal Configuration (Recommended) + +```json +{ + "k8s": { + "gpu_count": 1 + } +} +``` + +This automatically applies intelligent defaults for namespace, resources, image pull policy, etc. + +#### Build and Deploy + +```bash +# 1. Build image +madengine build --tags my_model \ + --registry my-registry.io \ + --additional-context-file k8s-config.json + +# 2. Deploy to Kubernetes +madengine run \ + --manifest-file build_manifest.json \ + --timeout 3600 +``` + +The deployment target is automatically detected from the `k8s` key in the config. + +### Configuration Options + +**k8s-config.json:** + +```json +{ + "k8s": { + "gpu_count": 2, + "namespace": "ml-team", + "gpu_vendor": "AMD", + "memory": "32Gi", + "cpu": "16", + "service_account": "madengine-sa", + "image_pull_policy": "Always" + } +} +``` + +**Configuration Priority:** +1. User config (`--additional-context-file`) +2. Profile presets (single-gpu/multi-gpu) +3. GPU vendor presets (AMD/NVIDIA) +4. Base defaults + +See [examples/k8s-configs/](../examples/k8s-configs/) for complete examples. + +### Multi-Node Training + +For distributed training across multiple nodes: + +```json +{ + "k8s": { + "gpu_count": 8 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 4 + } +} +``` + +This creates: +- Kubernetes Indexed Job with 2 completions +- Headless service for pod discovery +- Automatic rank assignment via `JOB_COMPLETION_INDEX` +- `MAD_MULTI_NODE_RUNNER` environment variable with torchrun command + +**Supported Launchers:** +- `torchrun` - PyTorch DDP/FSDP +- `deepspeed` - ZeRO optimization +- `torchtitan` - LLM pre-training +- `vllm` - LLM inference +- `sglang` - Structured generation + +See [Distributed Launchers Guide](distributed-launchers.md) for details. + +### Monitoring + +```bash +# Check job status +kubectl get jobs -n your-namespace + +# View pod logs +kubectl logs -f job/madengine-job-xxx -n your-namespace + +# Check pod status +kubectl get pods -n your-namespace +``` + +### Cleanup + +Jobs are automatically cleaned up after completion (configurable via `ttlSecondsAfterFinished`). + +Manual cleanup: + +```bash +kubectl delete job madengine-job-xxx -n your-namespace +``` + +## SLURM Deployment + +### Prerequisites + +- Access to SLURM login node +- SLURM commands available (`sbatch`, `squeue`, `scontrol`) +- Shared filesystem for MAD package and results +- Module system or container runtime (Singularity/Apptainer) + +### Quick Start + +#### Configuration + +**slurm-config.json:** + +```json +{ + "slurm": { + "partition": "gpu", + "gpus_per_node": 4, + "time": "02:00:00", + "account": "my_account" + } +} +``` + +#### Build and Deploy + +```bash +# 1. Build image (on build node or locally) +madengine build --tags my_model \ + --registry my-registry.io \ + --additional-context-file slurm-config.json + +# 2. SSH to SLURM login node +ssh user@hpc-login.example.com + +# 3. Deploy to SLURM +cd /shared/workspace +madengine run \ + --manifest-file build_manifest.json \ + --timeout 7200 +``` + +The deployment target is automatically detected from the `slurm` key in the config. + +### Configuration Options + +**slurm-config.json:** + +```json +{ + "slurm": { + "partition": "gpu", + "account": "research_group", + "qos": "normal", + "gpus_per_node": 8, + "nodes": 1, + "time": "24:00:00", + "mail_user": "user@example.com", + "mail_type": "ALL" + } +} +``` + +**Common SLURM Options:** +- `partition`: SLURM partition name +- `account`: Billing account +- `qos`: Quality of Service +- `gpus_per_node`: Number of GPUs per node +- `nodes`: Number of nodes (for multi-node) +- `time`: Wall time limit (HH:MM:SS) +- `mem`: Memory per node (e.g., "64G") + +See [examples/slurm-configs/](../examples/slurm-configs/) for complete examples. + +### Multi-Node Training + +For distributed training across SLURM nodes: + +```json +{ + "slurm": { + "partition": "gpu", + "nodes": 4, + "gpus_per_node": 8, + "time": "48:00:00" + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 4, + "nproc_per_node": 8 + } +} +``` + +SLURM automatically provides: +- Node list via `$SLURM_JOB_NODELIST` +- Master address detection +- Network interface configuration +- Rank assignment via `$SLURM_PROCID` + +### Monitoring + +```bash +# Check job queue +squeue -u $USER + +# Monitor job progress +squeue -j + +# View job details +scontrol show job + +# Check output logs +tail -f slurm-.out +``` + +### Cancellation + +```bash +# Cancel job +scancel + +# Cancel all your jobs +scancel -u $USER +``` + +## Deployment Comparison + +| Feature | Kubernetes | SLURM | +|---------|-----------|-------| +| **Environment** | Cloud, on-premise | HPC clusters | +| **Orchestration** | Automatic | Job scheduler | +| **Dependencies** | Python library (`kubernetes`) | CLI commands only | +| **Multi-node Setup** | Headless service + DNS | SLURM env vars | +| **Resource Management** | Declarative (YAML) | Batch script | +| **Best For** | Cloud deployments, microservices | Academic HPC, supercomputers | + +## Configuration Examples + +### Single-GPU Development (K8s) + +```json +{ + "k8s": { + "gpu_count": 1, + "namespace": "dev" + } +} +``` + +### Multi-GPU Training (K8s) + +```json +{ + "k8s": { + "gpu_count": 4, + "memory": "64Gi", + "cpu": "32" + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 4 + } +} +``` + +### Multi-Node Training (K8s) + +```json +{ + "k8s": { + "gpu_count": 8, + "namespace": "ml-training" + }, + "distributed": { + "launcher": "torchtitan", + "nnodes": 4, + "nproc_per_node": 8 + } +} +``` + +### Single-Node SLURM + +```json +{ + "slurm": { + "partition": "gpu", + "gpus_per_node": 8, + "time": "12:00:00" + } +} +``` + +### Multi-Node SLURM + +```json +{ + "slurm": { + "partition": "gpu", + "nodes": 8, + "gpus_per_node": 8, + "time": "72:00:00", + "account": "research_proj" + }, + "distributed": { + "launcher": "deepspeed", + "nnodes": 8, + "nproc_per_node": 8 + } +} +``` + +## Troubleshooting + +### Kubernetes Issues + +**Image Pull Failures:** +```bash +# Check image exists +docker pull /: + +# Verify image pull secrets +kubectl get secrets -n your-namespace + +# Check pod events +kubectl describe pod -n your-namespace +``` + +**Resource Issues:** +```bash +# Check node resources +kubectl describe nodes | grep -A5 "Allocated resources" + +# Check GPU availability +kubectl get nodes -o custom-columns=NAME:.metadata.name,GPU:.status.capacity.'amd\.com/gpu' +``` + +### SLURM Issues + +**Job Pending:** +```bash +# Check reason +squeue -j -o "%.18i %.9P %.8j %.8u %.2t %.10M %.6D %R" + +# Check partition status +sinfo -p gpu +``` + +**Out of Resources:** +```bash +# Check available resources +sinfo -o "%P %.5a %.10l %.6D %.6t %N" + +# Adjust resource requests in config +``` + +## Best Practices + +### For Kubernetes + +1. Use minimal configs with intelligent defaults +2. Specify resource limits to prevent over-allocation +3. Use appropriate namespaces for isolation +4. Configure image pull policies based on registry location +5. Monitor pod resource usage with `kubectl top` + +### For SLURM + +1. Start with conservative time limits +2. Use appropriate QoS for priority +3. Monitor job efficiency with `seff ` +4. Use shared filesystem for input/output +5. Test with single node before scaling + +## Next Steps + +- [Distributed Launchers Guide](distributed-launchers.md) - Multi-node training frameworks +- [K8s Examples](../examples/k8s-configs/) - Complete Kubernetes configurations +- [SLURM Examples](../examples/slurm-configs/) - Complete SLURM configurations +- [User Guide](user-guide.md) - General usage instructions + diff --git a/docs/how-to-build.md b/docs/how-to-build.md deleted file mode 100644 index de3d4499..00000000 --- a/docs/how-to-build.md +++ /dev/null @@ -1,24 +0,0 @@ -# Build MADEngine - -Clone the madengine repository to your local machine and build it from source by following these steps: - -```shell -git clone git@github.com:ROCm/madengine.git - -# Change folder to madengine -cd madengine - -# Now run this command from the same directory where pyproject.toml is located: -pip install . -``` - -## Install from GitHub - -You can also directly install the madengine library from the repository. - -```shell -pip intall git+https://username:password@github.com/ROCm/madengine.git@main -``` - -After a successful installation, you can use `pip list`/`pip freeze` to verify that madengine was succesfully installed in your environment. -You can then use the madengine CLI to run containerized models from [MAD](https://github.com/ROCm/MAD). diff --git a/docs/how-to-collect-competitive-library-perf.md b/docs/how-to-collect-competitive-library-perf.md deleted file mode 100644 index 3622e663..00000000 --- a/docs/how-to-collect-competitive-library-perf.md +++ /dev/null @@ -1,31 +0,0 @@ - -# How to collect competitive library performance - -## Profile the AI Model - -The goal is to generate a list of library API config calls in a csv file (library_trace.csv). -See [How to profile a Model](how-to-profile-a-model.md) - -Examples: - -```shell -madengine run --tags pyt_torchvision_alexnet --additional-context "{'guest_os': 'UBUNTU', 'tools': [{'name':'miopen_trace'}] }" -madengine run --tags pyt_torchvision_alexnet --additional-context "{'guest_os': 'UBUNTU', 'tools': [{'name':'rocblas_trace'}] }" -``` -or alternatively, collect everything in one run - -```shell -madengine run --tags pyt_torchvision_alexnet --additional-context "{'guest_os': 'UBUNTU', 'tools': [{'name':'miopen_trace'},{'name':'rocblas_trace'}] }" -``` - -## Measure competitive library configuration performance - -Here, the library config trace collected in previous section is used to collect competitive performance. This section works the same on AMD and NVIDIA gpus. - -The code assumes library_trace.csv exists in root folder, and produces a library_perf.csv. - -Examples: - -```shell -madengine run --tags pyt_library_config_perf -``` diff --git a/docs/how-to-contribute.md b/docs/how-to-contribute.md deleted file mode 100644 index 51dc08c9..00000000 --- a/docs/how-to-contribute.md +++ /dev/null @@ -1,71 +0,0 @@ -# Contributing to madengine Library - -Thank you for your interest in contributing to our madengine library! We welcome all contributions, whether they are bug fixes, new features, or improvements to documentation. Please follow the steps below to get started: - -## Getting started - -1. Fork the Repository: Start by forking the repository on GitHub to your own account. - -2. Clone the Repository: Clone your forked repository to your local machine: - -```shell -git clone https://github.com/ROCm/madengine.git -cd madengine -``` - -3. Create a Branch: Create a new branch for your changes: - -```shell -git checkout -b feature-or-bugfix-name -``` - -4. Install Dependencies: Install madengine and required dependencies using `pip`: - -```shell -pip install -e .[dev] -``` - -## Making changes - -1. Implement Your Changes: Make your changes or add new features in the appropriate files. - -2. Write Tests: Ensure that you write tests for your changes. Place your test files in the `tests` directory. - -## Validating changes with `pytest` - -1. Install `pytest`: If you haven't already, install `pytest`: - -```shell -pip install pytest -``` - -2. Run Tests: Run the tests to validate your changes: - -```shell -pytest -``` - -3. Check Test Results: Ensure all tests pass. If any tests fail, debug and fix the issues. - -## Submitting your changes - -1. Commit Your Changes: Commit your changes with a meaningful commit message: - -```shell -git add . -git commit -m "Description of your changes" -``` - -2. Push to GitHub: Push your changes to your forked repository: - -```shell -git push origin feature-or-bugfix-name -``` - -3. Create a Pull Request: Go to the original repository on GitHub and create a pull request from your forked repository. Provide a clear description of your changes and any relevant information. - -## Review process - -Your pull request will be reviewed by the maintainers. They may request changes or provide feedback. Once your pull request is approved, it will be merged into the main branch. - -Thank you for your contribution! diff --git a/docs/how-to-profile-a-model.md b/docs/how-to-profile-a-model.md deleted file mode 100644 index 7ee05f25..00000000 --- a/docs/how-to-profile-a-model.md +++ /dev/null @@ -1,168 +0,0 @@ -# How to profile a Model - -madengine now supports several tools for profiling. This is provided via the `additional-context` option and the `additional-context-file`. (Given the complexity of these configuration snippets, we recommend to use the `additional-context-file`.) - -For example to use the `rocprof` tool, one just needs to provide a `additional-context-file` with the following: - -```json -{ - "tools": [{ - "name": "rocprof" - }] -} -``` - -This results in a file named `rocprof_output` which contains all the resulting profiling information. - -NOTE: This feature only supports profiling a single workload so the tag provided should be the workload's name (e.g. `pyt_torchvison_alexnet`) - -## Changing the default behavior - -Providing an `additional-context-file` with the contents above will use `rocprof` default behavior. The default behavior for supported tools can be found in `./scripts/common/tools.json`. There are two keys we can change that will modify a tool's behavior, namely `cmd` and `env_vars`. The `cmd` key's value will be the full command to be placed before the python command that runs our model. - -For example, we can change then default command of `rocprof` with the following: - -```json -{ - "tools": [{ - "name": "rocprof", - "cmd": "rocprof --timestamp on " - }] -} -``` - -The above configuration changes the default behavior to use `timestamp` instead of `hip-trace`. (NOTE: `rocprof` is a binary itself and so is required in our `cmd` value.) - -There is also support for setting tool specific environment variables. - -```json -{ - "tools": [{ - "name": "rocprof", - "env_vars": { - "NCCL_DEBUG": "INFO" - } - }] -} - -``` - -## Stackable design - -The profiling/tracing tools follow a stackable design, where multiple tools can be stacked on top of each other. The order in which the tools are specified is the same order in which the tools are applied, with the initial tool forming the innermost envelope around the workload, and the final tool forming the outermost envelope around the workload. - -In the example below, rocprof is the innermost tool, and miopen_trace is the outermost. During runtime, the outermost tool setup is done first, followed by innermost tool setup. Then, the workload is run. The innermost scaffold is deconstucted first, followed by outermost scaffold. - -```json -{ - "tools": [{ - "name": "rocprof" - }, - { - "name": "miopen_trace" - }] -} -``` - -## List of supported tools for profiling - -### rocprof -ROCprofiler can be used to profile the application, with the rocprof tool. - -```json -{ - "tools": [{ - "name": "rocprof" - }] -} -``` - -### rpd -This mode is used to profile using rpd. - -```json -{ - "tools": [{ - "name": "rpd" - }] -} -``` - -### rocblas_trace -This mode is used to trace rocBLAS calls within an application. The rocBLAS calls reside in the output log file. This tool also generates a library_trace csv file that contains the summary of library, configs. - -```json -{ - "tools": [{ - "name": "rocblas_trace" - }] -} -``` - -### miopen_trace -This mode is used to trace MIOpen calls within an application. The MIOpen calls reside in the output log file. This tool also generates a library_trace csv file that contains the summary of library, configs. - -```json -{ - "tools": [{ - "name": "miopen_trace" - }] -} -``` - -### tensile_trace -This mode is used to trace Tensile calls within an application. The Tensile calls reside in the output log file. This tool also generates a library_trace csv file that contains the summary of library, configs. - -```json -{ - "tools": [{ - "name": "tensile_trace" - }] -} -``` - -### rccl_trace -This mode is used to trace RCCL calls within an application. The RCCL calls reside in the output log file. - -```json -{ - "tools": [{ - "name": "rccl_trace" - }] -} -``` -### gpu_info_power_profiler & gpu_info_vram_profiler -For `gpu_info_power_profiler`: - -```json -{"tools": [{"name": "gpu_info_power_profiler"}]} -``` - -For `gpu_info_vram_profiler`: - -```json -{"tools": [{"name": "gpu_info_vram_profiler"}]} -``` - -Currently, `gpu_info_power_profiler` and `gpu_info_vram_profiler` supports ROCm and CUDA, and it profiles real-time power and vram consumption for the workloads. The ouput of the profile is a `gpu_info_power_profiler_output.csv`or `gpu_info_vram_profiler_output.csv`. - -The default `env_vars` for the `gpu_info_power_profiler` `gpu_info_vram_profiler` can be found in `madengine/scripts/common/tools.json`: - -```json -"env_vars": {"DEVICE":"0", "SAMPLING_RATE":"0.1", "MODE":"power", "DUAL-GCD":"false"} -``` - -These two profiling tools share the same backend and -- `DEVICE` can be `"all"` or a string of device index like `"0"` or `"0,1,2"`. When the `MODE` is `"power"`, the device must be a "master" GCD on an OAM (the profiler will issue an error if the device is a secondary die). The tool automatically filters out the "master" GCDs when the value of this field is `"all"`. -- `SAMPLING_RATE` is the sampling interval for the profiler in **seconds**. -- `MODE` supports `"power"` and `"vram"`. -- `DUAL-GCD` launches the same workload on two GCDs if value is "true" **and** the container got two GCDs; therefore, to enable `DUAL_GCD`, one needs to set `"n_gpus": "2"` for the model in `models.json`. - - -## For developers - -This functionality is provided by pre- and post-scripts, which initially sets up the tool and then saves the wanted information while also cleaning up. These scripts are found in `./scripts/common/pre_scripts` and `./scripts/common/post_scripts`. The end result, in some cases, will be a directory called `tool_name_output` and will contain all of the results. The pre-scripts will deal with initial setup and installation, while the post-scripts deals with saving to output directory and cleanup. - -The `./scripts/common/tools.json` file is where the tools default behavior is defined. See previous tools there for examples. - - diff --git a/docs/how-to-provide-contexts.md b/docs/how-to-provide-contexts.md deleted file mode 100644 index 89c33887..00000000 --- a/docs/how-to-provide-contexts.md +++ /dev/null @@ -1,158 +0,0 @@ - -# How to provide Contexts - -Each model in models.json specifies a `dockerfile` that represents a collection of Dockerfiles, that start with the string. All Dockerfiles have individual context, given by `# CONTEXT` comment in the header of file. madengine automatically detects the hardware context within which it runs. Examples of hardware contexts include Host Operating System or GPU vendor. - -The Dockerfile collection is filtered through the detected hardware contexts. For each Dockerfile context that exists in the detected contexts, the value is compared. All common values have to match for the Dockerfile to be selected. The model is run for all filtered Dockerfiles. - -Additional contexts may be specified through `--additional-context` argument. -For example, for models supporting both `'guest_os'` as UBUNTU and CENTOS, one may choose to run only the CENTOS image using `--additional-context "{'guest_os': 'CENTOS'}"'. Without this additional context, both UBUNTU and CENTOS images are used to run the model. - -Additional contexts may also be specified through a json file, given by `--additional-context-file` argument. -For example, for models supporting both `'guest_os'` as UBUNTU and CENTOS, one may choose to run only the CENTOS image using `--additional-context-file addln_ctx.json, where the contents of addln_ctx.json might be - -```json -{ - "guest_os": "CENTOS" -} -``` - -## Changing image from commandline or file - -The `--additional-context` and `--additional-context-file` can be used to pass in a user-provided image. - -```shell -madengine run --tags {model} --additional-context "{'MAD_CONTAINER_IMAGE': 'rocm/pytorch:my_local_tag'}" -``` - -or using file for `--additional-context-file` as - -```json -{ - "MAD_CONTAINER_IMAGE": "rocm/pytorch:my_local_tag" -} -``` - -## Changing base docker from commandline or file - -The `--additional-context` and `--additional-context-file` can be used to override `BASE_DOCKER` used in the `FROM` line of Dockerfiles. - -```shell -madengine run --tags {model} --additional-context "{'docker_build_arg':{'BASE_DOCKER':'compute-artifactory.amd.com:5000/...' }}" -``` - -or using file for `--additional-context-file` as - -```json -{ - "docker_build_arg": {"BASE_DOCKER": "compute-artifactory.amd.com:5000/..."} -} -``` - -## Providing environment variables to docker container - -The `--additional-context` and `--additional-context-file` can be used to provide environment variables to docker containers. - - ```shell -madengine run --tags {model} --additional-context "{'docker_env_vars':{'HSA_ENABLE_SDMA':'0'} }" -``` - -or using file for --additional-context-file as - -```json -{ - "docker_env_vars": {"HSA_ENABLE_SDMA": "0"} -} -``` - -There are also model-environment variables that one can change at madengine runtime. - -```json -{ - "docker_env_vars": {"MAD_MODEL_NUM_EPOCHS": "5"} -} -``` -This example will set the number of epochs to `5` for a particular model. Please see [How to add a Model](how-to-add-a-model.md) for the list of model-environment variables available. - -## Mounting host folders inside docker container -The `--additional-context` and `--additional-context-file` can be used to provide mount paths into docker containers. - - ```shell -madengine run --tags {model} --additional-context "{'docker_mounts':{'/data-path-inside-container':'/data-path-on-host'} }" -``` - -or using file for --additional-context-file as - -```json -{ - "docker_mounts": {"/data-path-inside-container": "/data-path-on-host"} -} -``` - -## Running pre/post model run scripts - -The `--additional-context` and `--additional-context-file` can be used to provide scripts to be run before and after the model run. Commands that encapsulate the model run script can also be provided. - -```shell ---additional-context "{'pre_scripts':[{'path':'your/path/to/pre_script.sh', 'args':'-r'}], 'encapsulate_script':'your/path/to/encapsulate_script.sh', 'post_scripts':[{'path':'your/path/to/post_script.sh', 'args':'-p'}]}" -``` - -or using file for --additional-context-file as - -```json -{ - "pre_scripts":[ - { - "path":"your/path/to/pre_script.sh", - "args":"-r" - } - ], - "encapsulate_script":"your/path/to/encapsulate_script.sh", - "post_scripts":[ - { - "path":"your/path/to/post_script.sh", - "args":"-p" - } - ] -} -``` - -These scripts have their respective directories `/scripts/common/pre_scripts/` and `/scripts/common/post_scripts/`, but it is not necessary to place them there. If you do decide - -to place them in these directories you will need to append their respective paths to your script name for the path variable(s) in the additional-context and additional-context-file. -Also note that you can run multiple post and pre scripts. - -## Selecting gpus and cpus within docker container -The `--additional-context` and `--additional-context-file` can be used to provide a sub-list of cpus or gpus, available within a container. - -The gpus/cpus are comma-separated, and ranges may be denoted with hyphen. - -```shell ---additional-context "{'docker_gpus':'0,2-4,5-5,7', 'docker_cpus':'14-18,32,44-44,62'}" -``` - -or using file for --additional-context-file as - -```json -{ - "docker_gpus":"0,2-4,5-5,7", - "docker_cpus":"14-18,32,44-44,62" -} -``` - -## Providing model script with arguments - -Given additional context can modify existing model arguments to dlm run script by adding "model_args" value -Note: the values given through "model_args" are dependant on arguments the selected run script is expecting - -```shell ---additional-context "{'model_args':'--model_name_or_path bigscience/bloom'}" -``` - -or using the file for --additional-context-file as - -```shell -{ - "model_args": "--model_name_or_path bigscience/bloom" -} -``` diff --git a/docs/how-to-quick-start.md b/docs/how-to-quick-start.md deleted file mode 100644 index 241c048b..00000000 --- a/docs/how-to-quick-start.md +++ /dev/null @@ -1,128 +0,0 @@ -# Quickstart - -Run madengine CLI on your local machine. - -```shell -(venv) test-node:~/MAD$ madengine --help -usage: madengine [-h] [-v] {run,discover,report,database} ... - -A Models automation and dashboarding command-line tool to run LLMs and Deep Learning models locally. - -optional arguments: - -h, --help show this help message and exit - -v, --version show program's version number and exit - -Commands: - Available commands for running models, generating reports, and toolings. - - {run,discover,report,database} - run Run models on container - discover Discover the models. - report Generate report of models - database CRUD for database -``` - -## Run models - -You can use `madengine run` to benchmark the training and inference performance of various LLM and Deep Learning models/frameworks listed in [MAD](https://github.com/ROCm/MAD). - -```shell -(venv) test-node:~/MAD$ madengine run --help -usage: madengine run [-h] [--tags TAGS [TAGS ...]] [--timeout TIMEOUT] [--live-output] [--clean-docker-cache] [--additional-context-file ADDITIONAL_CONTEXT_FILE] - [--additional-context ADDITIONAL_CONTEXT] [--data-config-file-name DATA_CONFIG_FILE_NAME] [--tools-json-file-name TOOLS_JSON_FILE_NAME] - [--generate-sys-env-details GENERATE_SYS_ENV_DETAILS] [--force-mirror-local FORCE_MIRROR_LOCAL] [--keep-alive] [--keep-model-dir] - [--skip-model-run] [--disable-skip-gpu-arch] [-o OUTPUT] - -Run LLMs and Deep Learning models on container - -optional arguments: - -h, --help show this help message and exit - --tags TAGS [TAGS ...] - tags to run (can be multiple). - --timeout TIMEOUT time out for model run in seconds; Overrides per-model timeout if specified or default timeout of 7200 (2 hrs). Timeout - of 0 will never timeout. - --live-output prints output in real-time directly on STDOUT - --clean-docker-cache rebuild docker image without using cache - --additional-context-file ADDITIONAL_CONTEXT_FILE - additonal context, as json file, to filter behavior of workloads. Overrides detected contexts. - --additional-context ADDITIONAL_CONTEXT - additional context, as string representation of python dict, to filter behavior of workloads. Overrides detected contexts and additional- - context-file. - --data-config-file-name DATA_CONFIG_FILE_NAME - custom data configuration file. - --tools-json-file-name TOOLS_JSON_FILE_NAME - custom tools json configuration file. - --generate-sys-env-details GENERATE_SYS_ENV_DETAILS - generate system config env details by default - --force-mirror-local FORCE_MIRROR_LOCAL - Path to force all relevant dataproviders to mirror data locally on. - --keep-alive keep Docker container alive after run; will keep model directory after run - --keep-model-dir keep model directory after run - --skip-model-run skips running the model; will not keep model directory after run unless specified through keep-alive or keep-model-dir - --disable-skip-gpu-arch - disables skipping model based on gpu architecture - -o OUTPUT, --output OUTPUT - output file -``` - -A CLI example to run a model (See pyt_huggingface_bert in https://github.com/ROCm/MAD/models.json): - -```shell -madengine run --tags pyt_huggingface_bert --live-output --additional-context "{'guest_os': 'UBUNTU'}" -``` - -## Generate perf reports - -Commands for generating reports. - -```shell -(venv) test-node:~/MAD$ madengine report --help -usage: madengine report [-h] {update-perf,to-html,to-email} ... - -optional arguments: - -h, --help show this help message and exit - -Report Commands: - Available commands for generating reports. - - {update-perf,to-html,to-email} - update-perf Update perf.csv to database - to-html Convert CSV to HTML report of models - to-email Convert CSV to Email of models -``` - -## Database - -Commands for database, such as create and update table of DB. - -```shell -(venv) test-node:~/MAD$ madengine database --help -usage: madengine database [-h] {create-table,update-table,upload-mongodb} ... - -optional arguments: - -h, --help show this help message and exit - -Database Commands: - Available commands for database, such as creating and updating table in DB. - - {create-table,update-table,upload-mongodb} - create-table Create table in DB - update-table Update table in DB - upload-mongodb Update table in DB -``` - -## Tools in madengine - -There are some additional tools packaged with madengine. They work with madengine CLI to profile GPU usage and get trace of ROCm libraries. - -An example of profiling GPU usage with [rocprof](https://rocm.docs.amd.com/projects/rocprofiler/en/latest/). - -```shell -madengine run --tags pyt_huggingface_bert --additional-context "{'guest_os': 'UBUNTU','tools': [{'name':'rocprof'}]}" -``` - -An example of tracing library usage with [rocblas](https://rocm.docs.amd.com/projects/rocBLAS/en/latest/reference/logging.html). - -```shell -madengine run --tags pyt_huggingface_bert --additional-context "{'guest_os': 'UBUNTU','tools': [{'name':'rocblas_trace'}]}" -``` \ No newline at end of file diff --git a/docs/how-to-run-multi-node.md b/docs/how-to-run-multi-node.md deleted file mode 100644 index 5c84e6cf..00000000 --- a/docs/how-to-run-multi-node.md +++ /dev/null @@ -1,91 +0,0 @@ -# How to Run Mulit-Node - -**NOTE: all of the commands/examples shown below are only showing the multi-node arguments - you will probably need to add the other arguments for your run on top of these.** - -## Multi-Node Runners - -There are two mulit-node `RUNNER`s in DLM/MAD, namely `torchrun` and `mpirun` (coming soon). Each of these `RUNNER`s are enabled in the model's bash script via the environment variable `MAD_MULTI_NODE_RUNNER`. For example in the `pyt_megatron_lm_train_llama2_7b` script, this feature is enabled with the following code - -```bash -run_cmd=" - $MAD_MULTI_NODE_RUNNER \ - $TRAIN_SCRIPT \ - $GPT_ARGS \ - $DATA_ARGS \ - $OUTPUT_ARGS \ - $EXTRA_ARGS \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH -" -``` - -Note the use of the `$MAD_MULTI_NODE_RUNNER` environment variable. This environment variable will be expanded into which ever `RUNNER` is chosen at DLM/MAD runtime. - -### torchrun - -Default `RUNNER` is `torchrun` , `MASTER_ADDR` is `localhost` , `NNODES` is 1 , `NODE_RANK` is 0, additional context `multi_node_args` is not necessary to run on single node - -```bash -madengine run --tags {model} -``` - -#### Two-Node Example - -Using the `torchrun` `RUNNER` requires you to execute the DLM/MAD CLI command on each node manually. `NCCL_SOCKET_IFNAME` , `GLOO_SOCKET_IFNAME` needs to be set using `ifconfig` from `net-tools` - -```bash -apt install net-tools -``` - -So let's assume the first node is our "master" node and has an IP=10.227.23.63 - -On first node, run the following: - -```bash -madengine run --tags {model} --additional-context "{'multi_node_args': {'RUNNER': 'torchrun', 'MASTER_ADDR': '10.227.23.63', 'MASTER_PORT': '400', 'NNODES': '2', 'NODE_RANK': '0'}}" -``` - -On the second node, run the following: - -```bash -madengine run --tags {model} --additional-context "{'multi_node_args':{'RUNNER': 'torchrun', 'MASTER_ADDR': '10.227.23.63', 'MASTER_PORT': '400', 'NNODES': '2', 'NODE_RANK': '1'}}" -``` - -### mpirun - -Coming Soon! - -## Sharing Data - -DLM/MAD multi-node feature assumes the dataset is in a shared-file system for all participating nodes. For example, look at the following 2-node run of the Megatron-LM Llama2 workload. - -On the first node (assumed to be master node), run the following: - -```bash -madengine run --tags pyt_megatron_lm_train_llama2_7b --additional-context "{'multi_node_args': {'RUNNER': 'torchrun', 'MASTER_ADDR': '10.194.129.113', 'MASTER_PORT': '4000', 'NNODES': '2', 'NODE_RANK': '0', 'NCCL_SOCKET_IFNAME': 'ens14np0', 'GLOO_SOCKET_IFNAME': 'ens14np0'}}" --force-mirror-local /nfs/data -``` - -On the second node, run the following: - -```bash -madengine run --tags pyt_megatron_lm_train_llama2_7b --additional-context "{'multi_node_args': {'RUNNER': 'torchrun', 'MASTER_ADDR': '10.194.129.113', 'MASTER_PORT': '4000', 'NNODES': '2', 'NODE_RANK': '1', 'NCCL_SOCKET_IFNAME': 'ens14np0', 'GLOO_SOCKET_IFNAME': 'ens14np0'}}" --force-mirror-local /nfs/data -``` - -You can see at the end of these commands, we are pointing DLM/MAD to the shared-file system where the data can be located. - -**NOTE: The above commands assumes the shared-file system is mounted at `/nfs` in the commands above. If this is not the case and a user simply copies/pastes the above commands on two nodes, DLM/MAD will create a folder called `nfs` on each node and copy the data there, which is not desired behavior.** - -## TODO - -### RUNNER - -- [ ] mpirun (requires ansible integration) - -### Job Schedulare - -- [ ] SLURM -- [ ] Kubernetes - -### Design Consideration - -- [ ] Having the python model script launched by individual bash scripts can be limiting for multi-node. Perhaps we can explore a full python workflow for multi-node and only the job scheduler uses a bash script like SLURM using sbatch script. diff --git a/docs/img/architecture_overview.png b/docs/img/architecture_overview.png new file mode 100755 index 00000000..7bf972b3 Binary files /dev/null and b/docs/img/architecture_overview.png differ diff --git a/docs/img/distributed_workflow.png b/docs/img/distributed_workflow.png new file mode 100755 index 00000000..a6723b44 Binary files /dev/null and b/docs/img/distributed_workflow.png differ diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 00000000..d3f79b85 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,156 @@ +# Installation Guide + +Complete installation instructions for madengine. + +## Prerequisites + +- **Python 3.8+** with pip +- **Docker** with GPU support (ROCm for AMD, CUDA for NVIDIA) +- **Git** for repository management +- **MAD package** - Required for model discovery and execution + +## Quick Install + +### From GitHub + +```bash +# Basic installation +pip install git+https://github.com/ROCm/madengine.git + +# With Kubernetes support +pip install "madengine[kubernetes] @ git+https://github.com/ROCm/madengine.git" + +# With all optional dependencies +pip install "madengine[all] @ git+https://github.com/ROCm/madengine.git" +``` + +### Development Installation + +```bash +# Clone repository +git clone https://github.com/ROCm/madengine.git +cd madengine + +# Create virtual environment (recommended) +python3 -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install in editable mode with dev dependencies +pip install -e ".[dev]" + +# Setup pre-commit hooks (optional, for contributors) +pre-commit install +``` + +## Optional Dependencies + +| Extra | Install Command | Use Case | +|-------|----------------|----------| +| `kubernetes` | `pip install madengine[kubernetes]` | Kubernetes deployment support | +| `dev` | `pip install madengine[dev]` | Development tools (pytest, black, mypy, etc.) | +| `all` | `pip install madengine[all]` | All optional dependencies | + +**Note**: SLURM deployment requires no additional Python dependencies (uses CLI commands). + +## MAD Package Setup + +madengine requires the MAD package for model definitions and execution scripts. + +```bash +# Clone MAD package +git clone https://github.com/ROCm/MAD.git +cd MAD + +# Install madengine within MAD directory +pip install git+https://github.com/ROCm/madengine.git + +# Verify installation +madengine --version +madengine discover # Test model discovery +``` + +## Docker GPU Setup + +### AMD ROCm + +```bash +# Test ROCm GPU access +docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video \ + rocm/pytorch:latest rocm-smi + +# Verify with madengine +madengine run --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +``` + +### NVIDIA CUDA + +```bash +# Test CUDA GPU access +docker run --rm --gpus all nvidia/cuda:latest nvidia-smi + +# Verify with madengine +madengine run --tags dummy \ + --additional-context '{"gpu_vendor": "NVIDIA", "guest_os": "UBUNTU"}' +``` + +## Verify Installation + +```bash +# Check installation +madengine --version +madengine --version + +# Test basic functionality (requires MAD package) +cd /path/to/MAD +madengine discover --tags dummy +madengine run --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +``` + +## Troubleshooting + +### Import Errors + +If you get import errors, ensure your virtual environment is activated and madengine is installed: + +```bash +pip list | grep madengine +``` + +### Docker Permission Issues + +If you encounter Docker permission errors: + +```bash +# Add user to docker group (Linux) +sudo usermod -aG docker $USER +newgrp docker +``` + +### ROCm GPU Not Detected + +```bash +# Check ROCm installation +rocm-smi + +# Verify devices are accessible +ls -la /dev/kfd /dev/dri +``` + +### MAD Package Not Found + +Ensure you're running madengine commands from within a MAD package directory: + +```bash +cd /path/to/MAD +export MODEL_DIR=$(pwd) +madengine discover +``` + +## Next Steps + +- [User Guide](user-guide.md) - Learn how to use madengine +- [Deployment Guide](deployment.md) - Deploy to Kubernetes or SLURM +- [Quick Start](how-to-quick-start.md) - Run your first model + diff --git a/docs/launchers.md b/docs/launchers.md new file mode 100644 index 00000000..b4ae7d34 --- /dev/null +++ b/docs/launchers.md @@ -0,0 +1,791 @@ +# Distributed Launchers Guide + +Complete reference for all distributed execution launchers supported by madengine. + +--- + +## Overview + +madengine provides unified support for multiple distributed frameworks, enabling seamless execution across training and inference workloads on both Kubernetes and SLURM clusters. + +### Supported Launchers + +| Launcher | Type | Use Case | K8s | SLURM | Multi-Node | +|----------|------|----------|-----|-------|------------| +| **torchrun** | Training | PyTorch DDP/FSDP training | ✅ | ✅ | ✅ | +| **DeepSpeed** | Training | ZeRO optimization training | ✅ | ✅ | ✅ | +| **Megatron-LM** | Training | Large-scale transformer training | ✅ | ✅ | ✅ | +| **TorchTitan** | Training | LLM pre-training (FSDP2+TP+PP) | ✅ | ✅ | ✅ | +| **vLLM** | Inference | High-throughput LLM serving | ✅ | ✅ | ✅ | +| **SGLang** | Inference | Fast LLM inference | ✅ | ✅ | ✅ | +| **SGLang Disaggregated** | Inference | Large-scale disaggregated inference | ✅ | ✅ | ✅ (min 3) | + +--- + +## Quick Start + +### Basic Configuration + +```json +{ + "distributed": { + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 8 + } +} +``` + +### Deployment + +```bash +# Build with configuration +madengine build --tags my_model \ + --additional-context-file config.json + +# Deploy to K8s or SLURM +madengine run --manifest-file build_manifest.json +``` + +--- + +## Launcher Details + +### 1. torchrun (PyTorch Distributed) + +**Purpose**: Standard PyTorch distributed training with DDP/FSDP + +**When to Use**: +- ✅ Multi-GPU/multi-node PyTorch training +- ✅ Data Parallel or Fully Sharded Data Parallel +- ✅ Standard distributed training patterns + +**Configuration**: +```json +{ + "distributed": { + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 8, + "master_port": 29500 + } +} +``` + +**Features**: +- Automatic rank assignment +- NCCL backend for GPU communication +- Elastic training support +- Compatible with all PyTorch models + +**Examples**: +- K8s: `examples/k8s-configs/minimal/torchrun-multi-gpu-minimal.json` +- SLURM: `examples/slurm-configs/minimal/torchrun-multi-node-minimal.json` + +--- + +### 2. DeepSpeed + +**Purpose**: Memory-efficient training with ZeRO optimization + +**When to Use**: +- ✅ Large models that don't fit in GPU memory +- ✅ ZeRO optimization stages (ZeRO-1, ZeRO-2, ZeRO-3) +- ✅ Gradient accumulation and mixed precision + +**Configuration**: +```json +{ + "distributed": { + "launcher": "deepspeed", + "nnodes": 4, + "nproc_per_node": 8 + } +} +``` + +**Features**: +- ZeRO memory optimization +- Pipeline parallelism +- Gradient accumulation +- Mixed precision training +- Automatic hostfile generation (K8s) + +**Architecture**: +- Uses its own launcher (not torchrun) +- Manages process spawning internally +- Requires DeepSpeed config file in model script + +**Examples**: +- SLURM: `examples/slurm-configs/basic/04-multi-node-advanced.json` + +--- + +### 3. Megatron-LM + +**Purpose**: Large-scale transformer model training + +**When to Use**: +- ✅ GPT, BERT, T5 style transformers +- ✅ Tensor and pipeline parallelism +- ✅ Very large models (70B+ parameters) + +**Configuration**: +```json +{ + "distributed": { + "launcher": "megatron", + "nnodes": 4, + "nproc_per_node": 8 + } +} +``` + +**Features**: +- Tensor parallelism across GPUs +- Pipeline parallelism across nodes +- Optimized for transformer architectures +- Built on top of torchrun +- Automatic TP/PP size configuration + +**Availability**: +- ✅ K8s: Fully supported (dedicated launcher) +- ✅ SLURM: Fully supported + +**Examples**: +- K8s: `examples/k8s-configs/minimal/megatron-lm-minimal.json` +- K8s Multi-node: `examples/k8s-configs/basic/megatron-lm-multi-node-basic.json` +- SLURM: `examples/slurm-configs/minimal/megatron-lm-minimal.json` +- SLURM Multi-node: `examples/slurm-configs/basic/09-megatron-lm-multi-node.json` + +**Environment Variables** (automatically set by launcher): +```bash +# Megatron-Core standard variables +TENSOR_MODEL_PARALLEL_SIZE # Tensor parallelism (GPUs per node) +PIPELINE_MODEL_PARALLEL_SIZE # Pipeline parallelism (typically = nnodes) +CONTEXT_PARALLEL_SIZE # Context parallelism (default: 1) +``` + +**Note**: The launcher automatically configures: +- Single-node: TP only (PP=1) +- Multi-node: TP across GPUs + PP across nodes + +--- + +### 4. TorchTitan + +**Purpose**: Production LLM pre-training with multi-dimensional parallelism + +**Reference**: [pytorch/torchtitan](https://github.com/pytorch/torchtitan) + +**When to Use**: +- ✅ Llama 3.1 (8B to 405B) pre-training +- ✅ Multi-dimensional parallelism (FSDP2 + TP + PP + CP) +- ✅ Production-scale LLM training + +**Configuration**: +```json +{ + "distributed": { + "launcher": "torchtitan", + "nnodes": 4, + "nproc_per_node": 8 + } +} +``` + +**Parallelism Strategies**: +- **FSDP2**: Fully Sharded Data Parallel v2 for parameter sharding +- **TP**: Tensor Parallel - split model layers across GPUs +- **PP**: Pipeline Parallel - split model stages across nodes +- **CP**: Context Parallel - distributed context processing + +**Features**: +- Uses torchrun as underlying launcher +- Configured via TOML files +- Automatic parallelism detection +- Float8 and MXFP8 support +- Gradient accumulation +- Distributed checkpointing + +**Environment Variables**: +```bash +TORCHTITAN_TENSOR_PARALLEL_SIZE=8 +TORCHTITAN_PIPELINE_PARALLEL_SIZE=4 +TORCHTITAN_FSDP_ENABLED=1 +TORCHTITAN_CONTEXT_PARALLEL_SIZE=1 +``` + +**Single vs Multi-Node**: +- Single-node: TP only across GPUs +- Multi-node: TP + PP + FSDP2 combined + +**Examples**: +- K8s: `examples/k8s-configs/minimal/torchtitan-single-node-minimal.json` +- SLURM: `examples/slurm-configs/minimal/torchtitan-single-node-minimal.json` + +**Model Configuration** (TOML): +```toml +[model] +name = "llama3" +flavor = "8B" + +[training] +tensor_parallel_degree = 8 +pipeline_parallel_degree = 1 +batch_size = 1 +seq_len = 8192 +``` + +--- + +### 5. vLLM + +**Purpose**: High-throughput LLM inference serving + +**Reference**: [vllm-project/vllm](https://github.com/vllm-project/vllm) + +**When to Use**: +- ✅ LLM inference with high throughput +- ✅ Continuous batching +- ✅ PagedAttention for memory efficiency + +**Configuration**: +```json +{ + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 4 + } +} +``` + +**Features**: +- Continuous batching for high throughput +- PagedAttention memory optimization +- Tensor Parallelism support +- Ray for distributed coordination +- No torchrun needed (manages own processes) + +**Architecture**: +- Single-node: TP across GPUs, no Ray +- Multi-node (K8s): Data Parallelism with independent replicas per pod +- Multi-node (SLURM): TP + PP with Ray cluster + +**Environment Variables**: +```bash +VLLM_TENSOR_PARALLEL_SIZE=4 +VLLM_PIPELINE_PARALLEL_SIZE=1 +VLLM_DISTRIBUTED_BACKEND="auto" # or "ray" for multi-node +``` + +**Examples**: +- K8s: `examples/k8s-configs/minimal/vllm-single-node-minimal.json` +- SLURM: `examples/slurm-configs/minimal/vllm-single-node-minimal.json` + +--- + +### 6. SGLang + +**Purpose**: Fast LLM inference with structured generation + +**Reference**: [sgl-project/sglang](https://github.com/sgl-project/sglang) + +**When to Use**: +- ✅ Structured LLM generation +- ✅ Fast inference with caching +- ✅ OpenAI-compatible API + +**Configuration**: +```json +{ + "distributed": { + "launcher": "sglang", + "nnodes": 2, + "nproc_per_node": 4 + } +} +``` + +**Features**: +- Native launcher (sglang.launch_server) +- RadixAttention for prefix caching +- Tensor Parallelism +- Ray for distributed execution +- No torchrun needed + +**Architecture**: +- Single-node: TP across GPUs +- Multi-node: Native multi-node support with Ray + +**Environment Variables**: +```bash +SGLANG_TENSOR_PARALLEL_SIZE=4 +SGLANG_PIPELINE_PARALLEL_SIZE=1 +``` + +**Examples**: +- K8s: `examples/k8s-configs/minimal/sglang-single-node-minimal.json` +- SLURM: `examples/slurm-configs/basic/07-sglang-single-node.json` + +--- + +### 7. SGLang Disaggregated (NEW!) + +**Purpose**: Large-scale disaggregated LLM inference with specialized prefill/decode clusters + +**Reference**: [sgl-project/sglang](https://github.com/sgl-project/sglang) | [Mooncake Framework](https://github.com/kvcache-ai/Mooncake) + +**When to Use**: +- ✅ Large-scale LLM inference (multi-node clusters) +- ✅ Optimized resource allocation (separate prefill/decode) +- ✅ High-throughput production deployments +- ✅ Workload-specific optimization (tune prefill/decode ratio) + +**Architecture**: + +SGLang Disaggregated separates inference into specialized node pools: + +``` +┌─────────────────────────────────────────────────┐ +│ SGLang Disaggregated Cluster │ +├─────────────────────────────────────────────────┤ +│ Node 0: Proxy (Load Balancer) │ +│ Nodes 1-P: Prefill Servers (~40%) │ +│ Nodes P+1-N: Decode Servers (~60%) │ +│ │ +│ Communication: Mooncake (KV cache transfer) │ +└─────────────────────────────────────────────────┘ +``` + +**Configuration**: + +```json +{ + "distributed": { + "launcher": "sglang-disagg", + "nnodes": 5, + "nproc_per_node": 8, + "sglang_disagg": { + "prefill_nodes": 2, + "decode_nodes": 2 + } + } +} +``` + +**Minimum Requirements**: +- **Nodes**: Minimum 3 nodes (1 proxy + 1 prefill + 1 decode) +- **GPUs**: Minimum 1 GPU per node (for tensor parallelism) +- **Network**: High-speed interconnect (InfiniBand recommended for production) + +**Node Roles**: +1. **Proxy Node (Rank 0)**: Load balancer, request router (mini_lb) +2. **Prefill Nodes**: Process input prompts, generate KV cache +3. **Decode Nodes**: Receive KV cache, generate output tokens + +**Automatic Split (Default)**: +- Uses 40/60 golden ratio for prefill/decode +- Formula: `prefill = max(1, (nnodes - 1) * 2 // 5)` + +| Total Nodes | Proxy | Prefill | Decode | +|-------------|-------|---------|--------| +| 3 | 1 | 1 (33%) | 1 (33%) | +| 5 | 1 | 2 (40%) | 2 (40%) | +| 7 | 1 | 2 (29%) | 4 (57%) | +| 11 | 1 | 4 (40%) | 6 (60%) | + +**Custom Split (NEW Feature!)**: + +Override automatic split based on workload characteristics: + +```json +{ + "distributed": { + "launcher": "sglang-disagg", + "nnodes": 7, + "nproc_per_node": 8, + "sglang_disagg": { + "prefill_nodes": 4, + "decode_nodes": 2 + } + } +} +``` + +**Custom Split Use Cases**: + +| Workload Type | Recommended Split | Example (7 nodes) | +|---------------|------------------|-------------------| +| Long prompts (code gen) | 60% prefill | `prefill: 4, decode: 2` | +| Long outputs (creative) | 30% prefill | `prefill: 2, decode: 4` | +| Balanced (default) | 40% prefill | Omit sglang_disagg | +| Document processing | 50% prefill | `prefill: 3, decode: 3` | + +**Validation Rules**: +- `prefill_nodes >= 1` +- `decode_nodes >= 1` +- `prefill_nodes + decode_nodes + 1 == nnodes` + +**Features**: +- Disaggregated prefill/decode architecture +- Mooncake framework for KV cache transfer +- Automatic or custom node role assignment +- RadixAttention for KV cache efficiency +- Ray cluster coordination +- No torchrun needed (manages own processes) + +**Environment Variables (K8s)**: +```bash +POD_INDEX=${JOB_COMPLETION_INDEX} # Pod index for role assignment +TOTAL_PODS=5 # Total number of pods +PREFILL_COUNT=2 # Number of prefill nodes +DECODE_COUNT=2 # Number of decode nodes +TP_SIZE=8 # Tensor parallel size +``` + +**Environment Variables (SLURM)**: +```bash +SGLANG_DISAGG_MODE="enabled" +SGLANG_DISAGG_PREFILL_NODES=2 +SGLANG_DISAGG_DECODE_NODES=2 +SGLANG_DISAGG_TOTAL_NODES=5 +SGLANG_TP_SIZE=8 +SGLANG_NODE_RANK=${SLURM_PROCID} +SGLANG_NODE_IPS="10.0.0.1,10.0.0.2,..." +``` + +**Examples**: +- K8s Minimal: `examples/k8s-configs/minimal/sglang-disagg-minimal.json` +- K8s Basic: `examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json` +- K8s Custom: `examples/k8s-configs/basic/sglang-disagg-custom-split.json` +- SLURM Minimal: `examples/slurm-configs/minimal/sglang-disagg-minimal.json` +- SLURM Basic: `examples/slurm-configs/basic/sglang-disagg-multi-node.json` +- SLURM Custom: `examples/slurm-configs/basic/sglang-disagg-custom-split.json` + +**Comparison: SGLang vs SGLang Disaggregated**: + +| Feature | SGLang | SGLang Disaggregated | +|---------|--------|---------------------| +| **Architecture** | Unified | Separated prefill/decode | +| **Min Nodes** | 1 | 3 | +| **Node Types** | Same for all | Specialized (proxy/prefill/decode) | +| **KV Transfer** | In-memory | Mooncake framework | +| **Load Balancer** | Ray | mini_lb (dedicated) | +| **Best For** | General inference | Large-scale clusters | +| **Optimization** | General | Workload-specific tuning | + +**Production Considerations**: +1. **Install Mooncake**: Full framework with RDMA support +2. **Configure Network**: InfiniBand/RoCE for high-speed KV transfer +3. **Setup etcd**: For distributed coordination +4. **Monitor Metrics**: Track prefill latency, decode throughput, queue depths +5. **Tune Split**: Adjust prefill/decode ratio based on workload + +**Performance Tuning**: +```bash +# Start with automatic split +madengine run --tags model --config minimal-config.json + +# Monitor bottleneck (prefill latency vs decode throughput) +# If prefill is bottleneck → increase prefill nodes +# If decode is bottleneck → increase decode nodes + +# Apply custom split +madengine run --tags model --config custom-split-config.json +``` + +**Troubleshooting**: + +1. **"requires minimum 3 nodes"** + - Solution: Set `nnodes >= 3` + +2. **"prefill_nodes + decode_nodes + 1 must equal nnodes"** + - Solution: Verify math in custom split configuration + +3. **Pod/Node stuck in Init** + - K8s: Check headless service creation + - SLURM: Verify node IP discovery + +4. **High KV cache transfer latency** + - Enable RDMA/InfiniBand + - Configure Mooncake transfer backend + - Check network connectivity + +--- + +## Comparison Matrix + +### Training Launchers + +| Feature | torchrun | DeepSpeed | Megatron-LM | TorchTitan | +|---------|----------|-----------|-------------|------------| +| **Data Parallel** | ✅ DDP | ✅ ZeRO | ✅ | ✅ FSDP2 | +| **Tensor Parallel** | ❌ | ❌ | ✅ | ✅ | +| **Pipeline Parallel** | ❌ | ✅ | ✅ | ✅ | +| **Memory Efficiency** | Medium | High (ZeRO) | High | Very High | +| **Ease of Use** | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐ | ⭐⭐⭐⭐ | +| **Model Size** | Small-Medium | Medium-Large | Very Large | Very Large | +| **K8s Support** | ✅ | ✅ | ❌ | ✅ | +| **SLURM Support** | ✅ | ✅ | ✅ | ✅ | + +### Inference Launchers + +| Feature | vLLM | SGLang | SGLang Disaggregated | +|---------|------|--------|----------------------| +| **Throughput** | Very High | High | Very High | +| **Memory Efficiency** | PagedAttention | RadixAttention | RadixAttention + Mooncake | +| **Batching** | Continuous | Continuous | Continuous | +| **API** | OpenAI-compatible | OpenAI-compatible | OpenAI-compatible | +| **Structured Gen** | Limited | ✅ Native | ✅ Native | +| **Multi-Node** | ✅ Ray | ✅ Ray | ✅ Ray + mini_lb | +| **Architecture** | Unified | Unified | Disaggregated | +| **Min Nodes** | 1 | 1 | 3 | +| **Specialization** | ❌ | ❌ | ✅ Prefill/Decode | +| **Custom Split** | ❌ | ❌ | ✅ | +| **K8s Support** | ✅ | ✅ | ✅ | +| **SLURM Support** | ✅ | ✅ | ✅ | + +--- + +## Configuration Best Practices + +### 1. Launcher Selection + +**Training Workloads**: +``` +Small models (< 1B) → torchrun +Medium models (1B-10B) → DeepSpeed or torchrun +Large models (10B-70B) → TorchTitan or Megatron-LM +Very large (70B+) → TorchTitan with full parallelism +``` + +**Inference Workloads**: +``` +High throughput → vLLM or SGLang Disaggregated +Structured generation → SGLang or SGLang Disaggregated +Memory constrained → vLLM (PagedAttention) +Large-scale clusters (5+) → SGLang Disaggregated +Workload-specific tuning → SGLang Disaggregated +``` + +### 2. Resource Allocation + +**GPU Count Guidelines**: +```json +{ + "k8s": { + "gpu_count": 8 // Matches nproc_per_node + }, + "distributed": { + "nnodes": 4, + "nproc_per_node": 8 // Total: 32 GPUs + } +} +``` + +**Memory Recommendations**: +- torchrun: 16GB per GPU minimum +- DeepSpeed: 32GB per GPU (ZeRO-3) +- TorchTitan: 64GB+ per GPU (large models) +- vLLM: 32GB per GPU (depends on model size) + +### 3. Multi-Node Setup + +**Kubernetes**: +- Automatic headless service creation +- Pod discovery via DNS +- Uses `JOB_COMPLETION_INDEX` for rank + +**SLURM**: +- Uses SLURM environment variables +- Automatic node discovery +- Network interface configuration + +--- + +## Environment Variables + +### Common Variables (All Launchers) + +```bash +NNODES=4 # Number of nodes +NPROC_PER_NODE=8 # GPUs per node +NODE_RANK=0 # Current node rank (0-based) +MASTER_ADDR=master.local # Master node address +MASTER_PORT=29500 # Master communication port +``` + +### Launcher-Specific + +**torchrun**: +```bash +MAD_MULTI_NODE_RUNNER="torchrun --nnodes=4 --nproc_per_node=8 ..." +``` + +**DeepSpeed**: +```bash +MAD_MULTI_NODE_RUNNER="deepspeed --num_gpus=8 --hostfile=/tmp/hostfile ..." +``` + +**Megatron-LM**: +```bash +# Megatron-Core standard environment variables +TENSOR_MODEL_PARALLEL_SIZE=8 # Tensor parallelism size +PIPELINE_MODEL_PARALLEL_SIZE=4 # Pipeline parallelism size +CONTEXT_PARALLEL_SIZE=1 # Context parallelism size +MAD_MULTI_NODE_RUNNER="torchrun ..." # Uses torchrun (SLURM only) +``` + +**TorchTitan**: +```bash +TORCHTITAN_TENSOR_PARALLEL_SIZE=8 +TORCHTITAN_PIPELINE_PARALLEL_SIZE=4 +TORCHTITAN_FSDP_ENABLED=1 +MAD_MULTI_NODE_RUNNER="torchrun ..." +``` + +**vLLM**: +```bash +VLLM_TENSOR_PARALLEL_SIZE=4 +VLLM_DISTRIBUTED_BACKEND="ray" +# No MAD_MULTI_NODE_RUNNER (vLLM manages processes) +``` + +**SGLang**: +```bash +SGLANG_TENSOR_PARALLEL_SIZE=4 +NCCL_INIT_ADDR="master:29500" +# No MAD_MULTI_NODE_RUNNER (SGLang manages processes) +``` + +**SGLang Disaggregated**: +```bash +SGLANG_DISAGG_MODE="enabled" +SGLANG_DISAGG_PREFILL_NODES=2 +SGLANG_DISAGG_DECODE_NODES=2 +SGLANG_DISAGG_TOTAL_NODES=5 +SGLANG_TP_SIZE=8 +SGLANG_NODE_RANK=${SLURM_PROCID} +# No MAD_MULTI_NODE_RUNNER (SGLang disagg manages processes) +``` + +--- + +## Troubleshooting + +### Common Issues + +**1. Launcher Not Found** +```bash +Error: Unknown launcher type 'xyz' +``` +Solution: Use one of: `torchrun`, `deepspeed`, `megatron`, `torchtitan`, `vllm`, `sglang`, `sglang-disagg` + +**2. Multi-Node Communication Fails** +```bash +Error: Connection timeout to master node +``` +Solutions: +- Check network connectivity between nodes +- Verify `MASTER_ADDR` is correct +- Ensure firewall allows `MASTER_PORT` +- For K8s: Check headless service created + +**3. GPU Visibility Issues** +```bash +Error: Expected 8 GPUs but found 0 +``` +Solutions: +- Verify `gpu_count` matches `nproc_per_node` +- Check GPU resource name (`amd.com/gpu` vs `nvidia.com/gpu`) +- Ensure ROCm/CUDA drivers installed + +**4. Ray Cluster Issues (vLLM/SGLang)** +```bash +Error: Ray cluster failed to start +``` +Solutions: +- Clean existing Ray processes: `ray stop --force` +- Check port 6379 is available +- Verify network interface configuration +- For multi-node: ensure pods can communicate + +--- + +## Advanced Topics + +### Custom Launcher Scripts + +madengine provides `$MAD_MULTI_NODE_RUNNER` for frameworks that use torchrun: + +```bash +#!/bin/bash +# Your model script + +# For torchrun/deepspeed/megatron/torchtitan +$MAD_MULTI_NODE_RUNNER your_training_script.py --args + +# For vLLM/sglang (no MAD_MULTI_NODE_RUNNER) +python your_inference_script.py --args +``` + +### Launcher Detection + +madengine automatically: +1. Detects launcher from `distributed.launcher` field +2. Sets up appropriate environment variables +3. Generates launcher-specific commands +4. Creates multi-node infrastructure (K8s services, SLURM env) + +### Performance Optimization + +**AMD MI300X**: +```json +{ + "context": { + "env_vars": { + "PYTORCH_TUNABLEOP_ENABLED": "1", + "NCCL_IB_DISABLE": "0", + "NCCL_NET_GDR_LEVEL": "5" + } + } +} +``` + +**NVIDIA H100/A100**: +```json +{ + "context": { + "env_vars": { + "NCCL_ALGO": "Ring", + "NCCL_PROTO": "Simple", + "CUDA_DEVICE_MAX_CONNECTIONS": "1" + } + } +} +``` + +--- + +## References + +### Official Documentation +- [PyTorch Distributed](https://pytorch.org/tutorials/beginner/dist_overview.html) +- [DeepSpeed](https://www.deepspeed.ai/) +- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) +- [TorchTitan](https://github.com/pytorch/torchtitan) +- [vLLM](https://docs.vllm.ai/) +- [SGLang](https://github.com/sgl-project/sglang) + +### madengine Documentation +- [K8s Configuration Guide](../examples/k8s-configs/README.md) +- [SLURM Configuration Guide](../examples/slurm-configs/README.md) +- [How to Run Multi-Node](how-to-run-multi-node.md) + +### Example Configurations +- [K8s Examples](../examples/k8s-configs/) +- [SLURM Examples](../examples/slurm-configs/) +- [Test Fixtures](../tests/fixtures/dummy/) + diff --git a/docs/profiling.md b/docs/profiling.md new file mode 100644 index 00000000..89dfde6b --- /dev/null +++ b/docs/profiling.md @@ -0,0 +1,974 @@ +# Profiling Guide + +Complete guide to profiling model performance and analyzing library calls with madengine. + +## Overview + +madengine integrates multiple profiling and tracing tools to analyze GPU usage, library calls, and system performance. Tools are configured via `--additional-context` and applied in a stackable design pattern. + +## Quick Start + +### Basic GPU Profiling + +```bash +madengine run --tags model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [{"name": "rocprof"}] + }' +``` + +**Output:** `rocprof_output/` directory with profiling results + +### Using Configuration Files + +For complex profiling setups, use configuration files: + +**profiling-config.json:** +```json +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [ + {"name": "rocprof"} + ] +} +``` + +```bash +madengine run --tags model --additional-context-file profiling-config.json +``` + +## Profiling Tools + +### rocprof - GPU Profiling + +Profile GPU kernels and HIP API calls: + +```json +{ + "tools": [ + {"name": "rocprof"} + ] +} +``` + +**Default Behavior:** HIP trace mode +**Output:** `rocprof_output/` directory + +**Custom Configuration:** +```json +{ + "tools": [ + { + "name": "rocprof", + "cmd": "rocprof --timestamp on", + "env_vars": { + "NCCL_DEBUG": "INFO" + } + } + ] +} +``` + +#### ROCm Profiler Version Compatibility + +madengine uses `rocprof_wrapper.sh` to automatically handle the transition between rocprof (legacy) and rocprofv3: + +| ROCm Version | Profiler Used | Command Syntax | +|--------------|---------------|----------------| +| ROCm < 7.0 | rocprof (legacy) | `rocprof [options] ` | +| ROCm >= 7.0 | rocprofv3 (preferred) | `rocprofv3 [options] -- ` | + +**Key Points:** + +1. **Automatic Detection:** The wrapper detects which profiler is available and uses the appropriate syntax +2. **Separator Requirement:** When using custom commands with `rocprof_wrapper.sh`, always include the trailing `--`: + ```json + { + "name": "rocprof", + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --sys-trace --" + } + ``` +3. **Backward Compatibility:** The `--` works with both rocprof and rocprofv3, ensuring your configurations work across ROCm versions + +**Example - Custom Command with Wrapper:** +```json +{ + "tools": [ + { + "name": "rocprof", + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --sys-trace --", + "env_vars": { + "HSA_ENABLE_SDMA": "0" + } + } + ] +} +``` + +### rpd - ROCm Profiler Data + +Collect comprehensive ROCm profiling data: + +```json +{ + "tools": [ + {"name": "rpd"} + ] +} +``` + +**Output:** ROCm profiler data files + +### ROCprofv3 - Advanced GPU Profiling + +ROCprofv3 is the next-generation profiler for ROCm 7.0+ with enhanced features and better performance. madengine provides pre-configured profiles for common bottleneck scenarios. + +#### Available ROCprofv3 Profiles + +**Compute-Bound Analysis:** +```json +{ + "tools": [ + {"name": "rocprofv3_compute"} + ] +} +``` +- **Use Case**: Models bottlenecked by ALU operations +- **Metrics**: Wave execution, VALU/SALU instructions, wait states +- **Output Format**: Perfetto trace with hardware counters + +**Memory-Bound Analysis:** +```json +{ + "tools": [ + {"name": "rocprofv3_memory"} + ] +} +``` +- **Use Case**: Models bottlenecked by memory bandwidth +- **Metrics**: Cache hits/misses, memory transfers, LDS usage +- **Output Format**: Perfetto trace with memory counters + +**Communication-Bound Analysis (Multi-GPU):** +```json +{ + "tools": [ + {"name": "rocprofv3_communication"} + ] +} +``` +- **Use Case**: Multi-GPU distributed training +- **Metrics**: RCCL traces, inter-GPU transfers, synchronization +- **Output Format**: Perfetto trace with RCCL data + +**Comprehensive Profiling:** +```json +{ + "tools": [ + {"name": "rocprofv3_full"} + ] +} +``` +- **Use Case**: Complete analysis with all metrics (high overhead) +- **Metrics**: All traces + counters + stats +- **Output Format**: Perfetto trace with full instrumentation + +**Lightweight Profiling:** +```json +{ + "tools": [ + {"name": "rocprofv3_lightweight"} + ] +} +``` +- **Use Case**: Production-like profiling with minimal overhead +- **Metrics**: HIP and kernel traces only +- **Output Format**: JSON (compact) + +**Perfetto Visualization:** +```json +{ + "tools": [ + {"name": "rocprofv3_perfetto"} + ] +} +``` +- **Use Case**: Generate Perfetto-compatible traces +- **Metrics**: HIP, kernel, memory traces +- **Output Format**: Perfetto trace file (`.pftrace`) +- **View at**: https://ui.perfetto.dev/ + +**API Overhead Analysis:** +```json +{ + "tools": [ + {"name": "rocprofv3_api_overhead"} + ] +} +``` +- **Use Case**: Analyze HIP/HSA API call overhead +- **Metrics**: API call timing and statistics +- **Output Format**: JSON with stats + +**PC Sampling (Hotspot Analysis):** +```json +{ + "tools": [ + {"name": "rocprofv3_pc_sampling"} + ] +} +``` +- **Use Case**: Identify kernel hotspots +- **Metrics**: Program counter sampling at 1000 Hz +- **Output Format**: Perfetto trace with PC samples + +#### Using Pre-Configured Profiles + +madengine provides ready-to-use configuration files in `examples/profiling-configs/`: + +```bash +# Compute-bound profiling +madengine run --tags your_model \ + --additional-context-file examples/profiling-configs/rocprofv3_compute_bound.json + +# Memory-bound profiling +madengine run --tags your_model \ + --additional-context-file examples/profiling-configs/rocprofv3_memory_bound.json + +# Multi-GPU profiling +madengine run --tags your_model \ + --additional-context-file examples/profiling-configs/rocprofv3_multi_gpu.json + +# Comprehensive profiling +madengine run --tags your_model \ + --additional-context-file examples/profiling-configs/rocprofv3_comprehensive.json +``` + +See `examples/profiling-configs/README.md` for complete documentation. + +#### Custom ROCprofv3 Commands + +For advanced users, customize rocprofv3 invocation: + +```json +{ + "tools": [ + { + "name": "rocprof", + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --kernel-trace --memory-copy-trace --rccl-trace --counter-collection -i custom_counters.txt --output-format pftrace --stats -d ./my_output --", + "env_vars": { + "RCCL_DEBUG": "TRACE", + "HSA_ENABLE_SDMA": "0" + } + } + ] +} +``` + +**Important:** The `--` separator at the end of the `cmd` string is **required** when using `rocprof_wrapper.sh`. This separator distinguishes between profiler options and the application command: + +- **rocprofv3 (ROCm >= 7.0):** Requires `--` separator → `rocprofv3 [options] -- ` +- **rocprof (legacy):** Works with or without `--` → `rocprof [options] ` + +The wrapper auto-detects which profiler is available and formats arguments correctly. Always include the trailing `--` in your custom commands to ensure compatibility with both versions. + +#### Hardware Counter Collection + +Custom counter files are in `scripts/common/tools/counters/`: +- `compute_bound.txt` - ALU and execution metrics +- `memory_bound.txt` - Cache and memory metrics +- `communication_bound.txt` - PCIe and synchronization metrics +- `full_profile.txt` - Comprehensive metrics + +Create your own counter file: +```text +# my_counters.txt +pmc: SQ_WAVES +pmc: SQ_INSTS_VALU +pmc: L2CacheHit +pmc: TCC_HIT_sum +``` + +Then use it: +```bash +madengine run --tags your_model \ + --additional-context '{ + "tools": [{ + "name": "rocprof", + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --counter-collection -i my_counters.txt --output-format pftrace -d ./output --" + }] + }' +``` + +### rocblas_trace - rocBLAS Library Tracing + +Trace rocBLAS API calls and configurations: + +```json +{ + "tools": [ + {"name": "rocblas_trace"} + ] +} +``` + +**Output:** +- Trace logs in execution output +- `library_trace.csv` with library call summary + +**Use Case:** Analyze BLAS operations, identify optimization opportunities + +### miopen_trace - MIOpen Library Tracing + +Trace MIOpen API calls for deep learning operations: + +```json +{ + "tools": [ + {"name": "miopen_trace"} + ] +} +``` + +**Output:** +- Trace logs in execution output +- `library_trace.csv` with convolution, pooling, and other DNN operations + +**Use Case:** Optimize deep learning layers, analyze convolution configurations + +### tensile_trace - Tensile Library Tracing + +Trace Tensile matrix operations: + +```json +{ + "tools": [ + {"name": "tensile_trace"} + ] +} +``` + +**Output:** +- Trace logs in execution output +- `library_trace.csv` with matrix operation details + +**Use Case:** Analyze GEMM operations, optimize matrix multiplications + +### rccl_trace - RCCL Communication Tracing + +Trace RCCL collective communication operations: + +```json +{ + "tools": [ + {"name": "rccl_trace"} + ] +} +``` + +**Output:** Trace logs with communication patterns + +**Use Case:** Debug multi-GPU communication, optimize distributed training + +### gpu_info_power_profiler - Power Consumption + +Profile real-time GPU power consumption: + +```json +{ + "tools": [ + {"name": "gpu_info_power_profiler"} + ] +} +``` + +**Output:** `gpu_info_power_profiler_output.csv` + +**Configuration:** +```json +{ + "tools": [ + { + "name": "gpu_info_power_profiler", + "env_vars": { + "POWER_DEVICE": "0", + "POWER_SAMPLING_RATE": "0.1" + } + } + ] +} +``` + +**Environment Variables:** +- `POWER_DEVICE` - GPU device(s): `"0"`, `"0,1,2"`, or `"all"` (default: `"all"`) +- `POWER_SAMPLING_RATE` - Sampling interval in seconds (default: `"0.1"`) +- `POWER_MODE` - Must be `"power"` for this tool (default: `"power"`) +- `POWER_DUAL_GCD` - Enable dual-GCD mode: `"true"` or `"false"` (default: `"false"`) + +**Note:** To customize, override in tools configuration: +```json +{ + "tools": [ + { + "name": "gpu_info_power_profiler", + "env_vars": { + "POWER_DEVICE": "0,1", + "POWER_SAMPLING_RATE": "0.2" + } + } + ] +} +``` + +**Supported Platforms:** ROCm and CUDA + +### gpu_info_vram_profiler - VRAM Usage + +Profile real-time GPU memory consumption: + +```json +{ + "tools": [ + {"name": "gpu_info_vram_profiler"} + ] +} +``` + +**Output:** `gpu_info_vram_profiler_output.csv` + +**Configuration:** +```json +{ + "tools": [ + { + "name": "gpu_info_vram_profiler", + "env_vars": { + "VRAM_DEVICE": "all", + "VRAM_SAMPLING_RATE": "0.5" + } + } + ] +} +``` + +**Environment Variables:** +- `VRAM_DEVICE` - GPU device(s): `"0"`, `"0,1,2"`, or `"all"` (default: `"all"`) +- `VRAM_SAMPLING_RATE` - Sampling interval in seconds (default: `"0.1"`) +- `VRAM_MODE` - Must be `"vram"` for this tool (default: `"vram"`) +- `VRAM_DUAL_GCD` - Enable dual-GCD mode: `"true"` or `"false"` (default: `"false"`) + +**Using Both Profilers Together:** +```json +{ + "tools": [ + {"name": "gpu_info_power_profiler"}, + {"name": "gpu_info_vram_profiler"} + ] +} +``` +This will generate both `gpu_info_power_profiler_output.csv` and `gpu_info_vram_profiler_output.csv`. +- `SAMPLING_RATE` - Sampling interval in seconds +- `MODE` - Must be `"vram"` for this tool +- `DUAL-GCD` - Enable dual-GCD mode + +**Supported Platforms:** ROCm and CUDA + +## Stackable Design + +Tools can be stacked to collect multiple types of profiling data simultaneously. Tools are applied in order, with the first tool being innermost: + +```json +{ + "tools": [ + {"name": "rocprof"}, + {"name": "miopen_trace"}, + {"name": "rocblas_trace"} + ] +} +``` + +**Execution Order:** +1. **Setup:** rocblas_trace → miopen_trace → rocprof +2. **Run:** Model execution +3. **Teardown:** rocprof → miopen_trace → rocblas_trace + +**Example:** +```bash +madengine run --tags pyt_torchvision_alexnet \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [ + {"name": "rocprof"}, + {"name": "miopen_trace"} + ] + }' +``` + +## Competitive Library Performance Analysis + +### Overview + +Analyze and compare performance of different library configurations by: +1. Collecting library call traces +2. Measuring performance of different configurations +3. Comparing competitive implementations + +### Step 1: Collect Library Traces + +Collect library API call traces: + +```bash +# Trace MIOpen calls +madengine run --tags pyt_torchvision_alexnet \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [{"name": "miopen_trace"}] + }' + +# Trace rocBLAS calls +madengine run --tags pyt_torchvision_alexnet \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [{"name": "rocblas_trace"}] + }' +``` + +Or collect both in one run: + +```bash +madengine run --tags pyt_torchvision_alexnet \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [ + {"name": "miopen_trace"}, + {"name": "rocblas_trace"} + ] + }' +``` + +**Output:** `library_trace.csv` containing library calls and configurations + +### Step 2: Measure Library Configuration Performance + +Use the collected traces to benchmark different library configurations: + +```bash +madengine run --tags pyt_library_config_perf +``` + +**Prerequisites:** +- `library_trace.csv` must exist in the current directory +- Contains library call configurations from Step 1 + +**Output:** `library_perf.csv` with performance data for each configuration + +**Platform Support:** Works on both AMD and NVIDIA GPUs + +### Step 3: Analysis + +Compare results from `library_perf.csv` to: +- Identify optimal library configurations +- Compare performance across different implementations +- Validate optimization opportunities + +## Common Usage Patterns + +### Full Performance Analysis + +```bash +# Step 1: Collect comprehensive traces +madengine run --tags model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [ + {"name": "rocprof"}, + {"name": "gpu_info_power_profiler"}, + {"name": "gpu_info_vram_profiler"} + ] + }' + +# Step 2: Analyze results +ls -lh rocprof_output/ +cat gpu_info_power_profiler_output.csv +cat gpu_info_vram_profiler_output.csv +``` + +### Library Optimization Workflow + +```bash +# 1. Profile current implementation +madengine run --tags model \ + --additional-context '{"tools": [{"name": "miopen_trace"}]}' + +# 2. Test library configurations +madengine run --tags pyt_library_config_perf + +# 3. Analyze and compare +python analyze_library_perf.py library_perf.csv +``` + +### Multi-GPU Profiling + +```bash +madengine run --tags model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "docker_gpus": "0,1,2,3", + "tools": [ + { + "name": "gpu_info_power_profiler", + "env_vars": { + "DEVICE": "all", + "SAMPLING_RATE": "0.1" + } + }, + {"name": "rccl_trace"} + ] + }' +``` + +## Output Files Reference + +| Tool | Output File(s) | Content | +|------|---------------|---------| +| `rocprof` | `rocprof_output/*` | GPU kernel traces, HIP API calls | +| `rpd` | Various RPD files | ROCm profiler data | +| `rocblas_trace` | `library_trace.csv`, logs | rocBLAS API calls | +| `miopen_trace` | `library_trace.csv`, logs | MIOpen API calls | +| `tensile_trace` | `library_trace.csv`, logs | Tensile operations | +| `rccl_trace` | Execution logs | RCCL communication | +| `gpu_info_power_profiler` | `gpu_info_power_profiler_output.csv` | Power consumption over time | +| `gpu_info_vram_profiler` | `gpu_info_vram_profiler_output.csv` | VRAM usage over time | + +## Tool Configuration Options + +All tools support these configuration keys: + +### cmd - Custom Command + +Override the default profiling command: + +```json +{ + "tools": [ + { + "name": "rocprof", + "cmd": "rocprof --timestamp on --hip-trace" + } + ] +} +``` + +**Note:** Tool binary name must be included in custom commands. + +### env_vars - Environment Variables + +Set tool-specific environment variables: + +```json +{ + "tools": [ + { + "name": "rocprof", + "env_vars": { + "NCCL_DEBUG": "INFO", + "HSA_ENABLE_SDMA": "0" + } + } + ] +} +``` + +## Best Practices + +### 1. Profile Single Workloads + +Profiling works best with single model tags: + +```bash +# Good +madengine run --tags pyt_torchvision_alexnet \ + --additional-context '{"tools": [{"name": "rocprof"}]}' + +# Avoid +madengine run --tags model1 model2 model3 \ + --additional-context '{"tools": [{"name": "rocprof"}]}' +``` + +### 2. Use Configuration Files + +For complex profiling setups: + +```json +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [ + { + "name": "rocprof", + "cmd": "rocprof --timestamp on" + }, + { + "name": "gpu_info_power_profiler", + "env_vars": { + "POWER_DEVICE": "all", + "POWER_SAMPLING_RATE": "0.1" + } + } + ] +} +``` + +### 3. Optimize Sampling Rates + +Balance detail vs. overhead: + +```json +{ + "tools": [ + { + "name": "gpu_info_power_profiler", + "env_vars": { + "SAMPLING_RATE": "1.0" // Less overhead, less detail + } + } + ] +} +``` + +### 4. Stack Related Tools + +Group related profiling tools: + +```json +{ + "tools": [ + {"name": "miopen_trace"}, + {"name": "rocblas_trace"}, + {"name": "tensile_trace"} + ] +} +``` + +### 5. Separate Profiling Runs + +For performance-critical profiling: + +```bash +# Baseline run (no profiling) +madengine run --tags model + +# Profiling run +madengine run --tags model \ + --additional-context '{"tools": [{"name": "rocprof"}]}' +``` + +## Troubleshooting + +### Profiling Tool Not Found + +**Error:** Tool binary not available + +**Solution:** +```bash +# Verify tool is installed +which rocprof +which rocblas-bench + +# Check container has tools +docker run --rm rocm/pytorch:latest which rocprof +``` + +### Empty Output Files + +**Error:** Profiling produces empty results + +**Causes:** +- Model execution too fast +- Incorrect device selection +- Tool configuration error + +**Solutions:** +- Increase workload size +- Verify GPU device IDs +- Check tool logs for errors + +### High Profiling Overhead + +**Error:** Profiling significantly slows execution + +**Solutions:** +- Reduce sampling rate +- Use fewer stacked tools +- Profile subset of execution +- Use targeted profiling + +### library_trace.csv Not Generated + +**Error:** Library trace file missing + +**Causes:** +- No library calls made +- Tool not properly initialized +- Output directory permission issues + +**Solutions:** +- Verify model uses the library (e.g., uses convolutions for MIOpen) +- Check execution logs for errors +- Verify write permissions + +### False Failure Detection with ROCProf + +**Issue:** Model runs marked as FAILURE despite successful execution + +**Symptoms:** +- Status shows FAILURE but performance metrics are reported +- Log contains ROCProf messages like `E20251230 ... Opened result file` +- Error pattern `Error:` detected in logs + +**Root Cause:** +ROCProf uses glog-style logging where `E` prefix means "Error level log" (not an actual error). These informational messages were incorrectly triggering failure detection. + +**Fixed in:** madengine v2.0+ + +**Verification:** +```bash +# Run with profiling - should show SUCCESS status +madengine run --tags pyt_huggingface_gpt2 \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [{"name": "rocprof"}, {"name": "rpd"}] + }' + +# Check status in output +# ✅ Expected: Status = SUCCESS, Performance = ~38-40 samples/second +``` + +**Technical Details:** +- ROCProf log patterns now excluded from error detection +- Error patterns made more specific (e.g., `RuntimeError:` vs `Error:`) +- Performance extraction hardened against bash segfaults during profiling +- Tests: `pytest tests/unit/test_error_handling.py::TestErrorPatternMatching` + +## Developer Information + +### Tool Implementation + +Profiling functionality is implemented via pre/post scripts: + +**Location:** +- Pre-scripts: `scripts/common/pre_scripts/` +- Post-scripts: `scripts/common/post_scripts/` + +**Workflow:** +1. Pre-script: Tool setup and initialization +2. Model execution: Tool collects data +3. Post-script: Save results, cleanup + +### Default Tool Configuration + +Tool defaults are defined in `scripts/common/tools.json`: + +```json +{ + "rocprof": { + "cmd": "rocprof --hip-trace", + "env_vars": {} + }, + "gpu_info_power_profiler": { + "env_vars": { + "DEVICE": "0", + "SAMPLING_RATE": "0.1", + "MODE": "power", + "DUAL-GCD": "false" + } + } +} +``` + +### Adding Custom Tools + +To add new profiling tools: + +1. Create pre-script: `scripts/common/pre_scripts/tool_name_pre.sh` +2. Create post-script: `scripts/common/post_scripts/tool_name_post.sh` +3. Add default config to `scripts/common/tools.json` +4. Test with madengine + +## Environment Validation Tools + +### TheRock Detection + +Validate [TheRock](https://github.com/ROCm/TheRock) ROCm installations before running models. TheRock is AMD's lightweight build system for HIP and ROCm, distributed via Python pip packages. + +**Enable TheRock validation:** + +```bash +madengine run --tags dummy_therock \ + --tools therock_check \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +``` + +**Standalone detection:** + +```bash +# Shell script (quick check) +bash src/madengine/scripts/common/tools/detect_therock.sh + +# Python script (detailed output) +python3 src/madengine/scripts/common/tools/therock_detector.py --verbose + +# JSON output (for scripting) +python3 src/madengine/scripts/common/tools/therock_detector.py --json +``` + +**Detection methods:** +- Python pip installations (`~/.local/lib/python*/site-packages/rocm`) +- Virtual environments with rocm packages +- System packages (`/usr/lib/python*/site-packages/rocm`) +- Tarball installations +- Local build directories +- Environment variables (`ROCM_PATH`, `HIP_PATH`) + +**Configuration in tools.json:** + +```json +{ + "therock_check": { + "pre_scripts": [ + { + "path": "scripts/common/tools/detect_therock.sh" + } + ], + "cmd": "", + "env_vars": {}, + "post_scripts": [] + } +} +``` + +**Features:** +- Non-blocking validation (warnings only) +- Automatic integration in `dummy_therock` model +- Reports GPU targets and installation paths +- Exit code 0 = found, 1 = not found + +**Resources:** +- [TheRock GitHub](https://github.com/ROCm/TheRock) +- [TheRock Releases](https://github.com/ROCm/TheRock/blob/main/RELEASES.md) + +## Next Steps + +- [Configuration Guide](configuration.md) - Detailed profiling configuration +- [Usage Guide](usage.md) - Running models with profiling +- [Deployment Guide](deployment.md) - Profiling in distributed environments diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 00000000..85989c2e --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,698 @@ +# Usage Guide + +Complete guide to using madengine for running AI models locally and in distributed environments. + +> **📖 Quick Reference:** For detailed command options and flags, see the **[CLI Command Reference](cli-reference.md)**. + +## Quick Start + +### Prerequisites + +- Python 3.8+ with madengine installed +- Docker with GPU support +- MAD package cloned locally + +```bash +git clone https://github.com/ROCm/MAD.git +cd MAD +pip install git+https://github.com/ROCm/madengine.git +``` + +### Your First Model + +```bash +# Discover models +madengine discover --tags dummy + +# Run locally +madengine run --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +``` + +Results are saved to `perf_entry.csv`. + +## Commands Overview + +madengine provides five main commands: + +| Command | Purpose | Common Options | +|---------|---------|----------------| +| `discover` | Find available models | `--tags`, `--verbose` | +| `build` | Build Docker images | `--tags`, `--registry`, `--batch-manifest` | +| `run` | Execute models | `--tags`, `--manifest-file`, `--timeout` | +| `report` | Generate HTML reports | `to-html`, `to-email` | +| `database` | Upload to MongoDB | `--csv-file`, `--database-name` | + +For complete command options and detailed examples, see **[CLI Command Reference](cli-reference.md)**. + +### Quick Command Examples + +```bash +# Discover models +madengine discover --tags dummy + +# Build image +madengine build --tags model \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Run model +madengine run --tags model \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Generate HTML report +madengine report to-html --csv-file perf_entry.csv + +# Upload to MongoDB +madengine database --csv-file perf_entry.csv \ + --database-name mydb --collection-name results +``` + +## Model Discovery + +madengine supports three discovery methods: + +### 1. Root Models (models.json) + +Central model definitions in MAD package root: + +```bash +madengine discover --tags dummy pyt_huggingface_bert +``` + +### 2. Directory-Specific Models + +Models organized in subdirectories (`scripts/{dir}/models.json`): + +```bash +madengine discover --tags dummy2:dummy_2 +``` + +### 3. Dynamic Models with Parameters + +Python-generated models (`scripts/{dir}/get_models_json.py`): + +```bash +madengine discover --tags dummy3:dummy_3:batch_size=512:in=32 +``` + +## Build Workflow + +### Basic Build + +Create Docker images and manifest: + +```bash +madengine build --tags model \ + --registry localhost:5000 \ + --additional-context-file config.json +``` + +Creates `build_manifest.json`: + +```json +{ + "models": [ + { + "model_name": "my_model", + "image": "localhost:5000/my_model:20240115_123456", + "tag": "my_model" + } + ], + "registry": "localhost:5000", + "build_timestamp": "2024-01-15T12:34:56Z" +} +``` + +### Build with Deployment Config + +Include deployment configuration: + +```json +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "k8s": { + "gpu_count": 2, + "namespace": "ml-team" + } +} +``` + +```bash +madengine build --tags model \ + --registry docker.io/myorg \ + --additional-context-file k8s-config.json +``` + +The deployment config is saved in `build_manifest.json` and used during run phase. + +### Registry Authentication + +Configure in `credential.json` (MAD package root): + +```json +{ + "dockerhub": { + "username": "your_username", + "password": "your_token", + "repository": "myorg" + } +} +``` + +Or use environment variables: + +```bash +export MAD_DOCKERHUB_USER=your_username +export MAD_DOCKERHUB_PASSWORD=your_token +export MAD_DOCKERHUB_REPO=myorg +``` + +### Batch Build Mode + +Batch build mode enables selective builds with per-model configuration, ideal for CI/CD pipelines where you need fine-grained control over which models to rebuild. + +#### Batch Manifest Format + +Create a JSON file (e.g., `batch.json`) with a list of model entries: + +```json +[ + { + "model_name": "model1", + "build_new": true, + "registry": "my-registry.com", + "registry_image": "custom-namespace/model1" + }, + { + "model_name": "model2", + "build_new": false, + "registry": "my-registry.com", + "registry_image": "custom-namespace/model2" + }, + { + "model_name": "model3", + "build_new": true + } +] +``` + +**Fields:** +- `model_name` (required): Model tag to include +- `build_new` (optional, default: false): If true, build this model; if false, reference existing image +- `registry` (optional): Per-model registry override +- `registry_image` (optional): Custom registry image name/namespace + +#### Usage Example + +```bash +# Basic batch build +madengine build --batch-manifest batch.json \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# With global registry (can be overridden per model) +madengine build --batch-manifest batch.json \ + --registry localhost:5000 \ + --additional-context-file config.json + +# Verbose output +madengine build --batch-manifest batch.json \ + --registry my-registry.com \ + --verbose +``` + +#### Key Features + +**Selective Building**: Only models with `"build_new": true` are built. Models with `"build_new": false` are added to the output manifest without building, useful for referencing existing images. + +**Per-Model Registry Override**: Each model can specify its own `registry` and `registry_image`, overriding the global `--registry` flag. + +**Mutually Exclusive**: Cannot use `--batch-manifest` and `--tags` together. + +#### Use Cases + +**CI/CD Incremental Builds**: +```json +[ + {"model_name": "changed_model", "build_new": true}, + {"model_name": "unchanged_model1", "build_new": false}, + {"model_name": "unchanged_model2", "build_new": false} +] +``` + +**Multi-Registry Deployment**: +```json +[ + { + "model_name": "public_model", + "build_new": true, + "registry": "docker.io/myorg" + }, + { + "model_name": "private_model", + "build_new": true, + "registry": "gcr.io/myproject" + } +] +``` + +**Development vs Production**: +```json +[ + { + "model_name": "dev_model", + "build_new": true, + "registry": "localhost:5000" + }, + { + "model_name": "prod_model", + "build_new": false, + "registry": "prod-registry.com", + "registry_image": "production/model" + } +] +``` + +## Run Workflow + +### Local Execution + +Run on local machine: + +```bash +madengine run --tags model \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +``` + +**Required for Local:** +- `gpu_vendor`: "AMD", "NVIDIA" +- `guest_os`: "UBUNTU", "CENTOS" + +### Deploy to Kubernetes + +```bash +# Build phase +madengine build --tags model \ + --registry gcr.io/myproject \ + --additional-context '{"k8s": {"gpu_count": 2}}' + +# Deploy phase +madengine run --manifest-file build_manifest.json +``` + +Deployment target is automatically detected from `k8s` key in configuration. + +### Deploy to SLURM + +```bash +# Build phase (local or CI) +madengine build --tags model \ + --registry my-registry.io \ + --additional-context '{"slurm": {"partition": "gpu", "gpus_per_node": 4}}' + +# Deploy phase (on SLURM login node) +ssh user@hpc-login.example.com +madengine run --manifest-file build_manifest.json +``` + +Deployment target is automatically detected from `slurm` key in configuration. + +## Common Usage Patterns + +### Configuration Files + +Use configuration files for complex settings: + +**config.json:** +```json +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "timeout_multiplier": 2.0, + "docker_env_vars": { + "PYTORCH_TUNABLEOP_ENABLED": "1", + "HSA_ENABLE_SDMA": "0" + } +} +``` + +```bash +madengine run --tags model --additional-context-file config.json +``` + +### Custom Timeouts + +```bash +# Override default timeout +madengine run --tags model --timeout 7200 + +# No timeout (run indefinitely) +madengine run --tags model --timeout 0 +``` + +### Debugging + +```bash +# Keep containers alive +madengine run --tags model --keep-alive + +# Verbose output +madengine run --tags model --verbose --live-output + +# Both +madengine run --tags model --keep-alive --verbose --live-output +``` + +### Clean Rebuild + +```bash +# Rebuild without Docker cache +madengine build --tags model --clean-docker-cache +``` + +## Performance Profiling + +Profile GPU usage and library calls: + +```bash +# GPU profiling +madengine run --tags model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [{"name": "rocprof"}] + }' + +# Library tracing +madengine run --tags model \ + --additional-context '{"tools": [{"name": "rocblas_trace"}]}' + +# Multiple tools (stackable) +madengine run --tags model \ + --additional-context '{"tools": [ + {"name": "rocprof"}, + {"name": "miopen_trace"} + ]}' +``` + +See [Profiling Guide](profiling.md) and [CLI Reference - run command](cli-reference.md#run---execute-models) for details. + +## Reporting and Database Integration + +### Generate HTML Reports + +Convert performance CSV files to viewable HTML reports: + +```bash +# Single CSV to HTML +madengine report to-html --csv-file perf_entry.csv + +# Result: Creates perf_entry.html in same directory +``` + +### Consolidated Email Reports + +Generate a single HTML report from multiple CSV files: + +```bash +# Process all CSV files in current directory +madengine report to-email + +# Specify directory +madengine report to-email --directory ./results + +# Custom output filename +madengine report to-email --dir ./results --output weekly_summary.html +``` + +**Use Cases:** +- Weekly performance summaries +- CI/CD result reports +- Team email distributions +- Performance trend analysis + +### Upload to MongoDB + +Store performance data in MongoDB for long-term tracking: + +```bash +# Configure MongoDB connection +export MONGO_HOST=mongodb.example.com +export MONGO_PORT=27017 +export MONGO_USER=performance_user +export MONGO_PASSWORD=secretpassword + +# Upload results +madengine database \ + --csv-file perf_entry.csv \ + --database-name performance_tracking \ + --collection-name model_runs + +# Upload specific results +madengine database \ + --csv-file results/perf_mi300.csv \ + --db benchmarks \ + --collection mi300_results +``` + +**Integration Workflow:** + +```bash +# 1. Run benchmarks +madengine run --tags model1 model2 model3 \ + --output perf_entry.csv + +# 2. Generate HTML report +madengine report to-html --csv-file perf_entry.csv + +# 3. Upload to database +madengine database \ + --csv-file perf_entry.csv \ + --db benchmarks \ + --collection daily_runs + +# 4. Send email report +madengine report to-email --output daily_summary.html +# (Then use your email tool to send daily_summary.html) +``` + +See [CLI Reference](cli-reference.md#report---generate-reports) and [CLI Reference](cli-reference.md#database---upload-to-mongodb) for complete options. + +## Multi-Node Training + +Configure distributed training: + +```json +{ + "k8s": { + "gpu_count": 8 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 4 + } +} +``` + +**Supported Launchers:** +- `torchrun` - PyTorch DDP/FSDP +- `deepspeed` - ZeRO optimization +- `megatron` - Large transformers (K8s + SLURM) +- `torchtitan` - LLM pre-training +- `vllm` - LLM inference +- `sglang` - Structured generation + +See [Launchers Guide](launchers.md) for details. + +## Output and Results + +### Performance CSV + +Results are saved to `perf_entry.csv`: + +```csv +model_name,execution_time,gpu_utilization,memory_used,... +my_model,125.3,98.5,15.2,... +``` + +### Build Manifest + +`build_manifest.json` contains: +- Built image names and tags +- Model configurations +- Deployment configuration +- Build timestamp + +Use this manifest to run pre-built images: + +```bash +madengine run --manifest-file build_manifest.json +``` + +## Troubleshooting + +### Model Not Found + +```bash +# Ensure you're in MAD directory +cd /path/to/MAD +madengine discover --tags your_model +``` + +### Docker Permission Denied + +```bash +# Add user to docker group (Linux) +sudo usermod -aG docker $USER +newgrp docker +``` + +### GPU Not Detected + +```bash +# AMD GPUs +rocm-smi + +# NVIDIA GPUs +nvidia-smi + +# Test with Docker +docker run --rm --device=/dev/kfd --device=/dev/dri \ + rocm/pytorch:latest rocm-smi +``` + +### Build Failures + +```bash +# Check Docker daemon +docker ps + +# Rebuild without cache +madengine build --tags model --clean-docker-cache --verbose +``` + +## Environment Variables + +| Variable | Description | Example | +|----------|-------------|---------| +| `MODEL_DIR` | MAD package directory | `/path/to/MAD` | +| `MAD_VERBOSE_CONFIG` | Verbose config logging | `"true"` | +| `MAD_DOCKERHUB_USER` | Docker Hub username | `"myusername"` | +| `MAD_DOCKERHUB_PASSWORD` | Docker Hub password | `"mytoken"` | +| `MAD_DOCKERHUB_REPO` | Docker Hub repository | `"myorg"` | + +## Best Practices + +1. **Use configuration files** for complex settings +2. **Separate build and run** for distributed deployments +3. **Test locally first** before deploying to clusters +4. **Use registries** for distributed execution +5. **Enable verbose logging** when debugging +6. **Start with small timeouts** and increase as needed + +## Command-Line Tips + +### Using Configuration Files + +For complex configurations, use JSON files: + +```bash +# Create config.json +cat > config.json << 'EOF' +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "docker_gpus": "0,1,2,3", + "timeout_multiplier": 2.0, + "distributed": { + "launcher": "torchrun", + "nproc_per_node": 4 + } +} +EOF + +# Use with commands +madengine build --tags model --additional-context-file config.json +madengine run --tags model --additional-context-file config.json +``` + +### Multiple Tags + +Specify tags in multiple ways: + +```bash +# Space-separated +madengine run --tags model1 --tags model2 --tags model3 + +# Comma-separated +madengine run --tags model1,model2,model3 + +# Mix both +madengine run --tags model1 --tags model2,model3 +``` + +### Debugging Commands + +```bash +# Full verbose output with real-time logs +madengine run --tags model --verbose --live-output + +# Keep container alive for inspection +madengine run --tags model --keep-alive + +# Check what will be discovered +madengine discover --tags model --verbose +``` + +### CI/CD Integration + +```bash +#!/bin/bash +# Example CI script + +set -e # Exit on error + +# Build images +madengine build --batch-manifest batch.json \ + --registry docker.io/myorg \ + --verbose + +# Run tests +madengine run --manifest-file build_manifest.json \ + --timeout 3600 + +# Check exit code +if [ $? -eq 0 ]; then + echo "✅ Tests passed" + + # Generate and upload results + madengine report to-email --output ci_results.html + madengine database \ + --csv-file perf_entry.csv \ + --db ci_results \ + --collection ${CI_BUILD_ID} +else + echo "❌ Tests failed" + exit 1 +fi +``` + +## Next Steps + +### Documentation + +- **[CLI Reference](cli-reference.md)** - Complete command options and examples +- [Configuration Guide](configuration.md) - Advanced configuration options +- [Deployment Guide](deployment.md) - Kubernetes and SLURM deployment +- [Batch Build Guide](batch-build.md) - Selective builds for CI/CD +- [Profiling Guide](profiling.md) - Performance analysis +- [Launchers Guide](launchers.md) - Multi-node training frameworks + +### Quick Links + +- [Main README](../README.md) - Project overview +- [Installation Guide](installation.md) - Setup instructions +- [Contributing Guide](contributing.md) - How to contribute +- [GitHub Issues](https://github.com/ROCm/madengine/issues) - Report issues or get help + diff --git a/examples/build-manifest/batch.json b/examples/build-manifest/batch.json new file mode 100644 index 00000000..8996e43b --- /dev/null +++ b/examples/build-manifest/batch.json @@ -0,0 +1,24 @@ +[ + { + "model_name": "model1", + "build_new": true, + "registry": "docker.io/myorg", + "registry_image": "myorg/model1" + }, + { + "model_name": "model2", + "build_new": true, + "registry": "localhost:5000" + }, + { + "model_name": "model3", + "build_new": false, + "registry": "gcr.io/myproject", + "registry_image": "myproject/stable-model3" + }, + { + "model_name": "model4", + "build_new": false + } +] + diff --git a/examples/build-manifest/ci_incremental.json b/examples/build-manifest/ci_incremental.json new file mode 100644 index 00000000..af83ee86 --- /dev/null +++ b/examples/build-manifest/ci_incremental.json @@ -0,0 +1,23 @@ +[ + { + "model_name": "changed_model_1", + "build_new": true + }, + { + "model_name": "changed_model_2", + "build_new": true + }, + { + "model_name": "stable_model_1", + "build_new": false + }, + { + "model_name": "stable_model_2", + "build_new": false + }, + { + "model_name": "stable_model_3", + "build_new": false + } +] + diff --git a/examples/k8s-configs/README.md b/examples/k8s-configs/README.md new file mode 100644 index 00000000..dc3e979c --- /dev/null +++ b/examples/k8s-configs/README.md @@ -0,0 +1,1202 @@ +# Kubernetes Configuration Guide + +Complete reference for deploying madengine workloads on Kubernetes clusters. + +--- + +## 📋 Table of Contents + +- [Minimal Configuration (NEW!)](#-minimal-configuration-new) +- [Quick Start](#-quick-start) +- [Available Configurations](#-available-configurations) +- [Decision Matrix](#-decision-matrix-which-config-to-use) +- [Usage Examples](#-usage-examples) +- [Data Providers](#-data-providers-with-kubernetes) +- [Configuration Reference](#-configuration-reference) +- [Best Practices](#-best-practices) +- [Troubleshooting](#-troubleshooting) + +--- + +## 🌟 Minimal Configuration (NEW!) + +**madengine v2.0+ includes built-in presets!** You only need to specify what's unique: + +### Single GPU - Just 1 Field! +```json +{ + "k8s": { + "gpu_count": 1 + } +} +``` +**Note**: No `"deploy": "k8s"` needed - automatically inferred from `k8s` field presence! + +### Multi-GPU (2 GPUs) +```json +{ + "k8s": { + "gpu_count": 2 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 2 + } +} +``` + +### Multi-Node (2 nodes × 2 GPUs) +```json +{ + "k8s": { + "gpu_count": 2 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 2 + } +} +``` + +**Auto-Applied Defaults:** +- ✅ Deployment type (k8s/slurm/local) inferred from config structure +- ✅ Resource limits (memory, CPU) based on GPU count +- ✅ AMD/NVIDIA-specific optimizations +- ✅ ROCm/CUDA environment variables +- ✅ NCCL/RCCL configuration +- ✅ Multi-node settings (host_ipc, etc.) + +**See:** [minimal/](minimal/) directory for more examples and documentation. + +--- + +## 🚀 Quick Start + +### Option 1: Minimal Configuration (Recommended) + +```bash +# Create minimal config +cat > my-config.json << EOF +{ + "k8s": { + "gpu_count": 1 + } +} +EOF + +# Build and run +MODEL_DIR=tests/fixtures/dummy madengine build \ + --tags my_model \ + --additional-context-file my-config.json \ + --registry dockerhub + +MODEL_DIR=tests/fixtures/dummy madengine run \ + --manifest-file build_manifest.json \ + --live-output +``` + +### Option 2: Full Configuration (Advanced) + +#### 1. Choose a Configuration + +```bash +# For single GPU testing +cp examples/k8s-configs/01-single-node-single-gpu.json my-config.json + +# For multi-GPU (2 GPUs) +cp examples/k8s-configs/02-single-node-multi-gpu.json my-config.json + +# For multi-node distributed (2 nodes × 2 GPUs) +cp examples/k8s-configs/03-multi-node-basic.json my-config.json + +# For data provider with auto-PVC +cp examples/k8s-configs/06-data-provider-with-pvc.json my-config.json +``` + +#### 2. Customize for Your Cluster (Optional) + +With built-in defaults, customization is optional. Override only what you need: + +```json +{ + "k8s": { + "namespace": "my-namespace", // Override default "default" + "memory": "32Gi", // Override auto-calculated memory + "node_selector": { // Optional: target specific nodes + "node.kubernetes.io/instance-type": "Standard_ND96isr_H100_v5" + } + } +} +``` + +#### 3. Build and Deploy + +```bash +# Build container image +MODEL_DIR=tests/fixtures/dummy madengine build \ + --tags my_model \ + --additional-context-file my-config.json \ + --registry dockerhub + +# Deploy and run +MODEL_DIR=tests/fixtures/dummy madengine run \ + --manifest-file build_manifest.json \ + --live-output +``` + +--- + +## 📁 Available Configurations + +### Minimal Configs (NEW - Recommended for Most Users) + +Located in [`minimal/`](minimal/) directory: + +**General Purpose:** + +| File | Description | GPU Count | +|------|-------------|-----------| +| [`minimal/single-gpu-minimal.json`](minimal/single-gpu-minimal.json) | Single GPU with auto-defaults | 1 | +| [`minimal/multi-gpu-minimal.json`](minimal/multi-gpu-minimal.json) | Multi-GPU with auto-defaults | 2 | +| [`minimal/multi-node-minimal.json`](minimal/multi-node-minimal.json) | Multi-node with auto-defaults | 2×2 | +| [`minimal/nvidia-gpu-minimal.json`](minimal/nvidia-gpu-minimal.json) | NVIDIA GPUs with auto-defaults | 4 | +| [`minimal/custom-namespace-minimal.json`](minimal/custom-namespace-minimal.json) | Shows override examples | 1 | + +**Distributed Launchers:** + +| File | Launcher | Description | GPUs | +|------|----------|-------------|------| +| [`minimal/torchtitan-single-node-minimal.json`](minimal/torchtitan-single-node-minimal.json) | TorchTitan | LLM pre-training (single-node) | 8 | +| [`minimal/vllm-single-node-minimal.json`](minimal/vllm-single-node-minimal.json) | vLLM | LLM inference (single-node) | 4 | +| [`minimal/sglang-single-node-minimal.json`](minimal/sglang-single-node-minimal.json) | SGLang | LLM inference (single-node) | 4 | + +**See [minimal/README.md](minimal/README.md) for detailed documentation and [docs/distributed-launchers.md](../../docs/distributed-launchers.md) for launcher details.** + +### Full Configs (Reference Examples) + +Complete configurations showing all available fields: + +**Training Configs:** + +| File | GPUs | Nodes | Launcher | Use Case | +|------|------|-------|----------|----------| +| [`01-single-node-single-gpu.json`](01-single-node-single-gpu.json) | 1 | 1 | None | Basic testing, small models | +| [`01-single-node-single-gpu-tools.json`](01-single-node-single-gpu-tools.json) | 1 | 1 | None | Single GPU + monitoring | +| [`02-single-node-multi-gpu.json`](02-single-node-multi-gpu.json) | 2 | 1 | torchrun | Multi-GPU training | +| [`02-single-node-multi-gpu-tools.json`](02-single-node-multi-gpu-tools.json) | 2 | 1 | torchrun | Multi-GPU + monitoring | +| [`03-multi-node-basic.json`](03-multi-node-basic.json) | 2/node | 2 | torchrun | Multi-node basics (4 GPUs total) | +| [`04-multi-node-advanced.json`](04-multi-node-advanced.json) | 2/node | 4 | torchrun | Production multi-node (8 GPUs) | +| [`05-nvidia-gpu-example.json`](05-nvidia-gpu-example.json) | 4 | 1 | torchrun | NVIDIA GPUs (A100, H100) | +| [`06-data-provider-with-pvc.json`](06-data-provider-with-pvc.json) | 2 | 1+ | torchrun | **Data provider with auto-PVC** | + +**Distributed Launcher Configs (basic/):** + +| File | GPUs | Nodes | Launcher | Use Case | +|------|------|-------|----------|----------| +| [`basic/torchtitan-multi-node-basic.json`](basic/torchtitan-multi-node-basic.json) | 8/node | 4 | TorchTitan | Llama 3.1 70B+ training | +| [`basic/vllm-multi-node-basic.json`](basic/vllm-multi-node-basic.json) | 4/node | 2 | vLLM | High-throughput inference | +| [`basic/sglang-multi-node-basic.json`](basic/sglang-multi-node-basic.json) | 4/node | 2 | SGLang | Distributed inference | + +--- + +## 🎯 Decision Matrix: Which Config to Use? + +### By GPU Requirements + +| Scenario | Config File | GPUs | Nodes | +|----------|-------------|------|-------| +| **Quick test** | `01-single-node-single-gpu.json` | 1 | 1 | +| **Single GPU benchmark** | `01-single-node-single-gpu-tools.json` | 1 | 1 | +| **Multi-GPU (2 GPUs)** | `02-single-node-multi-gpu.json` | 2 | 1 | +| **Multi-GPU + monitoring** | `02-single-node-multi-gpu-tools.json` | 2 | 1 | +| **Multi-node (4 GPUs)** | `03-multi-node-basic.json` | 2×2 | 2 | +| **Multi-node (8 GPUs)** | `04-multi-node-advanced.json` | 2×4 | 4 | +| **NVIDIA GPUs** | `05-nvidia-gpu-example.json` | 4 | 1 | +| **With data download** | `06-data-provider-with-pvc.json` | 2 | 1+ | + +### By Use Case + +| Use Case | Recommended Config | +|----------|-------------------| +| **Development/Testing** | `01-single-node-single-gpu.json` | +| **Small models (BERT, ResNet)** | `01-single-node-single-gpu.json` | +| **Medium models (GPT-2, Stable Diffusion)** | `02-single-node-multi-gpu.json` | +| **Large models (LLaMA-13B)** | `03-multi-node-basic.json` | +| **Very large models (LLaMA-70B+)** | `04-multi-node-advanced.json` | +| **Models requiring datasets** | `06-data-provider-with-pvc.json` | +| **Busy/shared clusters** | `02-single-node-multi-gpu.json` (2 GPUs) | + +--- + +## 💻 Usage Examples + +### Example 1: Single GPU Test + +```bash +MODEL_DIR=tests/fixtures/dummy madengine build \ + --tags dummy \ + --additional-context-file examples/k8s-configs/01-single-node-single-gpu.json \ + --registry dockerhub + +MODEL_DIR=tests/fixtures/dummy madengine run \ + --manifest-file build_manifest.json \ + --live-output +``` + +### Example 2: Multi-GPU Training (2 GPUs) + +```bash +MODEL_DIR=tests/fixtures/dummy madengine build \ + --tags dummy_torchrun \ + --additional-context-file examples/k8s-configs/02-single-node-multi-gpu.json \ + --registry dockerhub + +MODEL_DIR=tests/fixtures/dummy madengine run \ + --manifest-file build_manifest.json \ + --live-output +``` + +### Example 3: Multi-Node Training (2 nodes, 4 GPUs) + +```bash +MODEL_DIR=tests/fixtures/dummy madengine build \ + --tags dummy_torchrun \ + --additional-context-file examples/k8s-configs/03-multi-node-basic.json \ + --registry dockerhub + +MODEL_DIR=tests/fixtures/dummy madengine run \ + --manifest-file build_manifest.json \ + --live-output +``` + +### Example 4: With Data Provider (Auto-PVC) + +```bash +MODEL_DIR=tests/fixtures/dummy madengine build \ + --tags dummy_torchrun_data_minio \ + --additional-context-file examples/k8s-configs/06-data-provider-with-pvc.json \ + --registry dockerhub + +MODEL_DIR=tests/fixtures/dummy madengine run \ + --manifest-file build_manifest.json \ + --live-output + +# Verify PVC was auto-created +kubectl get pvc madengine-shared-data +``` + +--- + +## 📦 Data Providers with Kubernetes + +**NEW:** madengine automatically handles data provisioning for K8s deployments! + +### ✨ Auto-PVC Feature + +**No manual PVC creation needed!** madengine automatically: +1. Creates `madengine-shared-data` PVC if it doesn't exist +2. Selects appropriate access mode (RWO for single-node, RWX for multi-node) +3. Downloads data on first run +4. Reuses data on subsequent runs + +### Quick Setup + +**Step 1: Use data provider config** +```bash +madengine build --tags dummy_torchrun_data_minio \ + --additional-context-file examples/k8s-configs/06-data-provider-with-pvc.json \ + --registry dockerhub +``` + +**Step 2: Run (PVC auto-created)** +```bash +madengine run --manifest-file build_manifest.json --live-output + +# Output shows: +# 📦 Data provider detected: Will auto-create shared data PVC +# PVC name: madengine-shared-data (reusable across runs) +# Access mode: RWO for single-node, RWX for multi-node (auto-selected) +``` + +**Step 3: Verify (optional)** +```bash +# Check PVC status +kubectl get pvc madengine-shared-data + +# Check PVC contents +kubectl exec -it -- ls -lh /data/ +``` + +### How It Works + +``` +┌─────────────────────────────────────────────────────────────┐ +│ 1. madengine detects data provider in model config │ +├─────────────────────────────────────────────────────────────┤ +│ 2. Auto-creates madengine-shared-data PVC (if not exists) │ +│ • Single-node: ReadWriteOnce (RWO) │ +│ • Multi-node: ReadWriteMany (RWX) │ +├─────────────────────────────────────────────────────────────┤ +│ 3. Mounts PVC at /data in pod │ +├─────────────────────────────────────────────────────────────┤ +│ 4. Downloads data from MinIO/S3/NAS to /data │ +├─────────────────────────────────────────────────────────────┤ +│ 5. Training starts with data at /data/ │ +├─────────────────────────────────────────────────────────────┤ +│ 6. PVC persists - subsequent runs skip download! ✅ │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Supported Data Providers + +| Provider | Protocol | Configuration | +|----------|----------|---------------| +| **MinIO** | S3-compatible | Automatic (credentials from `credential.json`) | +| **AWS S3** | S3 | AWS credentials in environment or `credential.json` | +| **NAS** | SSH/rsync | NAS credentials in `credential.json` | +| **Local** | Filesystem | Pre-mounted PVC | + +### Storage Classes + +**Single-Node (RWO)**: +- ✅ `local-path` (Rancher) +- ✅ AWS EBS (`gp3`, `io2`) +- ✅ Azure Disk +- ✅ Any RWO storage class + +**Multi-Node (RWX)**: +- ✅ NFS (`nfs-client`) +- ✅ CephFS +- ✅ GlusterFS +- ✅ AWS EFS +- ✅ Azure Files +- ❌ `local-path` (RWO only) + +### Custom PVC (Optional) + +To use an existing PVC instead of auto-creation: + +```json +{ + "k8s": { + "data_pvc": "my-existing-pvc" // Skip auto-creation + } +} +``` + +--- + +## 📖 Configuration Reference + +### Configuration Structure + +```json +{ + "_comment": "Description of this configuration", + "gpu_vendor": "AMD|NVIDIA", + "guest_os": "UBUNTU", + "deploy": "k8s", + + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "default", + "gpu_count": 2, + + "memory": "64Gi", + "memory_limit": "128Gi", + "cpu": "16", + "cpu_limit": "32", + + "image_pull_policy": "Always", + "backoff_limit": 3, + + "node_selector": {}, + "tolerations": [], + + "data_pvc": null, // Optional: for data providers + "results_pvc": null // Optional: custom results storage + }, + + "distributed": { + "enabled": true, + "backend": "nccl", + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 2, + "master_port": 29500 + }, + + "env_vars": { + "NCCL_DEBUG": "WARN", + "NCCL_IB_DISABLE": "1", + "OMP_NUM_THREADS": "8" + }, + + "debug": false +} +``` + +### Field Reference + +#### Top-Level Fields + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `gpu_vendor` | string | **Yes** | `"AMD"` or `"NVIDIA"` | +| `guest_os` | string | **Yes** | `"UBUNTU"`, `"RHEL"`, etc. | +| `deploy` | string | **Yes** | Must be `"k8s"` | +| `k8s` | object | **Yes** | Kubernetes configuration | +| `distributed` | object | No | Distributed training (for torchrun) | +| `env_vars` | object | No | Custom environment variables | +| `debug` | boolean | No | Enable debug mode (saves manifests) | + +#### K8s Configuration Fields + +**Required:** + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `gpu_count` | integer | - | **Number of GPUs per pod** | + +**Optional - Basic:** + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `kubeconfig` | string | `~/.kube/config` | Path to kubeconfig | +| `namespace` | string | `"default"` | Kubernetes namespace | +| `gpu_resource_name` | string | `"amd.com/gpu"` | GPU resource (`"nvidia.com/gpu"` for NVIDIA) | + +**Optional - Resources:** + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `memory` | string | `"128Gi"` | Memory request (e.g., `"16Gi"`, `"64Gi"`) | +| `memory_limit` | string | `"256Gi"` | Memory limit (typically 2× memory) | +| `cpu` | string | `"32"` | CPU cores request | +| `cpu_limit` | string | `"64"` | CPU cores limit (typically 2× cpu) | + +**Optional - Job Control:** + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `image_pull_policy` | string | `"Always"` | `"Always"`, `"IfNotPresent"`, or `"Never"` | +| `backoff_limit` | integer | `3` | Retry attempts before marking failed | +| `host_ipc` | boolean | `false` | Enable shared memory (required for multi-node) | + +**Optional - Node Selection:** + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `node_selector` | object | `{}` | Label selectors for pod placement | +| `tolerations` | array | `[]` | Tolerations for tainted nodes | + +**Optional - Storage:** + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `data_pvc` | string | `null` | Data PVC name (auto-created if using data provider) | +| `results_pvc` | string | `null` | Results PVC name (auto-created by default) | + +#### Distributed Execution Fields + +Configuration for distributed workloads (training and inference): + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `launcher` | string | - | Launcher type: `torchrun`, `deepspeed`, `torchtitan`, `vllm`, `sglang` | +| `enabled` | boolean | `false` | Enable distributed execution (legacy, prefer `launcher`) | +| `backend` | string | `"nccl"` | `"nccl"`, `"gloo"`, or `"mpi"` | +| `nnodes` | integer | `1` | Number of nodes | +| `nproc_per_node` | integer | gpu_count | Processes per node (= GPUs per node) | +| `master_port` | integer | `29500` | Master communication port | + +#### Environment Variables + +Custom environment variables for containers: + +```json +{ + "env_vars": { + // NCCL/RCCL (AMD distributed execution) + "NCCL_DEBUG": "WARN", // "INFO" for debugging, "WARN" for production + "NCCL_IB_DISABLE": "1", // Disable InfiniBand (required for K8s) + "NCCL_SOCKET_IFNAME": "eth0", // Network interface + "TORCH_NCCL_HIGH_PRIORITY": "1", // RCCL optimization for FSDP + "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", // Multi-node error handling + + // AMD ROCm optimizations + "GPU_MAX_HW_QUEUES": "2", // MI series optimization + "HSA_ENABLE_SDMA": "0", // Disable SDMA for multi-GPU + "HSA_FORCE_FINE_GRAIN_PCIE": "1", // Multi-node communication + "RCCL_ENABLE_HIPGRAPH": "0", // Disable for compatibility + + // MIOpen + "MIOPEN_FIND_MODE": "1", // Use compiled kernels + "MIOPEN_USER_DB_PATH": "/tmp/.miopen", // Writable cache location + + // General + "OMP_NUM_THREADS": "8" // OpenMP threads + } +} +``` + +--- + +## 🎓 Best Practices + +### Resource Sizing + +**Single GPU:** +``` +GPUs: 1 +Memory: 16Gi (request), 32Gi (limit) +CPU: 8 (request), 16 (limit) +``` + +**Multi-GPU (2 GPUs):** +``` +GPUs: 2 +Memory: 64Gi (request), 128Gi (limit) +CPU: 16 (request), 32 (limit) +``` + +**Multi-Node (2 nodes × 2 GPUs):** +``` +GPUs: 2 per node (4 total) +Memory: 64Gi per node +CPU: 16 per node +host_ipc: true (required!) +``` + +**Multi-Node Advanced (4 nodes × 2 GPUs):** +``` +GPUs: 2 per node (8 total) +Memory: 128Gi per node +CPU: 24 per node +host_ipc: true +PVCs: Recommended for data and results +``` + +### Distributed Launchers + +**Training Launchers:** +- **torchrun**: Standard PyTorch DDP/FSDP training +- **deepspeed**: ZeRO optimization for memory efficiency +- **torchtitan**: LLM pre-training with multi-dimensional parallelism (FSDP2+TP+PP) + +**Inference Launchers:** +- **vllm**: High-throughput LLM serving with continuous batching +- **sglang**: Fast LLM inference with structured generation + +**When to use distributed launchers:** +✅ Multi-GPU on single node (2+ GPUs) +✅ Multi-node distributed workloads +✅ Large model training or inference +✅ Production-scale deployments + +**When NOT to use:** +❌ Single GPU workloads +❌ Simple benchmarks without distributed execution +❌ Development and testing (use single GPU) + +**See [docs/distributed-launchers.md](../../docs/distributed-launchers.md) for comprehensive launcher guide.** + +### AMD ROCm Optimizations + +**Always set in K8s:** +- `NCCL_IB_DISABLE=1` - InfiniBand not available in K8s +- `NCCL_SOCKET_IFNAME=eth0` - Use Ethernet interface +- `MIOPEN_FIND_MODE=1` - Avoid MIOpen find-db warnings +- `MIOPEN_USER_DB_PATH=/tmp/.miopen` - Writable cache + +**For multi-GPU:** +- `TORCH_NCCL_HIGH_PRIORITY=1` - RCCL optimization +- `GPU_MAX_HW_QUEUES=2` - MI series GPUs +- `HSA_ENABLE_SDMA=0` - Disable SDMA for better P2P + +**For multi-node:** +- `host_ipc: true` - Required for shared memory +- `HSA_FORCE_FINE_GRAIN_PCIE=1` - Cross-node communication +- `TORCH_NCCL_ASYNC_ERROR_HANDLING=1` - Better error handling + +### For Busy/Shared Clusters + +✅ **Recommendations:** +- Use 1-2 GPUs instead of 8 to avoid scheduling conflicts +- Test with single-GPU first, then scale up +- Monitor GPU availability: `kubectl describe nodes | grep amd.com/gpu` +- Use node selectors to target specific node types +- Consider resource quotas and limits + +--- + +## 🐛 Troubleshooting + +### Pod Stuck in Pending + +**Symptoms:** +```bash +kubectl get pods +# NAME READY STATUS RESTARTS AGE +# madengine-job-xxxxx 0/1 Pending 0 5m +``` + +**Solutions:** + +1. **Check GPU availability:** +```bash +kubectl describe nodes | grep -A5 "amd.com/gpu\|nvidia.com/gpu" +# Shows: Allocatable vs Allocated +``` + +2. **Reduce GPU count:** +```json +{ + "k8s": { + "gpu_count": 1 // Try 1 instead of 2 + } +} +``` + +3. **Check node selectors:** +```bash +kubectl get nodes --show-labels | grep instance-type +# Verify your node_selector matches actual node labels +``` + +### NCCL/RCCL Errors + +**Error: "Duplicate GPU detected"** +``` +Solution: gpu_count in config must match nproc_per_node in distributed config +``` + +**Error: "Network connection failed"** +``` +Solution: Verify NCCL_SOCKET_IFNAME matches your network interface +Check: kubectl exec -- ip addr +``` + +**Error: "NCCL initialization failed"** +``` +Solution: Ensure these are set: + NCCL_IB_DISABLE=1 + NCCL_SOCKET_IFNAME=eth0 +Enable debug: NCCL_DEBUG=INFO +``` + +### Out of Memory (OOM) + +**Symptoms:** +```bash +kubectl get pods +# NAME READY STATUS RESTARTS AGE +# madengine-job-xxxxx 0/1 OOMKilled 0 2m +``` + +**Solutions:** + +1. **Increase memory limit:** +```json +{ + "k8s": { + "memory": "128Gi", // Increase request + "memory_limit": "256Gi" // Increase limit (2× request) + } +} +``` + +2. **Reduce batch size** (in model config) + +3. **Enable gradient checkpointing** (model-specific) + +### Job Failed + +**Check logs:** +```bash +kubectl logs +kubectl describe pod +``` + +**Common issues:** +- Image pull failed: Check registry credentials +- Permission denied: Check security context and PVC permissions +- Command not found: Verify scripts are in container +- Timeout: Increase `backoff_limit` or job timeout + +### Multi-Node Communication Fails + +**Symptoms:** +``` +NCCL WARN ... Connection refused +NCCL WARN ... Unable to find NCCL communicator +``` + +**Solutions:** + +1. **Enable host_ipc:** +```json +{ + "k8s": { + "host_ipc": true // Required for multi-node! + } +} +``` + +2. **Verify headless service:** +```bash +kubectl get svc | grep madengine +# Should show ClusterIP: None (headless) +``` + +3. **Check DNS resolution:** +```bash +kubectl exec -- nslookup madengine-job-name.default.svc.cluster.local +``` + +4. **Increase timeout:** +```json +{ + "env_vars": { + "NCCL_TIMEOUT": "600" // 10 minutes + } +} +``` + +### Data Provider Issues + +**Error: "Read-only file system"** +``` +Solution: Bug in template - should be fixed in latest version +The data PVC mount must have readOnly: false +``` + +**Error: "Data file not found"** +``` +Check: +1. PVC exists: kubectl get pvc madengine-shared-data +2. PVC is Bound: kubectl describe pvc madengine-shared-data +3. Data downloaded: kubectl exec -- ls -lh /data/ +4. MAD_DATAHOME=/data set correctly +``` + +**Error: "PVC pending"** +``` +Solution: Storage class issue +Check: kubectl describe pvc madengine-shared-data +Fix: Ensure your cluster has NFS storage class for RWX +For single-node: Any storage class works (uses RWO) +``` + +--- + +## 🔍 Configuration Comparison + +| Feature | Single GPU | Multi-GPU (2) | Multi-Node (2×2) | Advanced (4×2) | +|---------|------------|---------------|------------------|----------------| +| **GPUs** | 1 | 2 | 4 | 8 | +| **Nodes** | 1 | 1 | 2 | 4 | +| **Memory** | 16Gi | 64Gi | 64Gi/node | 128Gi/node | +| **CPU** | 8 | 16 | 16/node | 24/node | +| **torchrun** | ❌ | ✅ | ✅ | ✅ | +| **host_ipc** | ❌ | ❌ | ✅ | ✅ | +| **NCCL Vars** | Basic | Yes | Full | Advanced | +| **PVCs** | No | No | Optional | Recommended | +| **Tolerations** | No | No | No | Yes | +| **Complexity** | ⭐ | ⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐⭐ | + +--- + +## 📚 Advanced Topics + +### Node Selectors + +Target specific node types: + +```json +{ + "k8s": { + "node_selector": { + "node.kubernetes.io/instance-type": "Standard_ND96isr_H100_v5", + "gpu-type": "mi300x", + "zone": "us-west-2a" + } + } +} +``` + +Check available labels: +```bash +kubectl get nodes --show-labels +``` + +### Tolerations + +Schedule on tainted nodes: + +```json +{ + "k8s": { + "tolerations": [ + { + "key": "gpu", + "operator": "Equal", + "value": "true", + "effect": "NoSchedule" + } + ] + } +} +``` + +### Custom Storage Classes + +For multi-node with custom NFS: + +```json +{ + "k8s": { + "storage_class": "nfs-client" // Your NFS storage class + } +} +``` + +Check available storage classes: +```bash +kubectl get storageclass +``` + +### Debug Mode + +Save rendered K8s manifests for inspection: + +```json +{ + "debug": true, + "k8s": { + "output_dir": "./debug_manifests" + } +} +``` + +Manifests saved to: +- `./debug_manifests/job.yaml` +- `./debug_manifests/configmap.yaml` +- `./debug_manifests/service.yaml` (multi-node only) + +--- + +## 📊 Resource Scaling Guide + +### Single GPU (Development/Testing) +``` +GPUs: 1 +Memory: 16Gi (request), 32Gi (limit) +CPU: 8 (request), 16 (limit) +Use Case: Small models, debugging, cost-effective testing +``` + +### 2 GPUs (Recommended for Shared Clusters) +``` +GPUs: 2 +Memory: 64Gi (request), 128Gi (limit) +CPU: 16 (request), 32 (limit) +Use Case: Multi-GPU training, testing on busy clusters +``` + +### 4 GPUs (Multi-Node Testing) +``` +Configuration: 2 nodes × 2 GPUs per node +Memory: 64Gi per node +CPU: 16 per node +host_ipc: true (required!) +Use Case: Distributed training development +``` + +### 8 GPUs (Production Multi-Node) +``` +Configuration: 4 nodes × 2 GPUs per node +Memory: 128Gi per node +CPU: 24 per node +host_ipc: true +PVCs: Recommended +Use Case: Large-scale production training +``` + +--- + +## 🎯 Examples by Scenario + +### Scenario 1: Quick Smoke Test + +```bash +# Use minimal config (defaults for everything) +madengine build --tags dummy \ + --additional-context-file examples/k8s-configs/01-single-node-single-gpu.json \ + --registry dockerhub + +madengine run --manifest-file build_manifest.json +``` + +### Scenario 2: Benchmark on Busy Cluster + +```bash +# Use 2 GPUs to avoid scheduling conflicts +madengine build --tags resnet50 \ + --additional-context-file examples/k8s-configs/02-single-node-multi-gpu.json \ + --registry dockerhub + +madengine run --manifest-file build_manifest.json --live-output +``` + +### Scenario 3: Large Model Training + +```bash +# Multi-node for large models +madengine build --tags llama_13b \ + --additional-context-file examples/k8s-configs/03-multi-node-basic.json \ + --registry dockerhub + +madengine run --manifest-file build_manifest.json --live-output +``` + +### Scenario 4: Production with Datasets + +```bash +# Data provider with auto-PVC +madengine build --tags bert_large \ + --additional-context-file examples/k8s-configs/06-data-provider-with-pvc.json \ + --registry dockerhub + +madengine run --manifest-file build_manifest.json --live-output + +# Verify PVC +kubectl get pvc madengine-shared-data +kubectl exec -- ls -lh /data/ +``` + +### Scenario 5: GPU Profiling + +```bash +# Use *-tools.json variant for monitoring +madengine build --tags model \ + --additional-context-file examples/k8s-configs/02-single-node-multi-gpu-tools.json \ + --registry dockerhub + +madengine run --manifest-file build_manifest.json --live-output + +# Profiling results in PVC +kubectl cp :/results/gpu_info_*.csv ./ +``` + +--- + +## 🔧 Customization Guide + +### Start from Example + +```bash +# Copy closest match +cp examples/k8s-configs/02-single-node-multi-gpu.json my-custom-config.json + +# Edit +vim my-custom-config.json +``` + +### Common Customizations + +**Change GPU count:** +```json +{ + "k8s": { + "gpu_count": 4 // Change from 2 to 4 + }, + "distributed": { + "nproc_per_node": 4 // Must match gpu_count + } +} +``` + +**Target specific node type:** +```json +{ + "k8s": { + "node_selector": { + "gpu-type": "mi300x" + } + } +} +``` + +**Increase memory:** +```json +{ + "k8s": { + "memory": "128Gi", + "memory_limit": "256Gi" // 2× memory + } +} +``` + +**Add custom environment variables:** +```json +{ + "env_vars": { + "MY_CUSTOM_VAR": "value", + "BATCH_SIZE": "256" + } +} +``` + +--- + +## 📈 Performance Tips + +### Multi-GPU Scaling + +**Expected Scaling Efficiency:** +- 2 GPUs: ~95-100% (ideal: 2× single GPU) +- 4 GPUs: ~85-95% (network overhead) +- 8 GPUs: ~80-90% (more communication) + +**Factors affecting scaling:** +- Model size (larger = better scaling) +- Batch size (larger = less communication) +- Network bandwidth (faster = better) +- NCCL configuration (optimized = better) + +### NCCL Tuning for AMD + +**Basic (included in examples):** +```json +{ + "NCCL_DEBUG": "WARN", + "NCCL_IB_DISABLE": "1", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "GPU_MAX_HW_QUEUES": "2" +} +``` + +**Advanced (for production):** +```json +{ + "NCCL_DEBUG": "WARN", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", + "GPU_MAX_HW_QUEUES": "2", + "HSA_ENABLE_SDMA": "0", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "RCCL_ENABLE_HIPGRAPH": "0", + "MIOPEN_FIND_MODE": "1" +} +``` + +### Monitoring During Training + +```bash +# Watch pod status +kubectl get pods -w + +# Monitor resource usage +kubectl top pods + +# Stream logs +kubectl logs -f + +# Check GPU utilization (from pod) +kubectl exec -- rocm-smi + +# Check NCCL communication (multi-node) +kubectl logs | grep NCCL +``` + +--- + +## 🎓 Learning Path + +### Level 1: Beginner +1. Start with `01-single-node-single-gpu.json` +2. Test on single GPU +3. Understand basic K8s concepts +4. Monitor logs and results + +### Level 2: Intermediate +1. Try `02-single-node-multi-gpu.json` +2. Learn distributed execution with torchrun (training workloads) +3. Understand NCCL configuration +4. Profile GPU utilization + +### Level 3: Advanced +1. Deploy `03-multi-node-basic.json` +2. Master multi-node networking +3. Optimize NCCL parameters +4. Use PVCs for data and results + +### Level 4: Expert +1. Customize `04-multi-node-advanced.json` +2. Fine-tune for your cluster +3. Implement node affinity and tolerations +4. Scale to 8+ nodes + +--- + +## 📋 Configuration Checklist + +Before deploying to production: + +- [ ] Tested on single GPU first +- [ ] Verified GPU availability on cluster +- [ ] Set appropriate memory and CPU limits +- [ ] Configured node selectors (if needed) +- [ ] Set NCCL environment variables +- [ ] Enabled `host_ipc` for multi-node +- [ ] Tested with small batch size first +- [ ] Configured PVCs for data (if using data providers) +- [ ] Set up monitoring and logging +- [ ] Tested failure scenarios (backoff_limit) + +--- + +## 🔗 Related Documentation + +- **Main Documentation**: `../../README.md` +- **Data Provider Guide**: `../../docs/K8S_DATA_PROVIDER_GUIDE.md` (if exists) +- **Deployment Guide**: `../../K8S_DEPLOYMENT_GUIDE.md` (if exists) +- **Performance CSV Format**: `../../PERF_CSV_UNIFIED_FORMAT.md` (if exists) + +--- + +## 📝 File Structure + +``` +examples/k8s-configs/ +├── README.md # This file +├── 01-single-node-single-gpu.json # 1 GPU, basic +├── 01-single-node-single-gpu-tools.json # 1 GPU + monitoring +├── 02-single-node-multi-gpu.json # 2 GPUs, distributed +├── 02-single-node-multi-gpu-tools.json # 2 GPUs + monitoring +├── 03-multi-node-basic.json # 2 nodes × 2 GPUs +├── 04-multi-node-advanced.json # 4 nodes × 2 GPUs +├── 05-nvidia-gpu-example.json # NVIDIA GPUs +└── 06-data-provider-with-pvc.json # Data provider + auto-PVC +``` + +--- + +## ✅ Summary + +- **8 configuration files** covering all common scenarios +- **Auto-PVC creation** for data providers - no manual setup! +- **Production-ready** with best practices +- **Well-documented** with inline comments +- **Tested** on AMD MI300X and NVIDIA clusters +- **Ready to use** - just copy and customize! + +--- + +**Last Updated**: December 6, 2025 +**Status**: Production Ready ✅ diff --git a/examples/k8s-configs/basic/01-native-single-node-single-gpu-tools.json b/examples/k8s-configs/basic/01-native-single-node-single-gpu-tools.json new file mode 100644 index 00000000..8acb9127 --- /dev/null +++ b/examples/k8s-configs/basic/01-native-single-node-single-gpu-tools.json @@ -0,0 +1,33 @@ +{ + "_comment": "Single Node, Single GPU with Tools", + "_description": "Single GPU configuration with GPU profiling tools", + "_use_case": "Single GPU benchmarks with monitoring, no distributed execution", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "tools": [{ + "name": "gpu_info_vram_profiler" + }], + + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "default", + "gpu_count": 1, + + "memory": "16Gi", + "memory_limit": "32Gi", + "cpu": "8", + "cpu_limit": "16", + + "image_pull_policy": "Always", + "backoff_limit": 3 + }, + + "env_vars": { + "OMP_NUM_THREADS": "8" + }, + + "debug": false +} + diff --git a/examples/k8s-configs/basic/01-native-single-node-single-gpu.json b/examples/k8s-configs/basic/01-native-single-node-single-gpu.json new file mode 100644 index 00000000..373c8eea --- /dev/null +++ b/examples/k8s-configs/basic/01-native-single-node-single-gpu.json @@ -0,0 +1,28 @@ +{ + "_comment": "Single Node, Single GPU - Basic Configuration", + "_description": "Configuration for running a model on a single GPU in a Kubernetes cluster", + "_use_case": "Testing, small models, quick benchmarks (single GPU, no distributed execution)", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "default", + "gpu_count": 1, + + "memory": "16Gi", + "memory_limit": "32Gi", + "cpu": "8", + "cpu_limit": "16", + + "image_pull_policy": "Always", + "backoff_limit": 3 + }, + + "env_vars": { + "OMP_NUM_THREADS": "8" + }, + + "debug": false +} diff --git a/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu-tools.json b/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu-tools.json new file mode 100644 index 00000000..3c5f80ae --- /dev/null +++ b/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu-tools.json @@ -0,0 +1,57 @@ +{ + "_comment": "Single Node, Multiple GPUs (2 GPUs) with Tools", + "_description": "2 GPU configuration with torchrun and GPU profiling tools", + "_use_case": "Multi-GPU training with performance monitoring on busy clusters", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "tools": [{"name": "gpu_info_power_profiler"}, {"name": "gpu_info_vram_profiler"}, {"name": "rocprof"}, {"name": "rpd"}, {"name": "miopen_trace"}, {"name": "rocblas_trace"}, {"name": "tensile_trace"}], + + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "default", + "gpu_count": 2, + + "memory": "64Gi", + "memory_limit": "128Gi", + "cpu": "16", + "cpu_limit": "32", + + "image_pull_policy": "Always", + "backoff_limit": 3 + }, + + "distributed": { + "enabled": true, + "backend": "nccl", + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 2, + "master_port": 29500 + }, + + "env_vars": { + "NCCL_DEBUG": "WARN", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "GPU_MAX_HW_QUEUES": "2", + "HSA_ENABLE_SDMA": "0", + "OMP_NUM_THREADS": "8", + "MIOPEN_FIND_MODE": "1", + "MIOPEN_USER_DB_PATH": "/tmp/.miopen", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "RCCL_ENABLE_HIPGRAPH": "0" + }, + + "_env_var_notes": { + "NCCL_DEBUG": "Changed from INFO to WARN to reduce log verbosity", + "MIOPEN_FIND_MODE": "1 = Use compiled kernels, avoid find-db warnings", + "MIOPEN_USER_DB_PATH": "Writable location for MIOpen cache", + "HSA_FORCE_FINE_GRAIN_PCIE": "Helps with IOMMU-related warnings in containers", + "RCCL_ENABLE_HIPGRAPH": "Disable for compatibility (experimental feature)" + }, + + "debug": false +} diff --git a/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json b/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json new file mode 100644 index 00000000..f198dff7 --- /dev/null +++ b/examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json @@ -0,0 +1,56 @@ +{ + "_comment": "Single Node, Multiple GPUs (2 GPUs) - Multi-GPU Testing", + "_description": "Configuration for running a model on 2 GPUs on a single node with torchrun", + "_use_case": "Multi-GPU training and testing on busy clusters", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "default", + "gpu_count": 2, + + "memory": "64Gi", + "memory_limit": "128Gi", + "cpu": "16", + "cpu_limit": "32", + + "image_pull_policy": "Always", + "backoff_limit": 3 + }, + + "distributed": { + "enabled": true, + "backend": "nccl", + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 2, + "master_port": 29500 + }, + + "env_vars": { + "NCCL_DEBUG": "WARN", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "GPU_MAX_HW_QUEUES": "2", + "HSA_ENABLE_SDMA": "0", + "OMP_NUM_THREADS": "8", + "MIOPEN_FIND_MODE": "1", + "MIOPEN_USER_DB_PATH": "/tmp/.miopen", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "RCCL_ENABLE_HIPGRAPH": "0" + }, + + "_env_var_notes": { + "NCCL_DEBUG": "Changed from INFO to WARN to reduce log verbosity", + "MIOPEN_FIND_MODE": "1 = Use compiled kernels, avoid find-db warnings", + "MIOPEN_USER_DB_PATH": "Writable location for MIOpen cache", + "HSA_FORCE_FINE_GRAIN_PCIE": "Helps with IOMMU-related warnings in containers", + "RCCL_ENABLE_HIPGRAPH": "Disable for compatibility (experimental feature)", + "NCCL_MIN_NCHANNELS": "Removed (warning says ignored for <8 GPUs)" + }, + + "debug": false +} diff --git a/examples/k8s-configs/basic/03-torchrun-multi-node-basic.json b/examples/k8s-configs/basic/03-torchrun-multi-node-basic.json new file mode 100644 index 00000000..0d35cb2b --- /dev/null +++ b/examples/k8s-configs/basic/03-torchrun-multi-node-basic.json @@ -0,0 +1,61 @@ +{ + "_comment": "Multi-Node (2 nodes, 2 GPUs each) - Basic Configuration", + "_description": "Configuration for distributed workload across 2 nodes with 2 GPUs per node (4 GPUs total)", + "_use_case": "Multi-node distributed execution testing on busy clusters", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "default", + "gpu_count": 2, + + "memory": "64Gi", + "memory_limit": "128Gi", + "cpu": "16", + "cpu_limit": "32", + + "image_pull_policy": "Always", + "backoff_limit": 3, + "host_ipc": true + }, + + "distributed": { + "enabled": true, + "backend": "nccl", + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 2, + "master_port": 29500 + }, + + "env_vars": { + "NCCL_DEBUG": "WARN", + "NCCL_DEBUG_SUBSYS": "INIT,NET", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "GPU_MAX_HW_QUEUES": "2", + "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", + "NCCL_TIMEOUT": "600", + "HSA_ENABLE_SDMA": "0", + "OMP_NUM_THREADS": "8", + "MIOPEN_FIND_MODE": "1", + "MIOPEN_USER_DB_PATH": "/tmp/.miopen", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "HSA_NO_SCRATCH_RECLAIM": "1", + "RCCL_ENABLE_HIPGRAPH": "0" + }, + + "_env_var_notes": { + "NCCL_DEBUG": "Changed to WARN for cleaner logs (use INFO for debugging)", + "MIOPEN_FIND_MODE": "1 = Use compiled kernels, avoid find-db warnings", + "MIOPEN_USER_DB_PATH": "Writable location for MIOpen cache", + "HSA_FORCE_FINE_GRAIN_PCIE": "Helps with IOMMU-related warnings", + "RCCL_ENABLE_HIPGRAPH": "Disable for compatibility", + "NCCL_MIN_NCHANNELS": "Removed (warning says ignored for <8 GPUs)" + }, + + "debug": false +} diff --git a/examples/k8s-configs/basic/04-torchrun-multi-node-advanced.json b/examples/k8s-configs/basic/04-torchrun-multi-node-advanced.json new file mode 100644 index 00000000..5560ffab --- /dev/null +++ b/examples/k8s-configs/basic/04-torchrun-multi-node-advanced.json @@ -0,0 +1,87 @@ +{ + "_comment": "Multi-Node (4 nodes, 2 GPUs each) - Advanced Configuration", + "_description": "Full-featured configuration for large-scale distributed workloads with PVCs, tolerations, and node affinity", + "_use_case": "Multi-node distributed execution with advanced features on busy clusters (8 GPUs total)", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "ml-training", + "gpu_count": 2, + "gpu_resource_name": "amd.com/gpu", + + "memory": "128Gi", + "memory_limit": "192Gi", + "cpu": "24", + "cpu_limit": "32", + + "image_pull_policy": "IfNotPresent", + "backoff_limit": 5, + "host_ipc": true, + + "node_selector": { + "feature.node.kubernetes.io/amd-gpu-mi300x": "true", + "topology.kubernetes.io/zone": "us-west-2a", + "workload-type": "ml-training" + }, + + "tolerations": [ + { + "key": "gpu", + "operator": "Equal", + "value": "amd", + "effect": "NoSchedule" + }, + { + "key": "workload", + "operator": "Equal", + "value": "training", + "effect": "NoSchedule" + } + ], + + "results_pvc": "ml-results-pvc", + "data_pvc": "ml-datasets-pvc", + + "output_dir": "./k8s_manifests/multi-node" + }, + + "distributed": { + "enabled": true, + "backend": "nccl", + "launcher": "torchrun", + "nnodes": 4, + "nproc_per_node": 2, + "master_port": 29500 + }, + + "env_vars": { + "NCCL_DEBUG": "WARN", + "NCCL_DEBUG_SUBSYS": "INIT,NET,GRAPH", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "GPU_MAX_HW_QUEUES": "2", + "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", + "NCCL_TIMEOUT": "600", + "HSA_ENABLE_SDMA": "0", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "OMP_NUM_THREADS": "8", + "MIOPEN_FIND_MODE": "1", + "MIOPEN_USER_DB_PATH": "/tmp/.miopen", + "RCCL_ENABLE_HIPGRAPH": "0" + }, + + "_env_var_notes": { + "NCCL_DEBUG": "Changed to WARN for cleaner logs (use INFO for debugging)", + "MIOPEN_FIND_MODE": "1 = Use compiled kernels, avoid find-db warnings", + "MIOPEN_USER_DB_PATH": "Writable location for MIOpen cache", + "HSA_FORCE_FINE_GRAIN_PCIE": "Helps with IOMMU-related warnings", + "RCCL_ENABLE_HIPGRAPH": "Disable for compatibility", + "NCCL_MIN_NCHANNELS": "Removed (warning says ignored for <8 GPUs)" + }, + + "debug": false +} diff --git a/examples/k8s-configs/basic/05-torchrun-nvidia-gpu-example.json b/examples/k8s-configs/basic/05-torchrun-nvidia-gpu-example.json new file mode 100644 index 00000000..7c087acc --- /dev/null +++ b/examples/k8s-configs/basic/05-torchrun-nvidia-gpu-example.json @@ -0,0 +1,47 @@ +{ + "_comment": "NVIDIA GPU - Single Node, 4 GPUs", + "_description": "Configuration for running models on NVIDIA GPUs (A100, H100, etc.) with distributed execution", + "_use_case": "NVIDIA-based Kubernetes clusters, multi-GPU training", + + "gpu_vendor": "NVIDIA", + "guest_os": "UBUNTU", + + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "default", + "gpu_count": 4, + "gpu_resource_name": "nvidia.com/gpu", + + "memory": "128Gi", + "memory_limit": "256Gi", + "cpu": "48", + "cpu_limit": "96", + + "image_pull_policy": "Always", + "backoff_limit": 3, + + "node_selector": { + "accelerator": "nvidia-tesla-a100" + } + }, + + "distributed": { + "enabled": true, + "backend": "nccl", + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 4, + "master_port": 29500 + }, + + "env_vars": { + "NCCL_DEBUG": "INFO", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "NCCL_P2P_DISABLE": "0", + "NCCL_P2P_LEVEL": "NVL", + "OMP_NUM_THREADS": "12" + }, + + "debug": false +} diff --git a/examples/k8s-configs/basic/06-data-provider-with-pvc.json b/examples/k8s-configs/basic/06-data-provider-with-pvc.json new file mode 100644 index 00000000..9bd2e47f --- /dev/null +++ b/examples/k8s-configs/basic/06-data-provider-with-pvc.json @@ -0,0 +1,80 @@ +{ + "_comment": "K8s Configuration with Data Provider (Auto-PVC)", + "_description": "Production-ready setup for training with external data (MinIO, S3, NAS, etc.)", + "_use_case": "Models that require data provider (e.g., dummy_torchrun_data_minio)", + "_auto_pvc": "✅ PVC is automatically created - NO manual kubectl commands needed!", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "default", + "gpu_count": 2, + + "_comment_pvc": "OPTIONAL - Leave empty for auto-creation (recommended)", + "_pvc_auto": "Auto-created: madengine-shared-data (100Gi, RWO/RWX based on nnodes)", + "_pvc_custom": "To use existing PVC: uncomment and set: \"data_pvc\": \"your-pvc-name\"", + + "memory": "64Gi", + "memory_limit": "128Gi", + "cpu": "16", + "cpu_limit": "32", + + "image_pull_policy": "Always", + "backoff_limit": 3, + "host_ipc": true + }, + + "distributed": { + "enabled": true, + "backend": "nccl", + "launcher": "torchrun", + + "_comment_single_node": "For single-node: nnodes=1, nproc_per_node=N_GPUs", + "_comment_multi_node": "For multi-node: nnodes=N, nproc_per_node=GPUs_per_node", + "nnodes": 1, + "nproc_per_node": 2, + + "master_port": 29500 + }, + + "env_vars": { + "_comment_mad_datahome": "MAD_DATAHOME points to PVC mount point (default: /data)", + "MAD_DATAHOME": "/data", + + "_comment_nccl": "NCCL/RCCL configuration for AMD GPUs", + "NCCL_DEBUG": "WARN", + "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", + + "_comment_rocm": "ROCm optimizations", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "HSA_ENABLE_SDMA": "0", + "MIOPEN_FIND_MODE": "1", + "MIOPEN_USER_DB_PATH": "/tmp/.miopen", + "RCCL_ENABLE_HIPGRAPH": "0" + }, + + "_quick_start": { + "step_1": "Build: madengine build --tags dummy_torchrun_data_minio --additional-context-file THIS_FILE", + "step_2": "Run: madengine run --manifest-file build_manifest.json", + "result": "✅ PVC auto-created, data downloaded, training started - all automatic!" + }, + + "_how_it_works": { + "auto_pvc": "madengine creates 'madengine-shared-data' PVC automatically if not found", + "reusable": "PVC persists across runs - data downloads once, reuses forever", + "smart_mode": "Single-node: ReadWriteOnce, Multi-node: ReadWriteMany (auto-selected)", + "verify": "kubectl get pvc madengine-shared-data", + "inspect": "kubectl describe pvc madengine-shared-data" + }, + + "_advanced": { + "custom_pvc": "To use existing PVC: Add \"data_pvc\": \"your-pvc-name\" to k8s config above", + "storage_class": "Auto-PVC uses cluster's default storage class", + "pvc_size": "Default 100Gi - modify code in kubernetes.py if needed" + }, + + "debug": false +} + diff --git a/examples/k8s-configs/basic/megatron-lm-multi-node-basic.json b/examples/k8s-configs/basic/megatron-lm-multi-node-basic.json new file mode 100644 index 00000000..e059ba08 --- /dev/null +++ b/examples/k8s-configs/basic/megatron-lm-multi-node-basic.json @@ -0,0 +1,34 @@ +{ + "_comment": "Megatron-LM Multi-Node Training Configuration", + "_description": "Large-scale transformer training with Megatron-LM on Kubernetes", + "_use_case": "Multi-node Megatron-LM training with tensor and pipeline parallelism", + "_reference": "https://github.com/NVIDIA/Megatron-LM", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 8, + "namespace": "ml-training", + "memory": "128Gi", + "memory_limit": "256Gi", + "cpu": "32", + "cpu_limit": "64", + "image_pull_policy": "IfNotPresent" + }, + + "distributed": { + "launcher": "megatron", + "nnodes": 4, + "nproc_per_node": 8, + "master_port": 29500 + }, + + "env_vars": { + "OMP_NUM_THREADS": "16", + "NCCL_DEBUG": "INFO" + }, + + "debug": false +} + diff --git a/examples/k8s-configs/basic/sglang-disagg-custom-split.json b/examples/k8s-configs/basic/sglang-disagg-custom-split.json new file mode 100644 index 00000000..49aeecb1 --- /dev/null +++ b/examples/k8s-configs/basic/sglang-disagg-custom-split.json @@ -0,0 +1,48 @@ +{ + "_comment": "SGLang Disaggregated K8s Config - Custom Prefill/Decode Split", + "_description": "7 nodes with custom split: 1 proxy + 4 prefill + 2 decode", + "_use_case": "Workload with long prompts requiring more prefill capacity", + "_reference": "https://github.com/sgl-project/sglang", + "_architecture": { + "proxy": "Pod 0 (Load Balancer)", + "prefill": "Pods 1-4 (4 nodes, 57% - custom)", + "decode": "Pods 5-6 (2 nodes, 29% - custom)", + "total": "7 pods total", + "note": "Custom split overrides default 40/60 ratio" + }, + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 8, + "memory": "256Gi", + "memory_limit": "384Gi", + "cpu": "64", + "cpu_limit": "96", + "node_selector": { + "feature.node.kubernetes.io/amd-gpu-mi300x": "true" + } + }, + + "distributed": { + "launcher": "sglang-disagg", + "nnodes": 7, + "nproc_per_node": 8, + "master_port": 29500, + "sglang_disagg": { + "prefill_nodes": 4, + "decode_nodes": 2 + } + }, + + "context": { + "env_vars": { + "SGLANG_ENABLE_RADIX_CACHE": "1", + "SGLANG_RADIX_CACHE_SIZE": "0.9", + "NCCL_TIMEOUT": "600", + "RAY_health_check_timeout_ms": "60000" + } + } +} + diff --git a/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json b/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json new file mode 100644 index 00000000..c16fd342 --- /dev/null +++ b/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json @@ -0,0 +1,44 @@ +{ + "_comment": "SGLang Disaggregated K8s Config - 5 nodes x 8 GPUs", + "_description": "Multi-node SGLang disaggregated with prefill/decode separation", + "_use_case": "Large-scale LLM inference with specialized prefill/decode clusters", + "_reference": "https://github.com/sgl-project/sglang", + "_architecture": { + "proxy": "Pod 0 (Load Balancer)", + "prefill": "Pods 1-2 (2 nodes, ~40%)", + "decode": "Pods 3-4 (2 nodes, ~60%)", + "total": "5 pods total" + }, + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 8, + "memory": "256Gi", + "memory_limit": "384Gi", + "cpu": "64", + "cpu_limit": "96", + "node_selector": { + "feature.node.kubernetes.io/amd-gpu-mi300x": "true" + } + }, + + "distributed": { + "launcher": "sglang-disagg", + "nnodes": 5, + "nproc_per_node": 8, + "master_port": 29500 + }, + + "context": { + "env_vars": { + "SGLANG_ENABLE_RADIX_CACHE": "1", + "SGLANG_RADIX_CACHE_SIZE": "0.9", + "NCCL_TIMEOUT": "600", + "RAY_health_check_timeout_ms": "60000", + "MOONCAKE_TEST_MODE": "0" + } + } +} + diff --git a/examples/k8s-configs/basic/sglang-multi-node-basic.json b/examples/k8s-configs/basic/sglang-multi-node-basic.json new file mode 100644 index 00000000..b693260e --- /dev/null +++ b/examples/k8s-configs/basic/sglang-multi-node-basic.json @@ -0,0 +1,36 @@ +{ + "_comment": "SGLang Multi-Node K8s Config - 2 nodes x 4 GPUs", + "_description": "Multi-node SGLang with native launcher and Ray", + "_use_case": "Distributed LLM inference serving", + "_reference": "https://github.com/sgl-project/sglang", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 4, + "memory": "256Gi", + "memory_limit": "384Gi", + "cpu": "64", + "cpu_limit": "96", + "node_selector": { + "feature.node.kubernetes.io/amd-gpu-mi300x": "true" + } + }, + + "distributed": { + "launcher": "sglang", + "nnodes": 2, + "nproc_per_node": 4, + "master_port": 29500 + }, + + "context": { + "env_vars": { + "SGLANG_KV_CACHE_SIZE": "0.5", + "NCCL_TIMEOUT": "600", + "RAY_health_check_timeout_ms": "60000" + } + } +} + diff --git a/examples/k8s-configs/basic/torchtitan-multi-node-basic.json b/examples/k8s-configs/basic/torchtitan-multi-node-basic.json new file mode 100644 index 00000000..e350605d --- /dev/null +++ b/examples/k8s-configs/basic/torchtitan-multi-node-basic.json @@ -0,0 +1,39 @@ +{ + "_comment": "TorchTitan Multi-Node Config - 4 nodes x 8 GPUs for Llama 3.1 70B", + "_description": "Uses multi-dimensional parallelism (TP + PP + FSDP2)", + "_use_case": "Large-scale LLM pre-training (70B+ models)", + "_reference": "https://github.com/pytorch/torchtitan", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 8, + "memory": "512Gi", + "memory_limit": "768Gi", + "cpu": "96", + "cpu_limit": "128", + "node_selector": { + "feature.node.kubernetes.io/amd-gpu-mi300x": "true" + } + }, + + "distributed": { + "launcher": "torchtitan", + "nnodes": 4, + "nproc_per_node": 8, + "master_port": 29500 + }, + + "context": { + "pre_scripts": [ + "scripts/common/setup_pytorch_env.sh" + ], + "env_vars": { + "PYTORCH_TUNABLEOP_ENABLED": "1", + "PYTORCH_TUNABLEOP_TUNING": "1", + "NCCL_DEBUG": "INFO" + } + } +} + diff --git a/examples/k8s-configs/basic/vllm-multi-node-basic.json b/examples/k8s-configs/basic/vllm-multi-node-basic.json new file mode 100644 index 00000000..4c1b61c9 --- /dev/null +++ b/examples/k8s-configs/basic/vllm-multi-node-basic.json @@ -0,0 +1,38 @@ +{ + "_comment": "vLLM Multi-Node K8s Config - 2 nodes x 4 GPUs (Data Parallelism)", + "_description": "Each pod runs independent vLLM replica for higher throughput", + "_use_case": "High-throughput LLM inference serving", + "_reference": "https://github.com/vllm-project/vllm", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 4, + "memory": "256Gi", + "memory_limit": "384Gi", + "cpu": "64", + "cpu_limit": "96", + "node_selector": { + "feature.node.kubernetes.io/amd-gpu-mi300x": "true" + } + }, + + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 4, + "master_port": 29500 + }, + + "context": { + "env_vars": { + "VLLM_KV_CACHE_SIZE": "0.5", + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + "NCCL_TIMEOUT": "600", + "VLLM_ENGINE_ITERATION_TIMEOUT_S": "180", + "RAY_health_check_timeout_ms": "60000" + } + } +} + diff --git a/examples/k8s-configs/minimal/README.md b/examples/k8s-configs/minimal/README.md new file mode 100644 index 00000000..85626aec --- /dev/null +++ b/examples/k8s-configs/minimal/README.md @@ -0,0 +1,227 @@ +# Minimal Kubernetes Configuration Examples + +These are minimal configuration examples that leverage madengine's built-in defaults. + +## 🎯 Philosophy + +With madengine v2.0+, you only need to specify what's unique to your deployment: +- **GPU count** (required) +- **Distributed settings** (if using multiple GPUs) +- **Overrides** (only if you need to change defaults) + +Everything else is automatically configured based on best practices. + +## 🚀 Key Feature: Auto-Inferred Deployment Type + +**No `deploy` field needed!** Deployment type is automatically inferred: +- Presence of `k8s` field → K8s deployment +- Presence of `slurm` field → SLURM deployment +- Neither present → Local execution + +This follows the **Convention over Configuration** principle. + +## 📁 Examples + +### [single-gpu-minimal.json](single-gpu-minimal.json) +**Just 1 field:** GPU count +```json +{ + "k8s": { + "gpu_count": 1 + } +} +``` +**Auto-applied:** +- Memory: 16Gi / 32Gi limit +- CPU: 8 / 16 limit +- AMD optimizations +- Standard env vars + +**Usage:** +```bash +madengine run --tags model \ + --additional-context-file examples/k8s-configs/minimal/single-gpu-minimal.json +``` + +--- + +### [multi-gpu-minimal.json](multi-gpu-minimal.json) +**Multi-GPU training** with minimal config +```json +{ + "k8s": { + "gpu_count": 2 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 2 + } +} +``` +**Auto-applied:** +- Memory: 64Gi / 128Gi limit +- CPU: 16 / 32 limit +- All AMD multi-GPU optimizations +- NCCL/RCCL environment variables +- ROCm performance tuning + +--- + +### [multi-node-minimal.json](multi-node-minimal.json) +**Multi-node distributed** training (2 nodes × 2 GPUs = 4 GPUs total) +```json +{ + "k8s": { + "gpu_count": 2 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 2 + } +} +``` +**Auto-applied:** +- All multi-GPU optimizations +- `host_ipc: true` for shared memory +- Multi-node NCCL settings +- Timeout and async error handling + +--- + +### [nvidia-gpu-minimal.json](nvidia-gpu-minimal.json) +**NVIDIA GPUs** get different optimizations +```json +{ + "gpu_vendor": "NVIDIA", + "k8s": { + "gpu_count": 4 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 4 + } +} +``` +**Auto-applied:** +- `gpu_resource_name: nvidia.com/gpu` +- NVIDIA-specific NCCL settings +- P2P optimizations +- NVLink configuration + +--- + +### [custom-namespace-minimal.json](custom-namespace-minimal.json) +**Override defaults** when needed +```json +{ + "k8s": { + "gpu_count": 1, + "namespace": "ml-team", + "memory": "32Gi" + } +} +``` +**Shows:** You can override any default while keeping others + +--- + +## 🔄 Comparison: Old vs New + +### Before (Full Config Required) +```json +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "deploy": "k8s", + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "default", + "gpu_count": 1, + "memory": "16Gi", + "memory_limit": "32Gi", + "cpu": "8", + "cpu_limit": "16", + "image_pull_policy": "Always", + "backoff_limit": 3 + }, + "env_vars": { + "OMP_NUM_THREADS": "8" + }, + "debug": false +} +``` + +### After (Minimal) +```json +{ + "k8s": { + "gpu_count": 1 + } +} +``` + +**Both produce identical results!** + +--- + +## 🚀 Quick Start + +1. **Copy a minimal config:** + ```bash + cp examples/k8s-configs/minimal/single-gpu-minimal.json my-config.json + ``` + +2. **Customize if needed:** + ```bash + # Edit my-config.json to add namespace, memory overrides, etc. + ``` + +3. **Build and run:** + ```bash + MODEL_DIR=tests/fixtures/dummy madengine build \ + --tags my_model \ + --additional-context-file my-config.json + + madengine run \ + --manifest-file build_manifest.json \ + --live-output + ``` + +--- + +## 💡 Tips + +### Use CLI for one-off overrides +```bash +madengine run --tags model \ + --additional-context-file minimal/single-gpu-minimal.json \ + --additional-context '{"debug": true}' +``` + +### View resolved configuration +```bash +madengine config show \ + --additional-context-file my-config.json +``` +(Shows all defaults that will be applied) + +### Start minimal, add as needed +1. Start with minimal config +2. Test and validate +3. Add overrides only when necessary +4. Advanced features (PVCs, tolerations, node selectors) work the same + +--- + +## 📚 See Full Examples + +For advanced use cases with PVCs, tolerations, node selectors, etc., see: +- [../01-single-node-single-gpu.json](../01-single-node-single-gpu.json) +- [../04-multi-node-advanced.json](../04-multi-node-advanced.json) +- [../06-data-provider-with-pvc.json](../06-data-provider-with-pvc.json) + +These full configs still work exactly as before - no breaking changes! + diff --git a/examples/k8s-configs/minimal/custom-namespace-minimal.json b/examples/k8s-configs/minimal/custom-namespace-minimal.json new file mode 100644 index 00000000..fa3747dd --- /dev/null +++ b/examples/k8s-configs/minimal/custom-namespace-minimal.json @@ -0,0 +1,15 @@ +{ + "_comment": "Minimal Config with Custom Namespace", + "_description": "Shows how to override specific defaults", + "_use_case": "Deploying to a specific namespace", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 1, + "namespace": "default", + "memory": "32Gi" + } +} + diff --git a/examples/k8s-configs/minimal/deepspeed-minimal.json b/examples/k8s-configs/minimal/deepspeed-minimal.json new file mode 100644 index 00000000..7bece847 --- /dev/null +++ b/examples/k8s-configs/minimal/deepspeed-minimal.json @@ -0,0 +1,25 @@ +{ + "_comment": "DeepSpeed Minimal Config - Uses bash script with torchrun", + "_description": "DeepSpeed with ZeRO-1 optimization", + "_use_case": "Test DeepSpeed distributed training with bash wrapper", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 2, + "namespace": "default", + "memory": "32Gi", + "cpu": "16" + }, + + "distributed": { + "launcher": "deepspeed", + "nnodes": 1, + "nproc_per_node": 2 + }, + + "env_vars": { + "OMP_NUM_THREADS": "8" + } +} diff --git a/examples/k8s-configs/minimal/megatron-lm-exclude-node.json b/examples/k8s-configs/minimal/megatron-lm-exclude-node.json new file mode 100644 index 00000000..793431a2 --- /dev/null +++ b/examples/k8s-configs/minimal/megatron-lm-exclude-node.json @@ -0,0 +1,42 @@ +{ + "_comment": "Megatron-LM Configuration - Excluding Specific Problem Nodes", + "_description": "Use this if you need to explicitly exclude a node with disk pressure or other issues", + "_use_case": "Temporary config to avoid problematic nodes during maintenance", + "_note": "This uses anti-affinity to exclude banff-pla-r25-05. Update the hostname as needed.", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 2, + "namespace": "default", + + "memory": "32Gi", + "memory_limit": "128Gi", + "cpu": "16", + "cpu_limit": "32", + + "image_pull_policy": "IfNotPresent", + + "node_selector": { + "feature.node.kubernetes.io/amd-gpu": "true" + } + }, + + "distributed": { + "launcher": "megatron", + "nnodes": 1, + "nproc_per_node": 2 + }, + + "env_vars": { + "OMP_NUM_THREADS": "8" + }, + + "_instructions": [ + "To exclude a specific node, add node affinity in the deployment code,", + "or temporarily drain the node: kubectl drain banff-pla-r25-05 --ignore-daemonsets", + "This config ensures scheduling only on nodes with AMD GPUs" + ] +} + diff --git a/examples/k8s-configs/minimal/megatron-lm-minimal.json b/examples/k8s-configs/minimal/megatron-lm-minimal.json new file mode 100644 index 00000000..43266e01 --- /dev/null +++ b/examples/k8s-configs/minimal/megatron-lm-minimal.json @@ -0,0 +1,25 @@ +{ + "_comment": "Megatron-LM Minimal Config - Dedicated launcher support", + "_description": "Megatron-LM with automated tensor/pipeline parallelism setup", + "_use_case": "Large-scale transformer training with Megatron-LM on Kubernetes", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 2, + "namespace": "default", + "memory": "32Gi", + "cpu": "16" + }, + + "distributed": { + "launcher": "megatron", + "nnodes": 1, + "nproc_per_node": 2 + }, + + "env_vars": { + "OMP_NUM_THREADS": "8" + } +} diff --git a/examples/k8s-configs/minimal/megatron-lm-optimized.json b/examples/k8s-configs/minimal/megatron-lm-optimized.json new file mode 100644 index 00000000..29559308 --- /dev/null +++ b/examples/k8s-configs/minimal/megatron-lm-optimized.json @@ -0,0 +1,53 @@ +{ + "_comment": "Optimized Megatron-LM Configuration with Node Selector", + "_description": "Production-ready configuration with resource management and node selection", + "_use_case": "Megatron-LM training with automatic node selection to avoid problematic nodes", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 2, + "namespace": "default", + + "memory": "32Gi", + "memory_limit": "128Gi", + "cpu": "16", + "cpu_limit": "32", + + "image_pull_policy": "IfNotPresent", + "backoff_limit": 3, + + "node_selector": { + "feature.node.kubernetes.io/amd-gpu": "true", + "amd.com/gpu.product-name": "AMD_Instinct_MI300X_OAM" + }, + + "tolerations": [] + }, + + "distributed": { + "enabled": true, + "backend": "nccl", + "launcher": "megatron", + "nnodes": 1, + "nproc_per_node": 2, + "master_port": 29500 + }, + + "env_vars": { + "OMP_NUM_THREADS": "8", + "NCCL_DEBUG": "WARN", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "GPU_MAX_HW_QUEUES": "2", + "HSA_ENABLE_SDMA": "0", + "MIOPEN_FIND_MODE": "1", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "RCCL_ENABLE_HIPGRAPH": "0" + }, + + "debug": false +} + diff --git a/examples/k8s-configs/minimal/sglang-disagg-minimal.json b/examples/k8s-configs/minimal/sglang-disagg-minimal.json new file mode 100644 index 00000000..f0f6ad05 --- /dev/null +++ b/examples/k8s-configs/minimal/sglang-disagg-minimal.json @@ -0,0 +1,19 @@ +{ + "_comment": "Minimal SGLang Disaggregated configuration - 3 nodes minimum", + "_description": "SGLang disaggregated inference with 3 pods (1 proxy + 1 prefill + 1 decode)", + "_architecture": "Pod 0: Proxy, Pod 1: Prefill, Pod 2: Decode", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 3 + }, + + "distributed": { + "launcher": "sglang-disagg", + "nnodes": 3, + "nproc_per_node": 1 + } +} + diff --git a/examples/k8s-configs/minimal/sglang-single-node-minimal.json b/examples/k8s-configs/minimal/sglang-single-node-minimal.json new file mode 100644 index 00000000..5a12b19d --- /dev/null +++ b/examples/k8s-configs/minimal/sglang-single-node-minimal.json @@ -0,0 +1,28 @@ +{ + "_comment": "Minimal SGLang Single-Node K8s Config - 4 GPUs", + "_description": "SGLang inference with Tensor Parallelism for single-node", + "_use_case": "LLM inference serving with SGLang", + "_reference": "https://github.com/sgl-project/sglang", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 4, + "memory": "128Gi", + "cpu": "32" + }, + + "distributed": { + "launcher": "sglang", + "nnodes": 1, + "nproc_per_node": 4 + }, + + "context": { + "env_vars": { + "SGLANG_KV_CACHE_SIZE": "0.7" + } + } +} + diff --git a/examples/k8s-configs/minimal/torchrun-multi-gpu-minimal.json b/examples/k8s-configs/minimal/torchrun-multi-gpu-minimal.json new file mode 100644 index 00000000..49a2ebbf --- /dev/null +++ b/examples/k8s-configs/minimal/torchrun-multi-gpu-minimal.json @@ -0,0 +1,19 @@ +{ + "_comment": "Minimal Multi-GPU Config - 2 GPUs with torchrun", + "_description": "Uses built-in defaults for AMD multi-GPU optimizations", + "_use_case": "Quick multi-GPU training with minimal configuration", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 2 + }, + + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 2 + } +} + diff --git a/examples/k8s-configs/minimal/torchrun-multi-node-minimal.json b/examples/k8s-configs/minimal/torchrun-multi-node-minimal.json new file mode 100644 index 00000000..656ac123 --- /dev/null +++ b/examples/k8s-configs/minimal/torchrun-multi-node-minimal.json @@ -0,0 +1,19 @@ +{ + "_comment": "Minimal Multi-Node Config - 2 nodes x 2 GPUs each", + "_description": "Uses built-in defaults for multi-node distributed workload", + "_use_case": "Quick multi-node testing with 4 GPUs total", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 2 + }, + + "distributed": { + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 2 + } +} + diff --git a/examples/k8s-configs/minimal/torchrun-nvidia-gpu-minimal.json b/examples/k8s-configs/minimal/torchrun-nvidia-gpu-minimal.json new file mode 100644 index 00000000..444e037f --- /dev/null +++ b/examples/k8s-configs/minimal/torchrun-nvidia-gpu-minimal.json @@ -0,0 +1,19 @@ +{ + "_comment": "Minimal NVIDIA GPU Config - 4 GPUs with torchrun", + "_description": "Uses built-in NVIDIA optimizations and presets", + "_use_case": "Quick NVIDIA GPU testing with minimal configuration", + + "gpu_vendor": "NVIDIA", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 4 + }, + + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 4 + } +} + diff --git a/examples/k8s-configs/minimal/torchrun-single-gpu-minimal.json b/examples/k8s-configs/minimal/torchrun-single-gpu-minimal.json new file mode 100644 index 00000000..5041003e --- /dev/null +++ b/examples/k8s-configs/minimal/torchrun-single-gpu-minimal.json @@ -0,0 +1,19 @@ +{ + "_comment": "Minimal Single GPU Config - Only Essential Fields", + "_description": "Uses built-in defaults for everything except GPU count", + "_use_case": "Quick single GPU testing with minimal configuration", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 1 + }, + + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 1 + } +} + diff --git a/examples/k8s-configs/minimal/torchtitan-single-node-minimal.json b/examples/k8s-configs/minimal/torchtitan-single-node-minimal.json new file mode 100644 index 00000000..9605f09c --- /dev/null +++ b/examples/k8s-configs/minimal/torchtitan-single-node-minimal.json @@ -0,0 +1,22 @@ +{ + "_comment": "Minimal TorchTitan Single-Node Config - 8 GPUs for Llama 3.1 8B", + "_description": "Uses torchtitan with Tensor Parallelism for single-node training", + "_use_case": "Quick LLM pre-training with torchtitan (8B model)", + "_reference": "https://github.com/pytorch/torchtitan", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 8, + "memory": "256Gi", + "cpu": "64" + }, + + "distributed": { + "launcher": "torchtitan", + "nnodes": 1, + "nproc_per_node": 8 + } +} + diff --git a/examples/k8s-configs/minimal/vllm-single-node-minimal.json b/examples/k8s-configs/minimal/vllm-single-node-minimal.json new file mode 100644 index 00000000..ed0de4ac --- /dev/null +++ b/examples/k8s-configs/minimal/vllm-single-node-minimal.json @@ -0,0 +1,29 @@ +{ + "_comment": "Minimal vLLM Single-Node K8s Config - 4 GPUs", + "_description": "vLLM inference with Tensor Parallelism for single-node", + "_use_case": "LLM inference serving with vLLM", + "_reference": "https://github.com/vllm-project/vllm", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "k8s": { + "gpu_count": 4, + "memory": "128Gi", + "cpu": "32" + }, + + "distributed": { + "launcher": "vllm", + "nnodes": 1, + "nproc_per_node": 4 + }, + + "context": { + "env_vars": { + "VLLM_KV_CACHE_SIZE": "0.7", + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True" + } + } +} + diff --git a/examples/profiling-configs/README.md b/examples/profiling-configs/README.md new file mode 100644 index 00000000..111086a4 --- /dev/null +++ b/examples/profiling-configs/README.md @@ -0,0 +1,314 @@ +# ROCprofv3 Profiling Configurations + +This directory contains pre-configured profiling setups for different AI model benchmarking scenarios using madengine and ROCprofv3. + +## Available Profiles + +### 1. Compute-Bound Profiling (`rocprofv3_compute_bound.json`) + +**Use Case**: Models bottlenecked by ALU operations (e.g., large transformers with dense matrix operations) + +**Collected Metrics**: +- Wave execution and cycles +- VALU (Vector ALU) instructions +- SALU (Scalar ALU) instructions +- Wait states +- GPU power consumption + +**Usage**: +```bash +madengine run --tags your_model \ + --additional-context-file examples/profiling-configs/rocprofv3_compute_bound.json +``` + +### 2. Memory-Bound Profiling (`rocprofv3_memory_bound.json`) + +**Use Case**: Models bottlenecked by memory bandwidth (e.g., large batch sizes, high-resolution inputs) + +**Collected Metrics**: +- L1/L2 cache hit rates +- Memory read/write requests +- Cache efficiency +- VRAM usage over time + +**Usage**: +```bash +madengine run --tags your_model \ + --additional-context-file examples/profiling-configs/rocprofv3_memory_bound.json +``` + +### 3. Multi-GPU Profiling (`rocprofv3_multi_gpu.json`) + +**Use Case**: Multi-GPU training with data parallel or model parallel + +**Collected Metrics**: +- RCCL communication traces +- Inter-GPU memory transfers +- Scratch memory allocation +- Per-GPU power and VRAM + +**Usage**: +```bash +madengine run --tags your_model \ + --additional-context-file examples/profiling-configs/rocprofv3_multi_gpu.json +``` + +### 4. Comprehensive Profiling (`rocprofv3_comprehensive.json`) + +**Use Case**: Full analysis with all available metrics (high overhead!) + +**Collected Metrics**: +- All kernel traces (HIP, HSA, kernel, memory) +- Hardware performance counters +- Library call traces (MIOpen, rocBLAS) +- Power and VRAM monitoring +- Statistical summaries + +**Usage**: +```bash +madengine run --tags your_model \ + --additional-context-file examples/profiling-configs/rocprofv3_comprehensive.json +``` + +**Warning**: This profile has significant overhead. Use for detailed analysis only. + +### 5. Lightweight Profiling (`rocprofv3_lightweight.json`) + +**Use Case**: Production-like workloads with minimal profiling overhead + +**Collected Metrics**: +- Basic HIP and kernel traces +- JSON output format (compact) + +**Usage**: +```bash +madengine run --tags your_model \ + --additional-context-file examples/profiling-configs/rocprofv3_lightweight.json +``` + +### 6. Multi-Node Distributed (`rocprofv3_multinode.json`) + +**Use Case**: Large-scale distributed training on SLURM clusters + +**Collected Metrics**: +- RCCL communication patterns +- Cross-node synchronization +- Per-node power monitoring + +**Usage**: +```bash +# Build phase +madengine build --tags your_model --registry your-registry:5000 + +# Deploy to SLURM +madengine run --manifest-file build_manifest.json \ + --additional-context-file examples/profiling-configs/rocprofv3_multinode.json +``` + +## Direct Tool Usage (Without Config Files) + +### Single GPU - Compute Analysis +```bash +madengine run --tags dummy_prof \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [{"name": "rocprofv3_compute"}] + }' +``` + +### Multi-GPU - Communication Analysis +```bash +madengine run --tags your_model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "docker_gpus": "all", + "distributed": { + "launcher": "torchrun", + "nproc_per_node": 8 + }, + "tools": [{"name": "rocprofv3_communication"}] + }' +``` + +### Custom ROCprofv3 Command +```bash +madengine run --tags your_model \ + --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [{ + "name": "rocprof", + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --kernel-trace --memory-copy-trace --output-format pftrace -d ./my_traces --", + "env_vars": { + "RCCL_DEBUG": "TRACE", + "HSA_ENABLE_SDMA": "0" + } + }] + }' +``` + +## Best Practices for Custom Commands + +### Always Include the `--` Separator + +When using custom profiling commands with `rocprof_wrapper.sh`, **always include the trailing `--`**: + +```json +{ + "name": "rocprof", + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --sys-trace --" +} +``` + +**Why?** The `--` separator is critical for rocprofv3 (ROCm >= 7.0): +- **rocprofv3** requires: `rocprofv3 [options] -- ` +- **rocprof (legacy)** accepts: `rocprof [options] ` + +The wrapper script auto-detects which profiler is available and formats the command correctly. Without the `--`, rocprofv3 will fail to parse arguments when the application command is appended. + +**❌ Wrong:** +```json +{"cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --sys-trace"} +``` + +**✅ Correct:** +```json +{"cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --sys-trace --"} +``` + +## Available ROCprofv3 Tools + +| Tool Name | Description | Key Options | Overhead | +|-----------|-------------|-------------|----------| +| `rocprofv3_compute` | Compute-bound analysis | Counter collection, VALU/SALU metrics | Medium | +| `rocprofv3_memory` | Memory bandwidth analysis | Cache hits/misses, memory transfers | Medium | +| `rocprofv3_communication` | Multi-GPU communication | RCCL trace, scratch memory | Medium | +| `rocprofv3_full` | Comprehensive profiling | All traces + counters + stats | High | +| `rocprofv3_lightweight` | Minimal overhead | HIP + kernel trace only | Low | +| `rocprofv3_perfetto` | Perfetto visualization | Perfetto-compatible output | Medium | +| `rocprofv3_api_overhead` | API call analysis | HIP/HSA/marker traces with stats | Low | +| `rocprofv3_pc_sampling` | Kernel hotspot analysis | PC sampling at 1000 Hz | Medium | + +## Counter Definition Files + +Counter files are located at `src/madengine/scripts/common/tools/counters/`: + +- **`compute_bound.txt`**: Wave execution, VALU/SALU instructions, wait states +- **`memory_bound.txt`**: Cache metrics, memory controller traffic, LDS usage +- **`communication_bound.txt`**: PCIe traffic, atomic operations, synchronization +- **`full_profile.txt`**: Comprehensive set of all important metrics + +You can create custom counter files and reference them in your profiling commands. + +## Output Files + +After profiling, madengine writes outputs to the working directory: + +``` +rocprof_output/ +├── / +│ ├── *_results.db # ROCprofv3 database (SQLite) +│ ├── kernel_trace.csv # Kernel execution traces +│ ├── hip_api_trace.csv # HIP API calls +│ └── memory_copy_trace.csv # Memory transfers +├── model_trace.pftrace # Perfetto format (if using rocprofv3_perfetto) +└── trace.json # JSON format (if using rocprofv3_lightweight) + +gpu_info_power_profiler_output.csv # Power consumption over time +gpu_info_vram_profiler_output.csv # VRAM usage over time +library_trace.csv # Library API calls (if library tracing enabled) +``` + +## Visualization + +### Perfetto UI (Recommended) +```bash +# If using rocprofv3_perfetto or output-format pftrace +# Upload files to https://ui.perfetto.dev/ +``` + +### Custom Analysis +```python +import sqlite3 +import pandas as pd + +# Parse ROCprofv3 database +conn = sqlite3.connect('rocprof_output//*_results.db') +kernels = pd.read_sql_query("SELECT * FROM kernels", conn) +print(kernels.head()) +``` + +## Best Practices + +1. **Start lightweight**: Use `rocprofv3_lightweight` for initial profiling +2. **Target your bottleneck**: Use specific profiles (compute/memory/communication) based on initial findings +3. **Avoid full profiling in production**: `rocprofv3_full` adds 20-50% overhead +4. **Multi-GPU**: Always enable RCCL tracing for distributed workloads +5. **Sampling rates**: Reduce sampling rates for long-running jobs (e.g., 1.0 instead of 0.1) +6. **Counter multiplexing**: ROCprofv3 may need multiple runs if too many counters are requested + +## Troubleshooting + +### No output files generated +```bash +# Check if rocprofv3 is available +which rocprofv3 +rocprofv3 --version + +# Verify ROCm version (>= 7.0 recommended for rocprofv3) +rocm-smi --version +``` + +### "Counter not available" errors +Some counters may not be available on all GPU architectures. Check available counters: +```bash +rocprofv3-avail +``` + +### High overhead affecting results +Use `rocprofv3_lightweight` or reduce counter collection: +```bash +# Remove counter collection for minimal overhead +madengine run --tags your_model \ + --additional-context '{ + "tools": [{ + "name": "rocprof", + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --kernel-trace --output-format json -d ./traces --" + }] + }' +``` + +## Additional Resources + +- [ROCprofv3 Official Documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3.html) +- [madengine Profiling Guide](../../docs/profiling.md) +- [ROCm Developer Hub](https://rocm.docs.amd.com/) +- [Perfetto Trace Viewer](https://ui.perfetto.dev/) + +## Examples + +### Example 1: Profile LLM Inference (Compute-Bound) +```bash +madengine run --tags pyt_vllm_llama2_7b \ + --additional-context-file examples/profiling-configs/rocprofv3_compute_bound.json +``` + +### Example 2: Profile Multi-GPU Training (Communication-Bound) +```bash +madengine run --tags pyt_torchtitan_llama3_8b \ + --additional-context-file examples/profiling-configs/rocprofv3_multi_gpu.json +``` + +### Example 3: Profile Image Model (Memory-Bound) +```bash +madengine run --tags pyt_torchvision_resnet50 \ + --additional-context-file examples/profiling-configs/rocprofv3_memory_bound.json +``` + +### Example 4: Quick Test with Dummy Model +```bash +madengine run --tags dummy_prof \ + --additional-context-file examples/profiling-configs/rocprofv3_lightweight.json +``` diff --git a/examples/profiling-configs/rocprofv3_comprehensive.json b/examples/profiling-configs/rocprofv3_comprehensive.json new file mode 100644 index 00000000..f5d922e4 --- /dev/null +++ b/examples/profiling-configs/rocprofv3_comprehensive.json @@ -0,0 +1,34 @@ +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [ + { + "name": "rocprofv3_full", + "env_vars": { + "RCCL_DEBUG": "INFO", + "HSA_ENABLE_SDMA": "0" + } + }, + { + "name": "gpu_info_power_profiler", + "env_vars": { + "POWER_DEVICE": "all", + "POWER_SAMPLING_RATE": "0.1", + "POWER_DUAL_GCD": "false" + } + }, + { + "name": "gpu_info_vram_profiler", + "env_vars": { + "VRAM_DEVICE": "all", + "VRAM_SAMPLING_RATE": "0.1" + } + }, + { + "name": "miopen_trace" + }, + { + "name": "rocblas_trace" + } + ] +} diff --git a/examples/profiling-configs/rocprofv3_compute_bound.json b/examples/profiling-configs/rocprofv3_compute_bound.json new file mode 100644 index 00000000..8d3419c9 --- /dev/null +++ b/examples/profiling-configs/rocprofv3_compute_bound.json @@ -0,0 +1,17 @@ +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [ + { + "name": "rocprofv3_compute", + "env_vars": {} + }, + { + "name": "gpu_info_power_profiler", + "env_vars": { + "POWER_DEVICE": "all", + "POWER_SAMPLING_RATE": "0.1" + } + } + ] +} diff --git a/examples/profiling-configs/rocprofv3_lightweight.json b/examples/profiling-configs/rocprofv3_lightweight.json new file mode 100644 index 00000000..f1f69e0f --- /dev/null +++ b/examples/profiling-configs/rocprofv3_lightweight.json @@ -0,0 +1,10 @@ +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [ + { + "name": "rocprofv3_lightweight", + "env_vars": {} + } + ] +} diff --git a/examples/profiling-configs/rocprofv3_memory_bound.json b/examples/profiling-configs/rocprofv3_memory_bound.json new file mode 100644 index 00000000..9b955747 --- /dev/null +++ b/examples/profiling-configs/rocprofv3_memory_bound.json @@ -0,0 +1,17 @@ +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "tools": [ + { + "name": "rocprofv3_memory", + "env_vars": {} + }, + { + "name": "gpu_info_vram_profiler", + "env_vars": { + "VRAM_DEVICE": "all", + "VRAM_SAMPLING_RATE": "0.1" + } + } + ] +} diff --git a/examples/profiling-configs/rocprofv3_multi_gpu.json b/examples/profiling-configs/rocprofv3_multi_gpu.json new file mode 100644 index 00000000..c463e768 --- /dev/null +++ b/examples/profiling-configs/rocprofv3_multi_gpu.json @@ -0,0 +1,31 @@ +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "docker_gpus": "0,1,2,3", + "distributed": { + "launcher": "torchrun", + "nproc_per_node": 4 + }, + "tools": [ + { + "name": "rocprofv3_communication", + "env_vars": { + "RCCL_DEBUG": "INFO" + } + }, + { + "name": "gpu_info_power_profiler", + "env_vars": { + "POWER_DEVICE": "all", + "POWER_SAMPLING_RATE": "0.1" + } + }, + { + "name": "gpu_info_vram_profiler", + "env_vars": { + "VRAM_DEVICE": "all", + "VRAM_SAMPLING_RATE": "0.1" + } + } + ] +} diff --git a/examples/profiling-configs/rocprofv3_multinode.json b/examples/profiling-configs/rocprofv3_multinode.json new file mode 100644 index 00000000..f349c0e1 --- /dev/null +++ b/examples/profiling-configs/rocprofv3_multinode.json @@ -0,0 +1,30 @@ +{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "slurm": { + "partition": "gpu", + "nodes": 4, + "gpus_per_node": 8, + "time": "12:00:00" + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 4, + "nproc_per_node": 8 + }, + "tools": [ + { + "name": "rocprofv3_communication", + "env_vars": { + "RCCL_DEBUG": "INFO" + } + }, + { + "name": "gpu_info_power_profiler", + "env_vars": { + "POWER_DEVICE": "all", + "POWER_SAMPLING_RATE": "1.0" + } + } + ] +} diff --git a/examples/slurm-configs/README.md b/examples/slurm-configs/README.md new file mode 100644 index 00000000..bf09299d --- /dev/null +++ b/examples/slurm-configs/README.md @@ -0,0 +1,757 @@ +# SLURM Configuration Examples + +This directory contains example configurations for deploying madengine workloads on SLURM HPC clusters. + +## 📋 Convention Over Configuration + +**No explicit `deploy` field needed!** The presence of a `slurm` field automatically indicates SLURM deployment: + +```json +{ + "slurm": { + "partition": "amd-rccl", + "nodes": 2, + "gpus_per_node": 8 + } +} +``` + +**⚠️ Important:** The default partition is `amd-rccl` (for AMD RCCL clusters). If your cluster uses a different partition name (e.g., `gpu`, `compute`, `batch`), override it in your configuration: + +```json +{ + "slurm": { + "partition": "your-partition-name" // Override default + } +} +``` + +**Check your cluster's partitions:** +```bash +sinfo -o "%P" # List all available partitions +``` + +The deployment type is **inferred** from the configuration structure: +- ✅ Deployment type (k8s/slurm/local) inferred from config structure +- ✅ Layered defaults: base → profile → user configuration +- ✅ Intelligent profile selection based on node count + +## 📁 Example Configurations + +### Training Configurations (`basic/`) + +| File | Description | Nodes | GPUs | Use Case | +|------|-------------|-------|------|----------| +| `01-torchrun-single-node-single-gpu.json` | Single GPU training | 1 | 1 | Quick tests, small models | +| `02-single-node-multi-gpu.json` | Single node, 8 GPUs | 1 | 8 | Single-node distributed workload | +| `03-multi-node-basic.json` | 2 nodes, 8 GPUs each | 2 | 16 | Multi-node distributed workload | +| `04-multi-node-advanced.json` | 4 nodes, advanced features | 4 | 32 | Production-scale training | + +### vLLM Inference Configurations (`basic/`) + +| File | Description | Nodes | GPUs | Use Case | +|------|-------------|-------|------|----------| +| `05-vllm-single-node.json` | Single node vLLM | 1 | 4 | Single-node LLM inference | +| `06-vllm-multi-node.json` | Multi-node vLLM | 2 | 8 | Multi-node LLM inference with Ray | + +### Minimal Examples (`minimal/`) + +Stripped-down configurations showing only essential fields: +- `single-gpu-minimal.json` - Minimal single GPU config +- `multi-gpu-minimal.json` - Minimal 8 GPU config +- `multi-node-minimal.json` - Minimal 2-node config +- `vllm-single-node-minimal.json` - Minimal vLLM single-node +- `vllm-multi-node-minimal.json` - Minimal vLLM multi-node + +## 🔄 Configuration Workflow + +Understanding how configurations flow through madengine: + +``` +┌──────────────────────────────────────────────────┐ +│ 1. Config File (*.json) │ +│ - Contains: slurm, distributed, env_vars │ +└──────────────────┬───────────────────────────────┘ + │ --additional-context-file + ↓ +┌──────────────────────────────────────────────────┐ +│ 2. madengine build │ +│ - BuildOrchestrator._save_deployment_config() │ +│ - Extracts env_vars, slurm, distributed │ +└──────────────────┬───────────────────────────────┘ + │ + ↓ +┌──────────────────────────────────────────────────┐ +│ 3. build_manifest.json │ +│ - deployment_config.env_vars (saved) │ +│ - deployment_config.slurm (saved) │ +└──────────────────┬───────────────────────────────┘ + │ --manifest-file + ↓ +┌──────────────────────────────────────────────────┐ +│ 4. madengine run │ +│ - RunOrchestrator._execute_*() │ +│ - Loads deployment_config from manifest │ +└──────────────────┬───────────────────────────────┘ + │ + ↓ +┌──────────────────────────────────────────────────┐ +│ 5. Docker Container Environment │ +│ - env_vars passed to container │ +│ - SLURM job submitted with configuration │ +└──────────────────────────────────────────────────┘ +``` + +**Key Points:** +- ✅ **Config files are the source of truth** - Don't edit `build_manifest.json` manually +- ✅ **Build phase embeds configuration** - Configuration is saved during build for use at runtime +- ✅ **Run phase uses manifest** - All settings come from the generated manifest +- ✅ **Environment variables flow automatically** - From config → manifest → Docker + +## 🚀 Quick Start + +### 1. Build-and-Run Workflow (Recommended) + +When using configuration files with `env_vars`, use the two-phase workflow: + +```bash +# SSH to SLURM login node first +ssh user@hpc-cluster.example.com + +# Phase 1: Build with configuration +MODEL_DIR=models/my-model madengine build \ + --tags model_tag \ + --additional-context-file examples/slurm-configs/03-multi-node-basic.json \ + --manifest-output build_manifest.json + +# Phase 2: Run from manifest +MODEL_DIR=models/my-model madengine run \ + --manifest-file build_manifest.json +``` + +**Why two phases?** +- Build phase embeds your `env_vars` and deployment config into the manifest +- Run phase uses the pre-configured manifest +- Ensures consistency across builds and deployments + +### 2. Direct Run (For Simple Cases) + +For quick tests without custom `env_vars`: + +```bash +madengine run --tags model_tag \ + --additional-context-file examples/slurm-configs/minimal/single-gpu-minimal.json +``` + +### 3. CLI Override + +```bash +madengine run --tags model_tag \ + --additional-context '{ + "slurm": { + "partition": "gpu", + "nodes": 2, + "gpus_per_node": 8, + "time": "24:00:00" + } + }' +``` + +### 4. Hybrid Approach (File + CLI Override) + +```bash +# Use base config, override specific fields +madengine run --tags model_tag \ + --additional-context-file examples/slurm-configs/03-multi-node-basic.json \ + --additional-context '{"slurm": {"nodes": 4, "time": "48:00:00"}}' +``` + +## 🔄 Distributed Workload Support + +The SLURM deployment **automatically configures distributed execution** for multi-node and multi-GPU setups (training with torchrun/deepspeed or inference with vLLM/SGLang): + +### How It Works + +1. **Environment Variables**: SLURM sets distributed execution environment (MASTER_ADDR, MASTER_PORT, RANK, etc.) +2. **MAD_MULTI_NODE_RUNNER**: Automatically configured with the appropriate `torchrun` command +3. **Docker Containers**: Environment variables are passed into containers via `docker_env_vars` +4. **Model Scripts**: Use `$MAD_MULTI_NODE_RUNNER` to launch training (see below) + +### Model Script Pattern + +Your model's run script should use the `MAD_MULTI_NODE_RUNNER` environment variable: + +```bash +#!/bin/bash +# Example: scripts/my_model/run.sh + +# MAD_MULTI_NODE_RUNNER is automatically set by madengine for distributed workloads +if [ -z "$MAD_MULTI_NODE_RUNNER" ]; then + # Fallback for standalone execution + N_GPUS="${MAD_RUNTIME_NGPUS:-1}" + MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node=$N_GPUS" +fi + +# Launch your Python training script with torchrun +$MAD_MULTI_NODE_RUNNER train.py --your-args +``` + +### Distributed Environment Variables + +The following variables are automatically available in your containers: + +| Variable | Description | Example | +|----------|-------------|---------| +| `MASTER_ADDR` | Master node address | `node001` | +| `MASTER_PORT` | Master communication port | `29500` | +| `WORLD_SIZE` | Total number of processes | `16` (2 nodes × 8 GPUs) | +| `RANK` | Global process rank | `0`, `1`, ... | +| `LOCAL_RANK` | Local GPU rank on node | `0-7` | +| `NNODES` | Number of nodes | `2` | +| `NPROC_PER_NODE` | GPUs per node | `8` | +| `MAD_MULTI_NODE_RUNNER` | Complete torchrun command | `torchrun --nnodes=2 ...` | + +### Example Configurations + +**Single-Node Multi-GPU (Data Parallel)**: +```json +{ + "slurm": { + "nodes": 1, + "gpus_per_node": 8 + } +} +``` +→ `MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node=8"` + +**Multi-Node Distributed Workload**: +```json +{ + "slurm": { + "nodes": 4, + "gpus_per_node": 8 + } +} +``` +→ `MAD_MULTI_NODE_RUNNER="torchrun --nnodes=4 --nproc_per_node=8 --node_rank=$SLURM_PROCID --master_addr=$MASTER_ADDR --master_port=29500"` + +### Verification + +Check that distributed execution is configured correctly: + +```bash +# In your SLURM output logs, you should see: +Distributed Execution Configuration: + NNODES: 2 + GPUS_PER_NODE: 8 + TOTAL_GPUS: 16 + MASTER_ADDR: node001 + MASTER_PORT: 29500 + NODE_RANK: 0 + Launcher: torchrun (distributed) + MAD_MULTI_NODE_RUNNER: torchrun --nnodes=2 --nproc_per_node=8 ... +``` + +## 🚀 vLLM Inference Configurations + +vLLM is a high-throughput LLM inference engine. madengine provides pre-configured setups for both single-node and multi-node deployments. + +### Memory Management + +vLLM configurations include critical memory management environment variables to prevent OOM (Out of Memory) errors, especially in multi-node deployments with pipeline parallelism. + +#### Key Environment Variables + +**1. `VLLM_KV_CACHE_SIZE`** +- **Purpose**: Limits the percentage of GPU memory allocated for KV cache +- **Default in configs**: `0.8` (80% of available GPU memory) +- **Why needed**: Prevents vLLM from aggressively allocating all available memory, which can cause fragmentation and OOM errors +- **Tuning**: + - Increase (e.g., `0.9`) if you have large memory headroom + - Decrease (e.g., `0.6`, `0.7`) if experiencing OOM errors + +**2. `PYTORCH_CUDA_ALLOC_CONF`** +- **Purpose**: Configures PyTorch's CUDA/HIP memory allocator +- **Value**: `expandable_segments:True` +- **Why needed**: Reduces memory fragmentation by allowing the allocator to expand memory segments dynamically +- **Reference**: [PyTorch Memory Management](https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) + +### vLLM Configuration Files + +**Single-Node Configurations:** +- `05-vllm-single-node.json` - Full single-node config with NCCL settings +- `vllm-single-node-minimal.json` - Minimal single-node config (in `minimal/` directory) + +**Multi-Node Configurations:** +- `06-vllm-multi-node.json` - Full multi-node config with NCCL and Ray settings +- `vllm-multi-node-minimal.json` - Minimal multi-node config (in `minimal/` directory) + +### vLLM Workflow Example + +```bash +# 1. Build with vLLM configuration +MODEL_DIR=models/llama2-70b madengine build \ + --tags vllm \ + --additional-context-file examples/slurm-configs/basic/06-vllm-multi-node.json \ + --manifest-output build_manifest.json + +# 2. Verify memory management env_vars were embedded +grep -A 10 "env_vars" build_manifest.json +# Should show: +# "VLLM_KV_CACHE_SIZE": "0.8" +# "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True" + +# 3. Run the inference job +MODEL_DIR=models/llama2-70b madengine run \ + --manifest-file build_manifest.json +``` + +### vLLM Parallelism Strategies + +vLLM automatically selects parallelism based on your configuration: + +**Single-Node (TP only)**: +```json +{ + "slurm": { + "nodes": 1, + "gpus_per_node": 4 + }, + "distributed": { + "launcher": "vllm", + "nnodes": 1, + "nproc_per_node": 4 + } +} +``` +→ **Tensor Parallelism (TP) = 4** across GPUs + +**Multi-Node (TP + PP)**: +```json +{ + "slurm": { + "nodes": 2, + "gpus_per_node": 4 + }, + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 4 + } +} +``` +→ **Tensor Parallelism (TP) = 4** within each node +→ **Pipeline Parallelism (PP) = 2** across nodes +→ **Requires Ray cluster** for multi-node coordination + +## ⚙️ Configuration Layers + +madengine uses intelligent multi-layer configuration merging: + +``` +┌─────────────────────────────────┐ +│ 1. Base Defaults │ ← slurm/defaults.json +│ (partition, time, etc.) │ +├─────────────────────────────────┤ +│ 2. Profile Selection │ ← single-node.json or multi-node.json +│ (auto-selected by nodes) │ (based on nodes count) +├─────────────────────────────────┤ +│ 3. User Configuration │ ← Your config file + CLI args +│ (highest priority) │ +└─────────────────────────────────┘ +``` + +### Profile Auto-Selection + +- **Single-node profile**: Applied when `nodes == 1` +- **Multi-node profile**: Applied when `nodes > 1` + +## 📝 Configuration Reference + +### SLURM Section + +```json +{ + "slurm": { + "partition": "amd-rccl", // SLURM partition name (default: amd-rccl) + "nodes": 2, // Number of nodes + "gpus_per_node": 8, // GPUs per node + "time": "24:00:00", // Wall time (HH:MM:SS) + "output_dir": "./slurm_output", // Local output directory + "results_dir": "/shared/results", // Shared results collection + "shared_workspace": "/shared/workspace", // Shared workspace (NFS/Lustre) + "exclusive": true, // Exclusive node access + "qos": "high", // Quality of Service + "account": "project-name", // SLURM account + "network_interface": "ib0", // Network interface (ib0/eth0) + "modules": ["rocm/5.7.0"] // Environment modules to load + } +} +``` + +### Distributed Execution Section + +```json +{ + "distributed": { + "launcher": "torchrun", // Launcher type: torchrun, vllm, sglang, deepspeed, megatron + "backend": "nccl", // Communication backend (nccl/gloo) + "port": 29500, // Master node port + "nnodes": 2, // Number of nodes (overrides slurm.nodes if set) + "nproc_per_node": 8 // GPUs per node (overrides slurm.gpus_per_node if set) + } +} +``` + +**Supported Launchers:** +- `torchrun`: PyTorch distributed training (default) +- `vllm`: vLLM inference engine (TP/PP parallelism) +- `sglang`: SGLang inference engine +- `deepspeed`: DeepSpeed training framework +- `megatron`: Megatron-LM large model training +- Custom: Set environment variables, model script handles launcher + +**Note**: For vLLM and SGLang, the model script handles process spawning directly. +For torchrun/deepspeed/megatron, use `$MAD_MULTI_NODE_RUNNER` in your model script. + +### Environment Variables + +```json +{ + "env_vars": { + "NCCL_DEBUG": "WARN", + "NCCL_SOCKET_IFNAME": "ib0", + "OMP_NUM_THREADS": "8", + "MIOPEN_FIND_MODE": "1", + "VLLM_KV_CACHE_SIZE": "0.8", + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True" + } +} +``` + +**Note**: Environment variables set in config files are: +1. Saved to `deployment_config.env_vars` during `build` phase +2. Automatically passed to Docker containers during `run` phase +3. Available to your model scripts inside containers + +## 🔍 Common Use Cases + +### Testing on Single GPU + +```bash +madengine run --tags my_model \ + --additional-context-file examples/slurm-configs/minimal/single-gpu-minimal.json +``` + +### Multi-Node Training + +```bash +# Build with config +MODEL_DIR=models/my-model madengine build \ + --tags training \ + --additional-context-file examples/slurm-configs/03-multi-node-basic.json + +# Run from manifest +MODEL_DIR=models/my-model madengine run \ + --manifest-file build_manifest.json +``` + +### vLLM Single-Node Inference + +```bash +# Build with vLLM config +MODEL_DIR=models/llama2-13b madengine build \ + --tags vllm \ + --additional-context-file examples/slurm-configs/basic/05-vllm-single-node.json + +# Run inference +MODEL_DIR=models/llama2-13b madengine run \ + --manifest-file build_manifest.json +``` + +### vLLM Multi-Node Inference + +```bash +# Build with multi-node vLLM config +MODEL_DIR=models/llama2-70b madengine build \ + --tags vllm \ + --additional-context-file examples/slurm-configs/basic/06-vllm-multi-node.json + +# Run multi-node inference +MODEL_DIR=models/llama2-70b madengine run \ + --manifest-file build_manifest.json +``` + +### Production Deployment with Shared Storage + +```bash +madengine build --tags my_model \ + --additional-context-file examples/slurm-configs/04-multi-node-advanced.json + +madengine run --manifest-file build_manifest.json +``` + +### Custom vLLM Memory Settings + +For custom memory configurations, create a new config file: + +```json +{ + "slurm": { + "partition": "amd-rccl", + "nodes": 2, + "gpus_per_node": 8, + "time": "04:00:00" + }, + + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 8 + }, + + "env_vars": { + "VLLM_KV_CACHE_SIZE": "0.7", + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", + "HSA_FORCE_FINE_GRAIN_PCIE": "1" + } +} +``` + +## 🛠️ Advanced Features + +### Custom Environment Modules + +Load specific software versions: + +```json +{ + "slurm": { + "modules": [ + "rocm/5.7.0", + "gcc/11.2.0", + "openmpi/4.1.4" + ] + } +} +``` + +### Shared Filesystem + +Configure shared workspace and data: + +```json +{ + "slurm": { + "shared_workspace": "/lustre/workspace", + "results_dir": "/lustre/results" + }, + "shared_data": "/lustre/datasets" +} +``` + +### Network Configuration + +For InfiniBand clusters: + +```json +{ + "slurm": { + "network_interface": "ib0" + }, + "env_vars": { + "NCCL_SOCKET_IFNAME": "ib0", + "NCCL_IB_DISABLE": "0", + "NCCL_IB_HCA": "mlx5_0:1,mlx5_1:1" + } +} +``` + +## 📊 Monitoring Jobs + +After submission, monitor your SLURM job: + +```bash +# Check job status +squeue -u $USER + +# View job details +scontrol show job + +# View output logs (real-time) +tail -f slurm_output/madengine-*__*.out + +# View error logs +tail -f slurm_output/madengine-*__*.err + +# Cancel job if needed +scancel +``` + +## 🐛 Troubleshooting + +### Job Fails Immediately + +- Check SLURM partition exists: `sinfo` +- Verify GPU resources available: `sinfo -o "%P %.5a %.10l %.6D %.6t %N %G"` +- Check SLURM account/QoS settings +- Review job script: `slurm_output/madengine_*.sh` + +### Out of Memory Errors + +**General OOM**: +- Reduce batch size or model size +- Use gradient accumulation +- Enable CPU offloading +- Check available GPU memory: `rocm-smi` or `amd-smi` + +**vLLM-Specific OOM** (`torch.OutOfMemoryError: HIP out of memory`): + +**Symptom**: Error during vLLM initialization or KV cache allocation: +``` +torch.OutOfMemoryError: HIP out of memory. Tried to allocate 22.14 GiB. +GPU has a total capacity of 191.98 GiB of which 145.02 GiB is free. +``` + +**Root Cause**: Memory fragmentation or aggressive KV cache allocation + +**Solutions**: +1. **Reduce KV cache size**: + ```json + "env_vars": { + "VLLM_KV_CACHE_SIZE": "0.6" // Try 0.6 or 0.7 + } + ``` +2. **Enable expandable segments** (should already be in configs): + ```json + "env_vars": { + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True" + } + ``` +3. **Reduce parallelism**: Use fewer GPUs or nodes for smaller models +4. **Check GPU memory**: `rocm-smi` or `amd-smi` to verify available memory +5. **Rebuild with updated config**: Don't edit `build_manifest.json` - update the source config file and rebuild + +### NCCL/Communication Errors + +- Verify network interface name: `ifconfig` or `ip addr` +- Check InfiniBand status: `ibstat` (if using IB) +- Test connectivity between nodes +- Set correct `NCCL_SOCKET_IFNAME` in `env_vars` + +### vLLM Ray Connection Failures + +**Symptom**: `Failed to connect to GCS at address :6379` + +**Solutions**: +1. Check network connectivity between nodes +2. Ensure Ray port (6379) is accessible +3. Verify NCCL/RCCL environment variables are set correctly +4. For smaller models, consider using tensor parallelism only (single node) + +### Module Load Failures + +- List available modules: `module avail` +- Check module syntax: `module load rocm/5.7.0` (manual test) +- Verify module names match cluster configuration + +## 💡 Best Practices + +### General + +1. **Start Small**: Test on single GPU first, then scale up +2. **Use Configuration Files**: Prefer config files over CLI arguments for reproducibility +3. **Build-Then-Run**: Use two-phase workflow when configs include `env_vars` +4. **Use Shared Storage**: Configure shared workspace for multi-node jobs +5. **Network Configuration**: Properly configure NCCL for your network fabric +6. **Resource Requests**: Request exclusive node access for large jobs +7. **Time Limits**: Set realistic wall times (add buffer for checkpointing) +8. **Output Collection**: Use `results_dir` to collect outputs from all nodes + +### vLLM-Specific + +1. **Memory Management**: Always include `VLLM_KV_CACHE_SIZE` and `PYTORCH_CUDA_ALLOC_CONF` +2. **Start Conservative**: Use `VLLM_KV_CACHE_SIZE: "0.8"` initially, tune if needed +3. **Test Locally First**: Validate vLLM configs on single-node before scaling to multi-node +4. **Monitor Memory**: Check GPU memory usage during initialization +5. **Don't Edit Manifests**: Always modify source config files, not generated `build_manifest.json` +6. **Rebuild After Changes**: Re-run `build` phase when changing `env_vars` + +### Configuration Management + +1. **Version Control**: Keep your config files in git +2. **Naming Convention**: Use descriptive names (e.g., `my-project-vllm-8gpu.json`) +3. **Documentation**: Add `_comment` and `_description` fields to configs +4. **Reusability**: Create base configs and override specific fields +5. **Validation**: Test configs on small scale before production runs + +## 🎯 Example Workflow + +### Standard Training Workflow + +```bash +# 1. SSH to SLURM login node +ssh user@hpc-cluster.example.com + +# 2. Load any required modules (if needed before madengine) +module load python/3.9 + +# 3. Build with configuration +MODEL_DIR=models/my-model madengine build \ + --tags llama2_training \ + --additional-context-file examples/slurm-configs/03-multi-node-basic.json \ + --manifest-output build_manifest.json + +# 4. Run from manifest +MODEL_DIR=models/my-model madengine run \ + --manifest-file build_manifest.json + +# 5. Monitor job +watch squeue -u $USER + +# 6. Check logs when complete +ls -lh slurm_output/ +tail -f slurm_output/madengine-*__*.out +``` + +### vLLM Inference Workflow + +```bash +# 1. SSH to SLURM login node +ssh user@hpc-cluster.example.com + +# 2. Build vLLM image with memory management config +MODEL_DIR=models/llama2-70b madengine build \ + --tags vllm \ + --additional-context-file examples/slurm-configs/basic/06-vllm-multi-node.json \ + --manifest-output build_manifest.json + +# 3. Verify configuration was embedded +grep -A 5 "VLLM_KV_CACHE_SIZE" build_manifest.json + +# 4. Submit inference job +MODEL_DIR=models/llama2-70b madengine run \ + --manifest-file build_manifest.json + +# 5. Monitor for OOM errors +tail -f slurm_output/madengine-*__*.err | grep -i "memory" + +# 6. If OOM occurs, adjust config and rebuild +# Edit your config file to set VLLM_KV_CACHE_SIZE to 0.6 or 0.7 +# Then repeat steps 2-4 +``` + +## 📚 Related Documentation + +- [How to Run Multi-Node](../../docs/how-to-run-multi-node.md) +- [K8s Configuration Examples](../k8s-configs/) +- [SLURM Official Documentation](https://slurm.schedmd.com/) +- [vLLM Documentation](https://docs.vllm.ai/) +- [PyTorch Distributed Training](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) +- [vLLM Distributed Inference](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) +- [SGLang Distributed Serving](https://sgl-project.github.io/) + +--- + +**Note**: All configurations assume you've already SSH'd to the SLURM login node. madengine runs `sbatch` locally on the login node - no remote SSH handling needed. diff --git a/examples/slurm-configs/basic/01-single-node-single-gpu.json b/examples/slurm-configs/basic/01-single-node-single-gpu.json new file mode 100644 index 00000000..52324c1a --- /dev/null +++ b/examples/slurm-configs/basic/01-single-node-single-gpu.json @@ -0,0 +1,25 @@ +{ + "_comment": "Single Node, Single GPU - Basic SLURM Configuration", + "_description": "Configuration for running a model on a single GPU on a SLURM cluster", + "_use_case": "Testing, small models, quick benchmarks (single GPU, no distributed execution)", + "_note": "Using 'amd-rccl' partition. Change to your cluster's partition name if different (e.g., 'gpu', 'compute').", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 1, + "time": "01:00:00", + "output_dir": "./slurm_output", + "exclusive": false + }, + + "env_vars": { + "OMP_NUM_THREADS": "8" + }, + + "debug": false +} + diff --git a/examples/slurm-configs/basic/02-single-node-multi-gpu.json b/examples/slurm-configs/basic/02-single-node-multi-gpu.json new file mode 100644 index 00000000..f72fd0e2 --- /dev/null +++ b/examples/slurm-configs/basic/02-single-node-multi-gpu.json @@ -0,0 +1,33 @@ +{ + "_comment": "Single Node, Multi-GPU (8 GPUs) - SLURM Configuration", + "_description": "Configuration for running a model on 8 GPUs on a single SLURM node", + "_use_case": "Single-node distributed workload, large models requiring multiple GPUs", + "_note": "Using 'amd-rccl' partition. Change to your cluster's partition name if different.", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 8, + "time": "12:00:00", + "output_dir": "./slurm_output", + "exclusive": true + }, + + "distributed": { + "launcher": "torchrun", + "backend": "nccl", + "nnodes": 1, + "nproc_per_node": 8 + }, + + "env_vars": { + "OMP_NUM_THREADS": "8", + "NCCL_DEBUG": "WARN" + }, + + "debug": false +} + diff --git a/examples/slurm-configs/basic/03-multi-node-basic.json b/examples/slurm-configs/basic/03-multi-node-basic.json new file mode 100644 index 00000000..9c1f2de0 --- /dev/null +++ b/examples/slurm-configs/basic/03-multi-node-basic.json @@ -0,0 +1,47 @@ +{ + "_comment": "Multi-Node (2 nodes, 8 GPUs each) - SLURM Configuration", + "_description": "Configuration for distributed workload across 2 nodes with 8 GPUs per node (16 GPUs total)", + "_use_case": "Multi-node distributed execution for large models (training or inference)", + "_note": "Target is auto-detected as 'slurm' from presence of 'slurm' config section", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 2, + "gpus_per_node": 8, + "time": "24:00:00", + "output_dir": "./slurm_output", + "exclusive": true, + "network_interface": "eth0" + }, + + "distributed": { + "launcher": "torchrun", + "backend": "nccl", + "port": 29500, + "nnodes": 2, + "nproc_per_node": 8 + }, + + "env_vars": { + "NCCL_DEBUG": "WARN", + "NCCL_DEBUG_SUBSYS": "INIT,NET", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "GPU_MAX_HW_QUEUES": "2", + "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", + "NCCL_TIMEOUT": "600", + "HSA_ENABLE_SDMA": "0", + "OMP_NUM_THREADS": "8", + "MIOPEN_FIND_MODE": "1", + "MIOPEN_USER_DB_PATH": "/tmp/.miopen", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "RCCL_ENABLE_HIPGRAPH": "0" + }, + + "debug": false +} + diff --git a/examples/slurm-configs/basic/04-multi-node-advanced.json b/examples/slurm-configs/basic/04-multi-node-advanced.json new file mode 100644 index 00000000..bf30fb0b --- /dev/null +++ b/examples/slurm-configs/basic/04-multi-node-advanced.json @@ -0,0 +1,71 @@ +{ + "_comment": "Multi-Node (4 nodes, 8 GPUs each) - Advanced SLURM Configuration", + "_description": "Configuration for large-scale distributed workloads with advanced options", + "_use_case": "Production-scale multi-node training with custom workspace and results collection", + "_note": "Using 'amd-rccl' partition. Adjust for your cluster if needed.", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 4, + "gpus_per_node": 8, + "time": "48:00:00", + "output_dir": "./slurm_output", + "results_dir": "/shared/results", + "shared_workspace": "/shared/workspace", + "exclusive": true, + "qos": "high", + "account": "research-project", + "network_interface": "ib0", + "modules": [ + "rocm/5.7.0", + "gcc/11.2.0", + "openmpi/4.1.4" + ] + }, + + "distributed": { + "launcher": "torchrun", + "backend": "nccl", + "port": 29500, + "nnodes": 4, + "nproc_per_node": 8 + }, + + "env_vars": { + "NCCL_DEBUG": "INFO", + "NCCL_DEBUG_SUBSYS": "INIT,NET", + "NCCL_IB_DISABLE": "0", + "NCCL_IB_HCA": "mlx5_0:1,mlx5_1:1", + "NCCL_SOCKET_IFNAME": "ib0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "GPU_MAX_HW_QUEUES": "2", + "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", + "NCCL_TIMEOUT": "1200", + "HSA_ENABLE_SDMA": "0", + "OMP_NUM_THREADS": "16", + "MIOPEN_FIND_MODE": "1", + "MIOPEN_USER_DB_PATH": "/tmp/.miopen", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "RCCL_ENABLE_HIPGRAPH": "0", + "NCCL_BUFFSIZE": "8388608", + "NCCL_P2P_LEVEL": "NVL" + }, + + "shared_data": "/shared/datasets", + + "_notes": { + "description": "Advanced configuration with InfiniBand, shared storage, and custom SLURM settings", + "modules": "Load required environment modules before job execution", + "qos": "Quality of Service level for job priority", + "account": "SLURM account for resource allocation tracking", + "results_dir": "Shared directory for collecting results from all nodes", + "shared_workspace": "Shared filesystem for job execution (NFS/Lustre)", + "shared_data": "Shared dataset location accessible from all nodes" + }, + + "debug": false +} + diff --git a/examples/slurm-configs/basic/05-vllm-single-node.json b/examples/slurm-configs/basic/05-vllm-single-node.json new file mode 100644 index 00000000..a632bc0d --- /dev/null +++ b/examples/slurm-configs/basic/05-vllm-single-node.json @@ -0,0 +1,37 @@ +{ + "_comment": "vLLM Single Node Multi-GPU - Inference Configuration", + "_description": "vLLM inference with tensor parallelism on single node", + "_use_case": "High-throughput LLM inference on single node with multiple GPUs", + "_note": "vLLM uses tensor parallelism to split model across GPUs", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 4, + "time": "02:00:00", + "output_dir": "./slurm_output", + "exclusive": true + }, + + "distributed": { + "launcher": "vllm", + "nnodes": 1, + "nproc_per_node": 4 + }, + + "pre_scripts": [], + + "env_vars": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", + "VLLM_USE_MODELSCOPE": "False", + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + "VLLM_KV_CACHE_SIZE": "0.7", + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "NCCL_DEBUG": "WARN" + } +} + diff --git a/examples/slurm-configs/basic/06-vllm-multi-node.json b/examples/slurm-configs/basic/06-vllm-multi-node.json new file mode 100644 index 00000000..b76d34b9 --- /dev/null +++ b/examples/slurm-configs/basic/06-vllm-multi-node.json @@ -0,0 +1,66 @@ +{ + "_comment": "vLLM Multi-Node Data Parallelism - Benchmark Configuration", + "_description": "vLLM inference with Data Parallelism across nodes for high throughput", + "_use_case": "Benchmarking vLLM with independent replicas per node", + "_strategy": "Data Parallelism: Each node runs independent replica with Tensor Parallelism", + "_benefits": [ + "Simpler setup - no shared Ray cluster", + "Faster initialization - parallel node startup", + "More robust - nodes are independent", + "Better throughput - parallel processing", + "Ideal for benchmarking and production serving" + ], + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 2, + "gpus_per_node": 4, + "time": "00:45:00", + "output_dir": "./slurm_output", + "exclusive": true, + + "_comment_node_check": "Preflight GPU health check (helps avoid OOM from stale processes)", + "enable_node_check": true, + "auto_cleanup_nodes": false, + "verbose_node_check": false + }, + + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 4, + "backend": "nccl", + "port": 29500, + "_note": "Data Parallelism: Each node runs independently, no cross-node communication needed" + }, + + "pre_scripts": [], + + "env_vars": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", + "VLLM_USE_MODELSCOPE": "False", + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + + "_comment_memory": "Higher GPU utilization for Data Parallelism (no PP overhead)", + "VLLM_KV_CACHE_SIZE": "0.8", + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + + "_comment_timeouts": "Reduced timeouts for faster failure detection in DP mode", + "NCCL_TIMEOUT": "300", + "VLLM_ENGINE_ITERATION_TIMEOUT_S": "120", + "RAY_health_check_timeout_ms": "30000", + "RAY_gcs_rpc_server_reconnect_timeout_s": "60", + + "_comment_nccl": "NCCL settings for within-node tensor parallelism", + "NCCL_DEBUG": "WARN", + "NCCL_DEBUG_SUBSYS": "INIT,NET", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1" + } +} + diff --git a/examples/slurm-configs/basic/07-sglang-single-node.json b/examples/slurm-configs/basic/07-sglang-single-node.json new file mode 100644 index 00000000..ad82947a --- /dev/null +++ b/examples/slurm-configs/basic/07-sglang-single-node.json @@ -0,0 +1,41 @@ +{ + "_comment": "SGLang Single Node Multi-GPU - Inference Configuration", + "_description": "SGLang inference with tensor parallelism on single node", + "_use_case": "High-throughput LLM inference on single node with multiple GPUs", + "_note": "SGLang uses tensor parallelism to split model across GPUs", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 4, + "time": "02:00:00", + "output_dir": "./slurm_output", + "exclusive": true + }, + + "distributed": { + "launcher": "sglang", + "nnodes": 1, + "nproc_per_node": 4 + }, + + "env_vars": { + "SGLANG_ALLOW_LONG_MAX_MODEL_LEN": "1", + "SGLANG_USE_MODELSCOPE": "False", + "SGLANG_ENABLE_FLASHINFER": "1", + "SGLANG_ENABLE_RADIX_CACHE": "1", + "SGLANG_RADIX_CACHE_SIZE": "0.9", + "SGLANG_LOGGING_LEVEL": "INFO", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "HSA_ENABLE_SDMA": "0", + "GPU_MAX_HW_QUEUES": "2", + "NCCL_DEBUG": "WARN", + "NCCL_MIN_NCHANNELS": "16", + "RAY_DEDUP_LOGS": "1", + "RAY_BACKEND_LOG_LEVEL": "warning" + } +} + diff --git a/examples/slurm-configs/basic/08-sglang-multi-node.json b/examples/slurm-configs/basic/08-sglang-multi-node.json new file mode 100644 index 00000000..7519b513 --- /dev/null +++ b/examples/slurm-configs/basic/08-sglang-multi-node.json @@ -0,0 +1,47 @@ +{ + "_comment": "SGLang Multi-Node Multi-GPU - Distributed Inference Configuration", + "_description": "SGLang inference with tensor + data parallelism across nodes", + "_use_case": "High-throughput LLM inference requiring multiple nodes", + "_note": "SGLang uses tensor parallelism within nodes and data parallelism across nodes", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 2, + "gpus_per_node": 4, + "time": "04:00:00", + "output_dir": "./slurm_output", + "exclusive": true + }, + + "distributed": { + "launcher": "sglang", + "nnodes": 2, + "nproc_per_node": 4, + "backend": "nccl", + "port": 29500 + }, + + "env_vars": { + "SGLANG_ALLOW_LONG_MAX_MODEL_LEN": "1", + "SGLANG_USE_MODELSCOPE": "False", + "SGLANG_ENABLE_FLASHINFER": "1", + "SGLANG_ENABLE_RADIX_CACHE": "1", + "SGLANG_RADIX_CACHE_SIZE": "0.9", + "SGLANG_LOGGING_LEVEL": "INFO", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "HSA_ENABLE_SDMA": "0", + "GPU_MAX_HW_QUEUES": "2", + "NCCL_DEBUG": "WARN", + "NCCL_DEBUG_SUBSYS": "INIT,NET", + "NCCL_MIN_NCHANNELS": "16", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "RAY_DEDUP_LOGS": "1", + "RAY_BACKEND_LOG_LEVEL": "warning" + } +} + diff --git a/examples/slurm-configs/basic/09-megatron-lm-multi-node.json b/examples/slurm-configs/basic/09-megatron-lm-multi-node.json new file mode 100644 index 00000000..84e3c3f6 --- /dev/null +++ b/examples/slurm-configs/basic/09-megatron-lm-multi-node.json @@ -0,0 +1,34 @@ +{ + "_comment": "Megatron-LM Multi-Node Training Configuration", + "_description": "Large-scale transformer training with Megatron-LM on SLURM", + "_use_case": "Multi-node Megatron-LM training with tensor and pipeline parallelism", + "_reference": "https://github.com/NVIDIA/Megatron-LM", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "gpu", + "account": "research", + "nodes": 4, + "gpus_per_node": 8, + "time": "24:00:00", + "mem": "256G" + }, + + "distributed": { + "launcher": "megatron", + "nnodes": 4, + "nproc_per_node": 8, + "master_port": 29500 + }, + + "env_vars": { + "OMP_NUM_THREADS": "16", + "NCCL_DEBUG": "INFO", + "NCCL_IB_DISABLE": "0" + }, + + "debug": false +} + diff --git a/examples/slurm-configs/basic/cluster-amd-rccl.json b/examples/slurm-configs/basic/cluster-amd-rccl.json new file mode 100644 index 00000000..e15c0a6f --- /dev/null +++ b/examples/slurm-configs/basic/cluster-amd-rccl.json @@ -0,0 +1,41 @@ +{ + "_comment": "AMD RCCL Cluster Configuration (useocpslog-002)", + "_description": "Configuration for the AMD RCCL cluster with correct partition names", + "_cluster_info": { + "hostname": "useocpslog-002", + "partition": "amd-rccl (NOT 'gpu')", + "default_account": "amd-rccl", + "default_qos": "normal", + "discovery_command": "sinfo -o '%P %.5a %.10l %.6D %.6t %N %G'" + }, + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "account": "amd-rccl", + "qos": "normal", + "nodes": 1, + "gpus_per_node": 8, + "time": "12:00:00", + "output_dir": "./slurm_output", + "exclusive": true + }, + + "distributed": { + "backend": "nccl", + "port": 29500 + }, + + "env_vars": { + "NCCL_DEBUG": "WARN", + "NCCL_SOCKET_IFNAME": "eth0", + "OMP_NUM_THREADS": "8", + "MIOPEN_FIND_MODE": "1", + "MIOPEN_USER_DB_PATH": "/tmp/.miopen" + }, + + "debug": false +} + diff --git a/examples/slurm-configs/basic/sglang-disagg-custom-split.json b/examples/slurm-configs/basic/sglang-disagg-custom-split.json new file mode 100644 index 00000000..291a5938 --- /dev/null +++ b/examples/slurm-configs/basic/sglang-disagg-custom-split.json @@ -0,0 +1,59 @@ +{ + "_comment": "SGLang Disaggregated SLURM Config - Custom Prefill/Decode Split", + "_description": "7 nodes with custom split: 1 proxy + 4 prefill + 2 decode", + "_use_case": "Workload with long prompts requiring more prefill capacity", + "_architecture": { + "proxy": "Node 0 (Load Balancer)", + "prefill": "Nodes 1-4 (4 nodes, 57% - custom)", + "decode": "Nodes 5-6 (2 nodes, 29% - custom)", + "total": "7 nodes total", + "default_would_be": "2 prefill + 4 decode (2/4 split)", + "custom_override": "4 prefill + 2 decode (4/2 split)" + }, + "_note": "Custom split allows optimization for prompt-heavy workloads", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 7, + "gpus_per_node": 8, + "time": "04:00:00", + "output_dir": "./slurm_output", + "exclusive": true + }, + + "distributed": { + "launcher": "sglang-disagg", + "nnodes": 7, + "nproc_per_node": 8, + "backend": "nccl", + "port": 29500, + "sglang_disagg": { + "prefill_nodes": 4, + "decode_nodes": 2 + } + }, + + "env_vars": { + "SGLANG_ALLOW_LONG_MAX_MODEL_LEN": "1", + "SGLANG_USE_MODELSCOPE": "False", + "SGLANG_ENABLE_RADIX_CACHE": "1", + "SGLANG_RADIX_CACHE_SIZE": "0.9", + "SGLANG_LOGGING_LEVEL": "INFO", + "SGLANG_DISAGG_TRANSFER_BACKEND": "mooncake", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "HSA_ENABLE_SDMA": "0", + "GPU_MAX_HW_QUEUES": "2", + "NCCL_DEBUG": "WARN", + "NCCL_MIN_NCHANNELS": "16", + "NCCL_IB_DISABLE": "0", + "NCCL_IB_HCA": "mlx5_0", + "NCCL_SOCKET_IFNAME": "ib0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "RAY_DEDUP_LOGS": "1", + "RAY_BACKEND_LOG_LEVEL": "warning" + } +} + diff --git a/examples/slurm-configs/basic/sglang-disagg-multi-node.json b/examples/slurm-configs/basic/sglang-disagg-multi-node.json new file mode 100644 index 00000000..0c5ec00d --- /dev/null +++ b/examples/slurm-configs/basic/sglang-disagg-multi-node.json @@ -0,0 +1,56 @@ +{ + "_comment": "SGLang Disaggregated Multi-Node - Distributed Inference Configuration", + "_description": "SGLang disaggregated inference with specialized prefill/decode clusters", + "_use_case": "Large-scale LLM inference requiring disaggregated prefill/decode", + "_architecture": { + "proxy": "Node 0 (Load Balancer)", + "prefill": "Nodes 1-2 (2 nodes, ~40% of workers)", + "decode": "Nodes 3-4 (2 nodes, ~60% of workers)", + "total": "5 nodes total", + "tensor_parallel": "8 GPUs per node" + }, + "_note": "SGLang Disaggregated separates prefill and decode into specialized clusters connected via Mooncake", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 5, + "gpus_per_node": 8, + "time": "04:00:00", + "output_dir": "./slurm_output", + "exclusive": true + }, + + "distributed": { + "launcher": "sglang-disagg", + "nnodes": 5, + "nproc_per_node": 8, + "backend": "nccl", + "port": 29500 + }, + + "env_vars": { + "SGLANG_ALLOW_LONG_MAX_MODEL_LEN": "1", + "SGLANG_USE_MODELSCOPE": "False", + "SGLANG_ENABLE_RADIX_CACHE": "1", + "SGLANG_RADIX_CACHE_SIZE": "0.9", + "SGLANG_LOGGING_LEVEL": "INFO", + "SGLANG_DISAGG_TRANSFER_BACKEND": "mooncake", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "HSA_ENABLE_SDMA": "0", + "GPU_MAX_HW_QUEUES": "2", + "NCCL_DEBUG": "WARN", + "NCCL_DEBUG_SUBSYS": "INIT,NET", + "NCCL_MIN_NCHANNELS": "16", + "NCCL_IB_DISABLE": "0", + "NCCL_IB_HCA": "mlx5_0", + "NCCL_SOCKET_IFNAME": "ib0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "RAY_DEDUP_LOGS": "1", + "RAY_BACKEND_LOG_LEVEL": "warning", + "MOONCAKE_TEST_MODE": "0" + } +} + diff --git a/examples/slurm-configs/minimal/deepspeed-minimal.json b/examples/slurm-configs/minimal/deepspeed-minimal.json new file mode 100644 index 00000000..ae105389 --- /dev/null +++ b/examples/slurm-configs/minimal/deepspeed-minimal.json @@ -0,0 +1,25 @@ +{ + "_comment": "DeepSpeed Config - Uses deepspeed launcher", + "_description": "DeepSpeed with ZeRO-1 optimization", + "_use_case": "Test DeepSpeed distributed training on SLURM (training-specific launcher)", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 4, + "time": "02:00:00" + }, + + "distributed": { + "launcher": "deepspeed", + "nnodes": 1, + "nproc_per_node": 4 + }, + + "env_vars": { + "DEEPSPEED_LAUNCHER": "deepspeed" + } +} diff --git a/examples/slurm-configs/minimal/megatron-lm-minimal.json b/examples/slurm-configs/minimal/megatron-lm-minimal.json new file mode 100644 index 00000000..9480359e --- /dev/null +++ b/examples/slurm-configs/minimal/megatron-lm-minimal.json @@ -0,0 +1,25 @@ +{ + "_comment": "Megatron-LM Minimal Config - Dedicated launcher support", + "_description": "Megatron-LM with automated tensor/pipeline parallelism setup", + "_use_case": "Large-scale transformer training with Megatron-LM on SLURM", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 2, + "time": "02:00:00" + }, + + "distributed": { + "launcher": "megatron", + "nnodes": 1, + "nproc_per_node": 2 + }, + + "env_vars": { + "OMP_NUM_THREADS": "8" + } +} diff --git a/examples/slurm-configs/minimal/sglang-disagg-minimal.json b/examples/slurm-configs/minimal/sglang-disagg-minimal.json new file mode 100644 index 00000000..ee4ad9f2 --- /dev/null +++ b/examples/slurm-configs/minimal/sglang-disagg-minimal.json @@ -0,0 +1,22 @@ +{ + "_comment": "Minimal SGLang Disaggregated configuration - 3 nodes minimum", + "_description": "SGLang disaggregated inference with 3 nodes (1 proxy + 1 prefill + 1 decode)", + "_architecture": "Node 0: Proxy, Node 1: Prefill, Node 2: Decode", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "gpu", + "nodes": 3, + "gpus_per_node": 1, + "time": "02:00:00" + }, + + "distributed": { + "launcher": "sglang-disagg", + "nnodes": 3, + "nproc_per_node": 1 + } +} + diff --git a/examples/slurm-configs/minimal/sglang-multi-node-minimal.json b/examples/slurm-configs/minimal/sglang-multi-node-minimal.json new file mode 100644 index 00000000..057b5004 --- /dev/null +++ b/examples/slurm-configs/minimal/sglang-multi-node-minimal.json @@ -0,0 +1,21 @@ +{ + "_comment": "Minimal SGLang multi-node configuration", + "_description": "SGLang inference with 2 nodes, 4 GPUs per node", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 2, + "gpus_per_node": 4, + "time": "04:00:00" + }, + + "distributed": { + "launcher": "sglang", + "nnodes": 2, + "nproc_per_node": 4 + } +} + diff --git a/examples/slurm-configs/minimal/sglang-single-node-minimal.json b/examples/slurm-configs/minimal/sglang-single-node-minimal.json new file mode 100644 index 00000000..7e2eae97 --- /dev/null +++ b/examples/slurm-configs/minimal/sglang-single-node-minimal.json @@ -0,0 +1,21 @@ +{ + "_comment": "Minimal SGLang single-node configuration", + "_description": "SGLang inference with 4 GPUs tensor parallelism", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 4, + "time": "02:00:00" + }, + + "distributed": { + "launcher": "sglang", + "nnodes": 1, + "nproc_per_node": 4 + } +} + diff --git a/examples/slurm-configs/minimal/torchrun-multi-gpu-minimal.json b/examples/slurm-configs/minimal/torchrun-multi-gpu-minimal.json new file mode 100644 index 00000000..c8479d58 --- /dev/null +++ b/examples/slurm-configs/minimal/torchrun-multi-gpu-minimal.json @@ -0,0 +1,17 @@ +{ + "_comment": "Minimal multi-GPU SLURM configuration (8 GPUs, single node)", + "_note": "Using 'amd-rccl' partition (default for this cluster)", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "gpus_per_node": 8, + "time": "12:00:00" + }, + "distributed": { + "launcher": "torchrun" + } +} + diff --git a/examples/slurm-configs/minimal/torchrun-multi-node-minimal.json b/examples/slurm-configs/minimal/torchrun-multi-node-minimal.json new file mode 100644 index 00000000..e00262bf --- /dev/null +++ b/examples/slurm-configs/minimal/torchrun-multi-node-minimal.json @@ -0,0 +1,18 @@ +{ + "_comment": "Minimal multi-node SLURM configuration (2 nodes x 8 GPUs)", + "_note": "Using 'amd-rccl' partition (default for this cluster)", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 2, + "gpus_per_node": 8, + "time": "24:00:00" + }, + "distributed": { + "launcher": "torchrun" + } +} + diff --git a/examples/slurm-configs/minimal/torchrun-single-gpu-minimal.json b/examples/slurm-configs/minimal/torchrun-single-gpu-minimal.json new file mode 100644 index 00000000..4151f94a --- /dev/null +++ b/examples/slurm-configs/minimal/torchrun-single-gpu-minimal.json @@ -0,0 +1,14 @@ +{ + "_comment": "Minimal single GPU SLURM configuration", + "_note": "Using 'amd-rccl' partition (default for this cluster)", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "gpus_per_node": 1, + "time": "01:00:00" + } +} + diff --git a/examples/slurm-configs/minimal/torchtitan-multi-node-minimal.json b/examples/slurm-configs/minimal/torchtitan-multi-node-minimal.json new file mode 100644 index 00000000..0b227a99 --- /dev/null +++ b/examples/slurm-configs/minimal/torchtitan-multi-node-minimal.json @@ -0,0 +1,24 @@ +{ + "_comment": "TorchTitan multi-node SLURM configuration (4 nodes x 8 GPUs)", + "_description": "Llama 3.1 70B pre-training with TP + PP + FSDP2", + "_reference": "https://github.com/pytorch/torchtitan", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 4, + "gpus_per_node": 8, + "time": "72:00:00", + "mem": "512G", + "constraint": "MI300X" + }, + + "distributed": { + "launcher": "torchtitan", + "nnodes": 4, + "nproc_per_node": 8 + } +} + diff --git a/examples/slurm-configs/minimal/torchtitan-single-node-minimal.json b/examples/slurm-configs/minimal/torchtitan-single-node-minimal.json new file mode 100644 index 00000000..4b7f532a --- /dev/null +++ b/examples/slurm-configs/minimal/torchtitan-single-node-minimal.json @@ -0,0 +1,21 @@ +{ + "_comment": "Minimal TorchTitan SLURM configuration (1 node x 8 GPUs)", + "_description": "Llama 3.1 8B pre-training with Tensor Parallelism", + "_reference": "https://github.com/pytorch/torchtitan", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 8, + "time": "24:00:00", + "mem": "256G" + }, + + "distributed": { + "launcher": "torchtitan" + } +} + diff --git a/examples/slurm-configs/minimal/vllm-multi-node-minimal.json b/examples/slurm-configs/minimal/vllm-multi-node-minimal.json new file mode 100644 index 00000000..0a77b5ea --- /dev/null +++ b/examples/slurm-configs/minimal/vllm-multi-node-minimal.json @@ -0,0 +1,33 @@ +{ + "_comment": "Minimal vLLM multi-node configuration", + "_description": "vLLM inference with 2 nodes, 4 GPUs per node", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 2, + "gpus_per_node": 4, + "time": "04:00:00", + "enable_node_check": true, + "auto_cleanup_nodes": false + }, + + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 4 + }, + + "env_vars": { + "VLLM_KV_CACHE_SIZE": "0.5", + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", + "NCCL_TIMEOUT": "600", + "VLLM_ENGINE_ITERATION_TIMEOUT_S": "180", + "RAY_health_check_timeout_ms": "60000" + }, + + "pre_scripts": [] +} + diff --git a/examples/slurm-configs/minimal/vllm-single-node-minimal.json b/examples/slurm-configs/minimal/vllm-single-node-minimal.json new file mode 100644 index 00000000..14c9b843 --- /dev/null +++ b/examples/slurm-configs/minimal/vllm-single-node-minimal.json @@ -0,0 +1,28 @@ +{ + "_comment": "Minimal vLLM single-node configuration", + "_description": "vLLM inference with 4 GPUs tensor parallelism", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 4, + "time": "02:00:00" + }, + + "distributed": { + "launcher": "vllm", + "nnodes": 1, + "nproc_per_node": 4 + }, + + "env_vars": { + "VLLM_KV_CACHE_SIZE": "0.7", + "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True" + }, + + "pre_scripts": [] +} + diff --git a/pyproject.toml b/pyproject.toml index 00e9011d..5e2e2ff5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,13 +18,16 @@ dependencies = [ "sqlalchemy", "setuptools-rust", "paramiko", - "mysql-connector-python", - "pymysql", "tqdm", "pytest", "typing-extensions", "pymongo", "toml", + "typer>=0.9.0", + "rich>=13.0.0", + "click>=8.0.0", + "jinja2>=3.0.0", + "pyyaml>=6.0", ] classifiers = [ "Programming Language :: Python :: 3", @@ -33,13 +36,13 @@ classifiers = [ ] [project.scripts] -madengine = "madengine.mad:main" +madengine = "madengine.cli.app:cli_main" [project.urls] Homepage = "https://github.com/ROCm/madengine" Issues = "https://github.com/ROCm/madengine/issues" -[project.extras] +[project.optional-dependencies] dev = [ "pytest", "pytest-cov", @@ -47,19 +50,45 @@ dev = [ "pytest-timeout", "pytest-mock", "pytest-asyncio", + "black>=21.0.0", + "flake8", + "mypy>=0.910", + "isort", + "pre-commit", +] +# Optional dependencies for distributed deployments +# Note: SLURM requires no additional dependencies (uses CLI commands) +kubernetes = [ + "kubernetes>=28.0.0", +] +# Complete development environment (dev + kubernetes deployment) +all = [ + "kubernetes>=28.0.0", + "pytest", + "pytest-cov", + "pytest-xdist", + "pytest-timeout", + "pytest-mock", + "pytest-asyncio", + "black>=21.0.0", + "flake8", + "mypy>=0.910", + "isort", + "pre-commit", ] [tool.hatch.build.targets.wheel] [tool.hatch.build.targets.wheel.force-include] "src/madengine/scripts" = "madengine/scripts" +"src/madengine/deployment/templates" = "madengine/deployment/templates" [tool.hatch.version] source = "versioningit" [tool.versioningit.vcs] method = "git" -default-tag = "v1.0.0" +default-tag = "v2.0.0" [tool.versioningit.tag2version] regex = "v(?P.*)" @@ -68,3 +97,86 @@ regex = "v(?P.*)" distance = "{base_version}.post{distance}+{vcs}{rev}" dirty = "{base_version}+d{build_date:%Y%m%d}" distance-dirty = "{base_version}.post{distance}+{vcs}{rev}.d{build_date:%Y%m%d}" + +# Code formatting and linting configuration +[tool.black] +line-length = 88 +target-version = ['py38', 'py39', 'py310', 'py311'] +include = '\.pyi?$' +extend-exclude = ''' +/( + # directories + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | build + | dist +)/ +''' + +[tool.isort] +profile = "black" +multi_line_output = 3 +line_length = 88 +known_first_party = ["madengine"] +known_third_party = ["pytest", "pandas", "numpy", "sqlalchemy"] + +[tool.mypy] +python_version = "3.8" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +disallow_incomplete_defs = false +check_untyped_defs = true +disallow_untyped_decorators = false +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +warn_no_return = true +warn_unreachable = true +strict_equality = true + +[[tool.mypy.overrides]] +module = [ + "paramiko.*", + "pymongo.*", + "toml.*", + "jsondiff.*", + "git.*", + "kubernetes.*", +] +ignore_missing_imports = true + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_paths = ["src"] +addopts = "-v --tb=short" +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", + "integration: marks tests as integration tests", +] + +[tool.coverage.run] +source = ["src/madengine"] +omit = [ + "*/tests/*", + "*/test_*", + "*/__pycache__/*", +] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "if self.debug:", + "if settings.DEBUG", + "raise AssertionError", + "raise NotImplementedError", + "if 0:", + "if __name__ == .__main__.:", + "class .*\\bProtocol\\):", + "@(abc\\.)?abstractmethod", +] diff --git a/pytest.ini b/pytest.ini index 3a5aa078..32821037 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,85 @@ [pytest] -testpaths = tests -pythonpath = src \ No newline at end of file +# Pytest configuration for madengine + +# Test discovery +python_files = test_*.py +python_classes = Test* +python_functions = test_* +testpaths = tests/unit tests/integration tests/e2e + +# Output and reporting +addopts = + # Verbose output + -v + # Show local variables in tracebacks + --tb=short + # Show summary of all test outcomes + -ra + # Strict markers (fail on unknown markers) + --strict-markers + # Show warnings + -W default + # Coverage (if pytest-cov is installed) + # --cov=src/madengine + # --cov-report=term-missing + # --cov-report=html + +# Markers for test categorization +markers = + unit: Fast unit tests (no external dependencies) + integration: Integration tests (may be slower, test multiple components) + e2e: End-to-end tests (require full environment, Docker, may be very slow) + slow: Slow tests (can be skipped with -m "not slow") + gpu: Tests that require GPU hardware + amd: Tests specific to AMD GPUs + nvidia: Tests specific to NVIDIA GPUs + cpu: Tests for CPU-only execution + requires_docker: Tests that require Docker daemon + requires_models: Tests that require model fixtures + +# Test execution +# Skip slow tests by default (run with --runslow to include them) +# To run only unit tests: pytest -m unit +# To run integration tests: pytest -m integration +# To exclude GPU tests: pytest -m "not gpu" +# To run AMD-specific tests: pytest -m amd + +# Logging +log_cli = false +log_cli_level = INFO +log_cli_format = %(asctime)s [%(levelname)8s] %(message)s +log_cli_date_format = %Y-%m-%d %H:%M:%S + +# Test timeouts (requires pytest-timeout) +# timeout = 300 +# timeout_method = thread + +# Warnings +filterwarnings = + # Treat warnings as errors (strict mode) + # error + # Ignore specific warnings + ignore::DeprecationWarning + ignore::PendingDeprecationWarning + +# Minimum Python version +minversion = 3.8 + +# Coverage options (requires pytest-cov) +[coverage:run] +source = src/madengine +omit = + */tests/* + */test_*.py + */__pycache__/* + */site-packages/* + +[coverage:report] +exclude_lines = + pragma: no cover + def __repr__ + raise AssertionError + raise NotImplementedError + if __name__ == .__main__.: + if TYPE_CHECKING: + @abstractmethod diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..dab8c8c4 --- /dev/null +++ b/setup.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 +""" +Simplified setup.py for madengine + +This setup.py provides compatibility with environments that require traditional +setup.py installations while reading configuration from pyproject.toml. + +For modern installations, prefer: + pip install . + python -m build + pip install -e .[dev] + +For legacy compatibility: + python setup.py install + python setup.py develop + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import sys +from pathlib import Path + +try: + from setuptools import setup, find_packages +except ImportError: + print("setuptools is required for setup.py") + print("Install it using: pip install setuptools") + sys.exit(1) + +def read_readme(readme_file="README.md"): + """Read README.md file for long description.""" + readme_path = Path(__file__).parent / readme_file + if readme_path.exists(): + with open(readme_path, "r", encoding="utf-8") as f: + return f.read() + + # Fallback to README.md if specified file doesn't exist + fallback_path = Path(__file__).parent / "README.md" + if fallback_path.exists() and readme_file != "README.md": + with open(fallback_path, "r", encoding="utf-8") as f: + return f.read() + + return "" + +def get_config_from_pyproject(): + """Read configuration from pyproject.toml.""" + try: + import tomllib + except ImportError: + try: + import tomli as tomllib + except ImportError: + try: + import toml as tomllib_alt + def load(f): + if hasattr(f, 'read'): + content = f.read() + if isinstance(content, bytes): + content = content.decode('utf-8') + return tomllib_alt.loads(content) + else: + return tomllib_alt.load(f) + tomllib.load = load + except ImportError: + print("Warning: No TOML library found. Using fallback configuration.") + return get_fallback_config() + + pyproject_path = Path(__file__).parent / "pyproject.toml" + if not pyproject_path.exists(): + print("Warning: pyproject.toml not found. Using fallback configuration.") + return get_fallback_config() + + try: + with open(pyproject_path, "rb") as f: + data = tomllib.load(f) + + project = data.get("project", {}) + + # Extract configuration + config = { + "name": project.get("name", "madengine"), + "description": project.get("description", "MAD Engine"), + "authors": project.get("authors", []), + "dependencies": project.get("dependencies", []), + "optional_dependencies": project.get("optional-dependencies", {}), + "requires_python": project.get("requires-python", ">=3.8"), + "classifiers": project.get("classifiers", []), + "urls": project.get("urls", {}), + "scripts": project.get("scripts", {}), + "readme": project.get("readme", "README.md"), + } + + return config + + except Exception as e: + print(f"Warning: Could not read pyproject.toml: {e}") + return get_fallback_config() + +def get_fallback_config(): + """Fallback configuration if pyproject.toml cannot be read.""" + return { + "name": "madengine", + "description": "MAD Engine is a set of interfaces to run various AI models from public MAD.", + "authors": [{"name": "Advanced Micro Devices", "email": "mad.support@amd.com"}], + "dependencies": [ + "pandas", "GitPython", "jsondiff", "sqlalchemy", "setuptools-rust", + "paramiko", "tqdm", "pytest", + "typing-extensions", "pymongo", "toml", + ], + "optional_dependencies": { + "dev": [ + "pytest", "pytest-cov", "pytest-xdist", "pytest-timeout", + "pytest-mock", "pytest-asyncio", + ] + }, + "requires_python": ">=3.8", + "classifiers": [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + "urls": { + "Homepage": "https://github.com/ROCm/madengine", + "Issues": "https://github.com/ROCm/madengine/issues", + }, + "scripts": { + "madengine": "madengine.cli.app:cli_main" + }, + } + +def get_version(): + """Get version from git tags or fallback to a default.""" + try: + import subprocess + import re + + # Try to get version from git describe first (more accurate) + try: + result = subprocess.run( + ["git", "describe", "--tags", "--dirty", "--always", "--long"], + capture_output=True, text=True, timeout=10, cwd=Path(__file__).parent + ) + if result.returncode == 0: + version_str = result.stdout.strip() + + # Handle case where there are no tags yet + if not version_str or len(version_str.split('-')) < 3: + # Try to get just the commit hash + result = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + capture_output=True, text=True, timeout=10, cwd=Path(__file__).parent + ) + if result.returncode == 0: + commit = result.stdout.strip() + # Check if dirty + dirty_result = subprocess.run( + ["git", "diff-index", "--quiet", "HEAD", "--"], + capture_output=True, cwd=Path(__file__).parent + ) + is_dirty = dirty_result.returncode != 0 + if is_dirty: + return f"2.0.0.dev0+g{commit}.dirty" + else: + return f"2.0.0.dev0+g{commit}" + + # Clean up the version string to be PEP 440 compliant + if version_str.startswith('v'): + version_str = version_str[1:] + + # Handle patterns like "1.0.0-5-g1234567" or "1.0.0-5-g1234567-dirty" + match = re.match(r'^([^-]+)-(\d+)-g([a-f0-9]+)(-dirty)?$', version_str) + if match: + base_version, distance, commit, dirty = match.groups() + if distance == "0": + # Exact tag match + if dirty: + return f"{base_version}+dirty" + else: + return base_version + else: + # Post-release version + version_str = f"{base_version}.post{distance}+g{commit}" + if dirty: + version_str += ".dirty" + return version_str + + # Handle case where we just have a commit hash (no tags) + if re.match(r'^[a-f0-9]+(-dirty)?$', version_str): + clean_hash = version_str.replace('-dirty', '') + if '-dirty' in version_str: + return f"2.0.0.dev0+g{clean_hash}.dirty" + else: + return f"2.0.0.dev0+g{clean_hash}" + + return version_str + + except (subprocess.SubprocessError, FileNotFoundError): + pass + + # Fallback to short commit hash + result = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + capture_output=True, text=True, timeout=10, cwd=Path(__file__).parent + ) + if result.returncode == 0: + commit = result.stdout.strip() + return f"2.0.0.dev0+g{commit}" + + except Exception: + pass + + # Final fallback + return "2.0.0.dev0" + +def main(): + """Main setup function.""" + try: + config = get_config_from_pyproject() + + # Extract author information + authors = config.get("authors", []) + if authors: + author_name = authors[0].get("name", "Advanced Micro Devices") + author_email = authors[0].get("email", "mad.support@amd.com") + else: + author_name = "Advanced Micro Devices" + author_email = "mad.support@amd.com" + + # Extract scripts/entry points + scripts = config.get("scripts", {}) + entry_points = {"console_scripts": []} + for script_name, module_path in scripts.items(): + entry_points["console_scripts"].append(f"{script_name}={module_path}") + + # Find all packages + packages = find_packages(where="src") + if not packages: + print("Warning: No packages found in src/ directory") + # Fallback: look for madengine package specifically + import os + src_path = Path(__file__).parent / "src" + if (src_path / "madengine").exists(): + packages = ["madengine"] + [ + f"madengine.{name}" for name in find_packages(where="src/madengine") + ] + + # Setup package data to include scripts + package_data = {"madengine": ["scripts/**/*"]} + + # Check if scripts directory exists and add patterns accordingly + scripts_path = Path(__file__).parent / "src" / "madengine" / "scripts" + if scripts_path.exists(): + # Add more specific patterns to ensure all script files are included + package_data["madengine"].extend([ + "scripts/*", + "scripts/*/*", + "scripts/*/*/*", + "scripts/*/*/*/*", + ]) + + # Get version + version = get_version() + + # Setup configuration + setup_kwargs = { + "name": config["name"], + "version": version, + "author": author_name, + "author_email": author_email, + "description": config["description"], + "long_description": read_readme(config.get("readme", "README.md")), + "long_description_content_type": "text/markdown", + "url": config["urls"].get("Homepage", "https://github.com/ROCm/madengine"), + "project_urls": config["urls"], + "package_dir": {"": "src"}, + "packages": packages, + "install_requires": config["dependencies"], + "extras_require": config["optional_dependencies"], + "python_requires": config["requires_python"], + "entry_points": entry_points if entry_points["console_scripts"] else None, + "classifiers": config["classifiers"], + "include_package_data": True, + "package_data": package_data, + "zip_safe": False, + "platforms": ["any"], + } + + # Remove None values to avoid setuptools warnings + setup_kwargs = {k: v for k, v in setup_kwargs.items() if v is not None} + + # Print some info for debugging + if len(sys.argv) > 1 and any(arg in sys.argv for arg in ["--version", "--help", "--help-commands"]): + print(f"madengine version: {version}") + print(f"Found {len(packages)} packages") + if entry_points and entry_points["console_scripts"]: + print(f"Console scripts: {', '.join(entry_points['console_scripts'])}") + + setup(**setup_kwargs) + + except Exception as e: + print(f"Error during setup: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/src/madengine/__init__.py b/src/madengine/__init__.py index 8db410f6..f121d08e 100644 --- a/src/madengine/__init__.py +++ b/src/madengine/__init__.py @@ -1,26 +1,22 @@ """ -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -r''' -# What is MADEngine? +madengine - AI Models automation and dashboarding command-line tool. -An AI Models automation and dashboarding command-line tool to run LLMs and Deep Learning models locally or remotelly with CI. -The MADEngine library is to support AI automation having following features: +An AI Models automation and dashboarding command-line tool to run LLMs and Deep Learning +models locally or remotely with CI. The madengine library supports AI automation with: - AI Models run reliably on supported platforms and drive software quality -- Simple, minimalistic, out-of-the-box solution that enable confidence on hardware and software stack +- Simple, minimalistic, out-of-the-box solution that enables confidence on hardware and software stack - Real-time, audience-relevant AI Models performance metrics tracking, presented in clear, intuitive manner - Best-practices for handling internal projects and external open-source projects +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" +from importlib.metadata import version, PackageNotFoundError -.. include:: ../../docs/how-to-build.md -.. include:: ../../docs/how-to-quick-start.md -.. include:: ../../docs/how-to-provide-contexts.md -.. include:: ../../docs/how-to-profile-a-model.md -.. include:: ../../docs/how-to-collect-competitive-library-perf.md -.. include:: ../../docs/how-to-contribute.md - -''' -from importlib.metadata import version +try: + __version__ = version("madengine") +except PackageNotFoundError: + # Package is not installed, use a default version + __version__ = "dev" -__version__ = version("madengine") \ No newline at end of file +__all__ = ["__version__"] diff --git a/src/madengine/cli/__init__.py b/src/madengine/cli/__init__.py new file mode 100644 index 00000000..2ac185c2 --- /dev/null +++ b/src/madengine/cli/__init__.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +""" +CLI Package for madengine + +This package contains the modular CLI implementation split from the +monolithic mad_cli.py for better maintainability. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# Import for backward compatibility +from .app import app, cli_main +from .constants import ExitCode, VALID_GPU_VENDORS, VALID_GUEST_OS +from .constants import ( + DEFAULT_MANIFEST_FILE, + DEFAULT_PERF_OUTPUT, + DEFAULT_DATA_CONFIG, + DEFAULT_TOOLS_CONFIG, + DEFAULT_TIMEOUT, +) +from .utils import ( + setup_logging, + split_comma_separated_tags, + create_args_namespace, + save_summary_with_feedback, + display_results_table, + display_performance_table, +) +from .validators import ( + validate_additional_context, + process_batch_manifest, + process_batch_manifest_entries, +) + +__all__ = [ + "app", + "cli_main", + "ExitCode", + "VALID_GPU_VENDORS", + "VALID_GUEST_OS", + "DEFAULT_MANIFEST_FILE", + "DEFAULT_PERF_OUTPUT", + "DEFAULT_DATA_CONFIG", + "DEFAULT_TOOLS_CONFIG", + "DEFAULT_TIMEOUT", + "setup_logging", + "split_comma_separated_tags", + "create_args_namespace", + "save_summary_with_feedback", + "display_results_table", + "display_performance_table", + "validate_additional_context", + "process_batch_manifest", + "process_batch_manifest_entries", +] + diff --git a/src/madengine/cli/app.py b/src/madengine/cli/app.py new file mode 100644 index 00000000..b9ccfa15 --- /dev/null +++ b/src/madengine/cli/app.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +""" +Main CLI Application for madengine + +This module contains the main Typer app and entry point for the madengine CLI. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import sys + +import typer +from rich.traceback import install + +try: + from typing import Annotated # Python 3.9+ +except ImportError: + from typing_extensions import Annotated # Python 3.8 + +from .commands import build, run, discover, report_app, database +from .constants import ExitCode +from .utils import console + +# Install rich traceback handler for better error displays +install(show_locals=True) + +# Initialize the main Typer app +app = typer.Typer( + name="madengine", + help="🚀 madengine Distributed Orchestrator - Build and run AI models in distributed scenarios", + rich_markup_mode="rich", + add_completion=False, + no_args_is_help=True, +) + +# Register commands +app.command()(build) +app.command()(run) +app.command()(discover) +app.command()(database) +app.add_typer(report_app, name="report") + + +@app.callback(invoke_without_command=True) +def main( + ctx: typer.Context, + version: Annotated[ + bool, typer.Option("--version", help="Show version and exit") + ] = False, +) -> None: + """ + 🚀 madengine Distributed Orchestrator + + Modern CLI for building and running AI models in distributed scenarios. + Built with Typer and Rich for a beautiful, production-ready experience. + """ + if version: + # You might want to get the actual version from your package + console.print( + "🚀 [bold cyan]madengine[/bold cyan] version [green]2.0.0[/green]" + ) + raise typer.Exit() + + # If no command is provided, show help + if ctx.invoked_subcommand is None: + console.print(ctx.get_help()) + ctx.exit() + + +def cli_main() -> None: + """Entry point for the CLI application.""" + try: + app() + except KeyboardInterrupt: + console.print("\n🛑 [yellow]Operation cancelled by user[/yellow]") + sys.exit(ExitCode.FAILURE) + except Exception as e: + console.print(f"💥 [bold red]Unexpected error: {e}[/bold red]") + console.print_exception() + sys.exit(ExitCode.FAILURE) + + +if __name__ == "__main__": + cli_main() + diff --git a/src/madengine/cli/commands/__init__.py b/src/madengine/cli/commands/__init__.py new file mode 100644 index 00000000..f77b432e --- /dev/null +++ b/src/madengine/cli/commands/__init__.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +""" +CLI Commands Package for madengine + +This package contains individual command implementations split from mad_cli.py. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +from .build import build +from .run import run +from .discover import discover +from .report import report_app +from .database import database + +__all__ = ["build", "run", "discover", "report_app", "database"] + diff --git a/src/madengine/cli/commands/build.py b/src/madengine/cli/commands/build.py new file mode 100644 index 00000000..99166a47 --- /dev/null +++ b/src/madengine/cli/commands/build.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +""" +Build command for madengine CLI + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +from typing import List, Optional + +import typer +from rich.panel import Panel +from rich.progress import Progress, SpinnerColumn, TextColumn + +try: + from typing import Annotated # Python 3.9+ +except ImportError: + from typing_extensions import Annotated # Python 3.8 + +from madengine.orchestration.build_orchestrator import BuildOrchestrator +from madengine.core.errors import BuildError, ConfigurationError, DiscoveryError + +from ..constants import ExitCode, DEFAULT_MANIFEST_FILE +from ..utils import ( + console, + setup_logging, + split_comma_separated_tags, + create_args_namespace, + save_summary_with_feedback, + display_results_table, +) +from ..validators import validate_additional_context, process_batch_manifest, process_batch_manifest_entries + + +def build( + tags: Annotated[ + List[str], + typer.Option("--tags", "-t", help="Model tags to build (can specify multiple)"), + ] = [], + target_archs: Annotated[ + List[str], + typer.Option( + "--target-archs", + "-a", + help="Target GPU architectures to build for (e.g., gfx908,gfx90a,gfx942). If not specified, builds single image with MAD_SYSTEM_GPU_ARCHITECTURE from additional_context or detected GPU architecture." + ), + ] = [], + registry: Annotated[ + Optional[str], + typer.Option("--registry", "-r", help="Docker registry to push images to"), + ] = None, + batch_manifest: Annotated[ + Optional[str], + typer.Option( + "--batch-manifest", help="Input batch.json file for batch build mode" + ), + ] = None, + additional_context: Annotated[ + str, + typer.Option( + "--additional-context", "-c", help="Additional context as JSON string" + ), + ] = "{}", + additional_context_file: Annotated[ + Optional[str], + typer.Option( + "--additional-context-file", + "-f", + help="File containing additional context JSON", + ), + ] = None, + clean_docker_cache: Annotated[ + bool, + typer.Option("--clean-docker-cache", help="Rebuild images without using cache"), + ] = False, + manifest_output: Annotated[ + str, + typer.Option("--manifest-output", "-m", help="Output file for build manifest"), + ] = DEFAULT_MANIFEST_FILE, + summary_output: Annotated[ + Optional[str], + typer.Option( + "--summary-output", "-s", help="Output file for build summary JSON" + ), + ] = None, + live_output: Annotated[ + bool, typer.Option("--live-output", "-l", help="Print output in real-time") + ] = False, + verbose: Annotated[ + bool, typer.Option("--verbose", "-v", help="Enable verbose logging") + ] = False, +) -> None: + """ + 🔨 Build Docker images for models in distributed scenarios. + + This command builds Docker images for the specified model tags and optionally + pushes them to a registry. Additional context with gpu_vendor and guest_os + is required for build-only operations. + """ + setup_logging(verbose) + + # Process tags to handle comma-separated values + # Supports both: --tags dummy --tags multi AND --tags dummy,multi + processed_tags = split_comma_separated_tags(tags) + + # Validate mutually exclusive options + if batch_manifest and processed_tags: + console.print( + "❌ [bold red]Error: Cannot specify both --batch-manifest and --tags options[/bold red]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + # Process batch manifest if provided + batch_data = None + effective_tags = processed_tags + batch_build_metadata = None + + # There are 2 scenarios for batch builds and single builds + # - Batch builds: Use the batch manifest to determine which models to build + # - Single builds: Use the tags directly + if batch_manifest: + # Process the batch manifest + if verbose: + console.print(f"[DEBUG] Processing batch manifest: {batch_manifest}") + try: + batch_data = process_batch_manifest(batch_manifest) + if verbose: + console.print(f"[DEBUG] batch_data: {batch_data}") + + effective_tags = batch_data["build_tags"] + # Build a mapping of model_name -> registry_image/registry for build_new models + batch_build_metadata = {} + for model in batch_data["manifest_data"]: + if model.get("build_new", False): + batch_build_metadata[model["model_name"]] = { + "registry_image": model.get("registry_image"), + "registry": model.get("registry"), + } + if verbose: + console.print(f"[DEBUG] batch_build_metadata: {batch_build_metadata}") + + console.print( + Panel( + f"📦 [bold cyan]Batch Build Mode[/bold cyan]\n" + f"Input manifest: [yellow]{batch_manifest}[/yellow]\n" + f"Total models: [yellow]{len(batch_data['all_tags'])}[/yellow]\n" + f"Models to build: [yellow]{len(batch_data['build_tags'])}[/yellow] ({', '.join(batch_data['build_tags']) if batch_data['build_tags'] else 'none'})\n" + f"Registry: [yellow]{registry or 'Local only'}[/yellow]", + title="Batch Build Configuration", + border_style="blue", + ) + ) + except (FileNotFoundError, ValueError) as e: + console.print( + f"❌ [bold red]Error processing batch manifest: {e}[/bold red]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + else: + console.print( + Panel( + f"🔨 [bold cyan]Building Models[/bold cyan]\n" + f"Tags: [yellow]{', '.join(processed_tags) if processed_tags else 'All models'}[/yellow]\n" + f"Registry: [yellow]{registry or 'Local only'}[/yellow]", + title="Build Configuration", + border_style="blue", + ) + ) + + try: + # Validate additional context + validate_additional_context(additional_context, additional_context_file) + + # Create arguments object + args = create_args_namespace( + tags=effective_tags, + target_archs=target_archs, + registry=registry, + additional_context=additional_context, + additional_context_file=additional_context_file, + clean_docker_cache=clean_docker_cache, + manifest_output=manifest_output, + live_output=live_output, + verbose=verbose, + _separate_phases=True, + batch_build_metadata=batch_build_metadata if batch_build_metadata else None, + ) + + # Initialize orchestrator in build-only mode + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + console=console, + ) as progress: + task = progress.add_task("Initializing build orchestrator...", total=None) + + # Use new BuildOrchestrator + orchestrator = BuildOrchestrator(args) + progress.update(task, description="Building models...") + + # Execute build workflow + manifest_file = orchestrator.execute( + registry=registry, + clean_cache=clean_docker_cache, + manifest_output=manifest_output, + batch_build_metadata=batch_build_metadata, + ) + + # Load build summary for display + with open(manifest_output, 'r') as f: + manifest = json.load(f) + build_summary = manifest.get("summary", {}) + + progress.update(task, description="Build completed!") + + # Handle batch manifest post-processing + if batch_data: + with console.status("Processing batch manifest..."): + additional_context_dict = getattr(args, "additional_context", None) + if isinstance(additional_context_dict, str): + additional_context_dict = json.loads(additional_context_dict) + guest_os = ( + additional_context_dict.get("guest_os") if additional_context_dict else None + ) + gpu_vendor = ( + additional_context_dict.get("gpu_vendor") if additional_context_dict else None + ) + process_batch_manifest_entries( + batch_data, manifest_output, registry, guest_os, gpu_vendor + ) + + # Display results + # Check if target_archs was used to show GPU architecture column + show_gpu_arch = bool(target_archs) + display_results_table(build_summary, "Build Results", show_gpu_arch) + + # Save summary + save_summary_with_feedback(build_summary, summary_output, "Build") + + # Check results and exit with appropriate code + failed_builds = len(build_summary.get("failed_builds", [])) + successful_builds = len(build_summary.get("successful_builds", [])) + + if failed_builds == 0: + console.print( + "🎉 [bold green]All builds completed successfully![/bold green]" + ) + raise typer.Exit(ExitCode.SUCCESS) + elif successful_builds > 0: + # Partial success + console.print( + f"⚠️ [bold yellow]Partial success: " + f"{successful_builds} built, {failed_builds} failed[/bold yellow]" + ) + console.print( + "💡 [dim]Successful builds are available in build_manifest.json[/dim]" + ) + raise typer.Exit(ExitCode.BUILD_FAILURE) # Non-zero exit for CI/CD + else: + # All failed + console.print( + f"💥 [bold red]All builds failed[/bold red]" + ) + raise typer.Exit(ExitCode.BUILD_FAILURE) + + except typer.Exit: + raise + except BuildError as e: + # Specific build error handling + console.print(f"💥 [bold red]Build error: {e}[/bold red]") + if hasattr(e, 'suggestions') and e.suggestions: + console.print("\n💡 [cyan]Suggestions:[/cyan]") + for suggestion in e.suggestions: + console.print(f" • {suggestion}") + raise typer.Exit(ExitCode.BUILD_FAILURE) + + except ConfigurationError as e: + # Configuration errors + console.print(f"⚙️ [bold red]Configuration error: {e}[/bold red]") + if hasattr(e, 'suggestions') and e.suggestions: + console.print("\n💡 [cyan]Suggestions:[/cyan]") + for suggestion in e.suggestions: + console.print(f" • {suggestion}") + raise typer.Exit(ExitCode.INVALID_ARGS) + + except DiscoveryError as e: + # Model discovery errors + console.print(f"🔍 [bold red]Discovery error: {e}[/bold red]") + console.print("💡 Check MODEL_DIR or models.json configuration") + raise typer.Exit(ExitCode.FAILURE) + + except KeyboardInterrupt: + console.print("\n🛑 [yellow]Build cancelled by user[/yellow]") + raise typer.Exit(ExitCode.FAILURE) + + except PermissionError as e: + console.print(f"🔒 [bold red]Permission denied: {e}[/bold red]") + console.print("💡 Check file/directory permissions or run with appropriate privileges") + raise typer.Exit(ExitCode.FAILURE) + + except FileNotFoundError as e: + console.print(f"📁 [bold red]File not found: {e}[/bold red]") + console.print("💡 Check that all required files exist") + raise typer.Exit(ExitCode.FAILURE) + + except Exception as e: + console.print(f"💥 [bold red]Unexpected error: {e}[/bold red]") + if verbose: + console.print_exception() + + from madengine.core.errors import handle_error, create_error_context + context = create_error_context( + operation="build", + phase="build", + component="build_command" + ) + handle_error(e, context=context) + raise typer.Exit(ExitCode.FAILURE) + diff --git a/src/madengine/cli/commands/database.py b/src/madengine/cli/commands/database.py new file mode 100644 index 00000000..8f804e06 --- /dev/null +++ b/src/madengine/cli/commands/database.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +""" +Database command for madengine CLI - MongoDB upload. + +Modern implementation with auto-detection and intelligent defaults. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import os +from pathlib import Path + +import typer +from rich.panel import Panel +from rich.console import Console + +try: + from typing import Annotated +except ImportError: + from typing_extensions import Annotated + +from madengine.database.mongodb import ( + upload_file_to_mongodb, + MongoDBConfig, + UploadOptions +) +from ..constants import ExitCode +from ..utils import setup_logging + +console = Console() + + +def database( + file: Annotated[ + str, + typer.Option( + "--file", "-f", + help="Path to file (CSV or JSON, auto-detected)" + ), + ], + database: Annotated[ + str, + typer.Option( + "--database", "--db", + help="MongoDB database name" + ), + ], + collection: Annotated[ + str, + typer.Option( + "--collection", "-c", + help="MongoDB collection name" + ), + ], + unique_key: Annotated[ + str, + typer.Option( + "--unique-key", "-k", + help="Unique field(s) for deduplication (comma-separated, auto-detected if not specified)" + ), + ] = None, + batch_size: Annotated[ + int, + typer.Option( + "--batch-size", + help="Batch size for bulk operations" + ), + ] = 1000, + no_upsert: Annotated[ + bool, + typer.Option( + "--no-upsert", + help="Insert only (don't update existing documents)" + ), + ] = False, + no_index: Annotated[ + bool, + typer.Option( + "--no-index", + help="Skip automatic index creation" + ), + ] = False, + dry_run: Annotated[ + bool, + typer.Option( + "--dry-run", + help="Validate without uploading" + ), + ] = False, + verbose: Annotated[ + bool, + typer.Option( + "--verbose", "-v", + help="Verbose output" + ), + ] = False, +) -> None: + """ + 💾 Upload CSV or JSON files to MongoDB. + + Supports intelligent type preservation, automatic deduplication, + and bulk operations for optimal performance. + + \b + Examples: + # Upload JSON with auto-detection + madengine database -f perf_entry_super.json --db mydb -c perf_super + + # Upload CSV with custom unique key + madengine database -f perf.csv --db test -c results -k model,timestamp + + # Dry run to validate + madengine database -f data.json --db test -c data --dry-run + + \b + Environment Variables: + MONGO_HOST MongoDB host (default: localhost) + MONGO_PORT MongoDB port (default: 27017) + MONGO_USER MongoDB username + MONGO_PASSWORD MongoDB password + """ + + setup_logging(verbose) + + # Display configuration + file_path = Path(file) + + console.print( + Panel( + f"💾 [bold cyan]MongoDB Upload[/bold cyan]\n\n" + f"File: [yellow]{file_path.name}[/yellow]\n" + f"Database: [yellow]{database}[/yellow]\n" + f"Collection: [yellow]{collection}[/yellow]\n" + f"Unique Key: [yellow]{unique_key or 'auto-detect'}[/yellow]\n" + f"Mode: [yellow]{'Dry Run' if dry_run else 'Upload'}[/yellow]", + border_style="cyan", + ) + ) + + # Validate file exists + if not file_path.exists(): + console.print(f"❌ [bold red]File not found: {file}[/bold red]") + raise typer.Exit(ExitCode.FAILURE) + + # Prepare configuration + config = MongoDBConfig.from_env() + + # Parse unique fields + unique_fields = None + if unique_key: + unique_fields = [k.strip() for k in unique_key.split(',')] + + # Prepare options + options = UploadOptions( + unique_fields=unique_fields, + upsert=not no_upsert, + batch_size=batch_size, + create_indexes=not no_index, + dry_run=dry_run + ) + + try: + # Perform upload + result = upload_file_to_mongodb( + file_path=str(file_path), + database_name=database, + collection_name=collection, + config=config, + options=options + ) + + # Display results + console.print() + result.print_summary() + + # Show errors if any + if result.errors and verbose: + console.print("\n⚠️ [yellow]Errors:[/yellow]") + for i, error in enumerate(result.errors[:10], 1): + console.print(f" {i}. {error}") + if len(result.errors) > 10: + console.print(f" ... and {len(result.errors) - 10} more errors") + + # Exit with appropriate code + if result.status == "success": + raise typer.Exit(ExitCode.SUCCESS) + elif result.status == "partial": + raise typer.Exit(ExitCode.SUCCESS if result.documents_inserted + result.documents_updated > 0 else ExitCode.FAILURE) + else: + raise typer.Exit(ExitCode.FAILURE) + + except typer.Exit: + # Re-raise typer.Exit without catching it + raise + except Exception as e: + console.print(f"\n💥 [bold red]Upload failed:[/bold red] {str(e)}") + if verbose: + console.print_exception() + raise typer.Exit(ExitCode.FAILURE) diff --git a/src/madengine/cli/commands/discover.py b/src/madengine/cli/commands/discover.py new file mode 100644 index 00000000..a0fc939c --- /dev/null +++ b/src/madengine/cli/commands/discover.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +Discover command for madengine CLI + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +from typing import List + +import typer +from rich.panel import Panel + +try: + from typing import Annotated # Python 3.9+ +except ImportError: + from typing_extensions import Annotated # Python 3.8 + +from madengine.utils.discover_models import DiscoverModels + +from ..constants import ExitCode +from ..utils import console, setup_logging, split_comma_separated_tags, create_args_namespace + + +def discover( + tags: Annotated[ + List[str], + typer.Option("--tags", "-t", help="Model tags to discover (can specify multiple)"), + ] = [], + verbose: Annotated[ + bool, typer.Option("--verbose", "-v", help="Enable verbose logging") + ] = False, +) -> None: + """ + 🔍 Discover all models in the project. + + This command discovers all available models in the project based on the + specified tags. If no tags are provided, all models will be discovered. + """ + setup_logging(verbose) + + # Process tags to handle comma-separated values + processed_tags = split_comma_separated_tags(tags) + + console.print( + Panel( + f"🔍 [bold cyan]Discovering Models[/bold cyan]\n" + f"Tags: [yellow]{processed_tags if processed_tags else 'All models'}[/yellow]", + title="Model Discovery", + border_style="blue", + ) + ) + + try: + # Create args namespace similar to mad.py + args = create_args_namespace(tags=processed_tags) + + # Use DiscoverModels class + # Note: DiscoverModels prints output directly and returns None + discover_models_instance = DiscoverModels(args=args) + result = discover_models_instance.run() + + console.print("✅ [bold green]Model discovery completed successfully[/bold green]") + + except Exception as e: + console.print(f"💥 [bold red]Model discovery failed: {e}[/bold red]") + if verbose: + console.print_exception() + raise typer.Exit(ExitCode.FAILURE) + diff --git a/src/madengine/cli/commands/report.py b/src/madengine/cli/commands/report.py new file mode 100644 index 00000000..6abbb845 --- /dev/null +++ b/src/madengine/cli/commands/report.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Report command for madengine CLI + +This module provides report generation commands including CSV to HTML +and CSV to email conversions. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import os +from pathlib import Path + +import typer +from rich.panel import Panel + +try: + from typing import Annotated # Python 3.9+ +except ImportError: + from typing_extensions import Annotated # Python 3.8 + +from madengine.reporting.csv_to_html import ConvertCsvToHtml +from madengine.reporting.csv_to_email import ConvertCsvToEmail + +from ..constants import ExitCode +from ..utils import console, setup_logging, create_args_namespace + + +# Create a sub-app for report commands +report_app = typer.Typer( + name="report", + help="📊 Generate reports from CSV files", + rich_markup_mode="rich", + no_args_is_help=True, +) + + +@report_app.command("to-html") +def to_html( + csv_file: Annotated[ + str, + typer.Option( + "--csv-file", + help="Path to the CSV file to convert to HTML" + ), + ], + verbose: Annotated[ + bool, typer.Option("--verbose", "-v", help="Enable verbose logging") + ] = False, +) -> None: + """ + 📄 Convert a single CSV file to HTML report. + + This command converts a CSV file to an HTML table format, + useful for viewing performance metrics in a web browser. + + Examples: + madengine report to-html --csv-file perf_amd.csv + madengine report to-html --csv-file results/perf_mi300.csv + """ + setup_logging(verbose) + + console.print( + Panel( + f"📄 [bold cyan]Converting CSV to HTML[/bold cyan]\n" + f"Input file: [yellow]{csv_file}[/yellow]", + title="CSV to HTML Report", + border_style="blue", + ) + ) + + # Validate input + if not os.path.exists(csv_file): + console.print(f"❌ [bold red]Error: CSV file not found: {csv_file}[/bold red]") + raise typer.Exit(ExitCode.FAILURE) + + if not os.path.isfile(csv_file): + console.print(f"❌ [bold red]Error: Path is not a file: {csv_file}[/bold red]") + raise typer.Exit(ExitCode.FAILURE) + + if not csv_file.endswith('.csv'): + console.print(f"❌ [bold red]Error: File must be a CSV file: {csv_file}[/bold red]") + raise typer.Exit(ExitCode.FAILURE) + + try: + # Create args namespace for compatibility with existing code + args = create_args_namespace(csv_file_path=csv_file) + + # Use ConvertCsvToHtml class + converter = ConvertCsvToHtml(args=args) + result = converter.run() + + if result: + # Determine output file name + output_file = str(Path(csv_file).with_suffix('.html')) + console.print(f"✅ [bold green]Successfully converted to: {output_file}[/bold green]") + else: + console.print("❌ [bold red]Conversion failed[/bold red]") + raise typer.Exit(ExitCode.FAILURE) + + except Exception as e: + console.print(f"💥 [bold red]Conversion failed: {e}[/bold red]") + if verbose: + console.print_exception() + raise typer.Exit(ExitCode.FAILURE) + + +@report_app.command("to-email") +def to_email( + directory: Annotated[ + str, + typer.Option( + "--directory", + "--dir", + help="Path to directory containing CSV files to consolidate" + ), + ] = ".", + output: Annotated[ + str, + typer.Option( + "--output", + "-o", + help="Output HTML filename" + ), + ] = "run_results.html", + verbose: Annotated[ + bool, typer.Option("--verbose", "-v", help="Enable verbose logging") + ] = False, +) -> None: + """ + 📧 Convert all CSV files in a directory to consolidated email-ready HTML report. + + This command scans a directory for CSV files and combines them into a single + HTML report with sections for each CSV file, suitable for email distribution. + + Examples: + madengine report to-email + madengine report to-email --directory ./results + madengine report to-email --dir ./results --output summary.html + """ + setup_logging(verbose) + + console.print( + Panel( + f"📧 [bold cyan]Converting CSV Files to Email Report[/bold cyan]\n" + f"Input directory: [yellow]{directory}[/yellow]\n" + f"Output file: [yellow]{output}[/yellow]", + title="CSV to Email Report", + border_style="blue", + ) + ) + + # Validate input + if not os.path.exists(directory): + console.print(f"❌ [bold red]Error: Directory not found: {directory}[/bold red]") + raise typer.Exit(ExitCode.FAILURE) + + if not os.path.isdir(directory): + console.print(f"❌ [bold red]Error: Path is not a directory: {directory}[/bold red]") + console.print(f"💡 [cyan]Tip: Use 'to-html' command for single CSV files[/cyan]") + raise typer.Exit(ExitCode.FAILURE) + + try: + # Create args namespace for compatibility with existing code + # The old code expects 'csv_file_path' to be the directory + args = create_args_namespace(csv_file_path=directory, output_file=output) + + # Use ConvertCsvToEmail class + converter = ConvertCsvToEmail(args=args) + result = converter.run() + + if result: + output_path = os.path.join(directory, output) if directory != "." else output + console.print(f"✅ [bold green]Successfully generated email report: {output_path}[/bold green]") + else: + console.print("⚠️ [yellow]No CSV files found to process[/yellow]") + + except Exception as e: + console.print(f"💥 [bold red]Report generation failed: {e}[/bold red]") + if verbose: + console.print_exception() + raise typer.Exit(ExitCode.FAILURE) + + +# Export the report app +def report() -> typer.Typer: + """Return the report sub-app.""" + return report_app + diff --git a/src/madengine/cli/commands/run.py b/src/madengine/cli/commands/run.py new file mode 100644 index 00000000..90fc16f8 --- /dev/null +++ b/src/madengine/cli/commands/run.py @@ -0,0 +1,447 @@ +#!/usr/bin/env python3 +""" +Run command for madengine CLI + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import ast +import json +import os +from typing import List, Optional + +import typer +from rich.panel import Panel +from rich.progress import Progress, SpinnerColumn, TextColumn + +try: + from typing import Annotated # Python 3.9+ +except ImportError: + from typing_extensions import Annotated # Python 3.8 + +from madengine.orchestration.run_orchestrator import RunOrchestrator +from madengine.core.errors import ( + ConfigurationError, + RuntimeError as MADRuntimeError, +) + +from ..constants import ( + ExitCode, + DEFAULT_MANIFEST_FILE, + DEFAULT_PERF_OUTPUT, + DEFAULT_DATA_CONFIG, + DEFAULT_TOOLS_CONFIG, + DEFAULT_TIMEOUT, +) +from ..utils import ( + console, + setup_logging, + split_comma_separated_tags, + create_args_namespace, + save_summary_with_feedback, + display_results_table, + display_performance_table, +) + + +def run( + tags: Annotated[ + List[str], + typer.Option("--tags", "-t", help="Model tags to run (can specify multiple)"), + ] = [], + manifest_file: Annotated[ + str, typer.Option("--manifest-file", "-m", help="Build manifest file path") + ] = "", + registry: Annotated[ + Optional[str], typer.Option("--registry", "-r", help="Docker registry URL") + ] = None, + timeout: Annotated[ + int, + typer.Option( + "--timeout", + help="Timeout for model run in seconds (-1 for default, 0 for no timeout)", + ), + ] = DEFAULT_TIMEOUT, + additional_context: Annotated[ + str, + typer.Option( + "--additional-context", "-c", help="Additional context as JSON string" + ), + ] = "{}", + additional_context_file: Annotated[ + Optional[str], + typer.Option( + "--additional-context-file", + "-f", + help="File containing additional context JSON", + ), + ] = None, + keep_alive: Annotated[ + bool, + typer.Option("--keep-alive", help="Keep Docker containers alive after run"), + ] = False, + keep_model_dir: Annotated[ + bool, typer.Option("--keep-model-dir", help="Keep model directory after run") + ] = False, + clean_docker_cache: Annotated[ + bool, + typer.Option( + "--clean-docker-cache", + help="Rebuild images without using cache (for full workflow)", + ), + ] = False, + manifest_output: Annotated[ + str, + typer.Option( + "--manifest-output", help="Output file for build manifest (full workflow)" + ), + ] = DEFAULT_MANIFEST_FILE, + summary_output: Annotated[ + Optional[str], + typer.Option("--summary-output", "-s", help="Output file for summary JSON"), + ] = None, + live_output: Annotated[ + bool, typer.Option("--live-output", "-l", help="Print output in real-time") + ] = False, + output: Annotated[ + str, typer.Option("--output", "-o", help="Performance output file") + ] = DEFAULT_PERF_OUTPUT, + ignore_deprecated_flag: Annotated[ + bool, typer.Option("--ignore-deprecated", help="Force run deprecated models") + ] = False, + data_config_file_name: Annotated[ + str, typer.Option("--data-config", help="Custom data configuration file") + ] = DEFAULT_DATA_CONFIG, + tools_json_file_name: Annotated[ + str, typer.Option("--tools-config", help="Custom tools JSON configuration") + ] = DEFAULT_TOOLS_CONFIG, + generate_sys_env_details: Annotated[ + bool, + typer.Option("--sys-env-details", help="Generate system config env details"), + ] = True, + force_mirror_local: Annotated[ + Optional[str], + typer.Option("--force-mirror-local", help="Path to force local data mirroring"), + ] = None, + disable_skip_gpu_arch: Annotated[ + bool, + typer.Option( + "--disable-skip-gpu-arch", + help="Disable skipping models based on GPU architecture", + ), + ] = False, + verbose: Annotated[ + bool, typer.Option("--verbose", "-v", help="Enable verbose logging") + ] = False, + cleanup_perf: Annotated[ + bool, + typer.Option( + "--cleanup-perf", + help="Remove intermediate perf_entry files after run (keeps perf.csv and perf_super files)", + ), + ] = False, +) -> None: + """ + 🚀 Run model containers in distributed scenarios. + + If manifest-file is provided and exists, runs execution phase only. + Otherwise runs the complete workflow (build + run). + """ + setup_logging(verbose) + + # Process tags to handle comma-separated values + processed_tags = split_comma_separated_tags(tags) + + # Input validation + if timeout < -1: + console.print( + "❌ [red]Timeout must be -1 (default) or a positive integer[/red]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + # Convert -1 (default) to actual default timeout value (7200 seconds = 2 hours) + if timeout == -1: + timeout = 7200 + + try: + # Check if we're doing execution-only or full workflow + manifest_exists = manifest_file and os.path.exists(manifest_file) + + if manifest_exists: + console.print( + Panel( + f"🚀 [bold cyan]Running Models (Execution Only)[/bold cyan]\n" + f"Manifest: [yellow]{manifest_file}[/yellow]\n" + f"Registry: [yellow]{registry or 'Auto-detected'}[/yellow]\n" + f"Timeout: [yellow]{timeout if timeout != -1 else 'Default'}[/yellow]s", + title="Execution Configuration", + border_style="green", + ) + ) + + # Create arguments object for execution only + args = create_args_namespace( + tags=processed_tags, + manifest_file=manifest_file, + registry=registry, + timeout=timeout, + additional_context=additional_context, + additional_context_file=additional_context_file, + keep_alive=keep_alive, + keep_model_dir=keep_model_dir, + live_output=live_output, + output=output, + ignore_deprecated_flag=ignore_deprecated_flag, + data_config_file_name=data_config_file_name, + tools_json_file_name=tools_json_file_name, + generate_sys_env_details=generate_sys_env_details, + force_mirror_local=force_mirror_local, + disable_skip_gpu_arch=disable_skip_gpu_arch, + verbose=verbose, + cleanup_perf=cleanup_perf, + _separate_phases=True, + ) + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + console=console, + ) as progress: + task = progress.add_task( + "Initializing execution orchestrator...", total=None + ) + + # Use new RunOrchestrator + orchestrator = RunOrchestrator(args) + progress.update(task, description="Running models...") + + execution_summary = orchestrator.execute( + manifest_file=manifest_file, + tags=None, # manifest-only mode + registry=registry, + timeout=timeout, + ) + progress.update(task, description="Execution completed!") + + # Display results summary + display_results_table(execution_summary, "Execution Results") + + # Display detailed performance metrics from CSV (show all historical runs, mark current ones) + perf_csv_path = getattr(args, "output", DEFAULT_PERF_OUTPUT) + session_start_row = execution_summary.get("session_start_row") + display_performance_table(perf_csv_path, session_start_row) + + # Cleanup session marker AFTER display (so display functions can use it) + from madengine.utils.session_tracker import SessionTracker + tracker = SessionTracker(perf_csv_path) + tracker.cleanup_marker() + + # Cleanup intermediate perf files if requested + if cleanup_perf: + from madengine.utils.perf_cleanup import cleanup_perf_intermediates as do_cleanup + console.print("\n🧹 [cyan]Cleaning up intermediate performance files...[/cyan]") + do_cleanup() + + save_summary_with_feedback(execution_summary, summary_output, "Execution") + + failed_runs = len(execution_summary.get("failed_runs", [])) + if failed_runs == 0: + console.print( + "🎉 [bold green]All model executions completed successfully![/bold green]" + ) + raise typer.Exit(ExitCode.SUCCESS) + else: + console.print( + f"💥 [bold red]Execution failed for {failed_runs} models[/bold red]" + ) + raise typer.Exit(ExitCode.RUN_FAILURE) + + else: + # Check if MAD_CONTAINER_IMAGE is provided - this enables local image mode + additional_context_dict = {} + try: + if additional_context and additional_context != "{}": + additional_context_dict = json.loads(additional_context) + except json.JSONDecodeError: + try: + # Try parsing as Python dict literal + additional_context_dict = ast.literal_eval(additional_context) + except (ValueError, SyntaxError): + console.print( + f"❌ [red]Invalid additional_context format: {additional_context}[/red]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + # Load additional context from file if provided + if additional_context_file and os.path.exists(additional_context_file): + try: + with open(additional_context_file, 'r') as f: + file_context = json.load(f) + additional_context_dict.update(file_context) + except json.JSONDecodeError: + console.print( + f"❌ [red]Invalid JSON format in {additional_context_file}[/red]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + # MAD_CONTAINER_IMAGE handling is now done in RunOrchestrator + # Full workflow (may include MAD_CONTAINER_IMAGE mode) + if manifest_file: + console.print( + f"⚠️ Manifest file [yellow]{manifest_file}[/yellow] not found, running complete workflow" + ) + + console.print( + Panel( + f"🔨🚀 [bold cyan]Complete Workflow (Build + Run)[/bold cyan]\n" + f"Tags: [yellow]{', '.join(processed_tags) if processed_tags else 'All models'}[/yellow]\n" + f"Registry: [yellow]{registry or 'Local only'}[/yellow]\n" + f"Timeout: [yellow]{timeout if timeout != -1 else 'Default'}[/yellow]s", + title="Workflow Configuration", + border_style="magenta", + ) + ) + + # Create arguments object for full workflow + args = create_args_namespace( + tags=processed_tags, + registry=registry, + timeout=timeout, + additional_context=additional_context, + additional_context_file=additional_context_file, + keep_alive=keep_alive, + keep_model_dir=keep_model_dir, + clean_docker_cache=clean_docker_cache, + manifest_output=manifest_output, + live_output=live_output, + output=output, + ignore_deprecated_flag=ignore_deprecated_flag, + data_config_file_name=data_config_file_name, + tools_json_file_name=tools_json_file_name, + generate_sys_env_details=generate_sys_env_details, + force_mirror_local=force_mirror_local, + disable_skip_gpu_arch=disable_skip_gpu_arch, + verbose=verbose, + cleanup_perf=cleanup_perf, + _separate_phases=False, # Full workflow uses .live.log (not .run.live.log) + ) + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + console=console, + ) as progress: + task = progress.add_task( + "Initializing workflow orchestrator...", total=None + ) + + # Use new RunOrchestrator (handles build+run automatically when tags provided) + orchestrator = RunOrchestrator(args) + + progress.update(task, description="Building and running models...") + execution_summary = orchestrator.execute( + manifest_file=None, # Triggers build phase + tags=processed_tags, + registry=registry, + timeout=timeout, + ) + progress.update(task, description="Workflow completed!") + + # Load build summary from generated manifest + with open(manifest_output, 'r') as f: + manifest = json.load(f) + build_summary = manifest.get("summary", {}) + + # Combine summaries + workflow_summary = { + "build_phase": build_summary, + "run_phase": execution_summary, + "overall_success": ( + len(build_summary.get("failed_builds", [])) == 0 + and len(execution_summary.get("failed_runs", [])) == 0 + ), + } + + # Display results + display_results_table(build_summary, "Build Results") + display_results_table(execution_summary, "Execution Results") + + # Display detailed performance metrics from CSV (show all historical runs, mark current ones) + perf_csv_path = getattr(args, "output", DEFAULT_PERF_OUTPUT) + session_start_row = execution_summary.get("session_start_row") + display_performance_table(perf_csv_path, session_start_row) + + # Cleanup session marker AFTER display (so display functions can use it) + from madengine.utils.session_tracker import SessionTracker + tracker = SessionTracker(perf_csv_path) + tracker.cleanup_marker() + + # Cleanup intermediate perf files if requested + if cleanup_perf: + from madengine.utils.perf_cleanup import cleanup_perf_intermediates as do_cleanup + console.print("\n🧹 [cyan]Cleaning up intermediate performance files...[/cyan]") + do_cleanup() + + save_summary_with_feedback(workflow_summary, summary_output, "Workflow") + + if workflow_summary["overall_success"]: + console.print( + "🎉 [bold green]Complete workflow finished successfully![/bold green]" + ) + raise typer.Exit(ExitCode.SUCCESS) + else: + failed_runs = len(execution_summary.get("failed_runs", [])) + if failed_runs > 0: + console.print( + f"💥 [bold red]Workflow completed but {failed_runs} model executions failed[/bold red]" + ) + raise typer.Exit(ExitCode.RUN_FAILURE) + else: + console.print( + "💥 [bold red]Workflow failed for unknown reasons[/bold red]" + ) + raise typer.Exit(ExitCode.FAILURE) + + except typer.Exit: + raise + except MADRuntimeError as e: + # Runtime execution errors + console.print(f"💥 [bold red]Runtime error: {e}[/bold red]") + if hasattr(e, 'suggestions') and e.suggestions: + console.print("\n💡 [cyan]Suggestions:[/cyan]") + for suggestion in e.suggestions: + console.print(f" • {suggestion}") + raise typer.Exit(ExitCode.RUN_FAILURE) + + except ConfigurationError as e: + # Configuration errors + console.print(f"⚙️ [bold red]Configuration error: {e}[/bold red]") + if hasattr(e, 'suggestions') and e.suggestions: + console.print("\n💡 [cyan]Suggestions:[/cyan]") + for suggestion in e.suggestions: + console.print(f" • {suggestion}") + raise typer.Exit(ExitCode.INVALID_ARGS) + + except KeyboardInterrupt: + console.print("\n🛑 [yellow]Run cancelled by user[/yellow]") + raise typer.Exit(ExitCode.FAILURE) + + except FileNotFoundError as e: + console.print(f"📁 [bold red]File not found: {e}[/bold red]") + console.print("💡 Check manifest file path and required files") + raise typer.Exit(ExitCode.FAILURE) + + except Exception as e: + console.print(f"💥 [bold red]Run process failed: {e}[/bold red]") + if verbose: + console.print_exception() + + from madengine.core.errors import handle_error, create_error_context + context = create_error_context( + operation="run", + phase="run", + component="run_command" + ) + handle_error(e, context=context) + raise typer.Exit(ExitCode.FAILURE) + diff --git a/src/madengine/cli/constants.py b/src/madengine/cli/constants.py new file mode 100644 index 00000000..f32eb024 --- /dev/null +++ b/src/madengine/cli/constants.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +""" +Constants and configuration for madengine CLI + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + + +# Exit codes +class ExitCode: + """Exit codes for CLI commands.""" + + SUCCESS = 0 + FAILURE = 1 + BUILD_FAILURE = 2 + RUN_FAILURE = 3 + INVALID_ARGS = 4 + + +# Valid values for validation +VALID_GPU_VENDORS = ["AMD", "NVIDIA"] +VALID_GUEST_OS = ["UBUNTU", "CENTOS"] + +# Default file paths and values +DEFAULT_MANIFEST_FILE = "build_manifest.json" +DEFAULT_PERF_OUTPUT = "perf.csv" +DEFAULT_DATA_CONFIG = "data.json" +DEFAULT_TOOLS_CONFIG = "./scripts/common/tools.json" +DEFAULT_TIMEOUT = -1 + diff --git a/src/madengine/cli/utils.py b/src/madengine/cli/utils.py new file mode 100644 index 00000000..500232d7 --- /dev/null +++ b/src/madengine/cli/utils.py @@ -0,0 +1,506 @@ +#!/usr/bin/env python3 +""" +Utility functions for madengine CLI + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import logging +import os +from typing import Dict, List, Optional + +import typer +from rich.console import Console +from rich.logging import RichHandler +from rich.table import Table + +from madengine.core.errors import ErrorHandler, set_error_handler +from .constants import ExitCode + + +# Initialize Rich console +console = Console() + + +def setup_logging(verbose: bool = False) -> None: + """Setup Rich logging configuration and unified error handler.""" + log_level = logging.DEBUG if verbose else logging.INFO + + # Setup rich logging handler + rich_handler = RichHandler( + console=console, + show_time=True, + show_path=verbose, + markup=True, + rich_tracebacks=True, + ) + + logging.basicConfig( + level=log_level, + format="%(message)s", + datefmt="[%X]", + handlers=[rich_handler], + ) + + # Setup unified error handler + error_handler = ErrorHandler(console=console, verbose=verbose) + set_error_handler(error_handler) + + +def split_comma_separated_tags(tags: List[str]) -> List[str]: + """Split comma-separated tags into individual tags. + + Handles both formats: + - Multiple flags: --tags dummy --tags multi → ['dummy', 'multi'] + - Comma-separated: --tags dummy,multi → ['dummy', 'multi'] + + Args: + tags: List of tag strings (may contain comma-separated values) + + Returns: + List of individual tag strings + """ + if not tags: + return [] + + processed_tags = [] + for tag in tags: + # Split by comma and strip whitespace + split_tags = [t.strip() for t in tag.split(',') if t.strip()] + processed_tags.extend(split_tags) + + return processed_tags + + +def create_args_namespace(**kwargs) -> object: + """Create an argparse.Namespace-like object from keyword arguments.""" + + class Args: + def __init__(self, **kwargs): + for key, value in kwargs.items(): + setattr(self, key, value) + + return Args(**kwargs) + + +def save_summary_with_feedback( + summary: Dict, output_path: Optional[str], summary_type: str +) -> None: + """Save summary to file with user feedback.""" + if output_path: + try: + with open(output_path, "w") as f: + json.dump(summary, f, indent=2) + console.print( + f"💾 {summary_type} summary saved to: [cyan]{output_path}[/cyan]" + ) + except IOError as e: + console.print(f"❌ Failed to save {summary_type} summary: [red]{e}[/red]") + raise typer.Exit(ExitCode.FAILURE) + + +def display_results_table(summary: Dict, title: str, show_gpu_arch: bool = False) -> None: + """ + Display results in a formatted table. + + Automatically detects: + - BUILD results: Simple format (no nodes/performance) + - RUN results with nodes: Enhanced per-node breakdown + """ + successful = summary.get("successful_builds", summary.get("successful_runs", [])) + failed = summary.get("failed_builds", summary.get("failed_runs", [])) + + # Detect if this is a RUN result with per-node data (vs BUILD result) + has_node_data = False + for item in successful + failed: + if isinstance(item, dict) and ("nodes" in item or "perf_data" in item): + has_node_data = True + break + + # Create table with appropriate columns based on result type + if has_node_data: + # RUN results - enhanced format with per-node breakdown + table = Table( + title=f"⚡ {title} (Per-Node Breakdown)", + show_header=True, + header_style="bold magenta" + ) + table.add_column("Index", justify="right", style="dim") + table.add_column("Status", style="bold") + table.add_column("Model", style="cyan") + table.add_column("Node", style="yellow") + table.add_column("Performance", justify="right", style="green") + table.add_column("Metric", style="green") + else: + # BUILD results - simple format (no per-node data) + table = Table( + title=f"⚡ {title}", + show_header=True, + header_style="bold magenta" + ) + table.add_column("Index", justify="right", style="dim") + table.add_column("Status", style="bold") + table.add_column("Model", style="cyan") + + # Add GPU Architecture column if multi-arch build was used + if show_gpu_arch: + table.add_column("GPU Architecture", style="blue") + + # Helper function to extract model name from build result + def extract_model_name(item): + if isinstance(item, dict): + # Prioritize direct model name field if available + if "model" in item: + return item["model"] + elif "name" in item: + return item["name"] + # Fallback to extracting from docker_image for backward compatibility + elif "docker_image" in item: + # Extract model name from docker image name + docker_image = item["docker_image"] + if docker_image.startswith("ci-"): + parts = docker_image[3:].split("_") + model_name = parts[0] if len(parts) >= 2 else (parts[0] if parts else docker_image) + else: + model_name = docker_image + return model_name + return str(item)[:20] + + # Helper function to format numbers + def format_number(value): + if value is None or value == "-": + return "-" + try: + return f"{float(value):,.0f}" + except (ValueError, TypeError): + return str(value) + + # Add successful builds/runs + row_index = 1 + job_summaries = [] # For final summary line + + for item in successful: + if isinstance(item, dict): + model_name = extract_model_name(item) + nodes = item.get("nodes", []) + perf_data = item.get("perf_data", {}) + + if has_node_data: + # RUN results - show per-node breakdown + if not nodes: + # Single-node or old format - show one row + status = "✅ Success" + node_str = "node-0" + perf = perf_data.get("performance", "-") + metric = perf_data.get("metric", "-") + + row = [str(row_index), status, model_name, node_str, format_number(perf), metric] + if show_gpu_arch: + row.append(perf_data.get("gpu_architecture", "N/A")) + table.add_row(*row) + row_index += 1 + + job_summaries.append({ + "model": model_name, + "nodes_succeeded": 1, + "nodes_total": 1, + "aggregated_perf": perf, + "metric": metric + }) + else: + # Multi-node - show all nodes + aggregated_perf = perf_data.get("performance") + aggregated_metric = perf_data.get("metric") + + nodes_succeeded = sum(1 for n in nodes if n.get("status") == "SUCCESS") + + for node in nodes: + status_icon = "✅" if node.get("status") == "SUCCESS" else "❌" + status = f"{status_icon} {node.get('status')}" + node_str = f"node-{node['node_id']}" + + # Show node-local performance + perf = node.get("performance", "-") + metric = node.get("metric", "-") + + row = [str(row_index), status, model_name, node_str, format_number(perf) if perf != "-" else "-", metric if metric else "-"] + if show_gpu_arch: + row.append(perf_data.get("gpu_architecture", "N/A")) + table.add_row(*row) + row_index += 1 + + job_summaries.append({ + "model": model_name, + "nodes_succeeded": nodes_succeeded, + "nodes_total": len(nodes), + "aggregated_perf": aggregated_perf, + "metric": aggregated_metric + }) + else: + # BUILD results - simple format (no node/performance columns) + status = "✅ Success" + row = [str(row_index), status, model_name] + if show_gpu_arch: + row.append(item.get("architecture", "N/A")) + table.add_row(*row) + row_index += 1 + else: + # Fallback for non-dict items + model_name = str(item)[:20] + if has_node_data: + row = [str(row_index), "✅ Success", model_name, "node-0", "-", "-"] + else: + row = [str(row_index), "✅ Success", model_name] + if show_gpu_arch: + row.append("N/A") + table.add_row(*row) + row_index += 1 + + # Add failed builds/runs + for item in failed: + if isinstance(item, dict): + model_name = item.get("model", "Unknown") + nodes = item.get("nodes", []) + + if has_node_data: + # RUN results - show per-node failures + if not nodes: + # Single failure + row = [str(row_index), "❌ Failed", model_name, "node-0", "-", item.get("error", "Unknown")] + if show_gpu_arch: + row.append(item.get("architecture", "N/A")) + table.add_row(*row) + row_index += 1 + else: + # Multi-node failure + for node in nodes: + status_icon = "❌" + status = f"{status_icon} {node.get('status', 'FAILED')}" + node_str = f"node-{node['node_id']}" + error = node.get("error", "-") + row = [str(row_index), status, model_name, node_str, "-", error if error else "-"] + if show_gpu_arch: + row.append("N/A") + table.add_row(*row) + row_index += 1 + else: + # BUILD results - simple format + row = [str(row_index), "❌ Failed", model_name] + if show_gpu_arch: + row.append(item.get("architecture", "N/A")) + table.add_row(*row) + row_index += 1 + else: + if has_node_data: + row = [str(row_index), "❌ Failed", str(item), "node-0", "-", "-"] + else: + row = [str(row_index), "❌ Failed", str(item)] + if show_gpu_arch: + row.append("N/A") + table.add_row(*row) + row_index += 1 + + # Show empty state if no results + if not successful and not failed: + if has_node_data: + row = ["1", "ℹ️ No items", "", "", "", ""] + else: + row = ["1", "ℹ️ No items", ""] + if show_gpu_arch: + row.append("") + table.add_row(*row) + + console.print(table) + + # Print job-level summaries for multi-node jobs (RUN results only) + if has_node_data and job_summaries: + console.print("\n💡 [bold]Job Summary:[/bold]") + for js in job_summaries: + if js["nodes_total"] > 1: + console.print( + f" • {js['model']}: {js['nodes_succeeded']}/{js['nodes_total']} nodes succeeded | " + f"Aggregated Performance: {format_number(js['aggregated_perf'])} {js['metric']}" + ) + else: + console.print( + f" • {js['model']}: Single-node | Performance: {format_number(js['aggregated_perf'])} {js['metric']}" + ) + + +def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row: int = None) -> None: + """Display performance metrics from perf.csv file. + + Shows all historical runs with visual markers for current session runs. + + Args: + perf_csv_path: Path to the performance CSV file + session_start_row: Optional row number to filter from (for current session only) + """ + if not os.path.exists(perf_csv_path): + console.print(f"[yellow]⚠️ Performance CSV not found: {perf_csv_path}[/yellow]") + return + + try: + import pandas as pd + from madengine.utils.session_tracker import SessionTracker + + # Read CSV file + df = pd.read_csv(perf_csv_path) + + if df.empty: + console.print("[yellow]⚠️ Performance CSV is empty[/yellow]") + return + + # Get session_start_row to mark current runs (don't filter, just mark) + total_rows = len(df) + + # Try parameter first, then fall back to marker file + if session_start_row is None: + session_start_row = SessionTracker.load_session_marker_for_csv(perf_csv_path) + + # Count current session runs for title + if session_start_row is not None and session_start_row < total_rows: + current_run_count = total_rows - session_start_row + title = f"📊 Performance Results (all {total_rows} runs, {current_run_count} from current session)" + else: + title = f"📊 Performance Results (all {total_rows} runs)" + + # Create performance table + perf_table = Table( + title=title, + show_header=True, + header_style="bold magenta" + ) + + # Add columns (with "Run" marker column as first column) + perf_table.add_column("Run", justify="center", width=4) # Marker column for current session + perf_table.add_column("Index", justify="right", style="dim") + perf_table.add_column("Model", style="cyan") + perf_table.add_column("Topology", justify="center", style="blue") + perf_table.add_column("Launcher", justify="center", style="magenta") # Distributed launcher + perf_table.add_column("Deployment", justify="center", style="cyan") + perf_table.add_column("GPU Arch", style="yellow") + perf_table.add_column("Performance", justify="right", style="green") + perf_table.add_column("Metric", style="green") + perf_table.add_column("Status", style="bold") + perf_table.add_column("Duration", justify="right", style="blue", min_width=8) + perf_table.add_column("Data Name", style="magenta") + perf_table.add_column("Data Provider", style="magenta") + + # Helper function to format duration + def format_duration(duration): + if pd.isna(duration) or duration == "": + return "N/A" + try: + dur = float(duration) + if dur < 1: + return f"{dur*1000:.0f}ms" + elif dur < 60: + return f"{dur:.2f}s" + else: + return f"{dur/60:.1f}m" + except (ValueError, TypeError): + return "N/A" + + # Helper function to format performance + def format_performance(perf): + if pd.isna(perf) or perf == "": + return "N/A" + try: + val = float(perf) + if val >= 1000: + return f"{val:,.0f}" + elif val >= 10: + return f"{val:.1f}" + else: + return f"{val:.2f}" + except (ValueError, TypeError): + return str(perf) + + # Add rows from dataframe + for idx, row in df.iterrows(): + # Determine if this is a current session run + is_current_run = (session_start_row is not None and idx >= session_start_row) + run_marker = "[bold green]➤[/]" if is_current_run else "" # Arrow marker for current runs + + model = str(row.get("model", "Unknown")) + dataname = str(row.get("dataname", "")) if not pd.isna(row.get("dataname")) and row.get("dataname") != "" else "N/A" + data_provider_type = str(row.get("data_provider_type", "")) if not pd.isna(row.get("data_provider_type")) and row.get("data_provider_type") != "" else "N/A" + + # Format topology: Always show "NxG" format for consistency + # Examples: "1N×1G" (single node, single GPU), "1N×4G" (single node, 4 GPUs), "2N×2G" (2 nodes, 2 GPUs each) + n_gpus = row.get("n_gpus", 1) + nnodes = row.get("nnodes", 1) + gpus_per_node = row.get("gpus_per_node", n_gpus) + + # Determine topology display format + try: + nnodes_int = int(nnodes) if not pd.isna(nnodes) and str(nnodes) != "" else 1 + gpus_per_node_int = int(gpus_per_node) if not pd.isna(gpus_per_node) and str(gpus_per_node) != "" else int(n_gpus) if not pd.isna(n_gpus) else 1 + + # Always show NxG format for consistency + topology = f"{nnodes_int}N×{gpus_per_node_int}G" + except (ValueError, TypeError): + # Fallback if parsing fails + topology = "N/A" + + # Get launcher value as-is from the CSV (don't default to "docker" here) + launcher = str(row.get("launcher", "")) if not pd.isna(row.get("launcher")) and row.get("launcher") != "" else "N/A" + deployment_type = str(row.get("deployment_type", "local")) if not pd.isna(row.get("deployment_type")) and row.get("deployment_type") != "" else "local" + gpu_arch = str(row.get("gpu_architecture", "N/A")) + performance = format_performance(row.get("performance", "")) + metric = str(row.get("metric", "")) if not pd.isna(row.get("metric")) else "" + + status = str(row.get("status", "UNKNOWN")) + + # Duration column shows ONLY test/execution time (not build time) + # If test_duration is missing, show N/A + test_dur = row.get("test_duration", "") + if not pd.isna(test_dur) and test_dur != "": + duration = format_duration(test_dur) + else: + duration = "N/A" + + # Color-code status + if status == "SUCCESS": + status_display = "✅ Success" + elif status == "FAILURE": + status_display = "❌ Failed" + else: + status_display = f"⚠️ {status}" + + perf_table.add_row( + run_marker, # Marker column showing ➤ for current runs + str(idx), + model, + topology, + launcher, # Distributed launcher (docker, torchrun, vllm, etc.) + deployment_type, + gpu_arch, + performance, + metric, + status_display, + duration, + dataname, + data_provider_type + ) + + console.print() # Add blank line + console.print(perf_table) + + # Print summary statistics + total_runs = len(df) + successful_runs = len(df[df["status"] == "SUCCESS"]) + failed_runs = len(df[df["status"] == "FAILURE"]) + + console.print() + console.print(f"[bold]Summary:[/bold] {total_runs} total runs, " + f"[green]{successful_runs} successful[/green], " + f"[red]{failed_runs} failed[/red]") + + except ImportError: + console.print("[yellow]⚠️ pandas not installed. Install with: pip install pandas[/yellow]") + except Exception as e: + console.print(f"[red]❌ Error reading performance CSV: {e}[/red]") + diff --git a/src/madengine/cli/validators.py b/src/madengine/cli/validators.py new file mode 100644 index 00000000..6bfc7bdb --- /dev/null +++ b/src/madengine/cli/validators.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python3 +""" +Validation functions for madengine CLI + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import glob +import json +import os +from typing import Dict, List, Optional + +import typer +from rich.console import Console +from rich.panel import Panel + +from madengine.utils.discover_models import DiscoverModels +from .constants import ExitCode, VALID_GPU_VENDORS, VALID_GUEST_OS +from .utils import create_args_namespace + + +# Initialize Rich console +console = Console() + + +def validate_additional_context( + additional_context: str, + additional_context_file: Optional[str] = None, +) -> Dict[str, str]: + """ + Validate and parse additional context. + + Args: + additional_context: JSON string containing additional context + additional_context_file: Optional file containing additional context + + Returns: + Dict containing parsed additional context + + Raises: + typer.Exit: If validation fails + """ + context = {} + + # Load from file first + if additional_context_file: + try: + with open(additional_context_file, "r") as f: + context = json.load(f) + console.print( + f"✅ Loaded additional context from file: [cyan]{additional_context_file}[/cyan]" + ) + except (FileNotFoundError, json.JSONDecodeError) as e: + console.print(f"❌ Failed to load additional context file: [red]{e}[/red]") + raise typer.Exit(ExitCode.INVALID_ARGS) + + # Parse string context (overrides file) + if additional_context and additional_context != "{}": + try: + string_context = json.loads(additional_context) + context.update(string_context) + console.print("✅ Loaded additional context from command line") + except json.JSONDecodeError as e: + console.print(f"❌ Invalid JSON in additional context: [red]{e}[/red]") + console.print("💡 Please provide valid JSON format") + raise typer.Exit(ExitCode.INVALID_ARGS) + + if not context: + console.print("❌ [red]No additional context provided[/red]") + console.print( + "💡 For build operations, you must provide additional context with gpu_vendor and guest_os" + ) + + # Show example usage + example_panel = Panel( + """[bold cyan]Example usage:[/bold cyan] +madengine build --tags dummy --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +[bold cyan]Or using a file:[/bold cyan] +madengine build --tags dummy --additional-context-file context.json + +[bold cyan]Required fields:[/bold cyan] +• gpu_vendor: [green]AMD[/green], [green]NVIDIA[/green] +• guest_os: [green]UBUNTU[/green], [green]CENTOS[/green]""", + title="Additional Context Help", + border_style="blue", + ) + console.print(example_panel) + raise typer.Exit(ExitCode.INVALID_ARGS) + + # Validate required fields + required_fields = ["gpu_vendor", "guest_os"] + missing_fields = [field for field in required_fields if field not in context] + + if missing_fields: + console.print( + f"❌ Missing required fields: [red]{', '.join(missing_fields)}[/red]" + ) + console.print( + "💡 Both gpu_vendor and guest_os are required for build operations" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + # Validate gpu_vendor + gpu_vendor = context["gpu_vendor"].upper() + if gpu_vendor not in VALID_GPU_VENDORS: + console.print(f"❌ Invalid gpu_vendor: [red]{context['gpu_vendor']}[/red]") + console.print( + f"💡 Supported values: [green]{', '.join(VALID_GPU_VENDORS)}[/green]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + # Validate guest_os + guest_os = context["guest_os"].upper() + if guest_os not in VALID_GUEST_OS: + console.print(f"❌ Invalid guest_os: [red]{context['guest_os']}[/red]") + console.print( + f"💡 Supported values: [green]{', '.join(VALID_GUEST_OS)}[/green]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + console.print( + f"✅ Context validated: [green]{gpu_vendor}[/green] + [green]{guest_os}[/green]" + ) + return context + + +def process_batch_manifest(batch_manifest_file: str) -> Dict[str, List[str]]: + """Process batch manifest file and extract model tags based on build_new flag. + + Args: + batch_manifest_file: Path to the input batch.json file + + Returns: + Dict containing 'build_tags' and 'all_tags' lists + + Raises: + FileNotFoundError: If the manifest file doesn't exist + ValueError: If the manifest format is invalid + """ + if not os.path.exists(batch_manifest_file): + raise FileNotFoundError(f"Batch manifest file not found: {batch_manifest_file}") + + try: + with open(batch_manifest_file, "r") as f: + manifest_data = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in batch manifest file: {e}") + + if not isinstance(manifest_data, list): + raise ValueError("Batch manifest must be a list of model objects") + + build_tags = [] # Models that need to be built (build_new=true) + all_tags = [] # All models in the manifest + + for i, model in enumerate(manifest_data): + if not isinstance(model, dict): + raise ValueError(f"Model entry {i} must be a dictionary") + + if "model_name" not in model: + raise ValueError(f"Model entry {i} missing required 'model_name' field") + + model_name = model["model_name"] + build_new = model.get("build_new", False) + + all_tags.append(model_name) + if build_new: + build_tags.append(model_name) + + return { + "build_tags": build_tags, + "all_tags": all_tags, + "manifest_data": manifest_data, + } + + +def process_batch_manifest_entries( + batch_data: Dict, + manifest_output: str, + registry: Optional[str], + guest_os: Optional[str], + gpu_vendor: Optional[str], +) -> None: + """Process batch manifest and add entries for all models to build_manifest.json. + + Args: + batch_data: Processed batch manifest data + manifest_output: Path to the build manifest file + registry: Registry used for the build + guest_os: Guest OS for the build + gpu_vendor: GPU vendor for the build + """ + + # Load the existing build manifest + if os.path.exists(manifest_output): + with open(manifest_output, "r") as f: + build_manifest = json.load(f) + # Remove top-level registry if present + build_manifest.pop("registry", None) + else: + # Create a minimal manifest structure + build_manifest = { + "built_images": {}, + "built_models": {}, + "context": {}, + "credentials_required": [], + } + + # Process each model in the batch manifest + for model_entry in batch_data["manifest_data"]: + model_name = model_entry["model_name"] + build_new = model_entry.get("build_new", False) + model_registry_image = model_entry.get("registry_image", "") + model_registry = model_entry.get("registry", "") + + # If the model was not built (build_new=false), create an entry for it + if not build_new: + # Find the model configuration by discovering models with this tag + try: + # Create a temporary args object to discover the model + temp_args = create_args_namespace( + tags=[model_name], + registry=registry, + additional_context="{}", + additional_context_file=None, + clean_docker_cache=False, + manifest_output=manifest_output, + live_output=False, + verbose=False, + _separate_phases=True, + ) + + discover_models = DiscoverModels(args=temp_args) + models = discover_models.run() + + for model_info in models: + if model_info["name"] == model_name: + # Get dockerfile + dockerfile = model_info.get("dockerfile") + dockerfile_specified = ( + f"{dockerfile}.{guest_os.lower()}.{gpu_vendor.lower()}" + ) + dockerfile_matched_list = glob.glob(f"{dockerfile_specified}.*") + + # Check the matched list + if not dockerfile_matched_list: + console.print( + f"Warning: No Dockerfile found for {dockerfile_specified}" + ) + raise FileNotFoundError( + f"No Dockerfile found for {dockerfile_specified}" + ) + else: + dockerfile_matched = dockerfile_matched_list[0].split("/")[-1].replace(".Dockerfile", "") + + # Create a synthetic image name for this model + synthetic_image_name = f"ci-{model_name}_{dockerfile_matched}" + + # Add to built_images (even though it wasn't actually built) + build_manifest["built_images"][synthetic_image_name] = { + "docker_image": synthetic_image_name, + "dockerfile": model_info.get("dockerfile"), + "base_docker": "", # No base since not built + "docker_sha": "", # No SHA since not built + "build_duration": 0, + "build_command": f"# Skipped build for {model_name} (build_new=false)", + "log_file": f"{model_name}_{dockerfile_matched}.build.skipped.log", + "registry_image": ( + model_registry_image + or f"{model_registry or registry or 'dockerhub'}/{synthetic_image_name}" + if model_registry_image or model_registry or registry + else "" + ), + "registry": model_registry or registry or "dockerhub", + } + + # Add to built_models - include all discovered model fields + model_entry = model_info.copy() # Start with all fields from discovered model + + # Ensure minimum required fields have fallback values + model_entry.setdefault("name", model_name) + model_entry.setdefault("dockerfile", f"docker/{model_name}") + model_entry.setdefault("scripts", f"scripts/{model_name}/run.sh") + model_entry.setdefault("n_gpus", "1") + model_entry.setdefault("owner", "") + model_entry.setdefault("training_precision", "") + model_entry.setdefault("tags", []) + model_entry.setdefault("args", "") + model_entry.setdefault("cred", "") + + build_manifest["built_models"][synthetic_image_name] = model_entry + break + + except Exception as e: + console.print(f"Warning: Could not process model {model_name}: {e}") + # Create a minimal entry anyway + synthetic_image_name = f"ci-{model_name}_{dockerfile_matched}" + build_manifest["built_images"][synthetic_image_name] = { + "docker_image": synthetic_image_name, + "dockerfile": f"docker/{model_name}", + "base_docker": "", + "docker_sha": "", + "build_duration": 0, + "build_command": f"# Skipped build for {model_name} (build_new=false)", + "log_file": f"{model_name}_{dockerfile_matched}.build.skipped.log", + "registry_image": model_registry_image or "", + "registry": model_registry or registry or "dockerhub", + } + build_manifest["built_models"][synthetic_image_name] = { + "name": model_name, + "dockerfile": f"docker/{model_name}", + "scripts": f"scripts/{model_name}/run.sh", + "n_gpus": "1", + "owner": "", + "training_precision": "", + "tags": [], + "args": "", + } + + # Save the updated manifest + with open(manifest_output, "w") as f: + json.dump(build_manifest, f, indent=2) + + console.print( + f"✅ Added entries for all models from batch manifest to {manifest_output}" + ) + diff --git a/src/madengine/core/console.py b/src/madengine/core/console.py index 9340924a..57d7b329 100644 --- a/src/madengine/core/console.py +++ b/src/madengine/core/console.py @@ -8,24 +8,20 @@ # built-in modules import subprocess import typing -# third-party modules -import typing_extensions +import re class Console: """Class to run console commands. - + Attributes: shellVerbose (bool): The shell verbose flag. live_output (bool): The live output flag. """ - def __init__( - self, - shellVerbose: bool=True, - live_output: bool=False - ) -> None: + + def __init__(self, shellVerbose: bool = True, live_output: bool = False) -> None: """Constructor of the Console class. - + Args: shellVerbose (bool): The shell verbose flag. live_output (bool): The live output flag. @@ -33,25 +29,92 @@ def __init__( self.shellVerbose = shellVerbose self.live_output = live_output + def _highlight_docker_operations(self, command: str) -> str: + """Highlight docker push/pull/build/run operations for better visibility. + + Args: + command (str): The command to potentially highlight. + + Returns: + str: The highlighted command if it's a docker operation. + """ + # Check if this is a docker operation + docker_push_pattern = r"^docker\s+push\s+" + docker_pull_pattern = r"^docker\s+pull\s+" + docker_build_pattern = r"^docker\s+build\s+" + docker_run_pattern = r"^docker\s+run\s+" + + if re.match(docker_push_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\n🚀 DOCKER PUSH OPERATION: {command}\n{'='*80}" + elif re.match(docker_pull_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\n📥 DOCKER PULL OPERATION: {command}\n{'='*80}" + elif re.match(docker_build_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\n🔨 DOCKER BUILD OPERATION: {command}\n{'='*80}" + elif re.match(docker_run_pattern, command, re.IGNORECASE): + return f"\n{'='*80}\n🏃 DOCKER RUN OPERATION: {command}\n{'='*80}" + + return command + + def _show_docker_completion(self, command: str, success: bool = True) -> None: + """Show completion message for docker operations. + + Args: + command (str): The command that was executed. + success (bool): Whether the operation was successful. + """ + docker_push_pattern = r"^docker\s+push\s+" + docker_pull_pattern = r"^docker\s+pull\s+" + docker_build_pattern = r"^docker\s+build\s+" + docker_run_pattern = r"^docker\s+run\s+" + + if re.match(docker_push_pattern, command, re.IGNORECASE): + if success: + print(f"✅ DOCKER PUSH COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"❌ DOCKER PUSH FAILED") + print(f"{'='*80}\n") + elif re.match(docker_pull_pattern, command, re.IGNORECASE): + if success: + print(f"✅ DOCKER PULL COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"❌ DOCKER PULL FAILED") + print(f"{'='*80}\n") + elif re.match(docker_build_pattern, command, re.IGNORECASE): + if success: + print(f"✅ DOCKER BUILD COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"❌ DOCKER BUILD FAILED") + print(f"{'='*80}\n") + elif re.match(docker_run_pattern, command, re.IGNORECASE): + if success: + print(f"✅ DOCKER RUN COMPLETED SUCCESSFULLY") + print(f"{'='*80}\n") + else: + print(f"❌ DOCKER RUN FAILED") + print(f"{'='*80}\n") + def sh( - self, - command: str, - canFail: bool=False, - timeout: int=60, - secret: bool=False, - prefix: str="", - env: typing.Optional[typing.Dict[str, str]]=None - ) -> str: + self, + command: str, + canFail: bool = False, + timeout: int = 60, + secret: bool = False, + prefix: str = "", + env: typing.Optional[typing.Dict[str, str]] = None, + ) -> str: """Run shell command. - + Args: command (str): The shell command. canFail (bool): The flag to allow failure. timeout (int): The timeout in seconds. secret (bool): The flag to hide the command. prefix (str): The prefix of the output. - env (typing_extensions.TypedDict): The environment variables. - + env (typing.Optional[typing.Dict[str, str]]): The environment variables. + Returns: str: The output of the shell command. @@ -60,7 +123,8 @@ def sh( """ # Print the command if shellVerbose is True if self.shellVerbose and not secret: - print("> " + command, flush=True) + highlighted_command = self._highlight_docker_operations(command) + print("> " + highlighted_command, flush=True) # Run the shell command proc = subprocess.Popen( @@ -79,18 +143,57 @@ def sh( if not self.live_output: outs, errs = proc.communicate(timeout=timeout) else: - outs = [] - for stdout_line in iter(lambda: proc.stdout.readline().encode('utf-8', errors='replace').decode('utf-8', errors='replace'), ""): - print(prefix + stdout_line, end="") - outs.append(stdout_line) - outs = "".join(outs) - proc.stdout.close() + try: + outs = [] + for stdout_line in iter( + lambda: proc.stdout.readline() + .encode("utf-8", errors="replace") + .decode("utf-8", errors="replace"), + "", + ): + print(prefix + stdout_line, end="") + outs.append(stdout_line) + outs = "".join(outs) + finally: + # Ensure all pipes are properly closed + if proc.stdout and not proc.stdout.closed: + proc.stdout.close() + if proc.stdin and not proc.stdin.closed: + proc.stdin.close() proc.wait(timeout=timeout) except subprocess.TimeoutExpired as exc: proc.kill() + # Wait for process to finish after kill and clean up pipes + try: + proc.communicate(timeout=1) + except subprocess.TimeoutExpired: + # Force terminate if still not dead + proc.terminate() + proc.communicate() raise RuntimeError("Console script timeout") from exc - + finally: + # Final cleanup: ensure all pipes are closed regardless of success/failure + # This prevents ResourceWarning about unclosed files + try: + if proc.stdin and not proc.stdin.closed: + proc.stdin.close() + except (OSError, ValueError): + # Expected errors during cleanup - stdin may already be closed + pass + try: + if proc.stdout and not proc.stdout.closed: + proc.stdout.close() + except (OSError, ValueError): + # Expected errors during cleanup - stdout may already be closed + pass + # Check for failure + success = proc.returncode == 0 + + # Show docker operation completion status + if not secret: + self._show_docker_completion(command, success) + if proc.returncode != 0: if not canFail: if not secret: @@ -102,11 +205,9 @@ def sh( ) else: raise RuntimeError( - "Subprocess '" - + secret - + "' failed with exit code " + "Subprocess '***HIDDEN COMMAND***' failed with exit code " + str(proc.returncode) ) - + # Return the output return outs.strip() diff --git a/src/madengine/core/constants.py b/src/madengine/core/constants.py index c0cbd5c0..f86e51fe 100644 --- a/src/madengine/core/constants.py +++ b/src/madengine/core/constants.py @@ -3,89 +3,228 @@ This module provides the constants used in the MAD Engine. +Environment Variables: + - MAD_VERBOSE_CONFIG: Set to "true" to enable verbose configuration logging + - MAD_SETUP_MODEL_DIR: Set to "true" to enable automatic MODEL_DIR setup during import + - MODEL_DIR: Path to model directory to copy to current working directory + - MAD_MINIO: JSON string with MinIO configuration + - MAD_AWS_S3: JSON string with AWS S3 configuration + - NAS_NODES: JSON string with NAS nodes configuration + - PUBLIC_GITHUB_ROCM_KEY: JSON string with GitHub token configuration + +Configuration Loading: + All configuration constants follow a priority order: + 1. Environment variables (as JSON strings) + 2. credential.json file + 3. Built-in defaults + + Invalid JSON in environment variables will fall back to defaults with error logging. + Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ # built-in modules import os import json +import logging + + +# Utility function for optional verbose logging of configuration +def _log_config_info(message: str, force_print: bool = False): + """Log configuration information either to logger or print if specified.""" + if force_print or os.environ.get("MAD_VERBOSE_CONFIG", "").lower() == "true": + print(message) + else: + logging.debug(message) + + # third-party modules from madengine.core.console import Console -# Get the model directory, if it is not set, set it to None. -MODEL_DIR = os.environ.get("MODEL_DIR") +# Get the model directory, if it is not set, default to "." (current directory) +MODEL_DIR = os.environ.get("MODEL_DIR", ".") + + +def _setup_model_dir(): + """Setup model directory if MODEL_DIR environment variable is set. -# MADEngine update -if MODEL_DIR: - # Copy MODEL_DIR to the current working directory. - cwd_path = os.getcwd() - print(f"Current working directory: {cwd_path}") - console = Console(live_output=True) - # copy the MODEL_DIR to the current working directory - console.sh(f"cp -vLR --preserve=all {MODEL_DIR}/* {cwd_path}") - print(f"Model dir: {MODEL_DIR} copied to current dir: {cwd_path}") - -# MADEngine update + MODEL_DIR defaults to "." (current directory) if not set. + Only copies if MODEL_DIR points to a different directory than current working directory. + """ + # Get absolute paths to compare + model_dir_abs = os.path.abspath(MODEL_DIR) + cwd_abs = os.path.abspath(".") + + # Only copy if MODEL_DIR points to a different directory (not current dir) + if model_dir_abs != cwd_abs: + # Copy MODEL_DIR to the current working directory. + _log_config_info(f"Current working directory: {cwd_abs}") + _log_config_info(f"MODEL_DIR: {MODEL_DIR} (different from current dir)") + console = Console(live_output=True) + # copy the MODEL_DIR to the current working directory + console.sh(f"cp -vLR --preserve=all {MODEL_DIR}/* {cwd_abs}") + _log_config_info(f"Model dir: {MODEL_DIR} copied to current dir: {cwd_abs}") + + +# Only setup model directory if explicitly requested (when not just importing for constants) +if os.environ.get("MAD_SETUP_MODEL_DIR", "").lower() == "true": + _setup_model_dir() + +# madengine credentials configuration CRED_FILE = "credential.json" -try: - # read credentials - with open(CRED_FILE) as f: - CREDS = json.load(f) -except FileNotFoundError: - CREDS = {} -if "NAS_NODES" not in os.environ: - if "NAS_NODES" in CREDS: - NAS_NODES = CREDS["NAS_NODES"] +def _load_credentials(): + """Load credentials from file with proper error handling.""" + try: + # read credentials + with open(CRED_FILE) as f: + creds = json.load(f) + _log_config_info(f"Credentials loaded from {CRED_FILE}") + return creds + except FileNotFoundError: + _log_config_info(f"Credentials file {CRED_FILE} not found, using defaults") + return {} + except json.JSONDecodeError as e: + _log_config_info(f"Error parsing {CRED_FILE}: {e}, using defaults") + return {} + except Exception as e: + _log_config_info(f"Unexpected error loading {CRED_FILE}: {e}, using defaults") + return {} + + +CREDS = _load_credentials() + + +def _get_nas_nodes(): + """Initialize NAS_NODES configuration.""" + if "NAS_NODES" not in os.environ: + _log_config_info("NAS_NODES environment variable is not set.") + if "NAS_NODES" in CREDS: + _log_config_info("NAS_NODES loaded from credentials file.") + return CREDS["NAS_NODES"] + else: + _log_config_info("NAS_NODES is using default values.") + return [ + { + "NAME": "DEFAULT", + "HOST": "localhost", + "PORT": 22, + "USERNAME": "username", + "PASSWORD": "password", + } + ] else: - NAS_NODES = [{ - "NAME": "DEFAULT", - "HOST": "localhost", - "PORT": 22, - "USERNAME": "username", - "PASSWORD": "password", - }] -else: - NAS_NODES = json.loads(os.environ["NAS_NODES"]) - -# Check the MAD_AWS_S3 environment variable which is a dict, if it is not set, set its element to default values. -if "MAD_AWS_S3" not in os.environ: - # Check if the MAD_AWS_S3 is in the credentials.json file. - if "MAD_AWS_S3" in CREDS: - MAD_AWS_S3 = CREDS["MAD_AWS_S3"] + _log_config_info("NAS_NODES is loaded from env variables.") + try: + return json.loads(os.environ["NAS_NODES"]) + except json.JSONDecodeError as e: + _log_config_info( + f"Error parsing NAS_NODES environment variable: {e}, using defaults" + ) + return [ + { + "NAME": "DEFAULT", + "HOST": "localhost", + "PORT": 22, + "USERNAME": "username", + "PASSWORD": "password", + } + ] + + +NAS_NODES = _get_nas_nodes() + + +def _get_mad_aws_s3(): + """Initialize MAD_AWS_S3 configuration.""" + if "MAD_AWS_S3" not in os.environ: + _log_config_info("MAD_AWS_S3 environment variable is not set.") + if "MAD_AWS_S3" in CREDS: + _log_config_info("MAD_AWS_S3 loaded from credentials file.") + return CREDS["MAD_AWS_S3"] + else: + _log_config_info("MAD_AWS_S3 is using default values.") + return { + "USERNAME": None, + "PASSWORD": None, + } else: - MAD_AWS_S3 = { - "USERNAME": None, - "PASSWORD": None, - } -else: - MAD_AWS_S3 = json.loads(os.environ["MAD_AWS_S3"]) + _log_config_info("MAD_AWS_S3 is loaded from env variables.") + try: + return json.loads(os.environ["MAD_AWS_S3"]) + except json.JSONDecodeError as e: + _log_config_info( + f"Error parsing MAD_AWS_S3 environment variable: {e}, using defaults" + ) + return { + "USERNAME": None, + "PASSWORD": None, + } + + +MAD_AWS_S3 = _get_mad_aws_s3() + # Check the MAD_MINIO environment variable which is a dict. -if "MAD_MINIO" not in os.environ: - print("MAD_MINIO environment variable is not set.") - if "MAD_MINIO" in CREDS: - MAD_MINIO = CREDS["MAD_MINIO"] +def _get_mad_minio(): + """Initialize MAD_MINIO configuration.""" + if "MAD_MINIO" not in os.environ: + _log_config_info("MAD_MINIO environment variable is not set.") + if "MAD_MINIO" in CREDS: + _log_config_info("MAD_MINIO loaded from credentials file.") + return CREDS["MAD_MINIO"] + else: + _log_config_info("MAD_MINIO is using default values.") + return { + "USERNAME": None, + "PASSWORD": None, + "MINIO_ENDPOINT": "http://localhost:9000", + "AWS_ENDPOINT_URL_S3": "http://localhost:9000", + } else: - print("MAD_MINIO is using default values.") - MAD_MINIO = { - "USERNAME": None, - "PASSWORD": None, - "MINIO_ENDPOINT": "http://localhost:9000", - "AWS_ENDPOINT_URL_S3": "http://localhost:9000", - } -else: - print("MAD_MINIO is loaded from env variables.") - MAD_MINIO = json.loads(os.environ["MAD_MINIO"]) - -# Check the auth GitHub token environment variable which is a dict, if it is not set, set it to None. -if "PUBLIC_GITHUB_ROCM_KEY" not in os.environ: - if "PUBLIC_GITHUB_ROCM_KEY" in CREDS: - PUBLIC_GITHUB_ROCM_KEY = CREDS["PUBLIC_GITHUB_ROCM_KEY"] + _log_config_info("MAD_MINIO is loaded from env variables.") + try: + return json.loads(os.environ["MAD_MINIO"]) + except json.JSONDecodeError as e: + _log_config_info( + f"Error parsing MAD_MINIO environment variable: {e}, using defaults" + ) + return { + "USERNAME": None, + "PASSWORD": None, + "MINIO_ENDPOINT": "http://localhost:9000", + "AWS_ENDPOINT_URL_S3": "http://localhost:9000", + } + + +MAD_MINIO = _get_mad_minio() + + +def _get_public_github_rocm_key(): + """Initialize PUBLIC_GITHUB_ROCM_KEY configuration.""" + if "PUBLIC_GITHUB_ROCM_KEY" not in os.environ: + _log_config_info("PUBLIC_GITHUB_ROCM_KEY environment variable is not set.") + if "PUBLIC_GITHUB_ROCM_KEY" in CREDS: + _log_config_info("PUBLIC_GITHUB_ROCM_KEY loaded from credentials file.") + return CREDS["PUBLIC_GITHUB_ROCM_KEY"] + else: + _log_config_info("PUBLIC_GITHUB_ROCM_KEY is using default values.") + return { + "username": None, + "token": None, + } else: - PUBLIC_GITHUB_ROCM_KEY = { - "username": None, - "token": None, - } -else: - PUBLIC_GITHUB_ROCM_KEY = json.loads(os.environ["PUBLIC_GITHUB_ROCM_KEY"]) + _log_config_info("PUBLIC_GITHUB_ROCM_KEY is loaded from env variables.") + try: + return json.loads(os.environ["PUBLIC_GITHUB_ROCM_KEY"]) + except json.JSONDecodeError as e: + _log_config_info( + f"Error parsing PUBLIC_GITHUB_ROCM_KEY environment variable: {e}, using defaults" + ) + return { + "username": None, + "token": None, + } + + +PUBLIC_GITHUB_ROCM_KEY = _get_public_github_rocm_key() diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index b1c7c225..ce463abb 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -18,18 +18,21 @@ import os import re import typing + # third-party modules from madengine.core.console import Console -from madengine.utils.gpu_validator import validate_rocm_installation, GPUInstallationError +from madengine.utils.gpu_validator import validate_rocm_installation, GPUInstallationError, GPUVendor +from madengine.utils.gpu_tool_factory import get_gpu_tool_manager +from madengine.utils.gpu_tool_manager import BaseGPUToolManager def update_dict(d: typing.Dict, u: typing.Dict) -> typing.Dict: """Update dictionary. - + Args: d: The dictionary. u: The update dictionary. - + Returns: dict: The updated dictionary. """ @@ -45,11 +48,14 @@ def update_dict(d: typing.Dict, u: typing.Dict) -> typing.Dict: class Context: """Class to determine context. - + Attributes: console: The console. ctx: The context. - + _gpu_context_initialized: Flag to track if GPU context is initialized. + _system_context_initialized: Flag to track if system context is initialized. + _build_only_mode: Flag to indicate if running in build-only mode. + Methods: get_ctx_test: Get context test. get_gpu_vendor: Get GPU vendor. @@ -60,110 +66,284 @@ class Context: get_docker_gpus: Get Docker GPUs. get_gpu_renderD_nodes: Get GPU renderD nodes. set_multi_node_runner: Sets multi-node runner context. + init_system_context: Initialize system-specific context. + init_gpu_context: Initialize GPU-specific context for runtime. + init_build_context: Initialize build-specific context. + init_runtime_context: Initialize runtime-specific context. + ensure_system_context: Ensure system context is initialized. + ensure_runtime_context: Ensure runtime context is initialized. filter: Filter. """ + def __init__( - self, - additional_context: str=None, - additional_context_file: str=None - ) -> None: + self, + additional_context: str = None, + additional_context_file: str = None, + build_only_mode: bool = False, + ) -> None: """Constructor of the Context class. - + Args: additional_context: The additional context. additional_context_file: The additional context file. - + build_only_mode: Whether running in build-only mode (no GPU detection). + Raises: - RuntimeError: If the GPU vendor is not detected. - RuntimeError: If the GPU architecture is not detected. + RuntimeError: If GPU detection fails and not in build-only mode. """ # Initialize the console self.console = Console() + self._gpu_context_initialized = False + self._build_only_mode = build_only_mode + self._system_context_initialized = False + self._gpu_tool_manager = None # Lazy initialization - # Initialize the context + # Initialize base context self.ctx = {} - self.ctx["ctx_test"] = self.get_ctx_test() - self.ctx["host_os"] = self.get_host_os() - self.ctx["numa_balancing"] = self.get_numa_balancing() - # Check if NUMA balancing is enabled or disabled. - if self.ctx["numa_balancing"] == "1": - print("Warning: numa balancing is ON ...") - elif self.ctx["numa_balancing"] == "0": - print("Warning: numa balancing is OFF ...") - else: - print("Warning: unknown numa balancing setup ...") - # Keeping gpu_vendor for filterning purposes, if we filter using file names we can get rid of this attribute. - self.ctx["gpu_vendor"] = self.get_gpu_vendor() - - # Validate ROCm installation if AMD GPU is detected - if self.ctx["gpu_vendor"] == "AMD": - try: - validate_rocm_installation(verbose=False, raise_on_error=True) - except GPUInstallationError as e: - print("\n" + "="*70) - print("ERROR: ROCm Installation Validation Failed") - print("="*70) - print(str(e)) - print("="*70) - raise - - # Initialize the docker context + # Initialize docker contexts as empty - will be populated based on mode + self.ctx["docker_build_arg"] = {} self.ctx["docker_env_vars"] = {} - self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] = self.ctx["gpu_vendor"] - self.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"] = self.get_system_ngpus() - self.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = self.get_system_gpu_architecture() - self.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_PRODUCT_NAME"] = self.get_system_gpu_product_name() - self.ctx['docker_env_vars']['MAD_SYSTEM_HIP_VERSION'] = self.get_system_hip_version() - self.ctx["docker_build_arg"] = { - "MAD_SYSTEM_GPU_ARCHITECTURE": self.get_system_gpu_architecture(), - "MAD_SYSTEM_GPU_PRODUCT_NAME": self.get_system_gpu_product_name() - } - self.ctx["docker_gpus"] = self.get_docker_gpus() - self.ctx["gpu_renderDs"] = self.get_gpu_renderD_nodes() - - # Default multi-node configuration - self.ctx['multi_node_args'] = { - 'RUNNER': 'torchrun', - 'MAD_RUNTIME_NGPUS': self.ctx['docker_env_vars']['MAD_SYSTEM_NGPUS'], # Use system's GPU count - 'NNODES': 1, - 'NODE_RANK': 0, - 'MASTER_ADDR': 'localhost', - 'MASTER_PORT': 6006, - 'HOST_LIST': '', - 'NCCL_SOCKET_IFNAME': '', - 'GLOO_SOCKET_IFNAME': '' - } - - # Read and update MAD SECRETS env variable + + # Read and update MAD SECRETS env variable (can be used for both build and run) mad_secrets = {} for key in os.environ: if "MAD_SECRETS" in key: mad_secrets[key] = os.environ[key] if mad_secrets: - update_dict(self.ctx['docker_build_arg'], mad_secrets) - update_dict(self.ctx['docker_env_vars'], mad_secrets) - - ## ADD MORE CONTEXTS HERE ## + update_dict(self.ctx["docker_build_arg"], mad_secrets) + update_dict(self.ctx["docker_env_vars"], mad_secrets) - # additional contexts provided in file override detected contexts + # Additional contexts provided in file override detected contexts if additional_context_file: with open(additional_context_file) as f: update_dict(self.ctx, json.load(f)) - # additional contexts provided in command-line override detected contexts and contexts in file + # Additional contexts provided in command-line override detected contexts and contexts in file if additional_context: # Convert the string representation of python dictionary to a dictionary. dict_additional_context = ast.literal_eval(additional_context) - update_dict(self.ctx, dict_additional_context) - # Set multi-node runner after context update - self.ctx['docker_env_vars']['MAD_MULTI_NODE_RUNNER'] = self.set_multi_node_runner() + # Initialize context based on mode + # User-provided contexts will not be overridden by detection + if not build_only_mode: + # For full workflow mode, initialize everything (legacy behavior preserved) + self.init_runtime_context() + else: + # For build-only mode, only initialize what's needed for building + self.init_build_context() + + ## ADD MORE CONTEXTS HERE ## + + def init_build_context(self) -> None: + """Initialize build-specific context. + + This method sets up only the context needed for Docker builds, + avoiding GPU detection that would fail on build-only nodes. + System-specific contexts (host_os, numa_balancing, etc.) should be + provided via --additional-context for build-only nodes if needed. + """ + print("Initializing build-only context...") + + # Initialize only essential system contexts if not provided via additional_context + if "ctx_test" not in self.ctx: + try: + self.ctx["ctx_test"] = self.get_ctx_test() + except Exception as e: + print(f"Warning: Could not detect ctx_test on build node: {e}") + + if "host_os" not in self.ctx: + try: + self.ctx["host_os"] = self.get_host_os() + print(f"Detected host OS: {self.ctx['host_os']}") + except Exception as e: + print(f"Warning: Could not detect host OS on build node: {e}") + print( + "Consider providing host_os via --additional-context if needed for build" + ) + + # Don't detect GPU-specific contexts in build-only mode + # These should be provided via additional_context if needed for build args + if "MAD_SYSTEM_GPU_ARCHITECTURE" not in self.ctx.get("docker_build_arg", {}): + print( + "Info: MAD_SYSTEM_GPU_ARCHITECTURE not provided - should be set via --additional-context for GPU-specific builds" + ) + + # Don't initialize NUMA balancing check for build-only nodes + # This is runtime-specific and should be handled on execution nodes + + def init_runtime_context(self) -> None: + """Initialize runtime-specific context. + + This method sets up the full context including system and GPU detection + for nodes that will run containers. + """ + print("Initializing runtime context with system and GPU detection...") + # Initialize system context first + self.init_system_context() + # Initialize GPU context + self.init_gpu_context() + + def init_system_context(self) -> None: + """Initialize system-specific context. + + This method detects system configuration like OS, NUMA balancing, etc. + Should be called on runtime nodes to get actual execution environment context. + """ + if self._system_context_initialized: + return + + print("Detecting system configuration...") + + try: + # Initialize system contexts if not already provided via additional_context + if "ctx_test" not in self.ctx: + self.ctx["ctx_test"] = self.get_ctx_test() + + if "host_os" not in self.ctx: + self.ctx["host_os"] = self.get_host_os() + print(f"Detected host OS: {self.ctx['host_os']}") + + if "numa_balancing" not in self.ctx: + self.ctx["numa_balancing"] = self.get_numa_balancing() + + # Check if NUMA balancing is enabled or disabled. + if self.ctx["numa_balancing"] == "1": + print("Warning: numa balancing is ON ...") + elif self.ctx["numa_balancing"] == "0": + print("Warning: numa balancing is OFF ...") + else: + print("Warning: unknown numa balancing setup ...") + + self._system_context_initialized = True + + except Exception as e: + print(f"Warning: System context detection failed: {e}") + if not self._build_only_mode: + raise RuntimeError( + f"System context detection failed on runtime node: {e}" + ) + + def init_gpu_context(self) -> None: + """Initialize GPU-specific context for runtime. + + This method detects GPU configuration and sets up environment variables + needed for container execution. Should only be called on GPU nodes. + User-provided GPU contexts will not be overridden. + + Raises: + RuntimeError: If GPU detection fails. + """ + if self._gpu_context_initialized: + return + + print("Detecting GPU configuration...") + + try: + # GPU vendor detection - only if not provided by user + if "gpu_vendor" not in self.ctx: + self.ctx["gpu_vendor"] = self.get_gpu_vendor() + print(f"Detected GPU vendor: {self.ctx['gpu_vendor']}") + else: + print(f"Using provided GPU vendor: {self.ctx['gpu_vendor']}") + + # Initialize docker env vars for runtime - only if not already set + if "MAD_GPU_VENDOR" not in self.ctx["docker_env_vars"]: + self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] = self.ctx["gpu_vendor"] + + if "MAD_SYSTEM_NGPUS" not in self.ctx["docker_env_vars"]: + self.ctx["docker_env_vars"][ + "MAD_SYSTEM_NGPUS" + ] = self.get_system_ngpus() + + if "MAD_SYSTEM_GPU_ARCHITECTURE" not in self.ctx["docker_env_vars"]: + self.ctx["docker_env_vars"][ + "MAD_SYSTEM_GPU_ARCHITECTURE" + ] = self.get_system_gpu_architecture() + + if "MAD_SYSTEM_HIP_VERSION" not in self.ctx["docker_env_vars"]: + self.ctx["docker_env_vars"][ + "MAD_SYSTEM_HIP_VERSION" + ] = self.get_system_hip_version() + + if "MAD_SYSTEM_GPU_PRODUCT_NAME" not in self.ctx["docker_env_vars"]: + self.ctx["docker_env_vars"][ + "MAD_SYSTEM_GPU_PRODUCT_NAME" + ] = self.get_system_gpu_product_name() + + # Also add to build args (for runtime builds) - only if not already set + if "MAD_SYSTEM_GPU_ARCHITECTURE" not in self.ctx["docker_build_arg"]: + self.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = self.ctx[ + "docker_env_vars" + ]["MAD_SYSTEM_GPU_ARCHITECTURE"] + + # Docker GPU configuration - only if not already set + if "docker_gpus" not in self.ctx: + self.ctx["docker_gpus"] = self.get_docker_gpus() + + if "gpu_renderDs" not in self.ctx: + self.ctx["gpu_renderDs"] = self.get_gpu_renderD_nodes() + + self._gpu_context_initialized = True + + except Exception as e: + if self._build_only_mode: + print( + f"Warning: GPU detection failed in build-only mode (expected): {e}" + ) + else: + raise RuntimeError(f"GPU detection failed: {e}") + + def ensure_runtime_context(self) -> None: + """Ensure runtime context is initialized. + + This method should be called before any runtime operations + that require system and GPU context. + """ + if not self._system_context_initialized and not self._build_only_mode: + self.init_system_context() + if not self._gpu_context_initialized and not self._build_only_mode: + self.init_gpu_context() + + def ensure_system_context(self) -> None: + """Ensure system context is initialized. + + This method should be called when system context is needed + but may not be initialized (e.g., in build-only mode). + """ + if not self._system_context_initialized: + self.init_system_context() + + def _get_tool_manager(self) -> BaseGPUToolManager: + """Get GPU tool manager for the current vendor (lazy initialization). + + Returns: + GPU tool manager instance + + Raises: + ValueError: If GPU vendor cannot be determined or is unsupported + """ + if self._gpu_tool_manager is None: + # Determine vendor from context or detect automatically + if "MAD_GPU_VENDOR" in self.ctx.get("docker_env_vars", {}): + vendor_str = self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] + if vendor_str == "AMD": + vendor = GPUVendor.AMD + elif vendor_str == "NVIDIA": + vendor = GPUVendor.NVIDIA + else: + vendor = None # Auto-detect + else: + vendor = None # Auto-detect + + self._gpu_tool_manager = get_gpu_tool_manager(vendor) + + return self._gpu_tool_manager def get_ctx_test(self) -> str: """Get context test. - + Returns: str: The output of the shell command. @@ -176,30 +356,61 @@ def get_ctx_test(self) -> str: ) def get_gpu_vendor(self) -> str: - """Get GPU vendor. - + """Get GPU vendor with fallback support (PR #54). + Returns: - str: The output of the shell command. - + str: The GPU vendor ("NVIDIA", "AMD", or error message). + Raises: RuntimeError: If the GPU vendor is unable to detect. - + Note: What types of GPU vendors are supported? - NVIDIA - AMD + + PR #54 Enhancement: + Added fallback to rocm-smi if amd-smi is missing. """ - # Check if the GPU vendor is NVIDIA or AMD, and if it is unable to detect the GPU vendor. - return self.console.sh( - 'bash -c \'if [[ -f /usr/bin/nvidia-smi ]] && $(/usr/bin/nvidia-smi > /dev/null 2>&1); then echo "NVIDIA"; elif [[ -f /opt/rocm/bin/amd-smi ]]; then echo "AMD"; elif [[ -f /usr/local/bin/amd-smi ]]; then echo "AMD"; else echo "Unable to detect GPU vendor"; fi || true\'' - ) + # Check NVIDIA first (simplest check) + if os.path.exists("/usr/bin/nvidia-smi"): + try: + result = self.console.sh("/usr/bin/nvidia-smi > /dev/null 2>&1 && echo 'NVIDIA' || echo ''", timeout=180) + if result and result.strip() == "NVIDIA": + return "NVIDIA" + except Exception as e: + print(f"Warning: nvidia-smi check failed: {e}") + + # Check AMD - try amd-smi first, fallback to rocm-smi (PR #54) + # Increased timeout to 180s for SLURM compute nodes where GPU initialization may be slow + amd_smi_paths = ["/opt/rocm/bin/amd-smi", "/usr/local/bin/amd-smi"] + for amd_smi_path in amd_smi_paths: + if os.path.exists(amd_smi_path): + try: + # Verify amd-smi actually works (180s timeout for slow GPU initialization) + result = self.console.sh(f"{amd_smi_path} list > /dev/null 2>&1 && echo 'AMD' || echo ''", timeout=180) + if result and result.strip() == "AMD": + return "AMD" + except Exception as e: + print(f"Warning: amd-smi check failed for {amd_smi_path}: {e}") + + # Fallback to rocm-smi (PR #54) + if os.path.exists("/opt/rocm/bin/rocm-smi"): + try: + result = self.console.sh("/opt/rocm/bin/rocm-smi --showid > /dev/null 2>&1 && echo 'AMD' || echo ''", timeout=180) + if result and result.strip() == "AMD": + return "AMD" + except Exception as e: + print(f"Warning: rocm-smi check failed: {e}") + + return "Unable to detect GPU vendor" def get_host_os(self) -> str: """Get host OS. - + Returns: str: The output of the shell command. - + Raises: RuntimeError: If the host OS is unable to detect. @@ -216,7 +427,7 @@ def get_host_os(self) -> str: def get_numa_balancing(self) -> bool: """Get NUMA balancing. - + Returns: bool: The output of the shell command. @@ -225,9 +436,9 @@ def get_numa_balancing(self) -> bool: Note: NUMA balancing is enabled if the output is '1', and disabled if the output is '0'. - + What is NUMA balancing? - Non-Uniform Memory Access (NUMA) is a computer memory design used in multiprocessing, + Non-Uniform Memory Access (NUMA) is a computer memory design used in multiprocessing, where the memory access time depends on the memory location relative to the processor. """ # Check if NUMA balancing is enabled or disabled. @@ -238,50 +449,60 @@ def get_numa_balancing(self) -> bool: return False def get_system_ngpus(self) -> int: - """Get system number of GPUs. - + """Get system number of GPUs using tool manager. + Returns: int: The number of GPUs. - + Raises: RuntimeError: If the GPU vendor is not detected or GPU count cannot be determined. - + Note: What types of GPU vendors are supported? - NVIDIA - AMD + + Enhancement: + Uses version-aware tool manager with automatic fallback (PR #54). """ - number_gpus = 0 - if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "AMD": + vendor = self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] + + if vendor == "AMD": try: - number_gpus = int(self.console.sh("amd-smi list --csv | tail -n +3 | wc -l")) + tool_manager = self._get_tool_manager() + return tool_manager.get_gpu_count() except Exception as e: - # Try fallback to rocm-smi + raise RuntimeError( + f"Unable to determine number of AMD GPUs. " + f"Error: {e}" + ) + elif vendor == "NVIDIA": + try: + tool_manager = self._get_tool_manager() + return tool_manager.get_gpu_count() + except Exception as e: + # Fallback to direct command for NVIDIA (longer timeout for slow compute nodes) try: - number_gpus = int(self.console.sh("rocm-smi --showid --csv | tail -n +2 | wc -l")) + number_gpus = int(self.console.sh("nvidia-smi -L | wc -l", timeout=180)) + return number_gpus except Exception: raise RuntimeError( - f"Unable to determine number of AMD GPUs. " - f"Ensure amd-smi or rocm-smi is installed and GPUs are accessible. " - f"Original error: {e}" + f"Unable to determine number of NVIDIA GPUs. " + f"Error: {e}" ) - elif self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "NVIDIA": - number_gpus = int(self.console.sh("nvidia-smi -L | wc -l")) else: - raise RuntimeError("Unable to determine gpu vendor.") - - return number_gpus + raise RuntimeError(f"Unable to determine gpu vendor: {vendor}") def get_system_gpu_architecture(self) -> str: """Get system GPU architecture. - + Returns: str: The GPU architecture. - + Raises: RuntimeError: If the GPU vendor is not detected. RuntimeError: If the GPU architecture is unable to determine. - + Note: What types of GPU vendors are supported? - NVIDIA @@ -307,7 +528,7 @@ def get_system_gpu_architecture(self) -> str: raise RuntimeError("Unable to determine gpu architecture.") def get_system_gpu_product_name(self) -> str: - """Get system GPU product name. + """Get system GPU product name with fallback (PR #54). Returns: str: The GPU product name (e.g., AMD Instinct MI300X, NVIDIA H100 80GB HBM3). @@ -320,35 +541,86 @@ def get_system_gpu_product_name(self) -> str: What types of GPU vendors are supported? - NVIDIA - AMD + + PR #54 Enhancement: + Added rocm-smi fallback for AMD GPUs when amd-smi unavailable. """ - if self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "AMD": - return self.console.sh("amd-smi static -g 0 | grep MARKET_NAME: | cut -d ':' -f 2") - elif self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] == "NVIDIA": - return self.console.sh("nvidia-smi --query-gpu=name --format=csv,noheader,nounits -i 0") + vendor = self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] + + if vendor == "AMD": + try: + tool_manager = self._get_tool_manager() + return tool_manager.get_gpu_product_name(gpu_id=0) + except Exception as e: + raise RuntimeError( + f"Unable to determine AMD GPU product name. " + f"Error: {e}" + ) + elif vendor == "NVIDIA": + try: + tool_manager = self._get_tool_manager() + return tool_manager.get_gpu_product_name(gpu_id=0) + except Exception as e: + # Fallback to direct command for NVIDIA (longer timeout for slow compute nodes) + try: + return self.console.sh("nvidia-smi --query-gpu=name --format=csv,noheader,nounits -i 0", timeout=180) + except Exception: + raise RuntimeError( + f"Unable to determine NVIDIA GPU product name. " + f"Error: {e}" + ) else: - raise RuntimeError("Unable to determine gpu product name.") + raise RuntimeError(f"Unable to determine gpu product name for vendor: {vendor}") def get_system_hip_version(self): - if self.ctx['docker_env_vars']['MAD_GPU_VENDOR']=='AMD': + """Get HIP/CUDA version using tool manager. + + Returns: + str: Version string (e.g., "6.4" for ROCm, "12.0" for CUDA) + + Raises: + RuntimeError: If version cannot be determined + + Enhancement: + Uses tool manager for robust version detection with multiple fallbacks. + """ + vendor = self.ctx['docker_env_vars']['MAD_GPU_VENDOR'] + + if vendor == 'AMD': try: + tool_manager = self._get_tool_manager() + version_str = tool_manager.get_version() + if version_str: + # Return major.minor only (e.g., "6.4.1" -> "6.4") + parts = version_str.split('.') + if len(parts) >= 2: + return f"{parts[0]}.{parts[1]}" + return version_str + + # Fallback to hipconfig if tool manager fails version = self.console.sh("hipconfig --version | cut -d'.' -f1,2") if not version or version.strip() == "": raise RuntimeError("hipconfig returned empty version") return version + except Exception as e: raise RuntimeError( f"Unable to determine HIP version. " f"Ensure ROCm is installed and hipconfig is accessible. " f"Error: {e}" ) - elif self.ctx['docker_env_vars']['MAD_GPU_VENDOR']=='NVIDIA': - return self.console.sh("nvcc --version | sed -n 's/^.*release \\([0-9]\\+\\.[0-9]\\+\\).*$/\\1/p'") + elif vendor == 'NVIDIA': + try: + tool_manager = self._get_tool_manager() + return tool_manager.get_version() or self.console.sh("nvcc --version | sed -n 's/^.*release \\([0-9]\\+\\.[0-9]\\+\\).*$/\\1/p'") + except Exception: + return self.console.sh("nvcc --version | sed -n 's/^.*release \\([0-9]\\+\\.[0-9]\\+\\).*$/\\1/p'") else: - raise RuntimeError("Unable to determine hip version.") + raise RuntimeError(f"Unable to determine hip version for vendor: {vendor}") def get_docker_gpus(self) -> typing.Optional[str]: """Get Docker GPUs. - + Returns: str: The range of GPUs. """ @@ -360,7 +632,7 @@ def get_docker_gpus(self) -> typing.Optional[str]: def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: """Get GPU renderD nodes from KFD properties. - + Returns: list: The list of GPU renderD nodes, or None if not AMD GPU. @@ -386,37 +658,48 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: return gpu_renderDs try: - # Get ROCm version - rocm_version_str = self.console.sh("cat /opt/rocm/.info/version | cut -d'-' -f1") - if not rocm_version_str or rocm_version_str.strip() == "": - raise RuntimeError("Failed to retrieve ROCm version from /opt/rocm/.info/version") - - # Parse version safely + # Get ROCm version using tool manager for robust detection (PR #54) try: - rocm_version = tuple(map(int, rocm_version_str.strip().split("."))) - except (ValueError, AttributeError) as e: - raise RuntimeError(f"Failed to parse ROCm version '{rocm_version_str}': {e}") + tool_manager = self._get_tool_manager() + rocm_version = tool_manager.get_rocm_version() + if not rocm_version: + raise RuntimeError("Tool manager returned None for ROCm version") + except Exception as e: + # Fallback to direct file read + rocm_version_str = self.console.sh("cat /opt/rocm/.info/version | cut -d'-' -f1") + if not rocm_version_str or rocm_version_str.strip() == "": + raise RuntimeError("Failed to retrieve ROCm version from /opt/rocm/.info/version") + + # Parse version safely + try: + rocm_version = tuple(map(int, rocm_version_str.strip().split("."))) + except (ValueError, AttributeError) as parse_err: + raise RuntimeError(f"Failed to parse ROCm version '{rocm_version_str}': {parse_err}") # Get renderDs from KFD properties - kfd_output = self.console.sh("grep -r drm_render_minor /sys/devices/virtual/kfd/kfd/topology/nodes") - if not kfd_output or kfd_output.strip() == "": - raise RuntimeError("Failed to retrieve KFD properties from /sys/devices/virtual/kfd/kfd/topology/nodes") - - kfd_properties = kfd_output.split("\n") - # Filter out empty lines and CPU entries (renderD value 0) - kfd_properties = [ - line for line in kfd_properties - if line.strip() and line.split() and int(line.split()[-1]) != 0 - ] - - if not kfd_properties: - raise RuntimeError("No valid GPU renderD entries found in KFD properties") - - kfd_renderDs = [int(line.split()[-1]) for line in kfd_properties] - - # Get gpu id - renderD mapping using unique id if ROCm < 6.4.0 and node id otherwise - # node id is more robust but is only available from 6.4.0 - if rocm_version < (6, 4, 0): + # Try KFD topology first (preferred), but gracefully handle permission errors + # On HPC/multi-user systems, KFD topology files may be restricted + kfd_renderDs = None + kfd_properties = [] + try: + kfd_output = self.console.sh("grep -r drm_render_minor /sys/devices/virtual/kfd/kfd/topology/nodes") + if kfd_output and kfd_output.strip(): + kfd_properties = kfd_output.split("\n") + # Filter out empty lines and CPU entries (renderD value 0) + kfd_properties = [ + line for line in kfd_properties + if line.strip() and line.split() and int(line.split()[-1]) != 0 + ] + if kfd_properties: + kfd_renderDs = [int(line.split()[-1]) for line in kfd_properties] + except Exception as kfd_error: + # KFD topology read failed (common on HPC clusters with restricted permissions) + # Will use amd-smi/rocm-smi fallback which provides renderD info directly + print(f"Note: KFD topology not accessible ({kfd_error}), using ROCm tools fallback") + + # Get gpu id - renderD mapping using unique id if ROCm < 6.4.1 and node id otherwise + # node id is more robust but is only available from 6.4.1 (PR #54) + if rocm_version < (6, 4, 1): # Legacy method using unique_id kfd_unique_output = self.console.sh("grep -r unique_id /sys/devices/virtual/kfd/kfd/topology/nodes") if not kfd_unique_output: @@ -446,8 +729,8 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: for unique_id, renderD in zip(kfd_unique_ids, kfd_renderDs) } - # Get GPU ID to unique ID mapping from rocm-smi - rsmi_output = self.console.sh("rocm-smi --showuniqueid | grep 'Unique.*:'") + # Get GPU ID to unique ID mapping from rocm-smi (longer timeout for slow compute nodes) + rsmi_output = self.console.sh("rocm-smi --showuniqueid | grep 'Unique.*:'", timeout=180) if not rsmi_output or rsmi_output.strip() == "": raise RuntimeError("Failed to retrieve unique IDs from rocm-smi") @@ -464,60 +747,90 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: except (IndexError, KeyError) as e: raise RuntimeError(f"Failed to map unique ID from line '{line}': {e}") else: - # Modern method using node_id (ROCm >= 6.4.0) - kfd_nodeids = [] - for line in kfd_properties: - try: - match = re.search(r"\d+", line.split()[0]) - if match: - kfd_nodeids.append(int(match.group())) - else: - print(f"Warning: Could not extract node ID from line: {line}") - except (IndexError, ValueError) as e: - print(f"Warning: Failed to parse node ID from line '{line}': {e}") - continue - - if len(kfd_nodeids) != len(kfd_renderDs): - raise RuntimeError( - f"Mismatch between node IDs count ({len(kfd_nodeids)}) " - f"and renderDs count ({len(kfd_renderDs)})" - ) - - # Map node ids to renderDs - nodeid_renderD_map = { - nodeid: renderD - for nodeid, renderD in zip(kfd_nodeids, kfd_renderDs) - } - - # Get list of GPUs from amd-smi - output = self.console.sh("amd-smi list -e --json") + # Modern method using amd-smi (ROCm >= 6.4.0) + # Get list of GPUs from amd-smi (redirect stderr to filter warnings) + # Longer timeout (180s) for slow GPU initialization on SLURM compute nodes + output = self.console.sh("amd-smi list -e --json 2>/dev/null || amd-smi list -e --json 2>&1", timeout=180) if not output or output.strip() == "": raise ValueError("Failed to retrieve AMD GPU data from amd-smi") + # amd-smi may output warnings before JSON - extract only JSON part + # Look for lines starting with '[' or '{' (JSON start) + json_start = -1 + lines = output.split('\n') + for i, line in enumerate(lines): + if line.strip().startswith('[') or line.strip().startswith('{'): + json_start = i + break + + if json_start >= 0: + json_output = '\n'.join(lines[json_start:]) + else: + json_output = output + try: - data = json.loads(output) + data = json.loads(json_output) except json.JSONDecodeError as e: - raise ValueError(f"Failed to parse amd-smi JSON output: {e}") + raise ValueError(f"Failed to parse amd-smi JSON output: {e}. Output was: {output[:200]}") if not data or not isinstance(data, list): raise ValueError("amd-smi returned empty or invalid data") - # Get gpu id to node id map from amd-smi - gpuid_nodeid_map = {} - for item in data: + # Check if we successfully got KFD renderDs + if kfd_renderDs: + # Original method: Map KFD renderDs via node_id from amd-smi + kfd_nodeids = [] + for line in kfd_properties: + try: + match = re.search(r"\d+", line.split()[0]) + if match: + kfd_nodeids.append(int(match.group())) + else: + print(f"Warning: Could not extract node ID from line: {line}") + except (IndexError, ValueError) as e: + print(f"Warning: Failed to parse node ID from line '{line}': {e}") + continue + + if len(kfd_nodeids) != len(kfd_renderDs): + raise RuntimeError( + f"Mismatch between node IDs count ({len(kfd_nodeids)}) " + f"and renderDs count ({len(kfd_renderDs)})" + ) + + # Map node ids to renderDs + nodeid_renderD_map = { + nodeid: renderD + for nodeid, renderD in zip(kfd_nodeids, kfd_renderDs) + } + + # Get gpu id to node id map from amd-smi + gpuid_nodeid_map = {} + for item in data: + try: + gpuid_nodeid_map[item["gpu"]] = item["node_id"] + except KeyError as e: + raise KeyError(f"Failed to parse node_id from amd-smi data: {e}. Item: {item}") + + # Sort gpu_renderDs based on gpu ids try: - gpuid_nodeid_map[item["gpu"]] = item["node_id"] + gpu_renderDs = [ + nodeid_renderD_map[gpuid_nodeid_map[gpuid]] + for gpuid in sorted(gpuid_nodeid_map.keys()) + ] except KeyError as e: - raise KeyError(f"Failed to parse node_id from amd-smi data: {e}. Item: {item}") - - # Sort gpu_renderDs based on gpu ids - try: - gpu_renderDs = [ - nodeid_renderD_map[gpuid_nodeid_map[gpuid]] - for gpuid in sorted(gpuid_nodeid_map.keys()) - ] - except KeyError as e: - raise RuntimeError(f"Failed to map GPU IDs to renderDs: {e}") + raise RuntimeError(f"Failed to map GPU IDs to renderDs: {e}") + else: + # Fallback method: Get renderD directly from amd-smi (ROCm >= 6.4.1) + # This is actually BETTER - no KFD topology parsing needed! + print("Using amd-smi renderD info directly (cleaner method)") + gpu_renderDs = [] + for item in sorted(data, key=lambda x: x["gpu"]): + try: + render_str = item["render"] # e.g., "renderD128" + render_num = int(render_str.replace("renderD", "")) + gpu_renderDs.append(render_num) + except (KeyError, ValueError) as e: + raise RuntimeError(f"Failed to parse renderD from amd-smi: {e}. Item: {item}") except (RuntimeError, ValueError, KeyError) as e: # Re-raise with context @@ -528,49 +841,12 @@ def get_gpu_renderD_nodes(self) -> typing.Optional[typing.List[int]]: return gpu_renderDs - def set_multi_node_runner(self) -> str: - """ - Sets the `MAD_MULTI_NODE_RUNNER` environment variable based on the selected multi-node - runner (e.g., `torchrun`, `mpirun`, or fallback to `python3`). This method dynamically - generates the appropriate command based on the provided multi-node configuration. - - Returns: - str: The command string for the multi-node runner, including necessary arguments and - environment variable settings. - """ - # NOTE: mpirun is untested - if self.ctx["multi_node_args"]["RUNNER"] == 'mpirun': - if not self.ctx["multi_node_args"]["HOST_LIST"]: - self.ctx["multi_node_args"]["HOST_LIST"] = f"localhost:{self.ctx['multi_node_args']['MAD_RUNTIME_NGPUS']}" - multi_node_runner = ( - f"mpirun -np {self.ctx['multi_node_args']['NNODES'] * self.ctx['multi_node_args']['MAD_RUNTIME_NGPUS']} " - f"--host {self.ctx['multi_node_args']['HOST_LIST']}" - ) - else: - distributed_args = ( - f"--nproc_per_node {self.ctx['multi_node_args']['MAD_RUNTIME_NGPUS']} " - f"--nnodes {self.ctx['multi_node_args']['NNODES']} " - f"--node_rank {self.ctx['multi_node_args']['NODE_RANK']} " - f"--master_addr {self.ctx['multi_node_args']['MASTER_ADDR']} " - f"--master_port {self.ctx['multi_node_args']['MASTER_PORT']}" - ) - multi_node_runner = f"torchrun {distributed_args}" - - # Add NCCL and GLOO interface environment variables - multi_node_runner = ( - f"NCCL_SOCKET_IFNAME={self.ctx['multi_node_args']['NCCL_SOCKET_IFNAME']} " - f"GLOO_SOCKET_IFNAME={self.ctx['multi_node_args']['GLOO_SOCKET_IFNAME']} " - f"{multi_node_runner}" - ) - - return multi_node_runner - def filter(self, unfiltered: typing.Dict) -> typing.Dict: """Filter the unfiltered dictionary based on the context. - + Args: unfiltered: The unfiltered dictionary. - + Returns: dict: The filtered dictionary. """ diff --git a/src/madengine/core/dataprovider.py b/src/madengine/core/dataprovider.py index 29e675fe..0fa9b130 100644 --- a/src/madengine/core/dataprovider.py +++ b/src/madengine/core/dataprovider.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Data Provider module for MADEngine +"""Data Provider module for madengine This module provides data to the models. It can provide data from different sources like local, NAS, AWS, etc. @@ -24,7 +24,7 @@ import time import typing -# MADEngine modules +# madengine modules from madengine.core.console import Console from madengine.core.context import Context from madengine.core.docker import Docker @@ -118,7 +118,7 @@ def prepare_data(self, model_docker: Docker) -> bool: Args: model_docker: The model docker object - + Returns: bool: The status of preparing the data """ @@ -135,23 +135,19 @@ class CustomDataProvider(DataProvider): provider_type = "custom" - def __init__( - self, - dataname: str, - config: typing.Dict - ) -> None: + def __init__(self, dataname: str, config: typing.Dict) -> None: """Constructor of the CustomDataProvider class.""" super().__init__(dataname, config) def check_source(self, config: typing.Dict) -> bool: """Check if the data source is valid - + Args: config (dict): Configuration of the data provider - + Returns: bool: The status of the data source - + Raises: RuntimeError: Raised when the mirrorlocal path is a non-existent path """ @@ -165,7 +161,7 @@ def check_source(self, config: typing.Dict) -> bool: os.makedirs( self.config["mirrorlocal"] + "/" + self.dataname, exist_ok=True ) - + # get the base directory of the current file. BASE_DIR = os.path.dirname(os.path.realpath(__file__)) print("DEBUG - BASE_DIR::", BASE_DIR) @@ -269,7 +265,7 @@ def check_source(self, config): return True else: print(f"Failed to connect to NAS {self.name} at {self.ip}:{self.port}") - + print("Failed to connect to all available NAS nodes.") return False @@ -333,7 +329,7 @@ def prepare_data(self, model_docker): touch ~/.ssh/known_hosts ssh-keyscan -p {port} {ip} >> ~/.ssh/known_hosts echo '#!/bin/bash' > /tmp/ssh.sh - echo 'sshpass -p {password} rsync --progress -avz -e \\\"ssh -p {port} \\\" \\\"\$@\\\"' >> /tmp/ssh.sh + echo 'sshpass -p {password} rsync --progress -avz -e \\"ssh -p {port} \\" \\"\\$@\\"' >> /tmp/ssh.sh cat /tmp/ssh.sh chmod u+x /tmp/ssh.sh timeout --preserve-status {timeout} /tmp/ssh.sh {username}@{ip}:{datapath}/* {datahome} && rm -f /tmp/ssh.sh @@ -371,7 +367,7 @@ def prepare_data(self, model_docker): touch ~/.ssh/known_hosts ssh-keyscan -p {port} {ip} >> ~/.ssh/known_hosts echo '#!/bin/bash' > /tmp/ssh.sh - echo 'sshpass -p {password} ssh -v \$*' >> /tmp/ssh.sh + echo 'sshpass -p {password} ssh -v \\$*' >> /tmp/ssh.sh chmod u+x /tmp/ssh.sh timeout --preserve-status {timeout} mount -t fuse sshfs#{username}@{ip}:{datapath} {datahome} -o ssh_command=/tmp/ssh.sh,port={port} && rm -f /tmp/ssh.sh """ @@ -507,7 +503,7 @@ def check_source(self, config): except Exception as e: print(f"Failed to connect to Minio endpoint ({self.minio_endpoint}): {e}") return False - + return True def get_mountpath(self): @@ -545,7 +541,7 @@ def prepare_data(self, model_docker): datahome=datahome, dataname=self.dataname, ) - + # Measure time taken to copy data from MinIO to local start = time.time() model_docker.sh(cmd, timeout=3600) # 60 min timeout @@ -553,13 +549,13 @@ def prepare_data(self, model_docker): self.duration = end - start print("Copy data from MinIO to local") print("Data Download Duration: {} seconds".format(self.duration)) - + # Get the size of the data of dataname in the path of datahome and store it in the config cmd = f"du -sh {datahome} | cut -f1" data_size = model_docker.sh(cmd) self.size = data_size print("Data Size: ", self.size) - + return True @@ -721,9 +717,11 @@ def find_dataprovider(self, dataname: str) -> typing.Optional[DataProvider]: self.selected_data_provider = { "dataname": dataname, "data_provider_type": data_provider_type, - "data_provider_config": self.data_provider_config[dataname][data_provider_type], + "data_provider_config": self.data_provider_config[dataname][ + data_provider_type + ], "duration": data_provider.duration, - "size": data_provider.size + "size": data_provider.size, } break diff --git a/src/madengine/core/docker.py b/src/madengine/core/docker.py index 7ed4ff36..9d331a6b 100644 --- a/src/madengine/core/docker.py +++ b/src/madengine/core/docker.py @@ -8,6 +8,7 @@ # built-in modules import os import typing + # user-defined modules from madengine.core.console import Console @@ -59,31 +60,33 @@ def __init__( container_name_exists = self.console.sh( "docker container ps -a | grep " + container_name + " | wc -l" ) - # if container name exists, raise error. + # if container name exists, clean it up automatically if container_name_exists != "0": - raise RuntimeError( - "Container with name, " - + container_name - + " already exists. " - + "Please stop (docker stop --time=1 SHA) and remove this (docker rm -f SHA) to proceed..." + print( + f"⚠️ Container '{container_name}' already exists. Cleaning up..." + ) + # Stop the container (with timeout) + self.console.sh( + f"docker stop --timeout=1 {container_name} 2>/dev/null || true" + ) + # Remove the container + self.console.sh( + f"docker rm -f {container_name} 2>/dev/null || true" ) + print(f"✓ Cleaned up existing container '{container_name}'") # run docker command - command = ( - "docker run -t -d -u " - + self.userid - + ":" - + self.groupid - + " " - + dockerOpts - + " " - ) + command = "docker run -t -d " + # Conditionally add -u flag if not already present in dockerOpts + if "-u " not in dockerOpts: + command += f"-u {self.userid}:{self.groupid} " + command += dockerOpts + " " # add mounts if mounts is not None: for mount in mounts: command += "-v " + mount + ":" + mount + " " - + # add current working directory command += "-v " + cwd + ":/myworkspace/ " @@ -91,12 +94,14 @@ def __init__( if envVars is not None: for evar in envVars.keys(): command += "-e " + evar + "=" + envVars[evar] + " " - + command += "--workdir /myworkspace/ " command += "--name " + container_name + " " command += image + " " - - # hack to keep docker open + + # Use 'cat' to keep container alive (blocks waiting for stdin) + # Works reliably across all deployment types (local, k8s, slurm) + # with fresh image pulls preventing corrupted layer issues command += "cat " self.console.sh(command) @@ -105,19 +110,14 @@ def __init__( "docker ps -aqf 'name=" + container_name + "' " ) - def sh( - self, - command: str, - timeout: int=60, - secret: bool=False - ) -> str: + def sh(self, command: str, timeout: int = 60, secret: bool = False) -> str: """Run shell command inside docker. - + Args: command (str): The shell command. timeout (int): The timeout in seconds. secret (bool): The flag to hide the command. - + Returns: str: The output of the shell command. """ diff --git a/src/madengine/core/errors.py b/src/madengine/core/errors.py new file mode 100644 index 00000000..411e19df --- /dev/null +++ b/src/madengine/core/errors.py @@ -0,0 +1,386 @@ +#!/usr/bin/env python3 +""" +Unified Error Handling System for madengine + +This module provides a centralized error handling system with structured +error types and consistent Rich console-based error reporting. +""" + +import logging +import traceback +from dataclasses import dataclass +from typing import Optional, Any, Dict, List +from enum import Enum + +try: + from rich.console import Console + from rich.panel import Panel + from rich.text import Text + from rich.table import Table +except ImportError: + raise ImportError("Rich is required for error handling. Install with: pip install rich") + + +class ErrorCategory(Enum): + """Error category enumeration for classification.""" + + VALIDATION = "validation" + CONNECTION = "connection" + AUTHENTICATION = "authentication" + RUNTIME = "runtime" + BUILD = "build" + DISCOVERY = "discovery" + ORCHESTRATION = "orchestration" + RUNNER = "runner" + CONFIGURATION = "configuration" + TIMEOUT = "timeout" + + +@dataclass +class ErrorContext: + """Context information for errors.""" + + operation: str + phase: Optional[str] = None + component: Optional[str] = None + model_name: Optional[str] = None + node_id: Optional[str] = None + file_path: Optional[str] = None + additional_info: Optional[Dict[str, Any]] = None + + +class MADEngineError(Exception): + """Base exception for all madengine errors.""" + + def __init__( + self, + message: str, + category: ErrorCategory, + context: Optional[ErrorContext] = None, + cause: Optional[Exception] = None, + recoverable: bool = False, + suggestions: Optional[List[str]] = None + ): + super().__init__(message) + self.message = message + self.category = category + self.context = context or ErrorContext(operation="unknown") + self.cause = cause + self.recoverable = recoverable + self.suggestions = suggestions or [] + + +class ValidationError(MADEngineError): + """Validation and input errors.""" + + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): + super().__init__( + message, + ErrorCategory.VALIDATION, + context, + recoverable=True, + **kwargs + ) + + +class ConnectionError(MADEngineError): + """Connection and network errors.""" + + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): + super().__init__( + message, + ErrorCategory.CONNECTION, + context, + recoverable=True, + **kwargs + ) + + +class AuthenticationError(MADEngineError): + """Authentication and credential errors.""" + + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): + super().__init__( + message, + ErrorCategory.AUTHENTICATION, + context, + recoverable=True, + **kwargs + ) + + +class RuntimeError(MADEngineError): + """Runtime execution errors.""" + + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): + super().__init__( + message, + ErrorCategory.RUNTIME, + context, + recoverable=False, + **kwargs + ) + + +class BuildError(MADEngineError): + """Build and compilation errors.""" + + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): + super().__init__( + message, + ErrorCategory.BUILD, + context, + recoverable=False, + **kwargs + ) + + +class DiscoveryError(MADEngineError): + """Model discovery errors.""" + + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): + super().__init__( + message, + ErrorCategory.DISCOVERY, + context, + recoverable=True, + **kwargs + ) + + +class OrchestrationError(MADEngineError): + """Distributed orchestration errors.""" + + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): + super().__init__( + message, + ErrorCategory.ORCHESTRATION, + context, + recoverable=False, + **kwargs + ) + + +class RunnerError(MADEngineError): + """Distributed runner errors.""" + + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): + super().__init__( + message, + ErrorCategory.RUNNER, + context, + recoverable=True, + **kwargs + ) + + +class ConfigurationError(MADEngineError): + """Configuration and setup errors.""" + + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): + super().__init__( + message, + ErrorCategory.CONFIGURATION, + context, + recoverable=True, + **kwargs + ) + + +class TimeoutError(MADEngineError): + """Timeout and duration errors.""" + + def __init__(self, message: str, context: Optional[ErrorContext] = None, **kwargs): + super().__init__( + message, + ErrorCategory.TIMEOUT, + context, + recoverable=True, + **kwargs + ) + + +class ErrorHandler: + """Unified error handler with Rich console integration.""" + + def __init__(self, console: Optional[Console] = None, verbose: bool = False): + self.console = console or Console() + self.verbose = verbose + self.logger = logging.getLogger(__name__) + + def handle_error( + self, + error: Exception, + context: Optional[ErrorContext] = None, + show_traceback: Optional[bool] = None + ) -> None: + """Handle and display errors with rich formatting.""" + + show_tb = show_traceback if show_traceback is not None else self.verbose + + if isinstance(error, MADEngineError): + self._handle_madengine_error(error, show_tb) + else: + self._handle_generic_error(error, context, show_tb) + + def _handle_madengine_error(self, error: MADEngineError, show_traceback: bool) -> None: + """Handle madengine structured errors.""" + + # Determine error emoji and color + category_info = { + ErrorCategory.VALIDATION: ("⚠️", "yellow"), + ErrorCategory.CONNECTION: ("🔌", "blue"), + ErrorCategory.AUTHENTICATION: ("🔒", "red"), + ErrorCategory.RUNTIME: ("💥", "red"), + ErrorCategory.BUILD: ("🔨", "red"), + ErrorCategory.DISCOVERY: ("🔍", "yellow"), + ErrorCategory.ORCHESTRATION: ("⚡", "red"), + ErrorCategory.RUNNER: ("🚀", "red"), + ErrorCategory.CONFIGURATION: ("⚙️", "yellow"), + ErrorCategory.TIMEOUT: ("⏱️", "yellow"), + } + + emoji, color = category_info.get(error.category, ("❌", "red")) + + # Create error panel + title = f"{emoji} {error.category.value.title()} Error" + + # Build error content + content = Text() + content.append(f"{error.message}\n", style=f"bold {color}") + + # Add context information + if error.context: + content.append("\n📋 Context:\n", style="bold cyan") + if error.context.operation: + content.append(f" Operation: {error.context.operation}\n") + if error.context.phase: + content.append(f" Phase: {error.context.phase}\n") + if error.context.component: + content.append(f" Component: {error.context.component}\n") + if error.context.model_name: + content.append(f" Model: {error.context.model_name}\n") + if error.context.node_id: + content.append(f" Node: {error.context.node_id}\n") + if error.context.file_path: + content.append(f" File: {error.context.file_path}\n") + + # Add cause information + if error.cause: + content.append(f"\n🔗 Caused by: {str(error.cause)}\n", style="dim") + + # Add suggestions + if error.suggestions: + content.append("\n💡 Suggestions:\n", style="bold green") + for suggestion in error.suggestions: + content.append(f" • {suggestion}\n", style="green") + + # Add recovery information + if error.recoverable: + content.append("\n♻️ This error may be recoverable", style="bold blue") + + panel = Panel( + content, + title=title, + border_style=color, + expand=False + ) + + self.console.print(panel) + + # Show traceback if requested + if show_traceback and error.cause: + self.console.print("\n📚 [bold]Full Traceback:[/bold]") + self.console.print_exception() + + # Log to file + self.logger.error( + f"{error.category.value}: {error.message}", + extra={ + "context": error.context.__dict__ if error.context else {}, + "recoverable": error.recoverable, + "suggestions": error.suggestions + } + ) + + def _handle_generic_error( + self, + error: Exception, + context: Optional[ErrorContext], + show_traceback: bool + ) -> None: + """Handle generic Python exceptions.""" + + title = f"❌ {type(error).__name__}" + + content = Text() + content.append(f"{str(error)}\n", style="bold red") + + if context: + content.append("\n📋 Context:\n", style="bold cyan") + content.append(f" Operation: {context.operation}\n") + if context.phase: + content.append(f" Phase: {context.phase}\n") + if context.component: + content.append(f" Component: {context.component}\n") + + panel = Panel( + content, + title=title, + border_style="red", + expand=False + ) + + self.console.print(panel) + + if show_traceback: + self.console.print("\n📚 [bold]Full Traceback:[/bold]") + self.console.print_exception() + + # Log to file + self.logger.error(f"{type(error).__name__}: {str(error)}") + + +# Global error handler instance +_global_error_handler: Optional[ErrorHandler] = None + + +def set_error_handler(handler: ErrorHandler) -> None: + """Set the global error handler.""" + global _global_error_handler + _global_error_handler = handler + + +def get_error_handler() -> Optional[ErrorHandler]: + """Get the global error handler.""" + return _global_error_handler + + +def handle_error( + error: Exception, + context: Optional[ErrorContext] = None, + show_traceback: Optional[bool] = None +) -> None: + """Handle error using the global error handler.""" + if _global_error_handler: + _global_error_handler.handle_error(error, context, show_traceback) + else: + # Fallback to basic logging + logging.error(f"Error: {error}") + if show_traceback: + logging.exception("Exception details:") + + +def create_error_context( + operation: str, + phase: Optional[str] = None, + component: Optional[str] = None, + **kwargs +) -> ErrorContext: + """Convenience function to create error context.""" + return ErrorContext( + operation=operation, + phase=phase, + component=component, + **kwargs + ) \ No newline at end of file diff --git a/src/madengine/core/timeout.py b/src/madengine/core/timeout.py index 705a972a..0f72bd84 100644 --- a/src/madengine/core/timeout.py +++ b/src/madengine/core/timeout.py @@ -12,16 +12,14 @@ class Timeout: """Class to handle timeouts. - + Attributes: seconds (int): The timeout in seconds. """ - def __init__( - self, - seconds: int=15 - ) -> None: + + def __init__(self, seconds: int = 15) -> None: """Constructor of the Timeout class. - + Args: seconds (int): The timeout in seconds. """ @@ -29,14 +27,14 @@ def __init__( def handle_timeout(self, signum, frame) -> None: """Handle timeout. - + Args: signum: The signal number. frame: The frame. Returns: None - + Raises: TimeoutError: If the program times out. """ diff --git a/src/madengine/database/README.md b/src/madengine/database/README.md new file mode 100644 index 00000000..2c8e5f9f --- /dev/null +++ b/src/madengine/database/README.md @@ -0,0 +1,114 @@ +# Database Layer (Future MongoDB Ingestion) + +**Status**: Planned for future development +**Purpose**: Modern data ingestion API for local and distributed deployments + +--- + +## 🎯 Objective + +This directory is reserved for a future unified database ingestion layer that will support: +- MongoDB data persistence +- Local result storage +- Distributed data collection from build and run phases +- Unified API for performance metrics ingestion + +--- + +## 📋 Current State + +⚠️ **Not yet implemented**. This directory is a placeholder for future development. + +For current database operations, use the existing `db/` package which handles MySQL operations via SSH. + +--- + +## 🗂️ Legacy MySQL Tools (Removed) + +**MySQL support has been removed from madengine**. The following tools are no longer available: + +| File | Purpose | Status | +|------|---------|--------| +| ~~`tools/create_table_db.py`~~ | MySQL table creation | **REMOVED** | +| ~~`tools/update_table_db.py`~~ | MySQL table updates | **REMOVED** | +| ~~`db/` package~~ | MySQL operations via SSH | **REMOVED** | + +For database operations, use MongoDB via the `database` command in the new CLI or legacy `mad.py`. + +--- + +## 🚀 Future Implementation Plan + +When implemented, this layer will provide: + +### **1. MongoDB Client** (`mongodb_client.py`) +```python +from madengine.database.mongodb_client import MongoDBClient + +# Connect to local or remote MongoDB +client = MongoDBClient(connection_string="mongodb://localhost:27017") + +# Ingest build results +client.ingest_build_results(build_manifest) + +# Ingest run results +client.ingest_run_results(run_summary) +``` + +### **2. Local Storage** (`local_storage.py`) +```python +from madengine.database.local_storage import LocalStorage + +# Store results locally (JSON, Parquet, etc.) +storage = LocalStorage(base_path="./madengine_results") +storage.save_results(results_dict) +``` + +### **3. Unified API** (`api.py`) +```python +from madengine.database import ingest_results + +# Works with both local and distributed deployments +ingest_results( + results=run_summary, + target="mongodb", # or "local", "mysql" + config={"connection": "mongodb://..."} +) +``` + +--- + +## 📦 Difference from `db/` Package (Removed) + +| Aspect | `db/` (Removed) | `database/` (Current) | +|--------|------------------|---------------------| +| **Purpose** | MySQL operations via SSH | MongoDB support | +| **Target** | Remote MySQL server | Local/distributed MongoDB | +| **Transport** | SSH tunnel | Direct connection | +| **Status** | **REMOVED** | Active | + +--- + +## 🔄 Migration Status + +MySQL support has been fully removed from madengine: + +1. ✅ **Phase 1**: Removed `db/` package (MySQL operations) +2. ✅ **Phase 2**: Removed `tools/create_table_db.py` and `tools/update_table_db.py` +3. ✅ **Phase 3**: Removed `utils/ssh_to_db.py` (SSH to MySQL host) +4. ✅ **Phase 4**: Removed MySQL dependencies (`mysql-connector-python`, `pymysql`) + +**Current state**: Only MongoDB support remains via the `database/` package. + +--- + +## 📚 References + +- **MongoDB package**: `src/madengine/database/mongodb.py` +- **CLI database command**: `madengine database --help` + +--- + +**Last Updated**: November 30, 2025 +**Maintainer**: madengine Team + diff --git a/src/madengine/database/__init__.py b/src/madengine/database/__init__.py new file mode 100644 index 00000000..89c630c0 --- /dev/null +++ b/src/madengine/database/__init__.py @@ -0,0 +1,25 @@ +"""Database operations module for madengine. + +This module provides database operations for MongoDB. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +from .mongodb import ( + MongoDBHandler, + upload_csv_to_mongodb, + upload_file_to_mongodb, + MongoDBConfig, + UploadOptions, + UploadResult, +) + +__all__ = [ + "MongoDBHandler", + "upload_csv_to_mongodb", + "upload_file_to_mongodb", + "MongoDBConfig", + "UploadOptions", + "UploadResult", +] + diff --git a/src/madengine/database/mongodb.py b/src/madengine/database/mongodb.py new file mode 100644 index 00000000..8727222c --- /dev/null +++ b/src/madengine/database/mongodb.py @@ -0,0 +1,787 @@ +""" +Modern MongoDB operations for madengine. + +A clean, efficient implementation supporting CSV and JSON uploads with +intelligent type handling, bulk operations, and production-ready features. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import logging +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional +from enum import Enum + +import pandas as pd +import pymongo +from pymongo import UpdateOne +from pymongo.errors import BulkWriteError, ConnectionFailure, PyMongoError +from rich.console import Console +from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn + +logger = logging.getLogger(__name__) +console = Console() + + +# ============================================================================ +# Configuration +# ============================================================================ + +@dataclass +class MongoDBConfig: + """MongoDB connection configuration.""" + + host: str = "localhost" + port: int = 27017 + username: str = "" + password: str = "" + auth_source: str = "admin" + timeout_ms: int = 5000 + + @classmethod + def from_env(cls) -> 'MongoDBConfig': + """Load configuration from environment variables.""" + import os + return cls( + host=os.getenv("MONGO_HOST", "localhost"), + port=int(os.getenv("MONGO_PORT", "27017")), + username=os.getenv("MONGO_USER", ""), + password=os.getenv("MONGO_PASSWORD", ""), + auth_source=os.getenv("MONGO_AUTH_SOURCE", "admin"), + timeout_ms=int(os.getenv("MONGO_TIMEOUT_MS", "5000")) + ) + + @property + def uri(self) -> str: + """Build MongoDB connection URI.""" + if self.username and self.password: + return (f"mongodb://{self.username}:{self.password}@" + f"{self.host}:{self.port}/{self.auth_source}") + return f"mongodb://{self.host}:{self.port}" + + +@dataclass +class UploadOptions: + """Options for document upload.""" + + # Deduplication strategy + unique_fields: Optional[List[str]] = None # Fields to use for uniqueness + upsert: bool = True # Update existing or insert only + + # Performance options + batch_size: int = 1000 # Documents per batch + ordered: bool = False # Continue on error + + # Index creation + create_indexes: bool = True + index_fields: Optional[List[str]] = None # Auto-detect if None + + # Metadata + add_metadata: bool = True + metadata_prefix: str = "_meta" + + # Validation + validate_schema: bool = True + + # Dry run + dry_run: bool = False + + +@dataclass +class UploadResult: + """Result of upload operation.""" + + status: str # success, partial, failed + documents_read: int + documents_processed: int + documents_inserted: int + documents_updated: int + documents_failed: int + errors: List[str] = field(default_factory=list) + duration_seconds: float = 0.0 + + def print_summary(self): + """Print formatted summary.""" + if self.status == "success": + console.print(f"✅ [bold green]Upload successful![/bold green]") + elif self.status == "partial": + console.print(f"⚠️ [bold yellow]Partial success[/bold yellow]") + else: + console.print(f"❌ [bold red]Upload failed[/bold red]") + + console.print(f" 📊 Documents read: {self.documents_read}") + console.print(f" ✨ Documents processed: {self.documents_processed}") + console.print(f" ➕ Inserted: {self.documents_inserted}") + console.print(f" 🔄 Updated: {self.documents_updated}") + if self.documents_failed > 0: + console.print(f" ❌ Failed: {self.documents_failed}") + console.print(f" ⏱️ Duration: {self.duration_seconds:.2f}s") + + +# ============================================================================ +# File Loaders (Strategy Pattern) +# ============================================================================ + +class FileFormat(Enum): + """Supported file formats.""" + CSV = "csv" + JSON = "json" + + +class DocumentLoader(ABC): + """Abstract base class for document loaders.""" + + @abstractmethod + def load(self, file_path: Path) -> List[Dict[str, Any]]: + """Load documents from file.""" + pass + + @abstractmethod + def infer_schema(self, documents: List[Dict[str, Any]]) -> Dict[str, type]: + """Infer schema from documents.""" + pass + + +class JSONLoader(DocumentLoader): + """Loader for JSON files with native type preservation.""" + + def load(self, file_path: Path) -> List[Dict[str, Any]]: + """Load JSON file preserving native types.""" + logger.info(f"Loading JSON file: {file_path}") + + with open(file_path, 'r') as f: + data = json.load(f) + + # Normalize to list + if isinstance(data, dict): + documents = [data] + elif isinstance(data, list): + documents = data + else: + raise ValueError(f"Expected JSON object or array, got {type(data)}") + + # Validate structure + for i, doc in enumerate(documents): + if not isinstance(doc, dict): + raise ValueError(f"Document {i} is not a JSON object: {type(doc)}") + + logger.info(f"Loaded {len(documents)} documents from JSON") + return documents + + def infer_schema(self, documents: List[Dict[str, Any]]) -> Dict[str, type]: + """Infer schema from JSON documents.""" + if not documents: + return {} + + schema = {} + sample_doc = documents[0] + + for key, value in sample_doc.items(): + schema[key] = type(value) + + return schema + + +class CSVLoader(DocumentLoader): + """Loader for CSV files with intelligent type inference.""" + + def load(self, file_path: Path) -> List[Dict[str, Any]]: + """Load CSV file with type inference.""" + logger.info(f"Loading CSV file: {file_path}") + + # Read CSV with pandas (intelligent type inference) + df = pd.read_csv(file_path) + + # Clean column names + df.columns = df.columns.str.strip() + + # Convert to documents with native types preserved + documents = [] + for _, row in df.iterrows(): + doc = {} + for col in df.columns: + value = row[col] + # Handle pandas NA/NaN + if pd.isna(value): + doc[col] = None + # Try to parse JSON strings (for configs, multi_results) + elif isinstance(value, str) and value.strip().startswith(('{', '[')): + try: + doc[col] = json.loads(value) + except json.JSONDecodeError: + doc[col] = value + else: + # Keep native type (int, float, bool, str) + doc[col] = value if not pd.isna(value) else None + + documents.append(doc) + + logger.info(f"Loaded {len(documents)} documents from CSV") + return documents + + def infer_schema(self, documents: List[Dict[str, Any]]) -> Dict[str, type]: + """Infer schema from CSV documents.""" + if not documents: + return {} + + schema = {} + sample_doc = documents[0] + + for key, value in sample_doc.items(): + if value is None: + schema[key] = type(None) + else: + schema[key] = type(value) + + return schema + + +def detect_file_format(file_path: Path) -> FileFormat: + """Detect file format from extension and content.""" + + extension = file_path.suffix.lower() + + if extension == '.json': + return FileFormat.JSON + elif extension == '.csv': + return FileFormat.CSV + + # Content-based detection + try: + with open(file_path, 'r') as f: + first_char = f.read(1).strip() + if first_char in ['{', '[']: + return FileFormat.JSON + else: + return FileFormat.CSV + except Exception: + raise ValueError(f"Cannot detect format for {file_path}") + + +def get_loader(file_format: FileFormat) -> DocumentLoader: + """Get appropriate loader for file format.""" + loaders = { + FileFormat.JSON: JSONLoader(), + FileFormat.CSV: CSVLoader(), + } + return loaders[file_format] + + +# ============================================================================ +# Document Transformer +# ============================================================================ + +class DocumentTransformer: + """Transform and enrich documents before upload.""" + + def __init__(self, options: UploadOptions): + self.options = options + + def transform(self, documents: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Transform documents with metadata and normalization.""" + transformed = [] + + for doc in documents: + # Add metadata + if self.options.add_metadata: + doc = self._add_metadata(doc) + + # Normalize types + doc = self._normalize_types(doc) + + transformed.append(doc) + + return transformed + + def _add_metadata(self, doc: Dict[str, Any]) -> Dict[str, Any]: + """Add metadata fields.""" + prefix = self.options.metadata_prefix + + # Add upload timestamp if not present + if f"{prefix}_uploaded_at" not in doc: + doc[f"{prefix}_uploaded_at"] = datetime.utcnow() + + # Preserve original created_date if present + if "created_date" not in doc: + doc["created_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + + return doc + + def _normalize_types(self, doc: Dict[str, Any]) -> Dict[str, Any]: + """Normalize types for MongoDB compatibility.""" + normalized = {} + + for key, value in doc.items(): + # Handle numpy types (from pandas) + if hasattr(value, 'item'): # numpy scalar + value = value.item() + + # Convert pandas Timestamp to datetime + if hasattr(value, 'to_pydatetime'): + value = value.to_pydatetime() + + # Keep None as None (not empty string) + if pd.isna(value): + value = None + + normalized[key] = value + + return normalized + + def infer_unique_fields(self, documents: List[Dict[str, Any]]) -> List[str]: + """Intelligently infer unique identifier fields.""" + if not documents: + return [] + + # Common unique field patterns + candidate_fields = ['model', 'name', 'id', 'timestamp', 'date', 'pipeline'] + + available_fields = set(documents[0].keys()) + unique_fields = [] + + for field in candidate_fields: + if field in available_fields: + # Check if field has unique values + values = [doc.get(field) for doc in documents[:100]] # Sample + if len(set(str(v) for v in values if v is not None)) == len([v for v in values if v is not None]): + unique_fields.append(field) + break # Found a unique field + + # If no single unique field, try combinations + if not unique_fields and 'model' in available_fields: + unique_fields = ['model'] + if 'timestamp' in available_fields: + unique_fields.append('timestamp') + + return unique_fields + + +# ============================================================================ +# MongoDB Uploader +# ============================================================================ + +class MongoDBUploader: + """Handles MongoDB connection and bulk upload operations.""" + + def __init__(self, config: MongoDBConfig): + self.config = config + self.client: Optional[pymongo.MongoClient] = None + + def __enter__(self): + """Context manager entry.""" + self.connect() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + self.disconnect() + + def connect(self): + """Establish MongoDB connection.""" + logger.info(f"Connecting to MongoDB at {self.config.host}:{self.config.port}") + + self.client = pymongo.MongoClient( + self.config.uri, + serverSelectionTimeoutMS=self.config.timeout_ms + ) + + # Test connection + self.client.server_info() + logger.info("✅ Connected to MongoDB") + + def disconnect(self): + """Close MongoDB connection.""" + if self.client: + self.client.close() + logger.info("Disconnected from MongoDB") + + def upload( + self, + documents: List[Dict[str, Any]], + database_name: str, + collection_name: str, + options: UploadOptions + ) -> UploadResult: + """Upload documents to MongoDB with bulk operations.""" + + start_time = datetime.now() + + # Get collection + db = self.client[database_name] + collection = db[collection_name] + + # Create indexes if requested + if options.create_indexes: + self._create_indexes(collection, documents, options) + + # Perform bulk upload + result = self._bulk_upload(collection, documents, options) + + # Calculate duration + result.duration_seconds = (datetime.now() - start_time).total_seconds() + + return result + + def _create_indexes( + self, + collection, + documents: List[Dict[str, Any]], + options: UploadOptions + ): + """Create indexes for efficient querying.""" + if not documents: + return + + # Determine fields to index + index_fields = options.index_fields or [] + + if not index_fields and options.unique_fields: + index_fields = options.unique_fields + + # Auto-detect common index candidates + if not index_fields: + common_index_fields = ['model', 'timestamp', 'date', 'status', 'pipeline'] + available = set(documents[0].keys()) + index_fields = [f for f in common_index_fields if f in available] + + # Create indexes + for field in index_fields: + try: + collection.create_index([(field, pymongo.ASCENDING)]) + logger.info(f"Created index on field: {field}") + except PyMongoError as e: + logger.warning(f"Could not create index on {field}: {e}") + + # Create compound index for unique fields + if options.unique_fields and len(options.unique_fields) > 1: + try: + index_spec = [(f, pymongo.ASCENDING) for f in options.unique_fields] + collection.create_index(index_spec, unique=False, background=True) + logger.info(f"Created compound index on: {options.unique_fields}") + except PyMongoError as e: + logger.warning(f"Could not create compound index: {e}") + + def _bulk_upload( + self, + collection, + documents: List[Dict[str, Any]], + options: UploadOptions + ) -> UploadResult: + """Perform bulk upload with batching.""" + + total_inserted = 0 + total_updated = 0 + total_failed = 0 + errors = [] + + # Prepare bulk operations + if options.upsert and options.unique_fields: + operations = self._build_upsert_operations(documents, options.unique_fields) + else: + # Simple insert_many + try: + result = collection.insert_many(documents, ordered=options.ordered) + total_inserted = len(result.inserted_ids) + except BulkWriteError as e: + total_inserted = e.details.get('nInserted', 0) + total_failed = len(e.details.get('writeErrors', [])) + errors = [err['errmsg'] for err in e.details.get('writeErrors', [])] + + return UploadResult( + status="success" if total_failed == 0 else "partial", + documents_read=len(documents), + documents_processed=total_inserted + total_failed, + documents_inserted=total_inserted, + documents_updated=0, + documents_failed=total_failed, + errors=errors + ) + + # Batched bulk write for upsert operations + batch_size = options.batch_size + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + console=console + ) as progress: + + task = progress.add_task( + f"Uploading to {collection.name}...", + total=len(operations) + ) + + for i in range(0, len(operations), batch_size): + batch = operations[i:i + batch_size] + + try: + result = collection.bulk_write(batch, ordered=options.ordered) + total_inserted += result.upserted_count + total_updated += result.modified_count + + except BulkWriteError as e: + total_inserted += e.details.get('nUpserted', 0) + total_updated += e.details.get('nModified', 0) + write_errors = e.details.get('writeErrors', []) + total_failed += len(write_errors) + errors.extend([err['errmsg'] for err in write_errors[:5]]) # Limit error messages + + progress.update(task, advance=len(batch)) + + status = "success" if total_failed == 0 else ("partial" if total_inserted + total_updated > 0 else "failed") + + return UploadResult( + status=status, + documents_read=len(documents), + documents_processed=total_inserted + total_updated + total_failed, + documents_inserted=total_inserted, + documents_updated=total_updated, + documents_failed=total_failed, + errors=errors + ) + + def _build_upsert_operations( + self, + documents: List[Dict[str, Any]], + unique_fields: List[str] + ) -> List[UpdateOne]: + """Build bulk upsert operations.""" + operations = [] + + for doc in documents: + # Build filter from unique fields + filter_doc = {field: doc[field] for field in unique_fields if field in doc} + + if not filter_doc: + # No unique fields, skip or insert + continue + + # Upsert operation + operations.append( + UpdateOne( + filter_doc, + {"$set": doc}, + upsert=True + ) + ) + + return operations + + +# ============================================================================ +# Main Upload Function +# ============================================================================ + +def upload_file_to_mongodb( + file_path: str, + database_name: str, + collection_name: str, + config: Optional[MongoDBConfig] = None, + options: Optional[UploadOptions] = None +) -> UploadResult: + """ + Upload CSV or JSON file to MongoDB with intelligent handling. + + This is the main entry point for file uploads. + + Args: + file_path: Path to CSV or JSON file + database_name: MongoDB database name + collection_name: MongoDB collection name + config: MongoDB configuration (uses env vars if None) + options: Upload options (uses defaults if None) + + Returns: + UploadResult with operation details + + Raises: + FileNotFoundError: If file doesn't exist + ValueError: If file format is invalid + ConnectionFailure: If MongoDB connection fails + """ + # Setup + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + config = config or MongoDBConfig.from_env() + options = options or UploadOptions() + + # Detect format and load documents + file_format = detect_file_format(file_path) + loader = get_loader(file_format) + + console.print(f"📂 Loading {file_format.value.upper()} file: [cyan]{file_path.name}[/cyan]") + documents = loader.load(file_path) + + if not documents: + raise ValueError(f"No documents found in {file_path}") + + console.print(f"✅ Loaded {len(documents)} documents") + + # Transform documents + transformer = DocumentTransformer(options) + + # Infer unique fields if not specified + if options.unique_fields is None: + options.unique_fields = transformer.infer_unique_fields(documents) + if options.unique_fields: + console.print(f"🔑 Auto-detected unique fields: [yellow]{', '.join(options.unique_fields)}[/yellow]") + + documents = transformer.transform(documents) + + # Handle dry-run before connecting to MongoDB + if options.dry_run: + console.print(f"\n🔍 [yellow]DRY RUN: Would upload {len(documents)} documents[/yellow]") + console.print(f" Database: {database_name}") + console.print(f" Collection: {collection_name}") + if options.unique_fields: + console.print(f" Unique fields: {', '.join(options.unique_fields)}") + console.print(f" Upsert: {options.upsert}") + console.print(f" Create indexes: {options.create_indexes}") + + return UploadResult( + status="success", + documents_read=len(documents), + documents_processed=0, + documents_inserted=0, + documents_updated=0, + documents_failed=0, + duration_seconds=0.0 + ) + + # Upload to MongoDB + with MongoDBUploader(config) as uploader: + result = uploader.upload( + documents=documents, + database_name=database_name, + collection_name=collection_name, + options=options + ) + + return result + + +# ============================================================================ +# Legacy Compatibility +# ============================================================================ + +def upload_csv_to_mongodb( + csv_file_path: str, + database_name: str, + collection_name: str, + mongo_config: Optional[MongoDBConfig] = None +) -> Dict[str, Any]: + """ + Upload CSV data to MongoDB collection. + + DEPRECATED: Use upload_file_to_mongodb() instead. + This function is kept for backward compatibility. + + Args: + csv_file_path: Path to CSV file + database_name: Name of MongoDB database + collection_name: Name of MongoDB collection + mongo_config: MongoDB configuration (uses environment if None) + + Returns: + Dictionary with operation results + """ + logger.warning("upload_csv_to_mongodb is deprecated. Use upload_file_to_mongodb instead.") + + result = upload_file_to_mongodb( + file_path=csv_file_path, + database_name=database_name, + collection_name=collection_name, + config=mongo_config, + options=UploadOptions() + ) + + # Convert UploadResult to legacy dict format + return { + "status": "success" if result.status == "success" else "partial", + "database": database_name, + "collection": collection_name, + "records_processed": result.documents_processed, + } + + +class MongoDBHandler: + """ + Legacy handler class for MongoDB operations. + + DEPRECATED: This class is kept for backward compatibility. + Use upload_file_to_mongodb() directly instead. + """ + + def __init__(self, args): + """Initialize the MongoDBHandler.""" + import argparse + + self.args = args + self.config = MongoDBConfig.from_env() + self.database_name = args.database_name + self.collection_name = args.collection_name + + # Support both old and new parameter names + self.file_path = getattr(args, 'file_path', None) or getattr(args, 'csv_file_path', None) + self.unique_key = getattr(args, 'unique_key', None) + self.return_status = False + + def run(self) -> bool: + """Execute the MongoDB upload operation.""" + logger.warning("MongoDBHandler is deprecated. Use upload_file_to_mongodb instead.") + + print("\n" + "=" * 80) + print("📤 UPLOADING TO MONGODB") + print("=" * 80) + print(f"📂 File: {self.file_path}") + print(f"🗄️ Database: {self.database_name}") + print(f"📊 Collection: {self.collection_name}") + + try: + # Parse unique fields if provided + unique_fields = None + if self.unique_key: + unique_fields = [k.strip() for k in self.unique_key.split(',')] + + options = UploadOptions(unique_fields=unique_fields) + + result = upload_file_to_mongodb( + file_path=self.file_path, + database_name=self.database_name, + collection_name=self.collection_name, + config=self.config, + options=options + ) + + print(f"✅ Successfully processed {result.documents_processed} documents") + print(f" Inserted: {result.documents_inserted}") + print(f" Updated: {result.documents_updated}") + print("=" * 80 + "\n") + + self.return_status = True + + except FileNotFoundError as e: + print(f"❌ Error: {e}") + self.return_status = False + except ConnectionFailure as e: + print(f"❌ MongoDB connection failed: {e}") + print("💡 Tip: Check MONGO_HOST, MONGO_PORT, MONGO_USER, MONGO_PASSWORD") + self.return_status = False + except ValueError as e: + print(f"❌ Invalid file: {e}") + self.return_status = False + except Exception as e: + print(f"❌ Unexpected error: {e}") + logger.exception("MongoDB upload failed") + self.return_status = False + + print("=" * 80 + "\n") + return self.return_status diff --git a/src/madengine/db/base_class.py b/src/madengine/db/base_class.py deleted file mode 100644 index e8ca31ac..00000000 --- a/src/madengine/db/base_class.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -""" Module for creating DB tables interfaces - -This module provides the base class for our own common functionalities among tables - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# third-party modules -from sqlalchemy.ext.declarative import declarative_base - - -BASE = declarative_base() - - -class BaseMixin: - """Base class for our own common functionalities among tables - - This class provides the common functionalities among tables - - Attributes: - __tablename__ (str): The name of the table - __table__ (str): The table object - """ - - @classmethod - def obj_as_list_dict(cls, obj): - """Function to help with printing""" - dict_list = [] - for elem in obj: - # extra elem at top of dict - elem.__dict__.pop("_sa_instance_state", None) - # print(elem.__dict__) - # print(row.__table__.columns) - dict_list.append(elem.__dict__) - return dict_list - - @classmethod - def obj_columns(cls, obj): - """Helper function""" - return obj[0].__table__.columns.keys() - - @classmethod - def obj_as_dict(cls, obj, ommit_ts=False): - """Helper function""" - if "_sa_instance_state" in obj.__dict__.keys(): - obj.__dict__.pop("_sa_instance_state") - if ommit_ts: - obj.__dict__.pop("update_ts") - obj.__dict__.pop("insert_ts") - return obj.__dict__ - - def __repr__(self): - return "Table name: {0}\nTable columns: {1}".format( - self.__table__, self.__table__.columns - ) diff --git a/src/madengine/db/database.py b/src/madengine/db/database.py deleted file mode 100644 index 1e384854..00000000 --- a/src/madengine/db/database.py +++ /dev/null @@ -1,230 +0,0 @@ -"""Module of the MAD Engine database. - -This module provides the functions to create and update tables in the database. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" - -# built-in modules -import os -from datetime import datetime, timezone -# third-party modules -from sqlalchemy import Column, Integer, String, DateTime, TEXT, MetaData, Table -from sqlalchemy.exc import OperationalError -from sqlalchemy import create_engine -from sqlalchemy.orm import mapper, clear_mappers - -# MAD Engine modules -from logger import setup_logger -from base_class import BASE, BaseMixin -from utils import get_env_vars - - -# Create the logger -LOGGER = setup_logger() -# Get the environment variables -ENV_VARS = get_env_vars() - -# Check if the environment variables are set -if ENV_VARS["user_name"] is None or ENV_VARS["user_password"] is None: - raise ValueError("User name or password not set") - -if ENV_VARS["db_hostname"] is None or ENV_VARS["db_port"] is None: - raise ValueError("DB hostname or port not set") - -if ENV_VARS["db_name"] is None: - raise ValueError("DB name not set") - -# Create the engine -ENGINE = create_engine( - "mysql+pymysql://{user_name}:{user_password}@{hostname}:{port}/{db_name}".format( - user_name=ENV_VARS["user_name"], - user_password=ENV_VARS["user_password"], - hostname=ENV_VARS["db_hostname"], - port=ENV_VARS["db_port"], - db_name=ENV_VARS["db_name"], - ) -) - -# Define the path to the SQL file -SQL_FILE_PATH = os.path.join(os.path.dirname(__file__), 'db_table_def.sql') -# Update TABLE_SCHEMA and TABLE_NAME variables -TABLE_SCHEMA = ENV_VARS["db_name"] -TABLE_NAME = None -# get table name from SQL file -with open(SQL_FILE_PATH, 'r') as file: - for line in file: - if 'CREATE TABLE' in line: - TABLE_NAME = line.split(' ')[2].split('(')[0] - TABLE_NAME = TABLE_NAME.replace('`', '') - break - -if TABLE_NAME is None: - raise ValueError("Table name not found in SQL file") - -def read_sql_file(file_path: str) -> str: - """Read the SQL file and return its content.""" - with open(file_path, 'r') as file: - return file.read() - -def parse_table_definition(sql_content: str) -> Table: - """Parse the SQL content and return the table definition.""" - metadata = MetaData() - table = Table(TABLE_NAME, metadata, autoload_with=ENGINE, autoload_replace=True) - return table - -# Read and parse the SQL file -sql_content = read_sql_file(SQL_FILE_PATH) -db_table_definition = parse_table_definition(sql_content) - -# Clear any existing mappers -clear_mappers() - -# Define the DB_TABLE class dynamically -class DB_TABLE(BaseMixin, BASE): - """Represents db job table""" - __tablename__ = db_table_definition.name - __table__ = db_table_definition - - -def connect_db() -> None: - """Create DB if it doesnt exist - - This function creates the database if it does not exist. - - Raises: - OperationalError: An error occurred while creating the database. - """ - db_name = ENV_VARS["db_name"] - user_name = ENV_VARS["user_name"] - - try: - ENGINE.execute("Use {}".format(db_name)) - return - except OperationalError: # as err: - LOGGER.warning( - "Database %s does not exist, attempting to create database", db_name - ) - - try: - ENGINE.execute("Create database if not exists {}".format(db_name)) - except OperationalError as err: - LOGGER.error("Database creation failed %s for username: %s", err, user_name) - - ENGINE.execute("Use {}".format(db_name)) - ENGINE.execute("SET GLOBAL max_allowed_packet=4294967296") - - -def clear_db() -> None: - """Clear DB - - This function clears the database. - - Raises: - OperationalError: An error occurred while clearing the database - """ - db_name = ENV_VARS["db_name"] - - try: - ENGINE.execute("DROP DATABASE IF EXISTS {}".format(db_name)) - return - except OperationalError: # as err: - LOGGER.warning("Database %s could not be dropped", db_name) - - -def show_db() -> None: - """Show DB - - This function shows the database. - - Raises: - OperationalError: An error occurred while showing the database - """ - db_name = ENV_VARS["db_name"] - - try: - result = ENGINE.execute( - "SELECT * FROM {} \ - WHERE {}.created_date= \ - (SELECT MAX(created_date) FROM {}) ;".format(DB_TABLE.__tablename__) - ) - for row in result: - print(row) - return - except OperationalError: # as err: - LOGGER.warning("Database %s could not be shown", db_name) - - -def create_tables() -> bool: - """Function to create or sync DB tables/triggers - - This function creates or syncs the database tables/triggers. - - Returns: - bool: True if the tables are created successfully. - - Raises: - OperationalError: An error occurred while creating the tables. - """ - connect_db() - all_tables = [DB_TABLE] - - for table in all_tables: - if not table.__table__.exists(ENGINE): - try: - table.__table__.create(ENGINE) - LOGGER.info("Created: %s", table.__tablename__) - except OperationalError as err: - LOGGER.warning("Error occurred %s", err) - LOGGER.warning("Failed to create table %s \n", table.__tablename__) - continue - else: - LOGGER.info("Table %s already exists", table.__tablename__) - - return True - - -def trim_column(col_name: str) -> None: - """Trim column - - This function trims the column. - - Args: - col_name: Name of the column to be trimmed. - - Raises: - OperationalError: An error occurred while trimming the column. - """ - ENGINE.execute( - "UPDATE {} \ - SET \ - {} = TRIM({});".format( - DB_TABLE.__tablename__, col_name, col_name - ) - ) - show_db() - - -def get_column_names() -> list: - """Get column names - - This function gets the column names. - - Returns: - list: List of column names. - - Raises: - OperationalError: An error occurred while getting the column names. - """ - db_name = ENV_VARS["db_name"] - - result = ENGINE.execute( - "SELECT `COLUMN_NAME` \ - FROM `INFORMATION_SCHEMA`.`COLUMNS` \ - WHERE `TABLE_SCHEMA`='{}' \ - AND `TABLE_NAME`='{}'".format(db_name, DB_TABLE.__tablename__) - ) - ret = [] - for row in result: - ret.append(row[0]) - return ret diff --git a/src/madengine/db/database_functions.py b/src/madengine/db/database_functions.py deleted file mode 100644 index 97561fc1..00000000 --- a/src/madengine/db/database_functions.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Functions of the MAD Engine database. - -This module contains the functions to interact with the MAD Engine database. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" - -# built-in modules -import typing - -# MAD Engine modules -from database import ENGINE - - -def get_all_gpu_archs() -> typing.List[str]: - """Get all GPU architectures from the database. - - Returns: - typing.List[str]: A list of all GPU architectures in the database. - """ - matching_entries = ENGINE.execute( - "SELECT DISTINCT(gpu_architecture) FROM dlm_table" - ) - - archs = [] - for arch in matching_entries.fetchall(): - arch = arch[0] # return from database is in list - if arch: - archs.append("{}".format(arch)) - - return archs - - -def get_matching_db_entries( - recent_entry: typing.Dict[str, typing.Any], filters: typing.Dict[str, typing.Any] -) -> typing.List[typing.Dict[str, typing.Any]]: - """Get matching entries from the database. - - Args: - recent_entry (typing.Dict[str, typing.Any]): The recent entry to compare. - filters (typing.Dict[str, typing.Any]): The filters to apply. - - Returns: - typing.List[typing.Dict[str, typing.Any]]: The matching entries. - """ - print( - "Looking for entries with {}, {} and {}".format( - recent_entry["model"], - recent_entry["gpu_architecture"], - filters - ) - ) - - # find matching entries to current entry - matching_entries = ENGINE.execute( - "SELECT * FROM dlm_table \ - WHERE model='{}' \ - AND gpu_architecture='{}' \ - ".format( - recent_entry["model"], - recent_entry["gpu_architecture"] - ) - ) - matching_entries = matching_entries.mappings().all() - - # filter db entries - filtered_matching_entries = [] - for m in matching_entries: - should_add = True - for filter, value in filters.items(): - if m[filter] != value: - should_add = False - - if should_add: - filtered_matching_entries.append(m) - - print( - "Found {} similar entries in database filtered down to {} entries".format( - len(matching_entries), - len(filtered_matching_entries) - ) - ) - return filtered_matching_entries diff --git a/src/madengine/db/db_table_def.sql b/src/madengine/db/db_table_def.sql deleted file mode 100644 index bb6e3707..00000000 --- a/src/madengine/db/db_table_def.sql +++ /dev/null @@ -1,23 +0,0 @@ -CREATE TABLE `dlm_table` ( - `id` INT PRIMARY KEY, - `created_date` DATETIME DEFAULT CURRENT_TIMESTAMP, - `model` VARCHAR(128), - `pipeline` VARCHAR(65535), - `n_gpus` VARCHAR(128), - `training_precision` VARCHAR(128), - `args` VARCHAR(128), - `tags` VARCHAR(65535), - `docker_file` VARCHAR(128), - `base_docker` VARCHAR(128), - `docker_sha` VARCHAR(128), - `docker_image` VARCHAR(128), - `git_commit` VARCHAR(128), - `machine_name` VARCHAR(128), - `gpu_architecture` VARCHAR(128), - `performance` VARCHAR(128), - `metric` VARCHAR(128), - `relative_change` TEXT, - `status` VARCHAR(128), - `build_duration` VARCHAR(128), - `test_duration` VARCHAR(128) -); \ No newline at end of file diff --git a/src/madengine/db/logger.py b/src/madengine/db/logger.py deleted file mode 100644 index 8f450013..00000000 --- a/src/madengine/db/logger.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Module of logging functions. - -This module provides the functions to setup the logger for the MAD Engine. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import logging -import os -import sys - - -# Get the log level, if it is not set, set it to INFO. -if "LOG_LEVEL" not in os.environ: - LOG_LEVEL = "INFO" -else: - LOG_LEVEL = os.environ["LOG_LEVEL"] - - -def setup_logger(): - """Setup the logger for the MAD Engine. - - This function sets up the logger for the MAD Engine. - - Returns: - logging.Logger: The logger for the MAD Engine. - """ - logging.basicConfig(level=LOG_LEVEL) - # Create a logger - logger = logging.getLogger("madengine") - # logger.setLevel(logging.INFO) - - # Create a formatter - formatter = logging.Formatter( - "%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" - ) - - # Create a console handler - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setLevel(logging.INFO) - console_handler.setFormatter(formatter) - logger.propagate = False - logger.addHandler(console_handler) - - # Create a file handler - log_file = os.path.join(os.getcwd(), "madengine.log") - file_handler = logging.FileHandler(log_file) - file_handler.setLevel(logging.INFO) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - - return logger diff --git a/src/madengine/db/relative_perf.py b/src/madengine/db/relative_perf.py deleted file mode 100644 index 93d2569f..00000000 --- a/src/madengine/db/relative_perf.py +++ /dev/null @@ -1,128 +0,0 @@ -"""Module to get the relative performance of the models. - -This module contains functions to get the relative performance of the models. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import argparse -import ast -from statistics import mean -import typing - -# third-party modules -import pandas as pd - -# MAD Engine modules -from database import ENGINE, create_tables, LOGGER -from utils import get_avg_perf, load_perf_csv, dataFrame_to_list -from database_functions import get_all_gpu_archs, get_matching_db_entries - - -def get_baseline_configs( - recent_entry: typing.Dict[str, typing.Any], - baseline_params: typing.Dict[str, typing.Any], -) -> typing.List[typing.Dict[str, typing.Any]]: - """Get the baseline configurations. - - This function gets the baseline configurations from the database. - - Args: - recent_entry (typing.Dict[str, typing.Any]): The recent entry. - baseline_params (typing.Dict[str, typing.Any]): The baseline parameters. - - Returns: - typing.List[typing.Dict[str, typing.Any]]: The baseline configurations. - """ - # create sample_config - sample_baseline_config = recent_entry - for k, v in baseline_params.items(): - sample_baseline_config[k] = v - - # search database for similar configs - last_successful_matching_entries = get_matching_db_entries( - recent_entry, - filters={"status": "SUCCESS", "base_docker": recent_entry["base_docker"]}, - ) - - return last_successful_matching_entries - - -def relative_perf( - data: pd.DataFrame, base_line_params: typing.Dict[str, typing.Any] -) -> pd.DataFrame: - """Get the relative performance. - - This function gets the relative performance of the models. - - Args: - data (pd.DataFrame): The data. - base_line_params (typing.Dict[str, typing.Any]): The baseline parameters. - - Returns: - pd.DataFrame: The data. - """ - LOGGER.info("Checking relative performance against {}".format(base_line_params)) - print(data) - # get the most recent entries - most_recent_entries = dataFrame_to_list(data) - - # compare new data with avg of last succesfull runs in database - for i, recent_entry in enumerate(most_recent_entries): - - # find matching entries to current entry - baseline_configs = get_baseline_configs(recent_entry, base_line_params) - baseline_avg, baseline_perfs = get_avg_perf(baseline_configs, 5) - if recent_entry["performance"] and baseline_avg: - print( - "Current Performance is {} {}".format( - recent_entry["performance"], recent_entry["metric"] - ) - ) - relative_perf = (float(recent_entry["performance"]) / baseline_avg) * 100 - print( - "Relative perf {:.2f}% against {}".format( - relative_perf, base_line_params - ) - ) - else: - relative_perf = None - - entry_relative_change = { - "pct_change": relative_perf, - "baseline_avg": baseline_avg, - "sample_count": len(baseline_perfs) if baseline_perfs else None, - } - - # add pct_change info - if data.loc[i, "relative_change"]: - relative_change = ast.literal_eval(data.loc[i, "relative_change"]) - relative_change[base_line_params["gpu_architecture"]] = ( - entry_relative_change - ) - else: - relative_change = { - base_line_params["gpu_architecture"]: entry_relative_change - } - data.loc[i, "relative_change"] = str(relative_change) - - print(data) - return data - - -def relative_perf_all_configs(data: pd.DataFrame) -> pd.DataFrame: - """Get the relative performance of all configurations. - - This function gets the relative performance of all configurations. - - Args: - data (pd.DataFrame): The data. - - Returns: - pd.DataFrame: The data. - """ - archs = get_all_gpu_archs() - print(archs) - for a in archs: - data = relative_perf(data, {"gpu_architecture": a}) - return data diff --git a/src/madengine/db/upload_csv_to_db.py b/src/madengine/db/upload_csv_to_db.py deleted file mode 100644 index d70d15b5..00000000 --- a/src/madengine/db/upload_csv_to_db.py +++ /dev/null @@ -1,120 +0,0 @@ -"""Script to upload csv files to the database, -and create or update tables in the database. - -This script uploads csv files to the database, and creates or updates tables in the database. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -import sys -import argparse -import pandas as pd -import typing -from datetime import datetime -# third-party modules -from tqdm import tqdm -from sqlalchemy.orm import sessionmaker -# MAD Engine modules -from database import ENGINE, create_tables, DB_TABLE, LOGGER -from utils import dataFrame_to_list, load_perf_csv, replace_nans_with_None -from relative_perf import relative_perf_all_configs - - -def add_csv_to_db(data: pd.DataFrame) -> bool: - """Add csv files to the database. - - This function adds csv files to the database. - - Args: - data (pd.DataFrame): The data. - - Returns: - bool: True if data was successfully added, False otherwise - """ - LOGGER.info("adding csv to Database") - # Create the session - session = sessionmaker() - session.configure(bind=ENGINE) - s = session() - - # change nans to None to upload to database - data = replace_nans_with_None(data) - - # Add unique ID column if it doesn't exist - if 'id' not in data.columns: - # Get the max ID from the existing table to ensure uniqueness - try: - max_id_query = s.query(DB_TABLE.id).order_by(DB_TABLE.id.desc()).first() - start_id = 1 if max_id_query is None else max_id_query[0] + 1 - except: - LOGGER.warning('Failed to query max ID, starting from 1') - start_id = 1 - - # Add sequential unique IDs - data['id'] = range(start_id, start_id + len(data)) - - # Explicitly set created_date to current timestamp if not provided - if 'created_date' not in data.columns: - data['created_date'] = datetime.now() - - LOGGER.info("Data:") - LOGGER.info(data) - # add data to databases - success_count = 0 - data_as_list = dataFrame_to_list(data) - total_records = len(data_as_list) - - for model_perf_info in tqdm(data_as_list): - try: - # Ensure created_date is set for each record if not present - if 'created_date' not in model_perf_info or model_perf_info['created_date'] is None: - model_perf_info['created_date'] = datetime.now() - - record = DB_TABLE(**model_perf_info) - s.add(record) - success_count += 1 - except Exception as e: - LOGGER.warning( - 'Failed to add record to table due to %s \n', str(e)) - LOGGER.info(model_perf_info) - s.rollback() - - # commit changes and close sesstion - try: - s.commit() - LOGGER.info('Successfully added %d out of %d records to the database', - success_count, total_records) - success = success_count > 0 - except Exception as e: - LOGGER.error('Failed to commit changes: %s', str(e)) - s.rollback() - success = False - finally: - s.close() - - return success - - -def main() -> None: - """Main script function to upload csv files to the database.""" - # parse arg - parser = argparse.ArgumentParser(description='Upload perf.csv to database') - parser.add_argument("--csv-file-path", type=str) - args = parser.parse_args() - - ret = create_tables() - LOGGER.info('DB creation successful: %s', ret) - - if args.csv_file_path is None: - LOGGER.info("Only creating tables in the database") - return - else: - # load perf.csv to db - LOGGER.info("Loading %s to database", args.csv_file_path) - data = load_perf_csv(args.csv_file_path) - data = relative_perf_all_configs(data) - add_csv_to_db(data) - -if __name__ == '__main__': - main() diff --git a/src/madengine/db/utils.py b/src/madengine/db/utils.py deleted file mode 100644 index 13c6e879..00000000 --- a/src/madengine/db/utils.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python3 -"""Utility module for helper functions - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -from statistics import mean -import typing - -# third-party modules -import pandas as pd -import numpy as np - - -def get_env_vars() -> dict: - """Utility function to get MAD/DLM specific env_vars - - env_vars: - - TUNA_DB_USER_NAME - - TUNA_DB_USER_PASSWORD - - TUNA_DB_HOSTNAME - - TUNA_DB_PORT - - TUNA_DB_NAME - - TUNA_SSH_USER - - TUNA_SSH_PASSWORD - - TUNA_SSH_HOSTNAME - - TUNA_SSH_PORT - - SLURM_CPUS_ON_NODE - - LOG_LEVEL - - MODEL_DIR - - Returns: - dict: Dictionary of DLM specific env_vars - """ - # init env vars - env_vars = {} - - if "TUNA_DB_USER_NAME" in os.environ: - env_vars["user_name"] = os.environ["TUNA_DB_USER_NAME"] - else: - env_vars["user_name"] = "" - if "TUNA_DB_USER_PASSWORD" in os.environ: - env_vars["user_password"] = os.environ["TUNA_DB_USER_PASSWORD"] - else: - env_vars["user_password"] = "" - if "TUNA_DB_HOSTNAME" in os.environ: - env_vars["db_hostname"] = os.environ["TUNA_DB_HOSTNAME"] - else: - env_vars["db_hostname"] = "localhost" - if "TUNA_DB_PORT" in os.environ: - env_vars["db_port"] = str(os.environ["TUNA_DB_PORT"]) - else: - env_vars["db_port"] = "3306" - if "TUNA_DB_NAME" in os.environ: - env_vars["db_name"] = os.environ["TUNA_DB_NAME"] - else: - env_vars["db_name"] = "dlm_db" - if "SLURM_CPUS_ON_NODE" in os.environ: - env_vars["slurm_cpus"] = str(os.environ["SLURM_CPUS_ON_NODE"]) - else: - env_vars["slurm_cpus"] = "0" - if "TUNA_SSH_USER" in os.environ: - env_vars["ssh_user"] = os.environ["TUNA_SSH_USER"] - else: - env_vars["ssh_user"] = "" - if "TUNA_SSH_PASSWORD" in os.environ: - env_vars["ssh_password"] = os.environ["TUNA_SSH_PASSWORD"] - else: - env_vars["ssh_password"] = "" - if "TUNA_SSH_HOSTNAME" in os.environ: - env_vars["ssh_hostname"] = os.environ["TUNA_SSH_HOSTNAME"] - else: - env_vars["ssh_hostname"] = "localhost" - if "TUNA_SSH_PORT" in os.environ: - env_vars["ssh_port"] = str(os.environ["TUNA_SSH_PORT"]) - else: - env_vars["ssh_port"] = "22" - - return env_vars - - -def get_avg_perf( - entry_list: typing.List[dict], - n: int=5 - ) -> typing.Tuple[float, typing.List[float]]: - """Get average performance from the last n entries - - Args: - entry_list (list): List of entries - n (int): Number of entries to consider - - Returns: - tuple: Tuple of average performance and list of performances - """ - perfs = [] - for m in entry_list: - if m["performance"]: - perfs.append(float(m["performance"])) - perfs = perfs[-n:] - - if perfs: - avg = mean(perfs) - print("{} avg from the last {} entries".format(avg, len(perfs))) - return avg, perfs - else: - return None, None - - -def replace_nans_with_None(data: pd.DataFrame) -> pd.DataFrame: - """Replace NaNs with None in the dataframe - - Args: - data (pd.DataFrame): Dataframe to replace NaNs with None - - Returns: - pd.DataFrame: Dataframe with NaNs replaced with None - """ - # change nans to None to avoid errors - # data = data.where((pd.notnull(data)), None) - data = data.replace({np.nan: None}) - return data - - -def load_perf_csv(csv: str) -> pd.DataFrame: - """Load performance csv file - - Args: - csv (str): Path to the performance csv file - - Returns: - pd.DataFrame: Dataframe of the performance csv file - """ - df = pd.read_csv(csv) - df = df.drop(columns=["dataname", "data_provider_type", "data_size", "data_download_duration", "build_number"], errors="ignore") - df.rename(columns=lambda x: x.strip(), inplace=True) - df = df.rename(columns=lambda x: x.strip()) - df = df.where((pd.notnull(df)), None) - - def trim_strings(x): - return x.strip() if isinstance(x, str) else x - - df = df.applymap(trim_strings) - df = replace_nans_with_None(df) - return df - - -def dataFrame_to_list(df: pd.DataFrame) -> typing.List[dict]: - """Convert dataframe to list of dictionaries - - Args: - df (pd.DataFrame): Dataframe to convert - - Returns: - list: List of dictionaries - """ - return df.to_dict(orient="records") diff --git a/src/madengine/deployment/__init__.py b/src/madengine/deployment/__init__.py new file mode 100644 index 00000000..c48e99b8 --- /dev/null +++ b/src/madengine/deployment/__init__.py @@ -0,0 +1,31 @@ +""" +Deployment layer for distributed execution. + +Provides deployment implementations for SLURM and Kubernetes clusters. +Uses Factory pattern for creating appropriate deployment instances. + +Architecture: +- BaseDeployment: Abstract base class defining deployment workflow +- SlurmDeployment: SLURM cluster deployment (uses CLI commands) +- KubernetesDeployment: Kubernetes cluster deployment (uses Python library) +- DeploymentFactory: Factory for creating deployment instances + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +from .base import ( + BaseDeployment, + DeploymentConfig, + DeploymentResult, + DeploymentStatus, +) +from .factory import DeploymentFactory + +__all__ = [ + "BaseDeployment", + "DeploymentConfig", + "DeploymentResult", + "DeploymentStatus", + "DeploymentFactory", +] + diff --git a/src/madengine/deployment/base.py b/src/madengine/deployment/base.py new file mode 100644 index 00000000..33a338a9 --- /dev/null +++ b/src/madengine/deployment/base.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +""" +Base classes for deployment layer. + +Defines abstract base class for all deployment targets (SLURM, Kubernetes). +Implements Template Method pattern for deployment workflow. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import time +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from typing import Any, Dict, List, Optional + +from rich.console import Console + + +class DeploymentStatus(Enum): + """Deployment status enumeration.""" + + PENDING = "pending" + RUNNING = "running" + SUCCESS = "success" + FAILED = "failed" + CANCELLED = "cancelled" + + +@dataclass +class DeploymentConfig: + """Configuration for distributed deployment.""" + + target: str # "slurm", "k8s" (NOT "local" - that uses container_runner) + manifest_file: str + additional_context: Dict[str, Any] = field(default_factory=dict) + timeout: int = 3600 + monitor: bool = True + cleanup_on_failure: bool = True + + +@dataclass +class DeploymentResult: + """Result of deployment operation.""" + + status: DeploymentStatus + deployment_id: str + message: str + metrics: Optional[Dict[str, Any]] = None + logs_path: Optional[str] = None + artifacts: Optional[List[str]] = None + + @property + def is_success(self) -> bool: + """Check if deployment succeeded.""" + return self.status == DeploymentStatus.SUCCESS + + @property + def is_failed(self) -> bool: + """Check if deployment failed.""" + return self.status == DeploymentStatus.FAILED + + +class BaseDeployment(ABC): + """ + Abstract base class for all deployment targets. + + Implements Template Method pattern for deployment workflow. + Subclasses implement specific deployment logic for SLURM, Kubernetes, etc. + + Workflow: + 1. Validate environment and configuration + 2. Prepare deployment artifacts (scripts, manifests) + 3. Deploy to target infrastructure + 4. Monitor until completion (if enabled) + 5. Collect results and metrics + 6. Cleanup (if needed) + """ + + DEPLOYMENT_TYPE: str = "base" + REQUIRED_TOOLS: List[str] = [] # e.g., ["sbatch", "squeue"] for SLURM + + def __init__(self, config: DeploymentConfig): + """ + Initialize deployment. + + Args: + config: Deployment configuration + """ + self.config = config + self.manifest = self._load_manifest(config.manifest_file) + self.console = Console() + + def _load_manifest(self, manifest_file: str) -> Dict: + """ + Load and validate build manifest. + + Args: + manifest_file: Path to build_manifest.json + + Returns: + Loaded manifest dict + + Raises: + FileNotFoundError: If manifest doesn't exist + ValueError: If manifest is invalid + """ + manifest_path = Path(manifest_file) + if not manifest_path.exists(): + raise FileNotFoundError(f"Manifest not found: {manifest_file}") + + with open(manifest_path) as f: + manifest = json.load(f) + + # Validate required fields + required = ["built_images", "built_models", "context"] + missing = [f for f in required if f not in manifest] + if missing: + raise ValueError(f"Invalid manifest, missing: {missing}") + + return manifest + + # Template Method - defines workflow + def execute(self) -> DeploymentResult: + """ + Execute full deployment workflow (Template Method). + + This method orchestrates the entire deployment process by calling + abstract methods that subclasses must implement. + + Returns: + DeploymentResult with status and metrics + """ + try: + # Step 1: Validate + self.console.print( + f"[blue]Validating {self.DEPLOYMENT_TYPE} deployment...[/blue]" + ) + if not self.validate(): + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id="", + message=f"{self.DEPLOYMENT_TYPE} validation failed", + ) + + # Step 2: Prepare + self.console.print("[blue]Preparing deployment artifacts...[/blue]") + if not self.prepare(): + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id="", + message="Preparation failed", + ) + + # Step 3: Deploy + self.console.print(f"[blue]Deploying to {self.DEPLOYMENT_TYPE}...[/blue]") + result = self.deploy() + + if not result.is_success: + if self.config.cleanup_on_failure: + self.cleanup(result.deployment_id) + return result + + # Step 4: Monitor (optional) + if self.config.monitor: + result = self._monitor_until_complete(result.deployment_id) + + # Step 5: Collect Results (always collect, even on failure to record failed runs) + if result.deployment_id: + try: + metrics = self.collect_results(result.deployment_id) + result.metrics = metrics + except Exception as e: + self.console.print(f"[yellow]Warning: Could not collect results for {result.deployment_id}: {e}[/yellow]") + # Ensure empty metrics dict exists even if collection fails + result.metrics = {"successful_runs": [], "failed_runs": []} + + return result + + except Exception as e: + self.console.print(f"[red]Deployment error: {e}[/red]") + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id="", + message=f"Exception: {str(e)}", + ) + + def _monitor_until_complete(self, deployment_id: str) -> DeploymentResult: + """ + Monitor deployment until completion. + + Args: + deployment_id: Deployment ID to monitor + + Returns: + Final deployment status + """ + self.console.print("[blue]Monitoring deployment...[/blue]") + + while True: + status = self.monitor(deployment_id) + + if status.status in [DeploymentStatus.SUCCESS, DeploymentStatus.FAILED]: + return status + + # Still running, wait and check again + self.console.print( + f" Status: {status.status.value} - {status.message}" + ) + time.sleep(30) # Check every 30 seconds + + # Abstract methods to be implemented by subclasses + + @abstractmethod + def validate(self) -> bool: + """ + Validate deployment environment and configuration. + + Should check: + - Required tools are available (sbatch, kubectl, etc.) + - Credentials/access are valid + - Configuration parameters are correct + - Connectivity to target system + + Returns: + True if validation passes, False otherwise + """ + pass + + @abstractmethod + def prepare(self) -> bool: + """ + Prepare deployment artifacts. + + Should generate: + - Deployment scripts (sbatch scripts, K8s Job manifests) + - Configuration files + - Environment setup + + Returns: + True if preparation succeeds, False otherwise + """ + pass + + @abstractmethod + def deploy(self) -> DeploymentResult: + """ + Execute deployment to target infrastructure. + + Should: + - Submit job to scheduler (sbatch, kubectl apply) + - Return immediately with deployment_id + - Not wait for completion (use monitor() for that) + + Returns: + DeploymentResult with status and deployment_id + """ + pass + + @abstractmethod + def monitor(self, deployment_id: str) -> DeploymentResult: + """ + Check deployment status. + + Should query: + - SLURM job status (squeue) + - K8s Job status (kubectl get job) + - etc. + + Args: + deployment_id: ID returned from deploy() + + Returns: + Current deployment status + """ + pass + + @abstractmethod + def collect_results(self, deployment_id: str) -> Dict[str, Any]: + """ + Collect results and metrics from completed deployment. + + Should gather: + - Performance metrics + - Log files + - Output artifacts + - Error information (if any) + + Args: + deployment_id: ID of completed deployment + + Returns: + Dict with metrics and results + """ + pass + + @abstractmethod + def cleanup(self, deployment_id: str) -> bool: + """ + Cleanup deployment resources. + + Should: + - Cancel running jobs + - Delete temporary files + - Release resources + + Args: + deployment_id: ID of deployment to clean up + + Returns: + True if cleanup succeeds, False otherwise + """ + pass + diff --git a/src/madengine/deployment/config_loader.py b/src/madengine/deployment/config_loader.py new file mode 100644 index 00000000..5afbe7b7 --- /dev/null +++ b/src/madengine/deployment/config_loader.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +""" +Configuration loader with multi-layer merging for deployments. + +Layers (low to high priority): +1. System defaults (built-in presets) +2. User file (--additional-context-file) +3. User CLI (--additional-context) + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import os +from pathlib import Path +from typing import Dict, Any, Optional +from copy import deepcopy + + +class ConfigLoader: + """Smart configuration loader with preset support.""" + + PRESET_DIR = Path(__file__).parent / "presets" + + @classmethod + def load_preset(cls, preset_path: str) -> Dict[str, Any]: + """ + Load a preset JSON file. + + Args: + preset_path: Relative path to preset file from PRESET_DIR + + Returns: + Dict containing preset configuration, or empty dict if not found + """ + full_path = cls.PRESET_DIR / preset_path + if not full_path.exists(): + return {} + + try: + with open(full_path, 'r') as f: + return json.load(f) + except (json.JSONDecodeError, IOError) as e: + print(f"Warning: Could not load preset {preset_path}: {e}") + return {} + + @classmethod + def deep_merge(cls, base: Dict, override: Dict) -> Dict: + """ + Deep merge two dictionaries. Override wins conflicts. + Nested dicts are merged, lists/primitives are replaced. + Special handling: env_vars are merged (not replaced). + + Args: + base: Base dictionary + override: Override dictionary + + Returns: + Merged dictionary + """ + result = deepcopy(base) + + for key, value in override.items(): + # Skip documentation/comment fields from base if override has them + if key.startswith('_'): + result[key] = deepcopy(value) + continue + + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + # Recursively merge nested dicts + result[key] = cls.deep_merge(result[key], value) + else: + # Replace with override value + result[key] = deepcopy(value) + + return result + + @classmethod + def detect_profile_needs(cls, config: Dict) -> Dict[str, bool]: + """ + Detect what profiles/optimizations are needed. + + Args: + config: Configuration dictionary + + Returns: + Dict with flags: is_single_gpu, is_multi_gpu, is_multi_node, is_distributed + """ + distributed = config.get("distributed", {}) + gpu_count = config.get("k8s", {}).get("gpu_count", 1) + nnodes = distributed.get("nnodes", 1) + + is_distributed = distributed.get("enabled", False) or distributed.get("launcher") + is_multi_gpu = gpu_count > 1 or is_distributed + is_multi_node = nnodes > 1 + + return { + "is_single_gpu": gpu_count == 1 and not is_distributed, + "is_multi_gpu": is_multi_gpu and not is_multi_node, + "is_multi_node": is_multi_node, + "is_distributed": is_distributed + } + + @classmethod + def select_profile(cls, config: Dict, needs: Dict[str, bool]) -> Optional[str]: + """ + Auto-select k8s profile based on configuration needs. + + Args: + config: Configuration dictionary + needs: Profile needs from detect_profile_needs() + + Returns: + Profile filename or None + """ + if needs["is_multi_node"]: + return "k8s/profiles/multi-node.json" + elif needs["is_multi_gpu"]: + return "k8s/profiles/multi-gpu.json" + elif needs["is_single_gpu"]: + return "k8s/profiles/single-gpu.json" + + return None + + @classmethod + def load_k8s_config(cls, user_config: Dict[str, Any]) -> Dict[str, Any]: + """ + Load complete k8s configuration with multi-layer merging. + + Layers: + 1. Base k8s defaults + 2. GPU vendor base preset + 3. GPU vendor multi-GPU preset (if needed) + 4. Profile preset (single-gpu/multi-gpu/multi-node) + 5. User configuration (already merged from file + CLI) + + Args: + user_config: User-provided configuration (merged from file + CLI) + + Returns: + Complete configuration with all defaults applied + """ + # Layer 1: Base defaults + config = cls.load_preset("k8s/defaults.json") + + # Merge user config temporarily to detect requirements + temp_config = cls.deep_merge(config, user_config) + needs = cls.detect_profile_needs(temp_config) + + # Layer 2: GPU vendor base preset + gpu_vendor = temp_config.get("gpu_vendor", "AMD").upper() + vendor_file = f"k8s/gpu-vendors/{gpu_vendor.lower()}.json" + vendor_preset = cls.load_preset(vendor_file) + config = cls.deep_merge(config, vendor_preset) + + # Layer 3: GPU vendor multi-GPU optimizations (AMD only, when needed) + if gpu_vendor == "AMD" and (needs["is_multi_gpu"] or needs["is_multi_node"]): + amd_multi_preset = cls.load_preset("k8s/gpu-vendors/amd-multi-gpu.json") + config = cls.deep_merge(config, amd_multi_preset) + + # Layer 4: Profile preset based on detected needs + profile_file = cls.select_profile(temp_config, needs) + if profile_file: + profile_preset = cls.load_preset(profile_file) + config = cls.deep_merge(config, profile_preset) + + # Layer 5: User configuration (highest priority) + config = cls.deep_merge(config, user_config) + + return config + + @classmethod + def load_slurm_config(cls, user_config: Dict[str, Any]) -> Dict[str, Any]: + """ + Load complete SLURM configuration with multi-layer merging. + + Layers: + 1. Base SLURM defaults + 2. Profile preset (single-node/multi-node) + 3. User configuration (already merged from file + CLI) + + Args: + user_config: User-provided configuration + + Returns: + Complete configuration with defaults applied + """ + # Layer 1: Base defaults + config = cls.load_preset("slurm/defaults.json") + + # Merge user config temporarily to detect requirements + temp_config = cls.deep_merge(config, user_config) + + # Layer 2: Profile preset based on detected configuration + slurm_config = temp_config.get("slurm", {}) + nodes = slurm_config.get("nodes", 1) + + # Select profile based on node count + if nodes > 1: + profile_preset = cls.load_preset("slurm/profiles/multi-node.json") + config = cls.deep_merge(config, profile_preset) + else: + profile_preset = cls.load_preset("slurm/profiles/single-node.json") + config = cls.deep_merge(config, profile_preset) + + # Layer 3: User configuration (highest priority) + config = cls.deep_merge(config, user_config) + + return config + + @classmethod + def infer_and_validate_deploy_type(cls, user_config: Dict[str, Any]) -> str: + """ + Infer deployment type from config structure and validate for conflicts. + + Convention over Configuration: Presence of k8s/slurm field indicates deployment intent. + + Args: + user_config: User configuration dictionary + + Returns: + Deployment type: "k8s", "slurm", or "local" + + Raises: + ValueError: If conflicting deployment configs present + """ + has_k8s = "k8s" in user_config or "kubernetes" in user_config + has_slurm = "slurm" in user_config + explicit_deploy = user_config.get("deploy", "").lower() + + # Validation Rule 1: Can't have both k8s and slurm configs + if has_k8s and has_slurm: + raise ValueError( + "Conflicting deployment configuration: Both 'k8s' and 'slurm' fields present. " + "Please specify only one deployment target." + ) + + # Validation Rule 2: If explicit deploy set, it must match config presence + if explicit_deploy: + if explicit_deploy in ["k8s", "kubernetes"] and not has_k8s: + raise ValueError( + f"Conflicting deployment: 'deploy' field is '{explicit_deploy}' but no 'k8s' config present. " + "Either add 'k8s' config or remove 'deploy' field." + ) + if explicit_deploy == "slurm" and not has_slurm: + raise ValueError( + f"Conflicting deployment: 'deploy' field is 'slurm' but no 'slurm' config present. " + "Either add 'slurm' config or remove 'deploy' field." + ) + if explicit_deploy == "local" and (has_k8s or has_slurm): + raise ValueError( + f"Conflicting deployment: 'deploy' field is 'local' but k8s/slurm config present. " + "Remove k8s/slurm config for local execution." + ) + + # Infer deployment type from config presence + if has_k8s: + return "k8s" + elif has_slurm: + return "slurm" + else: + return "local" + + @classmethod + def load_config(cls, user_config: Dict[str, Any]) -> Dict[str, Any]: + """ + Load configuration with auto-inferred deploy type and validation. + + Infers deployment type from presence of k8s/slurm fields. + Validates for conflicting configurations. + Applies appropriate defaults based on deployment type. + + Convention over Configuration: + - Presence of "k8s" field → Kubernetes deployment + - Presence of "slurm" field → SLURM deployment + - Neither present → Local execution + - No explicit "deploy" field needed! + + Args: + user_config: User configuration (from file + CLI merge) + + Returns: + Complete configuration with defaults applied (no deploy field added) + + Raises: + ValueError: If conflicting deployment configs present + """ + # Infer and validate deployment type + deploy_type = cls.infer_and_validate_deploy_type(user_config) + + # Apply appropriate defaults based on deployment type + # Note: We do NOT add a "deploy" field - type is inferred from structure + if deploy_type == "k8s": + return cls.load_k8s_config(user_config) + elif deploy_type == "slurm": + return cls.load_slurm_config(user_config) + else: + # Local - return as-is (no deploy field needed) + return user_config + diff --git a/src/madengine/deployment/factory.py b/src/madengine/deployment/factory.py new file mode 100644 index 00000000..9391d3a3 --- /dev/null +++ b/src/madengine/deployment/factory.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Deployment Factory - Creates appropriate deployment instances. + +Implements Factory pattern to dynamically create SLURM or Kubernetes +deployment instances based on target configuration. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +from typing import Dict, Type + +from .base import BaseDeployment, DeploymentConfig + + +class DeploymentFactory: + """ + Factory for creating deployment instances. + + Supports dynamic registration and creation of deployment types. + Currently supports: slurm, k8s/kubernetes + """ + + _deployments: Dict[str, Type[BaseDeployment]] = {} + + @classmethod + def register(cls, deployment_type: str, deployment_class: Type[BaseDeployment]): + """ + Register a deployment type. + + Args: + deployment_type: Name of deployment type (e.g., "slurm", "k8s") + deployment_class: Class implementing BaseDeployment + """ + cls._deployments[deployment_type] = deployment_class + + @classmethod + def create(cls, config: DeploymentConfig) -> BaseDeployment: + """ + Create a deployment instance based on config. + + Args: + config: Deployment configuration with target type + + Returns: + Deployment instance for the specified target + + Raises: + ValueError: If deployment type is not registered + """ + deployment_class = cls._deployments.get(config.target) + + if not deployment_class: + available = ", ".join(cls._deployments.keys()) + raise ValueError( + f"Unknown deployment target: {config.target}. " + f"Available: {available}" + ) + + return deployment_class(config) + + @classmethod + def available_deployments(cls) -> list: + """ + Get list of available deployment types. + + Returns: + List of registered deployment type names + """ + return list(cls._deployments.keys()) + + +def register_default_deployments(): + """ + Register default deployment implementations. + + Called on module import to register built-in deployments. + """ + # Always register SLURM (no optional dependencies) + from .slurm import SlurmDeployment + + DeploymentFactory.register("slurm", SlurmDeployment) + + # Register Kubernetes if library is available + try: + from .kubernetes import KubernetesDeployment + + DeploymentFactory.register("k8s", KubernetesDeployment) + DeploymentFactory.register("kubernetes", KubernetesDeployment) + except ImportError: + # Kubernetes library not installed, skip registration + pass + + +# Auto-register on module import +register_default_deployments() + diff --git a/src/madengine/deployment/kubernetes.py b/src/madengine/deployment/kubernetes.py new file mode 100644 index 00000000..9c374c50 --- /dev/null +++ b/src/madengine/deployment/kubernetes.py @@ -0,0 +1,3731 @@ +#!/usr/bin/env python3 +""" +Kubernetes Deployment - Container orchestration using Jinja2 templates + Python library. + +Uses Jinja2 templates for manifest generation (industry best practice) and +Kubernetes Python client library for applying manifests. +Requires AMD GPU Device Plugin: https://github.com/ROCm/k8s-device-plugin + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import os +import subprocess +import time +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +try: + from kubernetes import client + from kubernetes import config as k8s_config + from kubernetes.client.rest import ApiException + + KUBERNETES_AVAILABLE = True +except ImportError: + KUBERNETES_AVAILABLE = False + +try: + import yaml + + YAML_AVAILABLE = True +except ImportError: + YAML_AVAILABLE = False + +from jinja2 import Environment, FileSystemLoader, Template + +from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus +from .config_loader import ConfigLoader +from madengine.core.dataprovider import Data +from madengine.core.context import Context +from madengine.core.errors import ConfigurationError, create_error_context +from madengine.utils.gpu_config import resolve_runtime_gpus + + +# Valid distributed launchers +VALID_LAUNCHERS = [ + "torchrun", + "torchtitan", + "deepspeed", + "megatron-lm", + "vllm", + "sglang", + "sglang-disagg" +] + + +def normalize_launcher(launcher_type: Optional[str], deployment_type: str) -> str: + """ + Normalize launcher field based on deployment type and launcher value. + + Logic: + - If launcher is in VALID_LAUNCHERS: keep as-is + - If launcher is None/empty/invalid: + * local → "docker" (runs in Docker container) + * slurm → "docker" (typically uses containers on compute nodes) + * kubernetes → "native" (pod itself is the container) + + Args: + launcher_type: Raw launcher type from config (may be None) + deployment_type: "local", "slurm", or "kubernetes" + + Returns: + Normalized launcher string + """ + # If launcher is valid, keep it + if launcher_type and launcher_type in VALID_LAUNCHERS: + return launcher_type + + # Otherwise, default based on deployment type + if deployment_type == "local": + return "docker" + elif deployment_type == "slurm": + return "docker" + elif deployment_type == "kubernetes": + return "native" + else: + # Fallback for unknown deployment types + return "docker" + + +def is_rocprofv3_available() -> bool: + """ + Check if rocprofv3 is available on the system. + + rocprofv3 is required for multi-node profiling with MPI support. + It's part of rocprofiler-sdk package in ROCm >= 6.4.1. + + Returns: + True if rocprofv3 is available and executable, False otherwise + """ + try: + # Note: rocprofv3 doesn't support --version, use --help instead + result = subprocess.run( + ["rocprofv3", "--help"], + capture_output=True, + text=True, + timeout=5 + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired, OSError): + return False + + +def configure_multi_node_profiling( + nnodes: int, + tools_config: List[Dict], + logger +) -> Dict[str, Any]: + """ + Configure profiling for multi-node runs with rocprofv3 support. + + Industry best practice for multi-node profiling: + - Profile ALL nodes to detect stragglers, load imbalances, and communication bottlenecks + - Use rocprofv3 (MPI-aware) for distributed profiling + - Collect per-node outputs for detailed analysis + + Logic: + 1. Single node (nnodes == 1): Use existing tool behavior + 2. Multi-node (nnodes > 1): + a. Check if rocprofv3 is available + b. If available: Enable per-node profiling, upgrade "rocprof" to "rocprofv3" + c. If not available: Log warning and skip profiling + + Args: + nnodes: Number of nodes in the deployment + tools_config: List of tool configurations from user + logger: Logger instance for messages + + Returns: + Dictionary with profiling configuration: + - enabled: bool - Whether profiling is enabled + - mode: str - "single_node", "multi_node", or "multi_node_unsupported" + - tools: List[Dict] - Processed tool configurations + - per_node_collection: bool - Whether to collect from all nodes + """ + if nnodes == 1: + # Single node - existing behavior works fine + return { + "enabled": True, + "mode": "single_node", + "tools": tools_config, + "per_node_collection": False + } + + # Multi-node case - check rocprofv3 availability + if not is_rocprofv3_available(): + logger.warning( + "╔════════════════════════════════════════════════════════════════════════════╗\n" + "║ Multi-Node Profiling Requirements Not Met ║\n" + "╠════════════════════════════════════════════════════════════════════════════╣\n" + "║ Multi-node profiling requires rocprofv3 (MPI-aware profiling support). ║\n" + "║ ║\n" + "║ Current Status: rocprofv3 NOT FOUND on system ║\n" + "║ ║\n" + "║ Profiling will be SKIPPED for this multi-node run. ║\n" + "║ ║\n" + "║ To enable multi-node profiling: ║\n" + "║ • Install rocprofiler-sdk package (ROCm >= 6.4.1) ║\n" + "║ • Command: apt install rocprofiler-sdk ║\n" + "║ • Or upgrade to ROCm 6.4.1 or later ║\n" + "║ ║\n" + "║ Note: Single-node profiling uses rocprof (no rocprofv3 required) ║\n" + "╚════════════════════════════════════════════════════════════════════════════╝" + ) + return { + "enabled": False, + "mode": "multi_node_unsupported", + "tools": [], + "per_node_collection": False + } + + # rocprofv3 is available - enable full multi-node profiling + logger.info(f"✓ Multi-node profiling enabled for {nnodes} nodes (rocprofv3 detected)") + + # Upgrade "rocprof" tools to "rocprofv3" for multi-node compatibility + upgraded_tools = [] + rocprof_upgraded = False + + for tool in tools_config: + tool_name = tool.get("name") + + if tool_name == "rocprof": + # Upgrade to rocprofv3 for multi-node MPI support + logger.info( + f" → Upgrading 'rocprof' to 'rocprofv3' for multi-node MPI compatibility" + ) + upgraded_tool = tool.copy() + upgraded_tool["name"] = "rocprofv3" + upgraded_tools.append(upgraded_tool) + rocprof_upgraded = True + else: + upgraded_tools.append(tool) + + # Log profiling tools being used + if upgraded_tools: + tool_names = [t.get("name") for t in upgraded_tools] + logger.info(f" → Multi-node profiling tools: {', '.join(tool_names)}") + + # Highlight RCCL trace if present (critical for multi-node communication) + if "rccl_trace" in tool_names: + logger.info(" → ✓ rccl_trace enabled (critical for multi-node communication profiling)") + + return { + "enabled": True, + "mode": "multi_node", + "tools": upgraded_tools, + "per_node_collection": True, + "profiler": "rocprofv3", + "wrapper_mode": "launcher" + } + + +class KubernetesDeployment(BaseDeployment): + """ + Kubernetes cluster deployment using Python client library. + + Uses kubernetes Python API for type-safe, production-ready deployment: + - client.BatchV1Api(): Job creation and management + - client.CoreV1Api(): Pod logs and status + + Requires AMD GPU Device Plugin: https://github.com/ROCm/k8s-device-plugin + + **Workflow**: + 1. User has kubeconfig configured (in-cluster or ~/.kube/config) + 2. madengine run --tags model --additional-context '{"deploy": "k8s", ...}' + 3. Creates K8s Job using built Docker image from build phase + 4. Job runs madengine workflow inside container (no docker-in-docker) + """ + + DEPLOYMENT_TYPE = "k8s" + REQUIRED_TOOLS = [] # No CLI tools needed, uses Python library + + def __init__(self, config: DeploymentConfig): + """ + Initialize Kubernetes deployment with Jinja2 templates. + + Args: + config: Deployment configuration + + Raises: + ImportError: If kubernetes or yaml Python libraries not installed + """ + if not KUBERNETES_AVAILABLE: + raise ImportError( + "Kubernetes Python library not installed.\n" + "Install with: pip install madengine[kubernetes]\n" + "Or: pip install kubernetes" + ) + + if not YAML_AVAILABLE: + raise ImportError( + "PyYAML library not installed.\n" + "Install with: pip install pyyaml" + ) + + # Apply intelligent defaults using ConfigLoader + # This merges built-in presets with user configuration + full_config = ConfigLoader.load_k8s_config(config.additional_context) + config.additional_context = full_config + + super().__init__(config) + + # Parse K8s configuration (now with defaults applied) + self.k8s_config = config.additional_context.get("k8s", {}) + if not self.k8s_config: + self.k8s_config = config.additional_context.get("kubernetes", {}) + + self.namespace = self.k8s_config.get("namespace", "default") + self.gpu_resource_name = self.k8s_config.get("gpu_resource_name", "amd.com/gpu") + + # Setup Jinja2 template environment + template_dir = Path(__file__).parent / "templates" / "kubernetes" + self.jinja_env = Environment(loader=FileSystemLoader(str(template_dir))) + + # Register custom Jinja2 filters + self.jinja_env.filters['dirname'] = lambda path: str(Path(path).parent) + + # Initialize data provider (will be used if models need data) + self.data = None + self.context_for_data = None + + # Load Kubernetes configuration + kubeconfig_path = self.k8s_config.get("kubeconfig") + try: + if kubeconfig_path: + k8s_config.load_kube_config(config_file=kubeconfig_path) + else: + # Try in-cluster first, then default kubeconfig + try: + k8s_config.load_incluster_config() + except (k8s_config.ConfigException, FileNotFoundError): + # Not running in-cluster, try default kubeconfig + k8s_config.load_kube_config() + except Exception as e: + raise RuntimeError(f"Failed to load Kubernetes config: {e}") + + # Initialize API clients + self.batch_v1 = client.BatchV1Api() + self.core_v1 = client.CoreV1Api() + + # Generated resources + self.job_name = None + self.configmap_name = None + self.configmap_yaml = None + self.job_yaml = None + self.service_yaml = None + + def validate(self) -> bool: + """Validate Kubernetes cluster access and configuration.""" + try: + # Test cluster connectivity + version = client.VersionApi().get_code() + self.console.print( + f"[green]✓ Connected to K8s cluster (v{version.major}.{version.minor})[/green]" + ) + + # Check if namespace exists + try: + self.core_v1.read_namespace(self.namespace) + self.console.print( + f"[green]✓ Namespace '{self.namespace}' exists[/green]" + ) + except ApiException as e: + if e.status == 404: + self.console.print( + f"[yellow]⚠ Namespace '{self.namespace}' not found[/yellow]" + ) + return False + raise + + # Validate AMD GPU Device Plugin is deployed + nodes = self.core_v1.list_node() + amd_gpu_nodes = [ + n + for n in nodes.items + if self.gpu_resource_name in n.status.allocatable + ] + + if not amd_gpu_nodes: + self.console.print( + f"[yellow]⚠ No nodes with {self.gpu_resource_name} found[/yellow]\n" + f"[yellow] Ensure AMD GPU Device Plugin is deployed:[/yellow]\n" + f"[yellow] kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml[/yellow]" + ) + return False + + self.console.print(f"[green]✓ Found {len(amd_gpu_nodes)} AMD GPU nodes[/green]") + return True + + except Exception as e: + self.console.print(f"[red]✗ Validation failed: {e}[/red]") + return False + + def prepare(self) -> bool: + """Generate K8s manifests from Jinja2 templates.""" + try: + # Get model info + model_keys = list(self.manifest["built_models"].keys()) + if not model_keys: + raise ValueError("No models in manifest") + + model_key = model_keys[0] + model_info = self.manifest["built_models"][model_key] + image_info = self.manifest["built_images"][model_key] + + # Generate resource names (K8s compatible: lowercase, hyphens) + model_name = model_info["name"].lower().replace("_", "-") + self.job_name = f"madengine-{model_name}" + self.configmap_name = f"{self.job_name}-config" + + # Prepare template context + context = self._prepare_template_context(model_info, image_info) + + # Render ConfigMap template + configmap_template = self.jinja_env.get_template("configmap.yaml.j2") + self.configmap_yaml = configmap_template.render(**context) + + # Render Job template + job_template = self.jinja_env.get_template("job.yaml.j2") + self.job_yaml = job_template.render(**context) + + # Optionally render Service template (for multi-node torchrun) + if context.get("create_headless_service"): + service_template = self.jinja_env.get_template("service.yaml.j2") + self.service_yaml = service_template.render(**context) + + # Debug mode: save rendered manifests + if self.config.additional_context.get("debug", False): + self._save_debug_manifests() + + self.console.print( + f"[green]✓ Prepared K8s manifests: {self.job_name}[/green]" + ) + return True + + except Exception as e: + self.console.print(f"[red]✗ Failed to prepare manifests: {e}[/red]") + import traceback + + traceback.print_exc() + return False + + def gather_system_env_details( + self, pre_scripts: List[Dict], model_name: str + ) -> None: + """ + Gather system environment details by adding rocEnvTool to pre-scripts. + + This ensures K8s deployment collects the same system info as local execution. + + Args: + pre_scripts: List of pre-script configurations + model_name: The model name (used for output file naming) + """ + # Add rocEnvTool pre-script with model-specific output name + pre_env_details = { + "path": "scripts/common/pre_scripts/run_rocenv_tool.sh", + "args": model_name.replace("/", "_") + "_env" + } + pre_scripts.append(pre_env_details) + self.console.print(f"[dim]Added rocEnvTool to pre-scripts with args: {pre_env_details['args']}[/dim]") + + def _add_tool_scripts(self, pre_scripts: List[Dict], post_scripts: List[Dict]) -> None: + """ + Add tool pre/post scripts to execution lists (similar to local execution). + + Extracts pre_scripts and post_scripts from tools.json definitions and adds them + to the pre_scripts and post_scripts lists for execution in K8s pods. + + Args: + pre_scripts: List to append tool pre-scripts to + post_scripts: List to append tool post-scripts to + """ + tools_config = self._get_tools_config() + if not tools_config: + return + + # Load tools.json to get pre/post script definitions + tools_json_path = Path(__file__).parent.parent / "scripts" / "common" / "tools.json" + if not tools_json_path.exists(): + return + + with open(tools_json_path, "r") as f: + tools_definitions = json.load(f) + + # Add pre/post scripts from each configured tool + for tool in tools_config: + tool_name = tool.get("name") + if not tool_name or tool_name not in tools_definitions.get("tools", {}): + continue + + tool_def = tools_definitions["tools"][tool_name] + + # Add pre-scripts (at beginning, like local execution) + if "pre_scripts" in tool_def: + pre_scripts[:0] = tool_def["pre_scripts"] + + # Add post-scripts (at end, like local execution) + if "post_scripts" in tool_def: + post_scripts.extend(tool_def["post_scripts"]) + + def _load_common_scripts(self, script_list: List[Dict]) -> Dict[str, str]: + """ + Load common script contents from madengine package for embedding in ConfigMap. + + Since madengine is not installed in model Docker images, we need to embed + the common scripts (pre_scripts, post_scripts, and tool wrapper scripts) in the ConfigMap. + + Args: + script_list: List of script configurations with 'path' field + + Returns: + Dict mapping relative script paths to their contents + """ + import os + script_contents = {} + madengine_root = Path(__file__).parent.parent # Go up to madengine/ directory + + for script_config in script_list: + script_path = script_config.get("path", "") + if not script_path: + continue + + # Convert to absolute path from madengine root + abs_script_path = madengine_root / script_path + + if abs_script_path.exists() and abs_script_path.is_file(): + with open(abs_script_path, "r") as f: + script_contents[script_path] = f.read() + self.console.print(f"[dim]Loaded common script: {script_path}[/dim]") + + # If it's run_rocenv_tool.sh, also load the entire rocEnvTool directory + if "run_rocenv_tool.sh" in script_path: + rocenv_dir = abs_script_path.parent / "rocEnvTool" + if rocenv_dir.exists() and rocenv_dir.is_dir(): + # Load all Python files + for py_file in rocenv_dir.glob("*.py"): + rel_path = f"scripts/common/pre_scripts/rocEnvTool/{py_file.name}" + with open(py_file, "r") as f: + script_contents[rel_path] = f.read() + self.console.print(f"[dim]Loaded rocEnvTool file: {rel_path}[/dim]") + + # Load all JSON files (e.g., env_tags.json) + for json_file in rocenv_dir.glob("*.json"): + rel_path = f"scripts/common/pre_scripts/rocEnvTool/{json_file.name}" + with open(json_file, "r") as f: + script_contents[rel_path] = f.read() + self.console.print(f"[dim]Loaded rocEnvTool file: {rel_path}[/dim]") + else: + self.console.print(f"[yellow]Warning: Script not found: {script_path} (at {abs_script_path})[/yellow]") + + # Load tool wrapper scripts if tools are configured + tools_config = self._get_tools_config() + if tools_config: + self._load_tool_wrapper_scripts(script_contents, tools_config, madengine_root) + + return script_contents + + def _load_tool_wrapper_scripts(self, script_contents: Dict[str, str], + tools_config: List[Dict], madengine_root: Path) -> None: + """ + Load tool wrapper scripts and tools.json for K8s ConfigMap. + + This enables profiling tools like rocprof to work in K8s deployments. + + Args: + script_contents: Dict to populate with script contents + tools_config: List of tool configurations from manifest + madengine_root: Path to madengine package root + """ + # Load tools.json first + tools_json_path = madengine_root / "scripts" / "common" / "tools.json" + if tools_json_path.exists(): + with open(tools_json_path, "r") as f: + tools_definitions = json.load(f) + script_contents["scripts/common/tools.json"] = json.dumps(tools_definitions, indent=2) + self.console.print(f"[dim]Loaded tools.json[/dim]") + else: + self.console.print(f"[yellow]Warning: tools.json not found at {tools_json_path}[/yellow]") + return + + # Extract and load wrapper scripts referenced in tool commands + for tool in tools_config: + tool_name = tool.get("name") + if not tool_name: + continue + + # Get tool definition from tools.json + if tool_name not in tools_definitions.get("tools", {}): + self.console.print(f"[yellow]Warning: Tool '{tool_name}' not found in tools.json[/yellow]") + continue + + tool_def = tools_definitions["tools"][tool_name] + + # Extract cmd - could be from tool config override or tool definition + cmd = tool.get("cmd", tool_def.get("cmd", "")) + + # Check if cmd references a script in scripts/common/tools/ + if "scripts/common/tools/" in cmd: + # Parse script path from command (e.g., "bash ../scripts/common/tools/rocprof_wrapper.sh --runtime-trace") + # or "python3 ../scripts/common/tools/gpu_info_profiler.py" + # Extract the path portion + parts = cmd.split() + for part in parts: + if "scripts/common/tools/" in part: + # Remove ../ prefix if present + script_rel_path = part.replace("../", "") + abs_script_path = madengine_root / script_rel_path + + if abs_script_path.exists() and abs_script_path.is_file(): + with open(abs_script_path, "r") as f: + script_contents[script_rel_path] = f.read() + self.console.print(f"[dim]Loaded tool script: {script_rel_path}[/dim]") + + # If it's a Python script, also load utility modules it might depend on + if script_rel_path.endswith('.py'): + tools_dir = abs_script_path.parent + # Load common utility modules that profiling tools depend on + utility_modules = ['amd_smi_utils.py', 'rocm_smi_utils.py', 'pynvml_utils.py'] + for util_file in utility_modules: + util_path = tools_dir / util_file + if util_path.exists(): + util_rel_path = f"scripts/common/tools/{util_file}" + if util_rel_path not in script_contents: + with open(util_path, "r") as f: + script_contents[util_rel_path] = f.read() + self.console.print(f"[dim]Loaded tool utility module: {util_rel_path}[/dim]") + else: + self.console.print(f"[yellow]Warning: Tool script not found: {script_rel_path} (at {abs_script_path})[/yellow]") + break + + # Also load any tool-specific pre_scripts and post_scripts + for script_config in tool_def.get("pre_scripts", []): + script_path = script_config.get("path", "") + if script_path and script_path not in script_contents: + abs_script_path = madengine_root / script_path + if abs_script_path.exists(): + with open(abs_script_path, "r") as f: + script_contents[script_path] = f.read() + self.console.print(f"[dim]Loaded tool pre-script: {script_path}[/dim]") + + for script_config in tool_def.get("post_scripts", []): + script_path = script_config.get("path", "") + if script_path and script_path not in script_contents: + abs_script_path = madengine_root / script_path + if abs_script_path.exists(): + with open(abs_script_path, "r") as f: + script_contents[script_path] = f.read() + self.console.print(f"[dim]Loaded tool post-script: {script_path}[/dim]") + + # NEW: Scan pre-scripts for dependencies on scripts/common/tools/ files + # This handles cases like gpu_info_vram_profiler where the pre-script + # calls python3 scripts/common/tools/gpu_info_profiler.py but the tool + # definition has an empty cmd field + for script_config in tool_def.get("pre_scripts", []): + script_path = script_config.get("path", "") + if script_path: + abs_script_path = madengine_root / script_path + if abs_script_path.exists(): + # Read the pre-script to find any tool script references + with open(abs_script_path, "r") as f: + script_content = f.read() + # Look for references to scripts/common/tools/ in the pre-script + import re + # Use non-capturing group (?:...) to avoid capturing just the ../ part + tool_refs = re.findall(r'(?:\.\./)?scripts/common/tools/[\w_]+\.py', script_content) + for tool_ref in tool_refs: + # Clean up the path + tool_script_path = tool_ref.strip('"\'').replace("../", "") + abs_tool_path = madengine_root / tool_script_path + + if abs_tool_path.exists() and tool_script_path not in script_contents: + with open(abs_tool_path, "r") as tf: + script_contents[tool_script_path] = tf.read() + self.console.print(f"[dim]Loaded tool dependency: {tool_script_path}[/dim]") + + # Also load utility modules for this Python script + if tool_script_path.endswith('.py'): + tools_dir = abs_tool_path.parent + utility_modules = ['amd_smi_utils.py', 'rocm_smi_utils.py', 'pynvml_utils.py'] + for util_file in utility_modules: + util_path = tools_dir / util_file + if util_path.exists(): + util_rel_path = f"scripts/common/tools/{util_file}" + if util_rel_path not in script_contents: + with open(util_path, "r") as uf: + script_contents[util_rel_path] = uf.read() + self.console.print(f"[dim]Loaded utility module (from dependency): {util_rel_path}[/dim]") + + def _prepare_template_context( + self, model_info: Dict, image_info: Dict + ) -> Dict[str, Any]: + """ + Prepare context dictionary for Jinja2 template rendering. + + Args: + model_info: Model configuration from build_manifest.json + image_info: Image information from build_manifest.json + + Returns: + Context dictionary with all template variables + """ + # Use hierarchical GPU resolution: runtime > deployment > model > default + additional_context = self.config.additional_context.copy() + additional_context["k8s"] = self.k8s_config + gpu_count = resolve_runtime_gpus(model_info, additional_context) + model_name = model_info["name"] + + # Load manifest and credential content for ConfigMap + with open(self.config.manifest_file, "r") as f: + manifest_content = f.read() + + credential_content = "{}" + credential_path = Path("credential.json") + if credential_path.exists(): + with open(credential_path, "r") as f: + credential_content = f.read() + + # Load data.json content if exists + data_json_content = None + data_path = Path("data.json") + if data_path.exists(): + with open(data_path, "r") as f: + data_json_content = f.read() + self.console.print(f"[dim]Loaded data.json[/dim]") + + # Load model scripts directory content (entire folder, not just one file) + # This matches local execution which mounts the entire MODEL_DIR/scripts folder + model_script_path = model_info.get("scripts") # e.g., "scripts/dummy/run_data_minio.sh" + model_script_dir = None + model_script_filename = None + model_scripts_contents = {} # Store all scripts in the directory + + if model_script_path: + script_file = Path(model_script_path) + # Extract directory and filename + model_script_dir = str(script_file.parent) # e.g., "scripts/dummy" + model_script_filename = script_file.name # e.g., "run_data_minio.sh" + + # Load ALL scripts from the model's scripts directory + # This is critical for models that have multiple helper scripts + scripts_dir_path = Path(model_script_dir) + if scripts_dir_path.exists() and scripts_dir_path.is_dir(): + for script in scripts_dir_path.glob("*.sh"): + with open(script, "r") as f: + # Use the path directly if relative, otherwise convert to relative + if script.is_absolute(): + relative_path = str(script.relative_to(Path.cwd())) + else: + relative_path = str(script) + model_scripts_contents[relative_path] = f.read() + + # Also check for Python scripts + for script in scripts_dir_path.glob("*.py"): + with open(script, "r") as f: + # Use the path directly if relative, otherwise convert to relative + if script.is_absolute(): + relative_path = str(script.relative_to(Path.cwd())) + else: + relative_path = str(script) + model_scripts_contents[relative_path] = f.read() + + # Also check for JSON config files (e.g., DeepSpeed configs) + for script in scripts_dir_path.glob("*.json"): + with open(script, "r") as f: + # Use the path directly if relative, otherwise convert to relative + if script.is_absolute(): + relative_path = str(script.relative_to(Path.cwd())) + else: + relative_path = str(script) + model_scripts_contents[relative_path] = f.read() + + self.console.print(f"[dim]Loaded {len(model_scripts_contents)} script(s) from {model_script_dir}[/dim]") + elif script_file.exists(): + # Fallback: load single file if directory doesn't exist + with open(script_file, "r") as f: + model_scripts_contents[model_script_path] = f.read() + self.console.print(f"[dim]Loaded single script: {model_script_path}[/dim]") + else: + self.console.print(f"[yellow]Warning: Script not found: {model_script_path}[/yellow]") + + # Load K8s tools configuration + k8s_tools_config = self._load_k8s_tools() + + # Prepare data configuration first + data_config = self._prepare_data_config(model_info) + + # Store for use in deploy() method + self._data_config = data_config + + # K8s best practice: Auto-create shared data PVC if needed + # K8s philosophy: Separate compute (pods) from storage (PVC) + if data_config and not self.k8s_config.get("data_pvc"): + # PVC will be auto-created during deployment + # Use consistent name for reusability across training runs + self.console.print( + f"[cyan]📦 Data provider detected: Will auto-create shared data PVC[/cyan]" + ) + self.console.print( + f"[dim] PVC name: madengine-shared-data (reusable across runs)[/dim]" + ) + self.console.print( + f"[dim] Access mode: RWO for single-node, RWX for multi-node (auto-selected)[/dim]" + ) + self.console.print( + f"[dim] To use existing PVC, add 'data_pvc' to your K8s config[/dim]" + ) + # Set PVC name now so templates are rendered with correct value + self.k8s_config["data_pvc"] = "madengine-shared-data" + + # Determine data provider script if model needs data + data_provider_script = None + data_provider_script_content = None + if data_config: + provider_type = data_config.get("provider_type", "local") + if provider_type in k8s_tools_config.get("data_providers", {}): + data_provider_script = k8s_tools_config["data_providers"][provider_type] + + # Load K8s data provider script content + k8s_script_path = Path(__file__).parent.parent / data_provider_script["script"] + if k8s_script_path.exists(): + with open(k8s_script_path, "r") as f: + data_provider_script_content = f.read() + self.console.print(f"[dim]Loaded K8s data provider: {data_provider_script['script']}[/dim]") + else: + self.console.print(f"[yellow]Warning: K8s script not found: {k8s_script_path}[/yellow]") + + # Get launcher configuration from manifest's deployment_config or additional_context + deployment_config = self.manifest.get("deployment_config", {}) + distributed_config = deployment_config.get("distributed", {}) + launcher_config = self.config.additional_context.get("launcher", {}) + + # Merge manifest and runtime launcher config (runtime overrides) + # Use explicit None checking to handle 0 values correctly + launcher_type = ( + launcher_config.get("type") + if launcher_config.get("type") is not None + else distributed_config.get("launcher") + ) + + nnodes = ( + launcher_config.get("nnodes") + if launcher_config.get("nnodes") is not None + else distributed_config.get("nnodes", 1) + ) + + # Store for use in deploy() method + self._nnodes = nnodes + + nproc_per_node = ( + launcher_config.get("nproc_per_node") + if launcher_config.get("nproc_per_node") is not None + else distributed_config.get("nproc_per_node") + if distributed_config.get("nproc_per_node") is not None + else int(model_info.get("n_gpus", 1)) + ) + + master_port = launcher_config.get("master_port", 29500) + + # Validate configuration + if launcher_type == "torchrun": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring torchrun: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "deepspeed": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring DeepSpeed: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "torchtitan": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring TorchTitan: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "vllm": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring vLLM: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "sglang": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring SGLang: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + elif launcher_type == "megatron": + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"Invalid nnodes: {nnodes}. Must be positive integer >= 1") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"Invalid nproc_per_node: {nproc_per_node}. Must be positive integer >= 1") + + self.console.print(f"[cyan]Configuring Megatron-LM: {nnodes} nodes × {nproc_per_node} GPUs/node[/cyan]") + + # Determine if we need multi-node setup + create_headless_service = False + launcher_command = None + + if launcher_type == "torchrun": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node detected: Creating headless service for pod discovery[/dim]") + + # Generate torchrun launcher command + launcher_command = self._generate_torchrun_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh") + ) + + elif launcher_type == "deepspeed": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node DeepSpeed: Creating headless service for pod discovery[/dim]") + + model_script = model_info.get("scripts", "run.sh") + + # Check if script is a bash script - if so, execute it directly + # as it will handle the launcher internally + if model_script.endswith('.sh'): + self.console.print(f"[dim]Detected bash script ({model_script}), will execute directly[/dim]") + launcher_command = self._generate_bash_script_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_script + ) + else: + # Python script - use DeepSpeed launcher + launcher_command = self._generate_deepspeed_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_script + ) + + elif launcher_type == "torchtitan": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node TorchTitan: Creating headless service for pod discovery[/dim]") + + # Generate TorchTitan launcher command + launcher_command = self._generate_torchtitan_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh") + ) + + elif launcher_type == "vllm": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node vLLM: Creating headless service for Ray cluster[/dim]") + + # Generate vLLM launcher command + launcher_command = self._generate_vllm_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh") + ) + + elif launcher_type == "sglang": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node SGLang: Creating headless service for Ray cluster[/dim]") + + # Generate SGLang launcher command + launcher_command = self._generate_sglang_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh") + ) + + elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": + if nnodes < 3: + raise ValueError( + f"SGLang Disaggregated requires minimum 3 nodes " + f"(1 proxy + 1 prefill + 1 decode), got {nnodes}" + ) + + # Always create headless service for disaggregated architecture + create_headless_service = True + self.console.print(f"[dim]SGLang Disaggregated: Creating headless service for {nnodes} pods[/dim]") + self.console.print(f"[dim] Architecture: 1 proxy + {max(1, (nnodes-1)*2//5)} prefill + {nnodes-1-max(1, (nnodes-1)*2//5)} decode[/dim]") + + # Generate SGLang Disaggregated launcher command + launcher_command = self._generate_sglang_disagg_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh") + ) + + elif launcher_type == "megatron": + if nnodes > 1: + create_headless_service = True + self.console.print(f"[dim]Multi-node Megatron-LM: Creating headless service for pod discovery[/dim]") + + # Generate Megatron-LM launcher command + launcher_command = self._generate_megatron_command( + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port, + model_script=model_info.get("scripts", "run.sh") + ) + + # Prepare pre/post scripts (similar to local execution) + pre_scripts = [] + post_scripts = [] + + # Get pre/post scripts from manifest context if available + if "context" in self.manifest: + if "pre_scripts" in self.manifest["context"]: + pre_scripts.extend(self.manifest["context"]["pre_scripts"]) + if "post_scripts" in self.manifest["context"]: + post_scripts.extend(self.manifest["context"]["post_scripts"]) + + # Add system environment collection (rocEnvTool) - same as local execution + # This is controlled by generate_sys_env_details flag (default: True) + generate_sys_env_details = self.config.additional_context.get("generate_sys_env_details", True) + if generate_sys_env_details: + self.gather_system_env_details(pre_scripts, model_info["name"]) + + # Add tool pre/post scripts to the execution lists (like local execution) + self._add_tool_scripts(pre_scripts, post_scripts) + + # Load pre/post script contents for ConfigMap (since madengine not installed in container) + pre_post_script_contents = self._load_common_scripts(pre_scripts + post_scripts) + + # Build complete context + context = { + # Job metadata + "job_name": self.job_name, + "namespace": self.namespace, + "model_name": model_name, + # ConfigMap + "configmap_name": self.configmap_name, + "manifest_content": manifest_content, + "credential_content": credential_content, + "data_json_content": data_json_content, + "model_scripts_contents": model_scripts_contents, # All scripts in directory + "model_script_path": model_script_path, + "model_script_dir": model_script_dir, + "model_script_filename": model_script_filename, + # K8s tools + "data_provider_script": data_provider_script, + "data_provider_script_content": data_provider_script_content, + # Image + "image": image_info["registry_image"], + "image_pull_policy": self.k8s_config.get("image_pull_policy", "Always"), + # Resources + "gpu_resource_name": self.gpu_resource_name, + "gpu_count": gpu_count, + "memory": self.k8s_config.get("memory", "128Gi"), + "memory_limit": self.k8s_config.get("memory_limit", "256Gi"), + "cpu": self.k8s_config.get("cpu", "32"), + "cpu_limit": self.k8s_config.get("cpu_limit", "64"), + # Job spec + "completions": nnodes, + "parallelism": nnodes, + "completion_mode": "Indexed" if nnodes > 1 else None, + "backoff_limit": self.k8s_config.get("backoff_limit", 3), + # Pod spec + "node_selector": self.k8s_config.get("node_selector", {}), + "tolerations": self.k8s_config.get("tolerations", []), + "host_ipc": nnodes > 1, # Enable for multi-node + "subdomain": self.job_name if (launcher_type == "torchrun" and nnodes > 1) else None, + # Execution + "gpu_visibility": ",".join(str(i) for i in range(gpu_count)), # e.g., "0" for 1 GPU, "0,1" for 2 GPUs + "gpu_architecture": self.manifest.get("context", {}).get( + "gpu_architecture", "gfx90a" + ), + "model_script": f"{model_info.get('scripts', 'run.sh')} {model_info.get('args', '')}".strip(), + "launcher_type": launcher_type, + "launcher_command": launcher_command, + "nnodes": nnodes, + "nproc_per_node": nproc_per_node, + "master_port": master_port, + "timeout": self.config.timeout, + # Environment - Merge base env vars with data/tools env vars + "env_vars": self._prepare_env_vars(model_info), + # Volumes + "results_pvc": f"{self.job_name}-results", # Always create a PVC for results + "pvc_name": f"{self.job_name}-results", # PVC name for template + "data_pvc": self.k8s_config.get("data_pvc"), + # Multi-node + "create_headless_service": create_headless_service, + "service_name": self.job_name, + "ports": [29500] if create_headless_service else [], + # Data provider configuration (already prepared above) + "data_config": data_config, + # Tools configuration - from manifest.context or additional_context + "tools_config": self._get_tools_config(), + # Tool command chains (pre-built for template) + # Tool command chains (pre-built for template) + "launcher_tool_chain": self._build_tool_command_chain( + self._get_tools_config(), "bash /tmp/run_launcher.sh" + ) if launcher_command else None, + "direct_script_tool_chain": self._build_tool_command_chain( + self._get_tools_config(), f"bash {model_info.get('scripts', 'run.sh')}" + ), + # Pre/Post scripts - includes rocEnvTool and any user-defined scripts + "pre_scripts": pre_scripts, + "post_scripts": post_scripts, + # Common script contents for ConfigMap (embedded since madengine not in container) + "common_script_contents": pre_post_script_contents, + } + + return context + + def _get_tools_config(self) -> List[Dict]: + """ + Get tools configuration from manifest.context or additional_context. + + Prioritizes runtime additional_context, falls back to manifest.context. + + For multi-node runs: + - Checks rocprofv3 availability (required for MPI profiling) + - Upgrades "rocprof" to "rocprofv3" for multi-node compatibility + - Logs warnings if rocprofv3 not available + + Returns: + List of tool configurations (enriched with cmd from tools.json) + """ + # Cache the result to avoid repeated expensive checks and duplicate warnings + if hasattr(self, '_cached_tools_config'): + return self._cached_tools_config + + # Check runtime additional_context first (allows runtime override) + tools = self.config.additional_context.get("tools", []) + + # Fall back to manifest.context if no runtime tools + if not tools and "context" in self.manifest: + tools = self.manifest["context"].get("tools", []) + + # Apply multi-node profiling logic if applicable + distributed_config = self.config.additional_context.get("distributed", {}) + nnodes = distributed_config.get("nnodes", 1) + + if nnodes > 1 and tools: + # Configure multi-node profiling (handles rocprofv3 detection and tool upgrades) + # Create a simple logger wrapper for configure_multi_node_profiling + class ConsoleLogger: + def __init__(self, console): + self.console = console + def info(self, msg): + self.console.print(f"[cyan]{msg}[/cyan]") + def warning(self, msg): + self.console.print(f"[yellow]{msg}[/yellow]") + def debug(self, msg): + pass # Skip debug messages in console + + profiling_config = configure_multi_node_profiling( + nnodes=nnodes, + tools_config=tools, + logger=ConsoleLogger(self.console) + ) + + if profiling_config["enabled"]: + tools = profiling_config["tools"] + else: + # rocprofv3 not available - skip profiling for multi-node + tools = [] + + # Enrich tools with cmd from tools.json for K8s template usage + result = self._enrich_tools_with_cmd(tools) + + # Cache the result for subsequent calls + self._cached_tools_config = result + return result + + def _build_tool_command_chain(self, tools_config: List[Dict], base_command: str) -> str: + """ + Build a command chain from multiple tools, wrapping the base command. + + Tools are chained from outermost to innermost: + tool_n wraps tool_2 wraps tool_1 wraps base_command + + Each tool's OUTPUT_FILE env var is set inline to avoid conflicts. + + Args: + tools_config: List of enriched tool configurations + base_command: The base command to wrap (e.g., "bash /tmp/run_launcher.sh") + + Returns: + Complete command chain string + """ + if not tools_config: + return base_command + + # Filter tools that have a cmd field + tools_with_cmd = [t for t in tools_config if t.get("cmd")] + + if not tools_with_cmd: + return base_command + + # Build command chain from inside out (reverse order) + cmd_chain = base_command + for tool in reversed(tools_with_cmd): + tool_cmd = tool["cmd"].replace("../scripts/common/", "scripts/common/") + + # Set OUTPUT_FILE inline for this specific tool (if defined in tool's env_vars) + tool_env_vars = tool.get("env_vars", {}) + if "OUTPUT_FILE" in tool_env_vars: + output_file = tool_env_vars["OUTPUT_FILE"] + # Prepend OUTPUT_FILE=value to this tool's command only + cmd_chain = f"OUTPUT_FILE={output_file} {tool_cmd} {cmd_chain}" + else: + cmd_chain = f"{tool_cmd} {cmd_chain}" + + return cmd_chain + + def _enrich_tools_with_cmd(self, tools: List[Dict]) -> List[Dict]: + """ + Enrich tools configuration with cmd field from tools.json. + + This is needed for K8s template to generate the correct encapsulation command. + + Args: + tools: List of tool configurations (may only have 'name' field) + + Returns: + Enriched list with 'cmd' field added from tools.json + """ + if not tools: + return tools + + # Load tools.json + tools_json_path = Path(__file__).parent.parent / "scripts" / "common" / "tools.json" + if not tools_json_path.exists(): + self.console.print(f"[yellow]Warning: tools.json not found at {tools_json_path}[/yellow]") + return tools + + with open(tools_json_path, "r") as f: + tools_definitions = json.load(f) + + enriched_tools = [] + for tool in tools: + tool_name = tool.get("name") + if not tool_name: + enriched_tools.append(tool) + continue + + # Get tool definition from tools.json + if tool_name not in tools_definitions.get("tools", {}): + self.console.print(f"[yellow]Warning: Tool '{tool_name}' not found in tools.json[/yellow]") + enriched_tools.append(tool) + continue + + tool_def = tools_definitions["tools"][tool_name] + + # Create enriched tool config with cmd + enriched_tool = tool.copy() + if "cmd" not in enriched_tool and "cmd" in tool_def: + enriched_tool["cmd"] = tool_def["cmd"] + + # Also copy env_vars if present + if "env_vars" not in enriched_tool and "env_vars" in tool_def: + enriched_tool["env_vars"] = tool_def["env_vars"] + + enriched_tools.append(enriched_tool) + + return enriched_tools + + def _generate_torchrun_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate torchrun launcher command for K8s Indexed Jobs. + + For single-node (nnodes=1), generates standalone torchrun command. + For multi-node (nnodes>1), generates distributed torchrun with headless + service DNS for coordination. + + Uses K8s environment variables for distributed coordination: + - JOB_COMPLETION_INDEX: Pod index (0, 1, 2, ...) + - Headless service DNS for MASTER_ADDR + + CRITICAL FIX: For bash scripts that use ${BASH_SOURCE[0]}, we cd into the + script directory first so relative paths resolve correctly. This fixes the + issue where profiling tool wrappers prevent BASH_SOURCE from resolving. + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port. Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete torchrun command string + + Raises: + ValueError: If any parameter is invalid + """ + from pathlib import Path + + # Validate inputs (defensive programming) + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # Check if model_script is a bash script + # If so, execute it directly as it handles torchrun internally + if model_script.endswith('.sh'): + # For bash scripts, set environment variables and execute script + # The script itself will invoke torchrun with the appropriate Python file + # CRITICAL: cd to script directory first so BASH_SOURCE[0] resolves correctly + script_dir = str(Path(model_script).parent) + script_name = str(Path(model_script).name) + if nnodes == 1: + return f"""export MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node={nproc_per_node}" +export MAD_RUNTIME_NGPUS={nproc_per_node} +cd {script_dir} && bash {script_name}""" + else: + return f"""# Multi-node torchrun setup (Kubernetes Indexed Job) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export MAD_MULTI_NODE_RUNNER="torchrun --nnodes={nnodes} --nproc_per_node={nproc_per_node} --node_rank=${{JOB_COMPLETION_INDEX}} --master_addr=${{MASTER_ADDR}} --master_port={master_port}" +export MAD_RUNTIME_NGPUS={nproc_per_node} +cd {script_dir} && bash {script_name}""" + + # For Python scripts, invoke torchrun directly + # For single-node, simpler standalone command + if nnodes == 1: + return f"""torchrun \\ + --standalone \\ + --nnodes=1 \\ + --nproc_per_node={nproc_per_node} \\ + {model_script}""" + + # Multi-node: Use headless service DNS and JOB_COMPLETION_INDEX + return f"""# Multi-node torchrun setup (Kubernetes Indexed Job) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export RANK=${{JOB_COMPLETION_INDEX}} +export WORLD_SIZE={nnodes} +export LOCAL_RANK=0 +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +echo "Torchrun Configuration:" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " RANK: $RANK" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " NPROC_PER_NODE: $NPROC_PER_NODE" + +torchrun \\ + --nnodes={nnodes} \\ + --nproc_per_node={nproc_per_node} \\ + --rdzv_backend=c10d \\ + --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \\ + --rdzv_id={self.job_name} \\ + --role=worker \\ + --tee=3 \\ + {model_script}""" + + def _generate_deepspeed_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate DeepSpeed launcher command for K8s Indexed Jobs. + + DeepSpeed has its own launcher that handles: + - ZeRO optimization stages (ZeRO-1, ZeRO-2, ZeRO-3) + - Gradient accumulation + - Mixed precision training + - Pipeline parallelism + - Hostfile management (handled by K8s in our case) + + For single-node (nnodes=1), uses localhost setup. + For multi-node (nnodes>1), uses headless service DNS for coordination. + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port. Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete DeepSpeed launcher command string + + Raises: + ValueError: If any parameter is invalid + """ + # Validate inputs + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # For single-node + if nnodes == 1: + return f"""# DeepSpeed Single-Node Setup +export MASTER_ADDR=localhost +export MASTER_PORT={master_port} +export RANK=0 +export LOCAL_RANK=0 +export WORLD_SIZE={nproc_per_node} + +echo "DeepSpeed Configuration:" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " NUM_GPUS: {nproc_per_node}" + +# DeepSpeed launcher (single-node) +deepspeed --num_gpus={nproc_per_node} \\ + --master_port={master_port} \\ + {model_script}""" + + # Multi-node: Use K8s headless service for coordination + return f"""# Multi-node DeepSpeed setup (Kubernetes Indexed Job) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export RANK=${{JOB_COMPLETION_INDEX}} +export LOCAL_RANK=0 +export WORLD_SIZE={nnodes * nproc_per_node} +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +echo "DeepSpeed Multi-Node Configuration:" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " RANK (Node Rank): $RANK" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " NNODES: $NNODES" +echo " NPROC_PER_NODE: $NPROC_PER_NODE" + +# Create hostfile for DeepSpeed (K8s Indexed Job aware) +cat > /tmp/hostfile << EOF +{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local slots={nproc_per_node} +EOF + +# Add all nodes to hostfile +for i in $(seq 1 $((NNODES - 1))); do + echo "{self.job_name}-$i.{self.job_name}.{self.namespace}.svc.cluster.local slots={nproc_per_node}" >> /tmp/hostfile +done + +echo "" +echo "Generated hostfile:" +cat /tmp/hostfile +echo "" + +# DeepSpeed launcher (multi-node with hostfile) +deepspeed --hostfile=/tmp/hostfile \\ + --master_addr=$MASTER_ADDR \\ + --master_port=$MASTER_PORT \\ + --num_nodes={nnodes} \\ + --num_gpus={nproc_per_node} \\ + {model_script}""" + + def _generate_bash_script_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate command to execute a bash script directly. + + This is used when the model script is a .sh file that handles + launcher invocation internally (e.g., using torchrun inside the script). + + Sets up environment variables for distributed training that the bash + script can use. + + Args: + nnodes: Number of nodes (pods) + nproc_per_node: GPUs per node + master_port: Master communication port + model_script: Path to the bash script + + Returns: + Command to execute the bash script with environment setup + """ + # For single-node + if nnodes == 1: + return f"""# Bash Script Execution (Single-Node) +# Setting up environment for script to use +export MASTER_ADDR=localhost +export MASTER_PORT={master_port} +export RANK=0 +export LOCAL_RANK=0 +export WORLD_SIZE={nproc_per_node} +export NNODES=1 +export NPROC_PER_NODE={nproc_per_node} + +echo "Bash Script Configuration:" +echo " Script: {model_script}" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " NNODES: $NNODES" +echo " NPROC_PER_NODE: $NPROC_PER_NODE" +echo "" + +# Execute the bash script directly +bash {model_script}""" + + # Multi-node: Use K8s headless service for coordination + return f"""# Bash Script Execution (Multi-Node) +# Setting up environment for script to use +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export RANK=${{JOB_COMPLETION_INDEX}} +export LOCAL_RANK=0 +export WORLD_SIZE={nnodes * nproc_per_node} +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +echo "Bash Script Multi-Node Configuration:" +echo " Script: {model_script}" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " RANK (Node Rank): $RANK" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " NNODES: $NNODES" +echo " NPROC_PER_NODE: $NPROC_PER_NODE" +echo "" + +# Execute the bash script directly +bash {model_script}""" + + def _generate_torchtitan_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate TorchTitan launcher command for K8s Indexed Jobs. + + TorchTitan is a PyTorch native platform for large-scale LLM pre-training + that supports multi-dimensional parallelism: + - FSDP2 (Fully Sharded Data Parallel v2) + - Tensor Parallel (TP) + - Pipeline Parallel (PP) + - Context Parallel (CP) + + TorchTitan uses torchrun as its underlying distributed launcher but + requires additional configuration for its parallelism strategies. + + For single-node (nnodes=1): Uses standalone torchrun with TP + For multi-node (nnodes>1): Uses distributed torchrun with TP+PP+FSDP2 + + Uses K8s environment variables for distributed coordination: + - JOB_COMPLETION_INDEX: Pod index (0, 1, 2, ...) + - Headless service DNS for MASTER_ADDR + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port. Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete torchtitan launch command string with environment setup + + Raises: + ValueError: If any parameter is invalid + + Example single-node output: + export TORCHTITAN_TENSOR_PARALLEL_SIZE=8 + export TORCHTITAN_PIPELINE_PARALLEL_SIZE=1 + torchrun --standalone --nproc_per_node=8 train.py --config llama3_8b.toml + + Example multi-node output: + export MASTER_ADDR="job-0.job.namespace.svc.cluster.local" + export TORCHTITAN_TENSOR_PARALLEL_SIZE=8 + export TORCHTITAN_PIPELINE_PARALLEL_SIZE=4 + export TORCHTITAN_FSDP_ENABLED=1 + torchrun --nnodes=4 --nproc_per_node=8 ... train.py --config llama3_405b.toml + """ + # Validate inputs + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # For single-node, use standalone mode with Tensor Parallelism only + if nnodes == 1: + return f"""# TorchTitan single-node setup (Tensor Parallelism) +export TORCHTITAN_TENSOR_PARALLEL_SIZE={nproc_per_node} +export TORCHTITAN_PIPELINE_PARALLEL_SIZE=1 +export TORCHTITAN_FSDP_ENABLED=0 +export TORCHTITAN_CONTEXT_PARALLEL_SIZE=1 + +echo "TorchTitan Configuration (Single Node):" +echo " Tensor Parallel Size: {nproc_per_node}" +echo " Pipeline Parallel Size: 1" +echo " Total GPUs: {nproc_per_node}" + +torchrun \\ + --standalone \\ + --nnodes=1 \\ + --nproc_per_node={nproc_per_node} \\ + {model_script}""" + + # Multi-node: Use headless service DNS and enable all parallelism strategies + return f"""# TorchTitan multi-node setup (K8s Indexed Job) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export RANK=${{JOB_COMPLETION_INDEX}} +export WORLD_SIZE={nnodes} +export LOCAL_RANK=0 +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +# TorchTitan multi-dimensional parallelism configuration +# These can be overridden by TOML config file in model script +export TORCHTITAN_TENSOR_PARALLEL_SIZE={nproc_per_node} +export TORCHTITAN_PIPELINE_PARALLEL_SIZE={nnodes} +export TORCHTITAN_FSDP_ENABLED=1 +export TORCHTITAN_CONTEXT_PARALLEL_SIZE=1 + +echo "TorchTitan Configuration (Multi-Node):" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " RANK: $RANK" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " Tensor Parallel Size: {nproc_per_node}" +echo " Pipeline Parallel Size: {nnodes}" +echo " FSDP: Enabled" +echo " Total GPUs: {nnodes * nproc_per_node}" + +torchrun \\ + --nnodes={nnodes} \\ + --nproc_per_node={nproc_per_node} \\ + --rdzv_backend=c10d \\ + --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \\ + --rdzv_id={self.job_name} \\ + --role=worker \\ + --tee=3 \\ + {model_script}""" + + def _generate_sglang_disagg_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate SGLang Disaggregated launcher command for K8s Indexed Jobs. + + SGLang Disaggregated uses separate node pools for: + - Proxy (index 0): Load balancer and request router + - Prefill (indices 1 to xP): Prompt processing + - Decode (indices xP+1 to end): Token generation + + Communication via Mooncake framework for efficient KV cache transfer. + + Architecture: + - Pod 0: Runs mini_lb (proxy/load balancer) + - Pods 1-xP: Run prefill servers + - Pods xP+1 to N-1: Run decode servers + + Args: + nnodes: Total number of pods (must be >= 3) + nproc_per_node: GPUs per pod + master_port: Port for proxy service + model_script: Path to model launch script + + Returns: + Complete disaggregated launch setup + + Raises: + ValueError: If nnodes < 3 or invalid parameters + """ + # Validate + if not isinstance(nnodes, int) or nnodes < 3: + raise ValueError( + f"SGLang Disaggregated requires minimum 3 nodes, got {nnodes}" + ) + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be >= 1, got {nproc_per_node}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string") + + # Check if custom split is specified in additional_context + sglang_disagg_config = self.config.additional_context.get("distributed", {}).get("sglang_disagg", {}) + prefill_nodes = sglang_disagg_config.get("prefill_nodes") + decode_nodes = sglang_disagg_config.get("decode_nodes") + + if prefill_nodes is not None and decode_nodes is not None: + # User specified custom split - validate + if prefill_nodes < 1 or decode_nodes < 1: + raise ValueError( + f"SGLang Disaggregated requires at least 1 prefill and 1 decode node, " + f"got prefill={prefill_nodes}, decode={decode_nodes}" + ) + if prefill_nodes + decode_nodes + 1 != nnodes: + raise ValueError( + f"Custom split validation failed: " + f"prefill_nodes ({prefill_nodes}) + decode_nodes ({decode_nodes}) + 1 proxy " + f"must equal nnodes ({nnodes}), but got {prefill_nodes + decode_nodes + 1}" + ) + xP = prefill_nodes + yD = decode_nodes + else: + # Default automatic split (can be customized via additional_context) + xP = max(1, (nnodes - 1) * 2 // 5) # ~40% prefill + yD = nnodes - 1 - xP # remaining decode + + # Build prefill and decode server lists + prefill_servers = " ".join([ + f"http://{self.job_name}-{i}.{self.job_name}.{self.namespace}.svc.cluster.local:30000" + for i in range(1, xP + 1) + ]) + + decode_servers = " ".join([ + f"http://{self.job_name}-{i}.{self.job_name}.{self.namespace}.svc.cluster.local:30000" + for i in range(xP + 1, nnodes) + ]) + + return f"""# SGLang Disaggregated K8s Setup +# ============================================ +# Cluster: {nnodes} pods total +# Proxy: Pod 0 +# Prefill: Pods 1-{xP} ({xP} nodes) +# Decode: Pods {xP+1}-{nnodes-1} ({yD} nodes) +# ============================================ + +export POD_INDEX=${{JOB_COMPLETION_INDEX:-0}} +export TOTAL_PODS={nnodes} +export PREFILL_COUNT={xP} +export DECODE_COUNT={yD} +export TP_SIZE={nproc_per_node} + +# Get pod IP +export POD_IP=$(hostname -i | awk '{{print $1}}') + +echo "==========================================" +echo "SGLang Disaggregated Pod Info" +echo "==========================================" +echo "Pod Index: $POD_INDEX" +echo "Pod IP: $POD_IP" +echo "Total Pods: $TOTAL_PODS" +echo "Prefill Pods: $PREFILL_COUNT" +echo "Decode Pods: $DECODE_COUNT" +echo "TP Size: $TP_SIZE" +echo "==========================================" + +# Node role assignment based on pod index +if [ "$POD_INDEX" -eq 0 ]; then + # Proxy Node (Load Balancer) + echo "🔀 This pod is PROXY (Load Balancer)" + + python3 -m sglang.srt.disaggregation.mini_lb \\ + --prefill {prefill_servers} \\ + --decode {decode_servers} \\ + --host 0.0.0.0 \\ + --port {master_port} + +elif [ "$POD_INDEX" -le "{xP}" ]; then + # Prefill Nodes + echo "⚡ This pod is PREFILL Node" + + python3 -m sglang.launch_server \\ + --model-path "$MODEL_PATH" \\ + --disaggregation-mode prefill \\ + --tp-size {nproc_per_node} \\ + --host $POD_IP \\ + --port 30000 \\ + --trust-remote-code \\ + --disaggregation-transfer-backend mooncake + +else + # Decode Nodes + echo "🔤 This pod is DECODE Node" + + python3 -m sglang.launch_server \\ + --model-path "$MODEL_PATH" \\ + --disaggregation-mode decode \\ + --tp-size {nproc_per_node} \\ + --host $POD_IP \\ + --port 30000 \\ + --trust-remote-code \\ + --disaggregation-transfer-backend mooncake +fi + +echo "SGLang Disaggregated setup complete" +""" + + def _generate_vllm_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate vLLM launcher command for K8s Indexed Jobs. + + vLLM is an inference engine with its own process management via Ray. + Unlike training frameworks, vLLM doesn't use torchrun. + + Architecture: + - Single-node: Tensor Parallelism (TP) across GPUs, no Ray needed + - Multi-node: Data Parallelism where each node runs independent vLLM replica + * Each replica uses TP across its local GPUs + * Ray coordinates resources on each node independently + * Benefits: Simpler, more robust, better for inference serving + + For K8s multi-node: + - Each pod runs its own independent vLLM instance + - Uses Ray for local GPU coordination + - NO shared Ray cluster across pods (Data Parallelism mode) + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port (for Ray). Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete vLLM launch setup with environment configuration + + Raises: + ValueError: If any parameter is invalid + """ + # Validate inputs + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # For single-node, simple TP setup (no Ray needed) + if nnodes == 1: + return f"""# vLLM single-node setup (Tensor Parallelism) +export VLLM_TENSOR_PARALLEL_SIZE={nproc_per_node} +export VLLM_PIPELINE_PARALLEL_SIZE=1 +export VLLM_DISTRIBUTED_BACKEND="auto" +export NNODES=1 +export NPROC_PER_NODE={nproc_per_node} +export NODE_RANK=0 + +echo "vLLM Configuration (Single Node):" +echo " Tensor Parallel Size: {nproc_per_node}" +echo " Pipeline Parallel Size: 1" +echo " Distributed Backend: auto (no Ray)" +echo " Total GPUs: {nproc_per_node}" + +# vLLM handles process management - just run the script +{model_script}""" + + # Multi-node: Data Parallelism with independent Ray clusters per pod + return f"""# vLLM multi-node setup (K8s Data Parallelism Mode) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export NODE_RANK=${{JOB_COMPLETION_INDEX}} +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +# vLLM Data Parallelism configuration +# Each pod runs INDEPENDENT vLLM replica (no shared Ray cluster) +export VLLM_TENSOR_PARALLEL_SIZE={nproc_per_node} +export VLLM_PIPELINE_PARALLEL_SIZE=1 +export VLLM_DISTRIBUTED_BACKEND="ray" + +# Get current pod IP for Ray +POD_IP=$(hostname -i | awk '{{print $1}}') +export VLLM_HOST_IP="$POD_IP" + +echo "vLLM Configuration (Multi-Node Data Parallelism):" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " NODE_RANK: $NODE_RANK (Pod Index)" +echo " NNODES: $NNODES" +echo " Tensor Parallel Size: {nproc_per_node} (per pod)" +echo " Data Parallel Size: {nnodes} (independent replicas)" +echo " Pod IP: $POD_IP" +echo " Total GPUs: {nnodes * nproc_per_node}" +echo "" +echo "Mode: Each pod runs independent vLLM replica with local Ray" + +# Clean any existing Ray processes +ray stop --force 2>/dev/null || true +pkill -9 -f "ray::" 2>/dev/null || true +sleep 2 + +# Start independent Ray cluster on THIS pod only +echo "Starting Ray cluster on Pod $NODE_RANK..." +ray start --head --port=6379 --node-ip-address="$POD_IP" --num-gpus={nproc_per_node} +sleep 3 + +echo "Ray cluster ready:" +ray status + +# Run vLLM inference script +{model_script} + +# Cleanup Ray on exit +trap "ray stop --force 2>/dev/null || true" EXIT""" + + def _generate_sglang_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate SGLang launcher command for K8s Indexed Jobs. + + SGLang is an inference engine with native launcher (sglang.launch_server). + Similar to vLLM, it manages its own process spawning via Ray. + + Architecture: + - Single-node: Tensor Parallelism (TP) across GPUs + - Multi-node: Uses SGLang's native multi-node launcher with Ray + * TP across GPUs within each node + * Ray for distributed coordination + + For K8s: + - Uses headless service for node discovery (similar to torchrun) + - Each pod knows its rank via JOB_COMPLETION_INDEX + - SGLang native launcher handles Ray cluster setup + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port (for NCCL/Ray). Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete SGLang launch setup with environment configuration + + Raises: + ValueError: If any parameter is invalid + """ + # Validate inputs + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # For single-node, simple TP setup + if nnodes == 1: + return f"""# SGLang single-node setup (Tensor Parallelism) +export SGLANG_TENSOR_PARALLEL_SIZE={nproc_per_node} +export SGLANG_PIPELINE_PARALLEL_SIZE=1 +export NNODES=1 +export NPROC_PER_NODE={nproc_per_node} +export NODE_RANK=0 + +echo "SGLang Configuration (Single Node):" +echo " Tensor Parallel Size: {nproc_per_node}" +echo " Total GPUs: {nproc_per_node}" + +# SGLang native launcher handles everything +{model_script}""" + + # Multi-node: Use SGLang's native multi-node support + return f"""# SGLang multi-node setup (K8s Indexed Job) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export NODE_RANK=${{JOB_COMPLETION_INDEX}} +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +# SGLang parallelism configuration +export SGLANG_TENSOR_PARALLEL_SIZE={nproc_per_node} +export SGLANG_PIPELINE_PARALLEL_SIZE=1 + +# Get current pod IP +POD_IP=$(hostname -i | awk '{{print $1}}') +export SGLANG_HOST_IP="$POD_IP" + +echo "SGLang Configuration (Multi-Node):" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " NODE_RANK: $NODE_RANK (Pod Index)" +echo " NNODES: $NNODES" +echo " Tensor Parallel Size: {nproc_per_node}" +echo " Pod IP: $POD_IP" +echo " Total GPUs: {nnodes * nproc_per_node}" + +# Clean any existing Ray processes +ray stop --force 2>/dev/null || true +pkill -9 -f "ray::" 2>/dev/null || true +sleep 2 + +# SGLang native launcher will handle Ray cluster coordination +# Pass NCCL init address for multi-node setup +export NCCL_INIT_ADDR="${{MASTER_ADDR}}:${{MASTER_PORT}}" + +echo "Starting SGLang with native multi-node launcher..." +{model_script} + +# Cleanup Ray on exit +trap "ray stop --force 2>/dev/null || true" EXIT""" + + def _generate_megatron_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate Megatron-LM launcher command for K8s Indexed Jobs. + + Megatron-LM is a training framework for large transformers with tensor and pipeline parallelism. + It uses torchrun as the underlying launcher but with Megatron-specific environment variables. + + Architecture: + - Single-node: Tensor Parallelism (TP) across GPUs + - Multi-node: Tensor + Pipeline Parallelism + * TP across GPUs within each node + * PP across nodes + + For K8s: + - Uses headless service for node discovery (like torchrun/deepspeed) + - Each pod knows its rank via JOB_COMPLETION_INDEX + - Sets TENSOR_MODEL_PARALLEL_SIZE and PIPELINE_MODEL_PARALLEL_SIZE (Megatron-Core standard) + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port (for NCCL). Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete Megatron-LM launch setup with environment configuration + + Raises: + ValueError: If any parameter is invalid + """ + # Validate inputs + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # For single-node, use TP only + if nnodes == 1: + return f"""# Megatron-LM single-node setup (Tensor Parallelism) +export TENSOR_MODEL_PARALLEL_SIZE={min(nproc_per_node, 8)} +export PIPELINE_MODEL_PARALLEL_SIZE=1 +export CONTEXT_PARALLEL_SIZE=1 +export NNODES=1 +export NPROC_PER_NODE={nproc_per_node} +export MASTER_ADDR=localhost +export MASTER_PORT={master_port} +export NODE_RANK=0 + +echo "Megatron-LM Configuration (Single-Node):" +echo " Tensor Model Parallel Size: {min(nproc_per_node, 8)}" +echo " Pipeline Model Parallel Size: 1" +echo " Total GPUs: {nproc_per_node}" + +# Launch using torchrun with Megatron configuration +torchrun \\ + --standalone \\ + --nproc_per_node={nproc_per_node} \\ + {model_script}""" + + # Multi-node: TP + PP + else: + # Use headless service for node discovery (set by template) + return f"""# Megatron-LM multi-node setup (Tensor + Pipeline Parallelism) +export TENSOR_MODEL_PARALLEL_SIZE={nproc_per_node} +export PIPELINE_MODEL_PARALLEL_SIZE={nnodes} +export CONTEXT_PARALLEL_SIZE=1 +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} +export NODE_RANK=${{JOB_COMPLETION_INDEX}} +export MASTER_ADDR=${{MASTER_ADDR}} +export MASTER_PORT={master_port} + +echo "Megatron-LM Configuration (Multi-Node):" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " NODE_RANK: $NODE_RANK (Pod Index)" +echo " NNODES: $NNODES" +echo " Tensor Model Parallel Size: {nproc_per_node}" +echo " Pipeline Model Parallel Size: {nnodes}" +echo " Total GPUs: {nnodes * nproc_per_node}" + +# Wait for all pods to be ready (K8s Indexed Job coordination) +echo "Waiting for all {nnodes} pods to be ready..." +sleep 5 + +# Launch using torchrun with Megatron multi-node configuration +torchrun \\ + --nnodes={nnodes} \\ + --nproc_per_node={nproc_per_node} \\ + --node_rank=${{NODE_RANK}} \\ + --master_addr=${{MASTER_ADDR}} \\ + --master_port={master_port} \\ + {model_script}""" + + def _load_k8s_tools(self) -> Dict: + """ + Load K8s-specific tools configuration. + + Returns: + Dict with K8s tools configuration + """ + k8s_tools_file = Path(__file__).parent.parent / "scripts" / "k8s" / "tools.json" + + if k8s_tools_file.exists(): + try: + with open(k8s_tools_file, "r") as f: + return json.load(f) + except Exception as e: + self.console.print(f"[yellow]Warning: Failed to load K8s tools config: {e}[/yellow]") + return {} + else: + self.console.print(f"[yellow]Warning: K8s tools.json not found at {k8s_tools_file}[/yellow]") + return {} + + def _prepare_env_vars(self, model_info: Dict) -> Dict[str, str]: + """ + Prepare environment variables from multiple sources. + + Merges env vars from: + 1. Base additional_context + 2. Data provider + 3. Tools configuration + + Args: + model_info: Model configuration + + Returns: + Merged environment variables dict + """ + env_vars = {} + + # 1. Base environment variables from additional_context + base_env = self.config.additional_context.get("env_vars", {}) + env_vars.update(base_env) + + # 1b. Critical ROCm environment variable (if not already set) + # HSA_NO_SCRATCH_RECLAIM=1 required for AMD MI300X and newer GPUs + # Prevents performance degradation and NCCL errors + if "HSA_NO_SCRATCH_RECLAIM" not in env_vars: + env_vars["HSA_NO_SCRATCH_RECLAIM"] = "1" + + # 2. Data provider environment variables + data_config = self._prepare_data_config(model_info) + if data_config: + if "env_vars" in data_config: + # Exclude MAD_DATAHOME from data provider's env vars (we set it explicitly below for K8s) + data_provider_env = {k: v for k, v in data_config["env_vars"].items() if k != "MAD_DATAHOME"} + env_vars.update(data_provider_env) + # Always set MAD_DATAHOME for K8s (PVC mount point /data, not /data_dlm_0) + if "datahome" in data_config: + env_vars["MAD_DATAHOME"] = data_config["datahome"] + + # 3. Tools configuration environment variables + # Check both additional_context and manifest.context for tools + tools_config = self.config.additional_context.get("tools", []) + if not tools_config and "context" in self.manifest: + tools_config = self.manifest["context"].get("tools", []) + + for tool in tools_config: + if "env_vars" in tool: + # Skip OUTPUT_FILE as it's set inline in command chain to avoid conflicts + tool_env_vars = {k: v for k, v in tool["env_vars"].items() if k != "OUTPUT_FILE"} + env_vars.update(tool_env_vars) + + return env_vars + + def _prepare_data_config(self, model_info: Dict) -> Optional[Dict]: + """ + Prepare data provider configuration for K8s pod. + + Args: + model_info: Model configuration + + Returns: + Data configuration dict or None + """ + if "data" not in model_info or not model_info["data"]: + return None + + # Initialize data provider if needed + if not self.data: + try: + # Create minimal context for data provider + # We only need the data.json file to be present + import os + data_json_file = "data.json" + if os.path.exists(data_json_file): + # Import Context and create minimal instance + # Data provider needs this to function + self.context_for_data = type('obj', (object,), { + 'ctx': {}, + 'sh': lambda cmd: os.popen(cmd).read().strip() + })() + self.data = Data( + self.context_for_data, + filename=data_json_file, + force_mirrorlocal=False + ) + else: + self.console.print("[yellow]Warning: data.json not found, data provider unavailable[/yellow]") + return None + except Exception as e: + self.console.print(f"[yellow]Warning: Could not initialize data provider: {e}[/yellow]") + return None + + try: + # Get data environment variables + data_env = self.data.get_env(model_info["data"]) + + # Find data provider for this data + dp = self.data.find_dataprovider(model_info["data"]) + if not dp: + self.console.print(f"[yellow]Warning: Data provider not found for {model_info['data']}[/yellow]") + return None + + # Get provider type and source path + provider_type = dp.provider_type if hasattr(dp, 'provider_type') else "local" + source_url = dp.config.get("path", "") if hasattr(dp, 'config') else "" + + # K8s best practice: Always use /data (PVC mount point) + # PVC provides persistent, shared storage across all pods/nodes + # Separation of storage (PVC) from compute (pods) is K8s standard + # FORCE datahome to /data for K8s (override data provider's default /data_dlm_0) + + # Filter out MAD_DATAHOME from data provider env vars (will be set explicitly below) + filtered_data_env = {k: v for k, v in (data_env or {}).items() if k != "MAD_DATAHOME"} + # Add MAD_DATAHOME with correct K8s value + filtered_data_env["MAD_DATAHOME"] = "/data" + + return { + "data_name": model_info["data"], + "env_vars": filtered_data_env, + "provider_type": provider_type, + "source_url": source_url, + "datahome": "/data", # Always use PVC mount point for K8s + } + except Exception as e: + self.console.print(f"[yellow]Warning: Could not prepare data config: {e}[/yellow]") + return None + + def _save_debug_manifests(self): + """Save rendered manifests to disk for debugging.""" + output_dir = Path(self.k8s_config.get("output_dir", "./k8s_manifests")) + output_dir.mkdir(parents=True, exist_ok=True) + + # Save ConfigMap + (output_dir / "configmap.yaml").write_text(self.configmap_yaml) + + # Save Job + (output_dir / "job.yaml").write_text(self.job_yaml) + + # Save Service if exists + if self.service_yaml: + (output_dir / "service.yaml").write_text(self.service_yaml) + + self.console.print( + f"[yellow]Debug: Manifests saved to {output_dir}[/yellow]" + ) + + def _create_results_pvc(self) -> str: + """ + Create a PersistentVolumeClaim for results storage. + + Returns: + Name of the created PVC + """ + pvc_name = f"{self.job_name}-results" + + # Render PVC template + template_dir = Path(__file__).parent / "templates" / "kubernetes" + pvc_template = template_dir / "pvc.yaml.j2" + + with open(pvc_template, "r") as f: + pvc_template_str = f.read() + + template = Template(pvc_template_str) + pvc_yaml = template.render( + pvc_name=pvc_name, + namespace=self.namespace, + storage_size=self.k8s_config.get("results_storage_size", "10Gi"), + storage_class=self.k8s_config.get("storage_class") + ) + + # Create PVC + pvc_dict = yaml.safe_load(pvc_yaml) + self.core_v1.create_namespaced_persistent_volume_claim( + namespace=self.namespace, body=pvc_dict + ) + + return pvc_name + + def _create_or_get_data_pvc(self, nnodes: int = 1) -> str: + """ + Create or reuse a shared PersistentVolumeClaim for data storage. + + K8s best practice: Use shared PVC for data (separate from compute pods). + This PVC is reusable across multiple training runs. + + Args: + nnodes: Number of nodes (determines access mode requirements) + + Returns: + Name of the PVC (existing or newly created) + """ + # Use a consistent name for reusability (not job-specific) + pvc_name = "madengine-shared-data" + + # Check if PVC already exists (idempotent) + try: + existing_pvc = self.core_v1.read_namespaced_persistent_volume_claim( + name=pvc_name, + namespace=self.namespace + ) + self.console.print(f"[dim]✓ Using existing data PVC: {pvc_name}[/dim]") + + # Verify access mode for multi-node + if nnodes > 1: + access_modes = existing_pvc.spec.access_modes + if "ReadWriteMany" not in access_modes: + self.console.print( + f"[yellow]⚠️ Warning: PVC {pvc_name} doesn't support ReadWriteMany[/yellow]" + ) + self.console.print( + f"[yellow] Multi-node deployment may fail. Current modes: {access_modes}[/yellow]" + ) + + return pvc_name + + except ApiException as e: + if e.status != 404: + raise # Unexpected error + + # PVC doesn't exist, create it + # Determine access mode based on deployment topology + # RWO (ReadWriteOnce): Single-node - works with most storage classes (local-path, EBS, etc.) + # RWX (ReadWriteMany): Multi-node - requires shared storage (NFS, CephFS, etc.) + access_mode = "ReadWriteMany" if nnodes > 1 else "ReadWriteOnce" + + self.console.print(f"[blue]Creating shared data PVC: {pvc_name}...[/blue]") + self.console.print(f"[dim] Access mode: {access_mode} ({'multi-node' if nnodes > 1 else 'single-node'})[/dim]") + + # Render data PVC template + template_dir = Path(__file__).parent / "templates" / "kubernetes" + pvc_template = template_dir / "pvc-data.yaml.j2" + + with open(pvc_template, "r") as f: + pvc_template_str = f.read() + + template = Template(pvc_template_str) + pvc_yaml = template.render( + pvc_name=pvc_name, + namespace=self.namespace, + access_mode=access_mode, + storage_size=self.k8s_config.get("data_storage_size", "100Gi"), + storage_class=self.k8s_config.get("storage_class") + ) + + # Create PVC + pvc_dict = yaml.safe_load(pvc_yaml) + self.core_v1.create_namespaced_persistent_volume_claim( + namespace=self.namespace, body=pvc_dict + ) + + # Wait for PVC to be bound (important!) + self.console.print(f"[dim]Waiting for PVC to be bound...[/dim]") + for _ in range(30): # Wait up to 30 seconds + try: + pvc = self.core_v1.read_namespaced_persistent_volume_claim( + name=pvc_name, namespace=self.namespace + ) + if pvc.status.phase == "Bound": + self.console.print(f"[green]✓ PVC bound successfully[/green]") + break + except ApiException: + pass + time.sleep(1) + else: + self.console.print( + f"[yellow]⚠️ Warning: PVC created but not bound yet. " + f"Check: kubectl describe pvc {pvc_name}[/yellow]" + ) + + return pvc_name + + def _cleanup_existing_resources(self): + """Delete existing Job, ConfigMap, and Service if they exist.""" + # Delete existing Job + try: + self.batch_v1.delete_namespaced_job( + name=self.job_name, + namespace=self.namespace, + propagation_policy="Background" + ) + self.console.print(f"[dim]Deleted existing Job: {self.job_name}[/dim]") + except ApiException as e: + if e.status != 404: # Ignore not found + pass + + # Delete existing ConfigMap + try: + self.core_v1.delete_namespaced_config_map( + name=self.configmap_name, + namespace=self.namespace + ) + self.console.print(f"[dim]Deleted existing ConfigMap: {self.configmap_name}[/dim]") + except ApiException as e: + if e.status != 404: + pass + + # Delete existing Service + if hasattr(self, 'service_yaml') and self.service_yaml: + try: + self.core_v1.delete_namespaced_service( + name=self.job_name, + namespace=self.namespace + ) + self.console.print(f"[dim]Deleted existing Service: {self.job_name}[/dim]") + except ApiException as e: + if e.status != 404: + pass + + # Delete existing collector pod (must be done before PVC to allow PVC deletion) + collector_pod_name = f"collector-{self.job_name}" + try: + self.core_v1.delete_namespaced_pod( + name=collector_pod_name, + namespace=self.namespace, + grace_period_seconds=0 + ) + self.console.print(f"[dim]Deleted existing collector pod: {collector_pod_name}[/dim]") + # Wait a moment for pod to release the PVC + import time + time.sleep(2) + except ApiException as e: + if e.status != 404: + pass + + # Delete existing PVC + pvc_name = f"{self.job_name}-results" + try: + self.core_v1.delete_namespaced_persistent_volume_claim( + name=pvc_name, + namespace=self.namespace + ) + self.console.print(f"[dim]Deleted existing PVC: {pvc_name}[/dim]") + + # Wait for PVC to be fully deleted (not just marked for deletion) + import time + max_wait = 30 # Maximum 30 seconds + wait_interval = 1 # Check every 1 second + for i in range(max_wait): + try: + self.core_v1.read_namespaced_persistent_volume_claim( + name=pvc_name, + namespace=self.namespace + ) + time.sleep(wait_interval) + except ApiException as e: + if e.status == 404: + # PVC is fully deleted + break + except ApiException as e: + if e.status != 404: + pass + + # Wait a moment for other resources to be deleted + import time + time.sleep(1) + + def deploy(self) -> DeploymentResult: + """Apply rendered manifests using kubernetes Python client.""" + try: + # Clean up any existing resources first + self._cleanup_existing_resources() + + # 1. Create PVC for results storage + self.console.print("[blue]Creating PVC for results storage...[/blue]") + pvc_name = self._create_results_pvc() + self.console.print(f"[green]✓ Created PVC: {pvc_name}[/green]") + + # 1b. Create or reuse data PVC if data provider is configured and auto-creation was flagged + if hasattr(self, '_data_config') and self._data_config: + # Check if we set the PVC name during prepare (auto-creation case) + data_pvc_name = self.k8s_config.get("data_pvc") + if data_pvc_name == "madengine-shared-data": + # Auto-creation mode: create/reuse the PVC + nnodes = getattr(self, '_nnodes', 1) + self._create_or_get_data_pvc(nnodes=nnodes) + + # 2. Create ConfigMap + self.console.print("[blue]Creating ConfigMap...[/blue]") + configmap_dict = yaml.safe_load(self.configmap_yaml) + self.core_v1.create_namespaced_config_map( + namespace=self.namespace, body=configmap_dict + ) + self.console.print( + f"[green]✓ Created ConfigMap: {self.configmap_name}[/green]" + ) + + # 3. Create Service (if needed for multi-node) + if self.service_yaml: + self.console.print("[blue]Creating headless Service...[/blue]") + service_dict = yaml.safe_load(self.service_yaml) + self.core_v1.create_namespaced_service( + namespace=self.namespace, body=service_dict + ) + self.console.print(f"[green]✓ Created Service: {self.job_name}[/green]") + + # 4. Create Job + self.console.print("[blue]Creating Job...[/blue]") + job_dict = yaml.safe_load(self.job_yaml) + job = self.batch_v1.create_namespaced_job( + namespace=self.namespace, body=job_dict + ) + + # Extract image for display + image = job_dict["spec"]["template"]["spec"]["containers"][0]["image"] + + self.console.print(f"[green]✓ Submitted K8s Job: {self.job_name}[/green]") + self.console.print(f" Namespace: {self.namespace}") + self.console.print(f" Image: {image}") + + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=self.job_name, + message=f"Job {self.job_name} created successfully", + ) + + except ApiException as e: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id="", + message=f"K8s API error: {e.reason} - {e.body}", + ) + except Exception as e: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id="", + message=f"Deployment error: {str(e)}", + ) + + def monitor(self, deployment_id: str) -> DeploymentResult: + """ + Monitor Job status using Python API. + + If live_output is enabled, streams pod logs in real-time. + Otherwise, polls status periodically. + """ + # Check if live output is requested + live_output = self.config.additional_context.get("live_output", False) + + if live_output: + return self._monitor_with_live_logs(deployment_id) + else: + return self._monitor_status_only(deployment_id) + + def _monitor_status_only(self, deployment_id: str) -> DeploymentResult: + """Monitor Job status without streaming logs.""" + try: + job = self.batch_v1.read_namespaced_job_status( + name=deployment_id, namespace=self.namespace + ) + + # Check job conditions + if job.status.succeeded: + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=deployment_id, + message=f"Job {deployment_id} completed successfully", + ) + + if job.status.failed: + # Get pod logs to show error + self._print_pod_logs_on_failure(deployment_id) + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=deployment_id, + message=f"Job {deployment_id} failed", + ) + + if job.status.active: + return DeploymentResult( + status=DeploymentStatus.RUNNING, + deployment_id=deployment_id, + message=f"Job {deployment_id} running ({job.status.active} active pods)", + ) + + return DeploymentResult( + status=DeploymentStatus.PENDING, + deployment_id=deployment_id, + message=f"Job {deployment_id} pending", + ) + + except ApiException as e: + if e.status == 404: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=deployment_id, + message=f"Job {deployment_id} not found", + ) + raise + + def _monitor_with_live_logs(self, deployment_id: str) -> DeploymentResult: + """Monitor Job and stream logs in real-time.""" + import time + + self.console.print(f"\n[cyan]═══ Streaming pod logs (--live-output) ═══[/cyan]\n") + + pod_name = None + log_position = 0 + + while True: + try: + # Check job status + job = self.batch_v1.read_namespaced_job_status( + name=deployment_id, namespace=self.namespace + ) + + # Get pod if we don't have it yet + if not pod_name: + pods = self.core_v1.list_namespaced_pod( + namespace=self.namespace, + label_selector=f"job-name={deployment_id}" + ) + if pods.items: + pod_name = pods.items[0].metadata.name + self.console.print(f"[dim]Following logs from pod: {pod_name}[/dim]\n") + + # Stream logs if we have a pod + if pod_name: + try: + # Get logs from current position + logs = self.core_v1.read_namespaced_pod_log( + name=pod_name, + namespace=self.namespace, + tail_lines=100 if log_position == 0 else None + ) + + # Print new log lines and trigger artifact collection + if logs: + log_lines = logs.split('\n') + if len(log_lines) > log_position: + for line in log_lines[log_position:]: + if line.strip(): + print(line) + log_position = len(log_lines) + + except ApiException as e: + if e.status != 400: # Ignore "container not ready" errors + pass + + # Check if job completed + if job.status.succeeded: + self.console.print(f"\n[green]✓ Job {deployment_id} completed successfully[/green]\n") + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=deployment_id, + message=f"Job {deployment_id} completed successfully", + ) + + if job.status.failed: + self.console.print(f"\n[red]✗ Job {deployment_id} failed[/red]\n") + # Print final logs + if pod_name: + self._print_pod_logs_on_failure(deployment_id) + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=deployment_id, + message=f"Job {deployment_id} failed", + ) + + time.sleep(2) # Poll every 2 seconds + + except ApiException as e: + if e.status == 404: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=deployment_id, + message=f"Job {deployment_id} not found", + ) + raise + + def _print_pod_logs_on_failure(self, deployment_id: str): + """Print pod logs when job fails (for debugging).""" + try: + self.console.print(f"\n[yellow]═══ Pod logs (last 50 lines) ═══[/yellow]\n") + + pods = self.core_v1.list_namespaced_pod( + namespace=self.namespace, + label_selector=f"job-name={deployment_id}" + ) + + for pod in pods.items: + pod_name = pod.metadata.name + try: + logs = self.core_v1.read_namespaced_pod_log( + name=pod_name, + namespace=self.namespace, + tail_lines=50 + ) + self.console.print(f"[dim]Pod: {pod_name}[/dim]") + print(logs) + print() + except ApiException: + pass + except Exception: + pass + + def collect_results(self, deployment_id: str) -> Dict[str, Any]: + """ + Enhanced results collection from K8s pods following vLLM multi-node best practices. + + For Data Parallel deployments (vLLM, SGLang): + - Each pod runs an independent replica + - Only pod-0 reports metrics to avoid duplicates + - Total throughput = pod-0 throughput × num_replicas + + Collects: + 1. Pod logs + 2. File artifacts via kubectl cp (profiling, tracing, env details) + 3. Results from shared PVC (if configured) + + Returns: + Dict with logs, artifacts, and performance results + """ + results = { + "job_name": deployment_id, + "namespace": self.namespace, + "logs": [], + "artifacts": [], + "successful_runs": [], + "failed_runs": [], + } + + # Create results directory for this deployment + results_dir = Path(f"./k8s_results/{deployment_id}") + results_dir.mkdir(parents=True, exist_ok=True) + + self.console.print(f"[cyan]📦 Collecting results from K8s job: {deployment_id}[/cyan]") + + try: + # Get pods for this job + pods = self.core_v1.list_namespaced_pod( + namespace=self.namespace, label_selector=f"job-name={deployment_id}" + ) + + # Get model info and build info from manifest + model_keys = list(self.manifest["built_models"].keys()) + if model_keys: + model_key = model_keys[0] + model_info = self.manifest["built_models"][model_key] + else: + model_info = {} + + # Get build info from built_images + image_keys = list(self.manifest.get("built_images", {}).keys()) + if image_keys: + image_key = image_keys[0] + build_info = self.manifest["built_images"][image_key] + else: + build_info = {} + + # Check if this is a multi-node distributed job + deployment_config = self.manifest.get("deployment_config", {}) + distributed_config = deployment_config.get("distributed", {}) + is_distributed = distributed_config.get("enabled", False) + nnodes = distributed_config.get("nnodes", 1) + is_multinode = is_distributed and nnodes > 1 + + # Determine launcher_type the same way as _prepare_template_context does + # (deployment_config doesn't store launcher_type directly) + launcher_config = self.config.additional_context.get("launcher", {}) + launcher_type = ( + launcher_config.get("type") + if launcher_config.get("type") is not None + else distributed_config.get("launcher") + ) + + # Normalize launcher based on deployment type and validity + launcher_type = normalize_launcher(launcher_type, "kubernetes") + + is_ray_launcher = launcher_type in ["vllm", "sglang"] + + # Sort pods by name to ensure consistent ordering (pod-0 is master) + sorted_pods = sorted(pods.items, key=lambda p: p.metadata.name) + + # ======================================================================== + # NEW: Per-Node Collection Strategy + # Collect logs and artifacts from ALL nodes + # Parse performance from ALL nodes (each reports node-local metrics) + # Aggregate metrics based on type (sum for throughput, etc.) + # ======================================================================== + + per_node_metrics = [] # Store performance from each node + results["nodes"] = [] # Store per-node details for display + + # Special handling for Ray-based launchers (vLLM, SGLang) + # These report per-replica metrics, need scaling + if is_multinode and is_ray_launcher: + self.console.print( + f"[cyan]Multi-node Ray deployment: {nnodes} nodes (Data Parallel mode)[/cyan]" + ) + + # Collect from ALL pods + for pod_index, pod in enumerate(sorted_pods): + pod_name = pod.metadata.name + pod_dir = results_dir / pod_name + pod_dir.mkdir(exist_ok=True) + + # Extract node rank from pod name (e.g., madengine-dummy-torchrun-0 -> 0) + try: + node_rank = int(pod_name.rsplit('-', 1)[-1]) + except (ValueError, IndexError): + node_rank = pod_index + + self.console.print(f"[dim] Collecting from pod: {pod_name} (node-{node_rank})[/dim]") + + try: + # 1. Collect pod logs + log = self.core_v1.read_namespaced_pod_log( + name=pod_name, namespace=self.namespace + ) + log_file = pod_dir / f"{pod_name}.log" + log_file.write_text(log) + results["logs"].append({ + "pod": pod_name, + "log": log, + "file": str(log_file) + }) + + # 2. Parse NODE-LOCAL performance from log + perf_data = self._parse_node_performance(log, model_info, build_info) + + # Get pod exit status + pod_status = pod.status.phase + pod_exit_code = 0 + if pod.status.container_statuses: + container_status = pod.status.container_statuses[0] + if container_status.state.terminated: + pod_exit_code = container_status.state.terminated.exit_code or 0 + + # Store per-node info for display table + node_info = { + "node_id": node_rank, + "pod_name": pod_name, + "status": "SUCCESS" if pod_status == "Succeeded" and pod_exit_code == 0 else "FAILED", + "exit_code": pod_exit_code, + "performance": perf_data.get("performance") if perf_data else None, + "metric": perf_data.get("metric") if perf_data else None, + "duration": perf_data.get("duration") if perf_data else None, + "log_file": str(log_file) + } + results["nodes"].append(node_info) + + if perf_data: + # For Ray launchers, this is per-replica metric + if is_multinode and is_ray_launcher: + perf_data["is_per_replica"] = True + per_node_metrics.append(perf_data) + self.console.print( + f"[green] ✓ Parsed performance: {perf_data['performance']:.2f} " + f"{perf_data['metric']} (node-{node_rank})[/green]" + ) + else: + self.console.print( + f"[dim] No performance metric found in node-{node_rank} log[/dim]" + ) + + except ApiException as e: + self.console.print( + f"[red]✗ Failed to get logs for pod {pod_name}: {e.reason}[/red]" + ) + results["nodes"].append({ + "node_id": node_rank, + "pod_name": pod_name, + "status": "FAILED", + "exit_code": -1, + "performance": None, + "metric": None, + "error": f"Failed to get logs: {e.reason}" + }) + except Exception as e: + self.console.print( + f"[red]✗ Error collecting from pod {pod_name}: {e}[/red]" + ) + results["nodes"].append({ + "node_id": node_rank, + "pod_name": pod_name, + "status": "FAILED", + "exit_code": -1, + "performance": None, + "metric": None, + "error": str(e) + }) + + self.console.print( + f"[green]✓ Collected logs from {len(results['logs'])} pods[/green]" + ) + + # ======================================================================== + # Aggregate per-node metrics + # ======================================================================== + if per_node_metrics: + # Special handling for Ray launchers - multiply by nnodes + if is_multinode and is_ray_launcher: + original_perf = per_node_metrics[0]["performance"] + aggregated_perf = original_perf * nnodes + self.console.print( + f"[green] Per-replica: {original_perf:.1f} req/s[/green]" + ) + self.console.print( + f"[green] Total capacity: {aggregated_perf:.1f} req/s ({nnodes} nodes)[/green]" + ) + + # Create aggregated record manually for Ray + aggregated_record = { + "model": per_node_metrics[0]["model"], + "performance": aggregated_perf, + "metric": per_node_metrics[0]["metric"], + "status": "SUCCESS", + "topology": f"{nnodes}N×{per_node_metrics[0].get('local_gpus', 1)}G", + "nnodes": nnodes, + "launcher": launcher_type or "N/A", + "deployment_type": "kubernetes", + "gpu_architecture": per_node_metrics[0].get("gpu_architecture", "N/A"), + "duration": per_node_metrics[0].get("duration", "N/A"), + "data_name": per_node_metrics[0].get("data_name", "N/A"), + "data_provider": per_node_metrics[0].get("data_provider", "N/A"), + "aggregation_method": "scaled_by_nnodes", + "nodes_contributing": nnodes + } + else: + # Use new aggregation logic for other launchers + aggregated_record = self._aggregate_node_metrics( + per_node_metrics, + nnodes, + launcher_type + ) + + if aggregated_record: + # Write ONE aggregated row to perf.csv (CRITICAL for database) + self._write_to_perf_csv(aggregated_record) + + results["successful_runs"].append({ + "model": model_info.get("name"), + "perf_data": aggregated_record, + "nodes": results["nodes"], # Include per-node details + "per_node_metrics": per_node_metrics # For detailed analysis + }) + + self.console.print( + f"[green]✓ Aggregated performance from {len(per_node_metrics)} nodes[/green]" + ) + self.console.print( + f"[green]✓ Updated local perf.csv[/green]" + ) + else: + # All nodes failed or no performance found + error_msg = "No performance metrics found from any node" + failure_record = self._create_failure_record( + model_info, build_info, deployment_id, error_msg + ) + self._write_to_perf_csv(failure_record) + results["failed_runs"].append({ + "model": model_info.get("name", "Unknown"), + "error": error_msg, + "nodes": results["nodes"] + }) + self.console.print( + f"[yellow]⚠ No performance metrics found, recorded as FAILED[/yellow]" + ) + + # 4. Collect all artifacts from PVC + self._collect_from_pvc(deployment_id, results_dir, results) + + # 5. Generate summary + self._generate_results_summary(results, results_dir) + + except Exception as e: + self.console.print(f"[yellow]⚠ Results collection incomplete: {e}[/yellow]") + + return results + + def _collect_artifacts_immediately(self, deployment_id: str, pod_name: str) -> None: + """ + Collect artifacts immediately from a running pod during the sleep period. + This is called when we detect the "Keeping pod alive" message in logs. + """ + try: + # Create results directory + results_dir = Path("k8s_results") / deployment_id + results_dir.mkdir(parents=True, exist_ok=True) + + pod_dir = results_dir / pod_name + pod_dir.mkdir(exist_ok=True) + + # Collect artifacts + artifacts = self._collect_pod_artifacts(pod_name, pod_dir) + + if artifacts: + self.console.print(f"[green]✓ Collected {len(artifacts)} artifacts from {pod_name}[/green]") + else: + self.console.print(f"[yellow]⚠ No artifacts collected from {pod_name}[/yellow]") + + except Exception as e: + self.console.print(f"[yellow]⚠ Error collecting artifacts: {e}[/yellow]") + + def _collect_pod_artifacts(self, pod_name: str, dest_dir: Path) -> List[Dict]: + """ + Collect file artifacts from pod using kubectl cp. + + Collects: + - perf.csv (performance results) + - *_env.csv (environment details from rocEnvTool) + - profiling outputs (rocprof*, results*, *.db) + - tracing outputs (*_output/ directories) + - tool-specific outputs + + Args: + pod_name: Name of the Kubernetes pod + dest_dir: Local directory to save artifacts + + Returns: + List of collected artifact metadata + """ + artifacts = [] + + # Define artifact patterns to collect + artifact_patterns = [ + {"pattern": "perf.csv", "type": "performance"}, + {"pattern": "*_env.csv", "type": "environment"}, + {"pattern": "results*", "type": "profiling"}, + {"pattern": "*.db", "type": "profiling"}, + {"pattern": "trace.*", "type": "tracing"}, + {"pattern": "prof.csv", "type": "profiling"}, # Raw profiler output before post-script renames it + {"pattern": "gpu_info_*.csv", "type": "profiling"}, + {"pattern": "library_trace.csv", "type": "tracing"}, + ] + + for artifact_def in artifact_patterns: + pattern = artifact_def["pattern"] + artifact_type = artifact_def["type"] + + try: + # Try direct kubectl cp without exec (works during the sleep period) + # For patterns with wildcards, try common specific filenames + if '*' in pattern: + # Expand pattern to specific known files + if pattern == "*_env.csv": + specific_files = ["dummy_prof_env.csv", "dummy_data_minio_env.csv"] + elif pattern == "gpu_info_*.csv": + specific_files = ["gpu_info_power_profiler_output.csv", "gpu_info_vram_profiler_output.csv"] + elif pattern == "results*": + specific_files = ["results.csv", "results.txt", "results.json"] + elif pattern == "trace.*": + specific_files = ["trace.txt", "trace.csv", "trace.json"] + else: + specific_files = [] + + for filename in specific_files: + local_path = dest_dir / filename + cp_cmd = [ + "kubectl", "cp", + f"{self.namespace}/{pod_name}:/workspace/{filename}", + str(local_path) + ] + + cp_result = subprocess.run( + cp_cmd, capture_output=True, text=True, timeout=30 + ) + + if cp_result.returncode == 0 and local_path.exists(): + artifacts.append({ + "pod": pod_name, + "type": artifact_type, + "source": f"/workspace/{filename}", + "local_path": str(local_path), + "size": local_path.stat().st_size + }) + self.console.print( + f"[dim] ✓ Collected {artifact_type}: {filename}[/dim]" + ) + elif cp_result.stderr and "No such file" not in cp_result.stderr: + # Log unexpected errors (but not "file not found") + self.console.print( + f"[yellow] ⚠ Failed to collect {filename}: {cp_result.stderr.strip()}[/yellow]" + ) + else: + # Direct file - try to copy it + local_path = dest_dir / pattern + cp_cmd = [ + "kubectl", "cp", + f"{self.namespace}/{pod_name}:/workspace/{pattern}", + str(local_path) + ] + + cp_result = subprocess.run( + cp_cmd, capture_output=True, text=True, timeout=30 + ) + + if cp_result.returncode == 0 and local_path.exists(): + artifacts.append({ + "pod": pod_name, + "type": artifact_type, + "source": f"/workspace/{pattern}", + "local_path": str(local_path), + "size": local_path.stat().st_size + }) + self.console.print( + f"[dim] ✓ Collected {artifact_type}: {pattern}[/dim]" + ) + elif cp_result.stderr and "No such file" not in cp_result.stderr: + # Log unexpected errors (but not "file not found") + self.console.print( + f"[yellow] ⚠ Failed to collect {pattern}: {cp_result.stderr.strip()}[/yellow]" + ) + + except subprocess.TimeoutExpired: + pass # Timeout - skip this file + except Exception: + pass # File not found or not accessible - this is expected + + # Try to collect known output directories using kubectl cp directly (during sleep period) + output_directories = ["rocprof_output", "rpd_output", "trace_output"] + for dir_name in output_directories: + try: + local_dir = dest_dir / dir_name + cp_cmd = [ + "kubectl", "cp", + f"{self.namespace}/{pod_name}:/workspace/{dir_name}", + str(local_dir) + ] + + cp_result = subprocess.run( + cp_cmd, capture_output=True, text=True, timeout=60 + ) + + if cp_result.returncode == 0 and local_dir.exists(): + # Count files in directory + file_count = sum(1 for _ in local_dir.rglob('*') if _.is_file()) + if file_count > 0: + total_size = sum(f.stat().st_size for f in local_dir.rglob('*') if f.is_file()) + artifacts.append({ + "pod": pod_name, + "type": "tool_output_directory", + "source": f"/workspace/{dir_name}", + "local_path": str(local_dir), + "file_count": file_count, + "size": total_size + }) + self.console.print( + f"[dim] ✓ Collected directory: {dir_name} ({file_count} files, {total_size} bytes)[/dim]" + ) + except Exception: + pass # Directory not found - this is expected + + return artifacts + + def _collect_from_pvc(self, deployment_id: str, results_dir: Path, results: Dict): + """ + Collect all artifacts from the PVC using a temporary busybox pod. + + This is the best practice for collecting results from completed K8s jobs. + kubectl cp doesn't work on completed pods, so we use a helper pod. + + Args: + deployment_id: Job deployment ID + results_dir: Local directory to save results + results: Results dict to update + """ + pvc_name = f"{deployment_id}-results" + + try: + # Create a temporary pod to access PVC + collector_pod_name = f"collector-{deployment_id[:15]}" + + self.console.print(f"[dim]📦 Collecting artifacts from PVC: {pvc_name}[/dim]") + + collector_pod_spec = { + "apiVersion": "v1", + "kind": "Pod", + "metadata": {"name": collector_pod_name, "namespace": self.namespace}, + "spec": { + "restartPolicy": "Never", + "imagePullSecrets": [{"name": "dockerhub-creds"}], + "containers": [{ + "name": "collector", + "image": "busybox:latest", + "command": ["sh", "-c", "sleep 600"], + "volumeMounts": [{"name": "results", "mountPath": "/results"}] + }], + "volumes": [{"name": "results", "persistentVolumeClaim": {"claimName": pvc_name}}] + } + } + + # Delete existing collector pod if it exists (prevents 409 Conflict) + try: + self.core_v1.delete_namespaced_pod( + collector_pod_name, self.namespace, grace_period_seconds=0 + ) + time.sleep(2) # Wait for pod to be deleted + except ApiException as e: + if e.status != 404: # 404 means pod doesn't exist, which is fine + pass + + # Create collector pod + self.core_v1.create_namespaced_pod(self.namespace, collector_pod_spec) + + # Wait for pod to be ready + for _ in range(30): # Wait up to 30 seconds + try: + pod_status = self.core_v1.read_namespaced_pod_status( + collector_pod_name, self.namespace + ) + if pod_status.status.phase == "Running": + break + except ApiException as e: + # Pod not found yet or not ready - this is expected during startup + if e.status != 404: + self.console.print(f"[dim]Waiting for collector pod (status: {e.status})...[/dim]") + time.sleep(1) + else: + raise Exception("Collector pod did not start in time") + + # List pod result directories in PVC + list_cmd = [ + "kubectl", "exec", collector_pod_name, "-n", self.namespace, "--", + "ls", "-1", "/results/" + ] + list_result = subprocess.run(list_cmd, capture_output=True, text=True, timeout=10) + + if list_result.returncode == 0 and list_result.stdout.strip(): + pod_dirs = list_result.stdout.strip().split('\n') + + for pod_dir_name in pod_dirs: + if not pod_dir_name: + continue + + # Copy entire pod directory + local_pod_dir = results_dir / pod_dir_name + local_pod_dir.mkdir(exist_ok=True) + + cp_cmd = [ + "kubectl", "cp", + f"{self.namespace}/{collector_pod_name}:/results/{pod_dir_name}", + str(local_pod_dir) + ] + + cp_result = subprocess.run(cp_cmd, capture_output=True, text=True, timeout=60) + + if cp_result.returncode == 0: + # Count collected files + file_count = sum(1 for _ in local_pod_dir.rglob('*') if _.is_file()) + if file_count > 0: + results["artifacts"].append({ + "source": f"PVC:{pvc_name}/{pod_dir_name}", + "local_path": str(local_pod_dir), + "file_count": file_count, + "type": "pvc_collection" + }) + self.console.print(f"[dim] ✓ Collected {file_count} files from {pod_dir_name}[/dim]") + + self.console.print(f"[green]✓ Collected artifacts from PVC[/green]") + else: + self.console.print(f"[yellow]⚠ No results found in PVC[/yellow]") + + # Cleanup collector pod + self.core_v1.delete_namespaced_pod( + collector_pod_name, self.namespace, grace_period_seconds=0 + ) + + except Exception as e: + self.console.print(f"[yellow]⚠ Could not collect from PVC: {e}[/yellow]") + + def _generate_results_summary(self, results: Dict, results_dir: Path): + """ + Generate a summary JSON of all collected artifacts. + + Args: + results: Results dict with logs and artifacts + results_dir: Directory where results are saved + """ + summary = { + "job_name": results["job_name"], + "namespace": results["namespace"], + "collected_at": datetime.now().isoformat(), + "pods": len(results["logs"]), + "total_artifacts": len(results["artifacts"]), + "artifacts_by_type": {}, + "artifacts": results["artifacts"], + "successful_runs": len(results["successful_runs"]), + "failed_runs": len(results["failed_runs"]), + } + + # Group artifacts by type + for artifact in results["artifacts"]: + artifact_type = artifact.get("type", "unknown") + summary["artifacts_by_type"][artifact_type] = summary["artifacts_by_type"].get(artifact_type, 0) + 1 + + summary_file = results_dir / "results_summary.json" + summary_file.write_text(json.dumps(summary, indent=2)) + + self.console.print(f"[green]✓ Results summary: {summary_file}[/green]") + + # Print summary table if artifacts were collected + if summary["artifacts_by_type"]: + from rich.table import Table + table = Table(title="Collected Artifacts") + table.add_column("Type", style="cyan") + table.add_column("Count", justify="right", style="green") + + for artifact_type, count in sorted(summary["artifacts_by_type"].items()): + table.add_row(artifact_type, str(count)) + + self.console.print(table) + + def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: str, error_msg: str) -> Dict: + """ + Create a failure record for perf.csv when performance metrics are missing. + + Args: + model_info: Model information from manifest + build_info: Build information from manifest + pod_name: Kubernetes pod name + error_msg: Error message describing the failure + + Returns: + Dict with all perf.csv fields marked as FAILED + """ + import os + + # Get topology information for failure record + deployment_config = self.manifest.get("deployment_config", {}) + distributed_config = deployment_config.get("distributed", {}) + nnodes = distributed_config.get("nnodes", 1) + nproc_per_node = distributed_config.get("nproc_per_node") + if nproc_per_node is None: + nproc_per_node = int(model_info.get("n_gpus", 1)) + + # Create a record with the same structure as successful runs + # but with performance=0, metric="", and status="FAILED" + result = { + # Core identification + "model": model_info.get("name", ""), + "n_gpus": str(nnodes * nproc_per_node), + "nnodes": str(nnodes), + "gpus_per_node": str(nproc_per_node), + + # Model configuration + "training_precision": model_info.get("training_precision", ""), + "pipeline": os.environ.get("pipeline", ""), + "args": model_info.get("args", ""), + "tags": model_info.get("tags", ""), + + # Build information + "docker_file": build_info.get("dockerfile", ""), + "base_docker": build_info.get("base_docker", ""), + "docker_sha": build_info.get("docker_sha", ""), + "docker_image": build_info.get("docker_image", ""), + + # Runtime information + "git_commit": "", + "machine_name": pod_name, + "deployment_type": "kubernetes", + "gpu_architecture": "", + + # Performance metrics - FAILED + "performance": "0", + "metric": error_msg, # Store error message in metric field + "relative_change": "", + "status": "FAILURE", # Use "FAILURE" to match CSV schema + + # Timing + "build_duration": build_info.get("build_duration", ""), + "test_duration": "", + + # Data information + "dataname": model_info.get("data", ""), + "data_provider_type": "", + "data_size": "", + "data_download_duration": "", + + # Build tracking + "build_number": os.environ.get("BUILD_NUMBER", "0"), + "additional_docker_run_options": model_info.get("additional_docker_run_options", ""), + } + + # Flatten tags if they are in list format + if isinstance(result["tags"], list): + result["tags"] = ",".join(str(item) for item in result["tags"]) + + return result + + def _parse_node_performance( + self, + log_content: str, + model_info: Dict, + build_info: Dict + ) -> Optional[Dict]: + """ + Parse node-local performance from log. + + Expected format in log (from updated run scripts): + performance: + node_id: + local_gpus: + + Args: + log_content: Pod log content + model_info: Model information dict + build_info: Build information dict + + Returns: + Dict with node performance data, or None if parsing failed + """ + import re + + perf_data = None + + # Parse performance line + perf_pattern = r"performance:\s*([\d.]+)\s+(\S+)" + match = re.search(perf_pattern, log_content) + + if match: + value = float(match.group(1)) + metric = match.group(2) + + # Try to extract node_id for validation + node_id_pattern = r"node_id:\s*(\d+)" + node_match = re.search(node_id_pattern, log_content) + node_id = int(node_match.group(1)) if node_match else None + + # Try to extract local_gpus + local_gpus_pattern = r"local_gpus:\s*(\d+)" + gpus_match = re.search(local_gpus_pattern, log_content) + local_gpus = int(gpus_match.group(1)) if gpus_match else 1 + + # Extract duration if available + duration_pattern = r"test_duration:\s*([\d.]+)s" + duration_match = re.search(duration_pattern, log_content) + duration = f"{duration_match.group(1)}s" if duration_match else "N/A" + + # Extract GPU architecture from rocEnvTool runtime detection + # Look for pattern: 🔹 Name : gfx942 or Name : gfx942 + gpu_arch_pattern = r"(?:🔹\s*)?Name\s*:\s*(gfx\w+)" + gpu_arch_match = re.search(gpu_arch_pattern, log_content) + gpu_arch = gpu_arch_match.group(1) if gpu_arch_match else "N/A" + + perf_data = { + "model": model_info.get("name"), + "performance": value, + "metric": metric, + "node_id": node_id, + "local_gpus": local_gpus, + "duration": duration, + "gpu_architecture": gpu_arch, + "data_name": "N/A", + "data_provider": "N/A" + } + + return perf_data + + def _determine_aggregation_method(self, metric_name: str) -> str: + """ + Determine how to aggregate a metric based on its name/type. + + Args: + metric_name: Name of the performance metric + + Returns: + "sum", "average", "max", or "unknown" + """ + metric_lower = metric_name.lower() + + # Throughput metrics - SUM + if any(keyword in metric_lower for keyword in [ + "throughput", "samples_per_second", "tokens_per_second", + "images_per_second", "requests_per_second", "qps", + "bandwidth", "ops_per_second", "samples/sec", "tokens/sec" + ]): + return "sum" + + # Latency metrics - AVERAGE + elif any(keyword in metric_lower for keyword in [ + "latency", "time", "duration", "milliseconds", "seconds", + "ttft", "tpot", "response_time" + ]): + return "average" + + # Accuracy metrics - AVERAGE + elif any(keyword in metric_lower for keyword in [ + "accuracy", "precision", "recall", "f1", "loss" + ]): + return "average" + + # Memory metrics - MAX + elif any(keyword in metric_lower for keyword in [ + "memory", "bytes", "ram", "vram", "gb", "mb" + ]): + return "max" + + else: + # Unknown - default to sum for throughput-like metrics (conservative) + self.console.print(f"[yellow]⚠ Unknown metric type '{metric_name}', using sum aggregation[/yellow]") + return "sum" + + def _aggregate_node_metrics( + self, + per_node_metrics: List[Dict], + nnodes: int, + launcher_type: str + ) -> Optional[Dict]: + """ + Aggregate per-node metrics into single job-level metric. + + Aggregation Strategy: + - Throughput (samples/sec, tokens/sec, images/sec): SUM + - Latency (ms, seconds): AVERAGE + - Accuracy (%, ratio): AVERAGE or LAST + - Memory (bytes, GB): MAX or SUM + + Args: + per_node_metrics: List of performance dicts from each node + nnodes: Number of nodes + launcher_type: Type of launcher (torchrun, deepspeed, etc.) + + Returns: + Dict with aggregated performance data for perf.csv + """ + import statistics + + if not per_node_metrics: + return None + + # Get metric type from first node + first_metric = per_node_metrics[0] + metric_name = first_metric["metric"] + + # Determine aggregation strategy based on metric type + aggregation_method = self._determine_aggregation_method(metric_name) + + if aggregation_method == "sum": + # Sum throughput metrics + aggregated_value = sum(m["performance"] for m in per_node_metrics) + method_desc = "sum_across_nodes" + elif aggregation_method == "average": + # Average latency/accuracy metrics + aggregated_value = statistics.mean(m["performance"] for m in per_node_metrics) + method_desc = "average_across_nodes" + elif aggregation_method == "max": + # Max for memory usage + aggregated_value = max(m["performance"] for m in per_node_metrics) + method_desc = "max_across_nodes" + else: + # Unknown - conservative sum + aggregated_value = sum(m["performance"] for m in per_node_metrics) + method_desc = "sum_across_nodes (default)" + + # Compute statistics for validation + perfs = [m["performance"] for m in per_node_metrics] + if len(perfs) > 1: + statistics_dict = { + "mean": statistics.mean(perfs), + "std_dev": statistics.stdev(perfs), + "min": min(perfs), + "max": max(perfs), + "coefficient_variation": statistics.stdev(perfs) / statistics.mean(perfs) if statistics.mean(perfs) > 0 else 0 + } + else: + statistics_dict = { + "mean": perfs[0], + "std_dev": 0, + "min": perfs[0], + "max": perfs[0], + "coefficient_variation": 0 + } + + # Get GPU architecture from any successful node + gpu_arch = "N/A" + for m in per_node_metrics: + if m.get("gpu_architecture") and m["gpu_architecture"] != "N/A": + gpu_arch = m["gpu_architecture"] + break + + # Get duration (use max across nodes - slowest determines job time) + durations = [m.get("duration", "N/A") for m in per_node_metrics if m.get("duration") != "N/A"] + if durations: + # Extract numeric value and find max + duration_values = [] + for d in durations: + if isinstance(d, str) and d.endswith("s"): + try: + duration_values.append(float(d[:-1])) + except ValueError: + pass + duration = f"{max(duration_values):.2f}s" if duration_values else "N/A" + else: + duration = "N/A" + + # Get total GPUs + total_gpus = sum(m.get("local_gpus", 1) for m in per_node_metrics) + gpus_per_node = per_node_metrics[0].get("local_gpus", 1) if per_node_metrics else 1 + + # Build aggregated record (matches perf.csv schema) + aggregated_record = { + "model": first_metric["model"], + "performance": aggregated_value, + "metric": metric_name, + "status": "SUCCESS", + "topology": f"{nnodes}N×{gpus_per_node}G", + "nnodes": nnodes, + "launcher": launcher_type or "N/A", + "deployment_type": "kubernetes", + "gpu_architecture": gpu_arch, + "test_duration": duration, # FIXED: Must match CSV header name + "data_name": first_metric.get("data_name", "N/A"), + "data_provider": first_metric.get("data_provider", "N/A"), + + # NEW: Aggregation metadata (for results_summary.json) + "aggregation_method": method_desc, + "nodes_contributing": len(per_node_metrics), + "per_node_mean": statistics_dict["mean"], + "per_node_std_dev": statistics_dict["std_dev"], + "per_node_cv": statistics_dict["coefficient_variation"] + } + + return aggregated_record + + def _write_to_perf_csv(self, perf_data: Dict): + """ + Write performance data to local perf.csv file. + + Uses the same format as local execution for consistency. + Matches the schema from container_runner.py's create_run_details_dict(). + """ + import csv + from pathlib import Path + + perf_csv_path = Path("perf.csv") + + # Check if file exists to determine if we need headers + file_exists = perf_csv_path.exists() + + # CSV headers matching local execution format EXACTLY + # This is the same order as in container_runner.py line 69 + # Enhanced with topology fields for multi-node tracking + headers = [ + "model", + "n_gpus", + "nnodes", # NEW: Number of nodes + "gpus_per_node", # NEW: GPUs per node + "training_precision", + "pipeline", + "args", + "tags", + "docker_file", + "base_docker", + "docker_sha", + "docker_image", + "git_commit", + "machine_name", + "deployment_type", + "launcher", # Execution launcher (native, docker, torchrun, etc.) + "gpu_architecture", + "performance", + "metric", + "relative_change", + "status", + "build_duration", + "test_duration", + "dataname", + "data_provider_type", + "data_size", + "data_download_duration", + "build_number", + "additional_docker_run_options", + ] + + # Write to CSV + with open(perf_csv_path, 'a', newline='') as f: + writer = csv.DictWriter(f, fieldnames=headers, extrasaction='ignore') + + # Write headers if new file + if not file_exists: + writer.writeheader() + + # Write data row (only fields in headers will be written) + writer.writerow(perf_data) + + def cleanup(self, deployment_id: str) -> bool: + """Delete Job, ConfigMap, Service and associated pods.""" + success = True + + try: + # Delete Job (propagates to pods) + self.batch_v1.delete_namespaced_job( + name=deployment_id, + namespace=self.namespace, + propagation_policy="Background", + ) + self.console.print(f"[yellow]Deleted K8s Job: {deployment_id}[/yellow]") + except ApiException as e: + if e.status != 404: + self.console.print(f"[yellow]⚠ Job cleanup warning: {e.reason}[/yellow]") + success = False + except Exception as e: + self.console.print(f"[yellow]⚠ Job cleanup error: {e}[/yellow]") + success = False + + # Delete ConfigMap + try: + configmap_name = f"{deployment_id}-config" + self.core_v1.delete_namespaced_config_map( + name=configmap_name, namespace=self.namespace + ) + self.console.print( + f"[yellow]Deleted ConfigMap: {configmap_name}[/yellow]" + ) + except ApiException as e: + if e.status != 404: + self.console.print( + f"[yellow]⚠ ConfigMap cleanup warning: {e.reason}[/yellow]" + ) + except Exception: + pass + + # Delete Service (if exists) + try: + self.core_v1.delete_namespaced_service( + name=deployment_id, namespace=self.namespace + ) + self.console.print(f"[yellow]Deleted Service: {deployment_id}[/yellow]") + except ApiException as e: + if e.status != 404: + pass # Service may not exist for single-node jobs + except Exception: + pass + + return success + diff --git a/src/madengine/deployment/presets/__init__.py b/src/madengine/deployment/presets/__init__.py new file mode 100644 index 00000000..f554fc4f --- /dev/null +++ b/src/madengine/deployment/presets/__init__.py @@ -0,0 +1,6 @@ +""" +Built-in presets for deployment configurations. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + diff --git a/src/madengine/deployment/presets/k8s/__init__.py b/src/madengine/deployment/presets/k8s/__init__.py new file mode 100644 index 00000000..25a33dfa --- /dev/null +++ b/src/madengine/deployment/presets/k8s/__init__.py @@ -0,0 +1,6 @@ +""" +Kubernetes deployment presets. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + diff --git a/src/madengine/deployment/presets/k8s/defaults.json b/src/madengine/deployment/presets/k8s/defaults.json new file mode 100644 index 00000000..5cb54d2c --- /dev/null +++ b/src/madengine/deployment/presets/k8s/defaults.json @@ -0,0 +1,18 @@ +{ + "_comment": "Base Kubernetes defaults - deploy field inferred from presence of k8s field", + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "debug": false, + + "k8s": { + "kubeconfig": "~/.kube/config", + "namespace": "default", + "image_pull_policy": "Always", + "backoff_limit": 3 + }, + + "env_vars": { + "OMP_NUM_THREADS": "8" + } +} + diff --git a/src/madengine/deployment/presets/k8s/gpu-vendors/amd-multi-gpu.json b/src/madengine/deployment/presets/k8s/gpu-vendors/amd-multi-gpu.json new file mode 100644 index 00000000..6e559742 --- /dev/null +++ b/src/madengine/deployment/presets/k8s/gpu-vendors/amd-multi-gpu.json @@ -0,0 +1,16 @@ +{ + "_comment": "AMD multi-GPU optimizations - applied only when distributed/multi-GPU", + "env_vars": { + "NCCL_DEBUG": "WARN", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "GPU_MAX_HW_QUEUES": "2", + "HSA_ENABLE_SDMA": "0", + "MIOPEN_FIND_MODE": "1", + "MIOPEN_USER_DB_PATH": "/tmp/.miopen", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "RCCL_ENABLE_HIPGRAPH": "0" + } +} + diff --git a/src/madengine/deployment/presets/k8s/gpu-vendors/amd.json b/src/madengine/deployment/presets/k8s/gpu-vendors/amd.json new file mode 100644 index 00000000..42069620 --- /dev/null +++ b/src/madengine/deployment/presets/k8s/gpu-vendors/amd.json @@ -0,0 +1,7 @@ +{ + "_comment": "AMD GPU - only resource name, minimal env vars for single GPU", + "k8s": { + "gpu_resource_name": "amd.com/gpu" + } +} + diff --git a/src/madengine/deployment/presets/k8s/gpu-vendors/nvidia.json b/src/madengine/deployment/presets/k8s/gpu-vendors/nvidia.json new file mode 100644 index 00000000..f7831f92 --- /dev/null +++ b/src/madengine/deployment/presets/k8s/gpu-vendors/nvidia.json @@ -0,0 +1,15 @@ +{ + "_comment": "NVIDIA GPU configuration", + "k8s": { + "gpu_resource_name": "nvidia.com/gpu" + }, + "env_vars": { + "NCCL_DEBUG": "INFO", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "NCCL_P2P_DISABLE": "0", + "NCCL_P2P_LEVEL": "NVL", + "OMP_NUM_THREADS": "12" + } +} + diff --git a/src/madengine/deployment/presets/k8s/profiles/multi-gpu.json b/src/madengine/deployment/presets/k8s/profiles/multi-gpu.json new file mode 100644 index 00000000..f92df7f6 --- /dev/null +++ b/src/madengine/deployment/presets/k8s/profiles/multi-gpu.json @@ -0,0 +1,16 @@ +{ + "_comment": "Multi-GPU profile - 2-4 GPUs with torchrun", + "k8s": { + "memory": "64Gi", + "memory_limit": "128Gi", + "cpu": "16", + "cpu_limit": "32" + }, + "distributed": { + "enabled": true, + "backend": "nccl", + "launcher": "torchrun", + "master_port": 29500 + } +} + diff --git a/src/madengine/deployment/presets/k8s/profiles/multi-node.json b/src/madengine/deployment/presets/k8s/profiles/multi-node.json new file mode 100644 index 00000000..3d814f38 --- /dev/null +++ b/src/madengine/deployment/presets/k8s/profiles/multi-node.json @@ -0,0 +1,22 @@ +{ + "_comment": "Multi-node distributed profile", + "k8s": { + "memory": "64Gi", + "memory_limit": "128Gi", + "cpu": "16", + "cpu_limit": "32", + "host_ipc": true + }, + "distributed": { + "enabled": true, + "backend": "nccl", + "launcher": "torchrun", + "master_port": 29500 + }, + "env_vars": { + "NCCL_DEBUG_SUBSYS": "INIT,NET", + "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", + "NCCL_TIMEOUT": "600" + } +} + diff --git a/src/madengine/deployment/presets/k8s/profiles/single-gpu.json b/src/madengine/deployment/presets/k8s/profiles/single-gpu.json new file mode 100644 index 00000000..34106655 --- /dev/null +++ b/src/madengine/deployment/presets/k8s/profiles/single-gpu.json @@ -0,0 +1,11 @@ +{ + "_comment": "Single GPU profile - 1 GPU resources", + "k8s": { + "gpu_count": 1, + "memory": "16Gi", + "memory_limit": "32Gi", + "cpu": "8", + "cpu_limit": "16" + } +} + diff --git a/src/madengine/deployment/presets/slurm/__init__.py b/src/madengine/deployment/presets/slurm/__init__.py new file mode 100644 index 00000000..9d11608c --- /dev/null +++ b/src/madengine/deployment/presets/slurm/__init__.py @@ -0,0 +1,15 @@ +""" +SLURM deployment presets. + +Layered configuration system: +1. defaults.json - Base SLURM defaults +2. profiles/*.json - Workload-specific profiles (single-node, multi-node) +3. User configuration - Highest priority + +Convention over Configuration: +- Presence of "slurm" field → SLURM deployment +- No explicit "deploy" field needed + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + diff --git a/src/madengine/deployment/presets/slurm/defaults.json b/src/madengine/deployment/presets/slurm/defaults.json new file mode 100644 index 00000000..cf4ba978 --- /dev/null +++ b/src/madengine/deployment/presets/slurm/defaults.json @@ -0,0 +1,32 @@ +{ + "_comment": "Base SLURM defaults - deployment type inferred from presence of slurm field", + "_description": "Default configuration for SLURM HPC cluster deployment", + "_note": "Default partition is 'amd-rccl' for AMD RCCL cluster. Override if your cluster uses different partition names.", + "_best_practice": "Use shared storage workspace for multi-node. Single-node auto-detects NFS and uses shared storage when available.", + + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "debug": false, + + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 8, + "time": "24:00:00", + "output_dir": "./slurm_output", + "exclusive": true, + "modules": [] + }, + + "distributed": { + "backend": "nccl", + "port": 29500 + }, + + "env_vars": { + "OMP_NUM_THREADS": "8", + "MIOPEN_FIND_MODE": "1", + "MIOPEN_USER_DB_PATH": "/tmp/.miopen" + } +} + diff --git a/src/madengine/deployment/presets/slurm/profiles/multi-node.json b/src/madengine/deployment/presets/slurm/profiles/multi-node.json new file mode 100644 index 00000000..2e499307 --- /dev/null +++ b/src/madengine/deployment/presets/slurm/profiles/multi-node.json @@ -0,0 +1,30 @@ +{ + "_comment": "Multi-node SLURM profile - optimized for distributed workloads across nodes", + "_description": "Configuration for multi-node distributed execution (training/inference) on SLURM cluster", + + "slurm": { + "nodes": 2, + "gpus_per_node": 8, + "time": "24:00:00" + }, + + "distributed": { + "backend": "nccl", + "port": 29500 + }, + + "env_vars": { + "NCCL_DEBUG": "WARN", + "NCCL_DEBUG_SUBSYS": "INIT,NET", + "NCCL_IB_DISABLE": "1", + "NCCL_SOCKET_IFNAME": "eth0", + "TORCH_NCCL_HIGH_PRIORITY": "1", + "GPU_MAX_HW_QUEUES": "2", + "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", + "NCCL_TIMEOUT": "600", + "HSA_ENABLE_SDMA": "0", + "HSA_FORCE_FINE_GRAIN_PCIE": "1", + "RCCL_ENABLE_HIPGRAPH": "0" + } +} + diff --git a/src/madengine/deployment/presets/slurm/profiles/single-node.json b/src/madengine/deployment/presets/slurm/profiles/single-node.json new file mode 100644 index 00000000..7c62ef7a --- /dev/null +++ b/src/madengine/deployment/presets/slurm/profiles/single-node.json @@ -0,0 +1,15 @@ +{ + "_comment": "Single-node SLURM profile - optimized for single node multi-GPU", + "_description": "Configuration for running on a single SLURM node with multiple GPUs", + + "slurm": { + "nodes": 1, + "gpus_per_node": 8, + "time": "12:00:00" + }, + + "env_vars": { + "NCCL_DEBUG": "WARN" + } +} + diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py new file mode 100644 index 00000000..577be47f --- /dev/null +++ b/src/madengine/deployment/slurm.py @@ -0,0 +1,1255 @@ +#!/usr/bin/env python3 +""" +SLURM Deployment - HPC cluster deployment using CLI commands. + +Uses subprocess to call SLURM CLI commands (sbatch, squeue, scancel). +No Python SLURM library required (zero dependencies). + +**Assumption**: User has already SSH'd to SLURM login node manually. +madengine is executed ON the login node, not remotely. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import os +import subprocess +from pathlib import Path +from typing import Any, Dict + +from jinja2 import Environment, FileSystemLoader + +from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus +from .config_loader import ConfigLoader +from .slurm_node_selector import SlurmNodeSelector +from madengine.utils.gpu_config import resolve_runtime_gpus +from typing import Optional + + +# Valid distributed launchers +VALID_LAUNCHERS = [ + "torchrun", + "torchtitan", + "deepspeed", + "megatron-lm", + "vllm", + "sglang", + "sglang-disagg" +] + + +def normalize_launcher(launcher_type: Optional[str], deployment_type: str) -> str: + """ + Normalize launcher field based on deployment type and launcher value. + + Logic: + - If launcher is in VALID_LAUNCHERS: keep as-is + - If launcher is None/empty/invalid: + * local → "docker" (runs in Docker container) + * slurm → "docker" (typically uses containers on compute nodes) + * kubernetes → "native" (pod itself is the container) + + Args: + launcher_type: Raw launcher type from config (may be None) + deployment_type: "local", "slurm", or "kubernetes" + + Returns: + Normalized launcher string + """ + # If launcher is valid, keep it + if launcher_type and launcher_type in VALID_LAUNCHERS: + return launcher_type + + # Otherwise, default based on deployment type + if deployment_type == "local": + return "docker" + elif deployment_type == "slurm": + return "docker" + elif deployment_type == "kubernetes": + return "native" + else: + # Fallback for unknown deployment types + return "docker" + + +def is_rocprofv3_available() -> bool: + """ + Check if rocprofv3 is available on the system. + + rocprofv3 is required for multi-node profiling with MPI support. + It's part of rocprofiler-sdk package in ROCm >= 6.4.1. + + Returns: + True if rocprofv3 is available and executable, False otherwise + """ + try: + # Note: rocprofv3 doesn't support --version, use --help instead + result = subprocess.run( + ["rocprofv3", "--help"], + capture_output=True, + text=True, + timeout=5 + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired, OSError): + return False + + +def configure_multi_node_profiling( + nnodes: int, + tools_config: list, + logger +) -> Dict[str, Any]: + """ + Configure profiling for multi-node SLURM runs with rocprofv3 support. + + Industry best practice for multi-node profiling: + - Profile ALL nodes to detect stragglers, load imbalances, and communication bottlenecks + - Use rocprofv3 (MPI-aware) for distributed profiling + - Collect per-node outputs for detailed analysis + + Logic: + 1. Single node (nnodes == 1): Use existing tool behavior + 2. Multi-node (nnodes > 1): + a. Check if rocprofv3 is available + b. If available: Enable per-node profiling, upgrade "rocprof" to "rocprofv3" + c. If not available: Log warning and skip profiling + + Args: + nnodes: Number of nodes in the SLURM deployment + tools_config: List of tool configurations from user + logger: Logger instance for messages + + Returns: + Dictionary with profiling configuration: + - enabled: bool - Whether profiling is enabled + - mode: str - "single_node", "multi_node", or "multi_node_unsupported" + - tools: list - Processed tool configurations + - per_node_collection: bool - Whether to collect from all nodes + """ + if nnodes == 1: + # Single node - existing behavior works fine + return { + "enabled": True, + "mode": "single_node", + "tools": tools_config, + "per_node_collection": False + } + + # Multi-node case - check rocprofv3 availability + if not is_rocprofv3_available(): + logger.warning( + "╔════════════════════════════════════════════════════════════════════════════╗\n" + "║ Multi-Node Profiling Requirements Not Met ║\n" + "╠════════════════════════════════════════════════════════════════════════════╣\n" + "║ Multi-node profiling requires rocprofv3 (MPI-aware profiling support). ║\n" + "║ ║\n" + "║ Current Status: rocprofv3 NOT FOUND on system ║\n" + "║ ║\n" + "║ Profiling will be SKIPPED for this multi-node run. ║\n" + "║ ║\n" + "║ To enable multi-node profiling: ║\n" + "║ • Install rocprofiler-sdk package (ROCm >= 6.4.1) ║\n" + "║ • Command: apt install rocprofiler-sdk ║\n" + "║ • Or upgrade to ROCm 6.4.1 or later ║\n" + "║ ║\n" + "║ Note: Single-node profiling uses rocprof (no rocprofv3 required) ║\n" + "╚════════════════════════════════════════════════════════════════════════════╝" + ) + return { + "enabled": False, + "mode": "multi_node_unsupported", + "tools": [], + "per_node_collection": False + } + + # rocprofv3 is available - enable full multi-node profiling + logger.info(f"✓ Multi-node profiling enabled for {nnodes} nodes (rocprofv3 detected)") + + # Upgrade "rocprof" tools to "rocprofv3" for multi-node compatibility + upgraded_tools = [] + rocprof_upgraded = False + + for tool in tools_config: + tool_name = tool.get("name") if isinstance(tool, dict) else None + + if tool_name == "rocprof": + # Upgrade to rocprofv3 for multi-node MPI support + logger.info( + f" → Upgrading 'rocprof' to 'rocprofv3' for multi-node MPI compatibility" + ) + upgraded_tool = tool.copy() if isinstance(tool, dict) else {"name": "rocprofv3"} + upgraded_tool["name"] = "rocprofv3" + upgraded_tools.append(upgraded_tool) + rocprof_upgraded = True + else: + upgraded_tools.append(tool) + + # Log profiling tools being used + if upgraded_tools: + tool_names = [t.get("name") if isinstance(t, dict) else str(t) for t in upgraded_tools] + logger.info(f" → Multi-node profiling tools: {', '.join(filter(None, tool_names))}") + + # Highlight RCCL trace if present (critical for multi-node communication) + if "rccl_trace" in tool_names: + logger.info(" → ✓ rccl_trace enabled (critical for multi-node communication profiling)") + + return { + "enabled": True, + "mode": "multi_node", + "tools": upgraded_tools, + "per_node_collection": True, + "profiler": "rocprofv3", + "wrapper_mode": "launcher" + } + + +class SlurmDeployment(BaseDeployment): + """ + SLURM HPC cluster deployment using CLI commands. + + **Workflow**: + 1. User: ssh login_node@hpc.example.com + 2. User: madengine run --tags model --additional-context '{"deploy": "slurm", ...}' + 3. madengine: Runs sbatch locally (no SSH needed) + + Uses subprocess to call SLURM CLI commands locally: + - sbatch: Submit jobs to SLURM scheduler + - squeue: Monitor job status + - scancel: Cancel jobs + - scontrol: Get cluster info + + No Python SLURM library required (zero dependencies). + No SSH handling needed (user is already on login node). + """ + + DEPLOYMENT_TYPE = "slurm" + REQUIRED_TOOLS = ["sbatch", "squeue", "scontrol"] # Must be available locally + + def __init__(self, config: DeploymentConfig): + """ + Initialize SLURM deployment. + + Args: + config: Deployment configuration + """ + # Apply intelligent defaults using ConfigLoader + # This merges built-in presets with user configuration + full_config = ConfigLoader.load_slurm_config(config.additional_context) + config.additional_context = full_config + + super().__init__(config) + + # Parse SLURM configuration (now with defaults applied) + self.slurm_config = config.additional_context.get("slurm", {}) + self.distributed_config = config.additional_context.get("distributed", {}) + + # SLURM parameters + self.partition = self.slurm_config.get("partition", "gpu") + self.nodes = self.slurm_config.get("nodes", 1) + self.gpus_per_node = self.slurm_config.get("gpus_per_node", 8) + self.time_limit = self.slurm_config.get("time", "24:00:00") + self.output_dir = Path(self.slurm_config.get("output_dir", "./slurm_output")) + + # Setup Jinja2 template engine + template_dir = Path(__file__).parent / "templates" / "slurm" + self.jinja_env = Environment(loader=FileSystemLoader(str(template_dir))) + + # Register custom Jinja2 filters + self.jinja_env.filters['dirname'] = lambda path: str(Path(path).parent) + self.jinja_env.filters['basename'] = lambda path: str(Path(path).name) + + # Generated script path + self.script_path = None + + def validate(self) -> bool: + """Validate SLURM commands are available locally.""" + # Check required SLURM CLI tools + for tool in self.REQUIRED_TOOLS: + result = subprocess.run( + ["which", tool], capture_output=True, timeout=5 + ) + if result.returncode != 0: + self.console.print( + f"[red]✗ Required tool not found: {tool}[/red]\n" + f"[yellow]Make sure you are on a SLURM login node[/yellow]" + ) + return False + + # Verify we can query SLURM cluster + result = subprocess.run(["sinfo", "-h"], capture_output=True, timeout=10) + if result.returncode != 0: + self.console.print("[red]✗ Cannot query SLURM (sinfo failed)[/red]") + return False + + # Validate configuration + if self.nodes < 1: + self.console.print(f"[red]✗ Invalid nodes: {self.nodes}[/red]") + return False + + if self.gpus_per_node < 1: + self.console.print(f"[red]✗ Invalid GPUs per node: {self.gpus_per_node}[/red]") + return False + + self.console.print("[green]✓ SLURM environment validated[/green]") + return True + + def _validate_cli_availability(self) -> bool: + """ + Validate madengine is available before job submission. + + Compute nodes inherit the submission environment, so madengine + must be available in PATH on the submission node. + + Returns: + bool: True if madengine is available and functional + """ + try: + result = subprocess.run( + ["madengine", "--version"], + capture_output=True, + text=True, + timeout=5, + check=False + ) + if result.returncode == 0: + version = result.stdout.strip() or "unknown" + self.console.print( + f"[green]✓[/green] madengine available: [cyan]{version}[/cyan]" + ) + + # Show path for transparency + which_result = subprocess.run( + ["which", "madengine"], + capture_output=True, + text=True, + check=False + ) + if which_result.returncode == 0: + cli_path = which_result.stdout.strip() + self.console.print(f" Path: [dim]{cli_path}[/dim]") + + return True + else: + self.console.print( + "[red]✗ madengine found but returned error[/red]" + ) + if result.stderr: + self.console.print(f" Error: {result.stderr.strip()}") + return False + + except FileNotFoundError: + self.console.print( + "\n[red]✗ ERROR: madengine not found[/red]\n" + ) + self.console.print( + "[yellow]Compute nodes need madengine in PATH.[/yellow]\n" + "\n[bold]To fix:[/bold]\n" + " 1. Activate virtual environment: [cyan]source venv/bin/activate[/cyan]\n" + " 2. Install madengine:\n" + " • Development: [cyan]pip install -e .[/cyan]\n" + " • Production: [cyan]pip install madengine[/cyan]\n" + " 3. Verify: [cyan]madengine --version[/cyan]\n" + ) + return False + except subprocess.TimeoutExpired: + self.console.print("[red]✗ madengine command timed out[/red]") + return False + except Exception as e: + self.console.print(f"[red]✗ Error checking madengine: {e}[/red]") + return False + + def prepare(self) -> bool: + """Generate sbatch script from template.""" + # Validate environment BEFORE generating job scripts + self.console.print("\n[bold]Validating submission environment...[/bold]") + if not self._validate_cli_availability(): + self.console.print( + "\n[yellow]⚠ Tip: Compute nodes inherit your submission environment[/yellow]" + ) + return False + + try: + self.output_dir.mkdir(parents=True, exist_ok=True) + + # Get model info from manifest + model_keys = list(self.manifest["built_models"].keys()) + if not model_keys: + raise ValueError("No models in manifest") + + model_key = model_keys[0] + model_info = self.manifest["built_models"][model_key] + + # Prepare template context + context = self._prepare_template_context(model_info) + + # Render template + template = self.jinja_env.get_template("job.sh.j2") + script_content = template.render(**context) + + # Save script + self.script_path = self.output_dir / f"madengine_{model_info['name']}.sh" + self.script_path.write_text(script_content) + self.script_path.chmod(0o755) + + self.console.print( + f"[green]✓ Generated sbatch script: {self.script_path}[/green]" + ) + return True + + except Exception as e: + self.console.print(f"[red]✗ Failed to generate script: {e}[/red]") + return False + + def _prepare_template_context(self, model_info: Dict) -> Dict[str, Any]: + """Prepare context for Jinja2 template rendering.""" + # Use hierarchical GPU resolution: runtime > deployment > model > default + additional_context = self.config.additional_context.copy() + additional_context["slurm"] = self.slurm_config + resolved_gpus_per_node = resolve_runtime_gpus(model_info, additional_context) + + # Extract launcher configuration + launcher_type = self.distributed_config.get("launcher", "torchrun") # Default to torchrun + + # Normalize launcher based on deployment type and validity + launcher_type = normalize_launcher(launcher_type, "slurm") + + nnodes = self.distributed_config.get("nnodes", self.nodes) + nproc_per_node = self.distributed_config.get("nproc_per_node", resolved_gpus_per_node) + master_port = self.distributed_config.get("port", 29500) + + # Apply multi-node profiling logic if tools are configured + tools = additional_context.get("tools", []) + if nnodes > 1 and tools: + # Configure multi-node profiling (handles rocprofv3 detection and tool upgrades) + # Create a simple logger wrapper for configure_multi_node_profiling + class ConsoleLogger: + def __init__(self, console): + self.console = console + def info(self, msg): + self.console.print(f"[cyan]{msg}[/cyan]") + def warning(self, msg): + self.console.print(f"[yellow]{msg}[/yellow]") + def debug(self, msg): + pass # Skip debug messages in console + + profiling_config = configure_multi_node_profiling( + nnodes=nnodes, + tools_config=tools, + logger=ConsoleLogger(self.console) + ) + + if profiling_config["enabled"]: + tools = profiling_config["tools"] + else: + # rocprofv3 not available - skip profiling for multi-node + tools = [] + + # Update tools in additional_context + additional_context["tools"] = tools + + # Generate launcher-specific command + launcher_command = self._generate_launcher_command( + launcher_type=launcher_type, + nnodes=nnodes, + nproc_per_node=nproc_per_node, + master_port=master_port + ) + + return { + "model_name": model_info["name"], + "manifest_file": os.path.abspath(self.config.manifest_file), + "partition": self.partition, + "nodes": self.nodes, + "gpus_per_node": resolved_gpus_per_node, # Use resolved GPU count + "time_limit": self.time_limit, + "output_dir": str(self.output_dir), + "master_port": master_port, + "distributed_backend": self.distributed_config.get("backend", "nccl"), + "network_interface": self.slurm_config.get("network_interface"), + "exclusive": self.slurm_config.get("exclusive", True), + "exclude": self.slurm_config.get("exclude"), + "constraint": self.slurm_config.get("constraint"), + "qos": self.slurm_config.get("qos"), + "account": self.slurm_config.get("account"), + "modules": self.slurm_config.get("modules", []), + "env_vars": self.config.additional_context.get("env_vars", {}), + "shared_workspace": self.slurm_config.get("shared_workspace"), + "shared_data": self.config.additional_context.get("shared_data"), + "results_dir": self.slurm_config.get("results_dir"), + "timeout": self.config.timeout, + "live_output": self.config.additional_context.get("live_output", False), + "tags": " ".join(model_info.get("tags", [])), + "credential_file": "credential.json" + if Path("credential.json").exists() + else None, + "data_file": "data.json" if Path("data.json").exists() else None, + # Launcher configuration + "launcher_type": launcher_type, + "launcher_command": launcher_command, + "nnodes": nnodes, + "nproc_per_node": nproc_per_node, + # Profiling tools (processed for multi-node compatibility) + "tools": tools, + } + + def _generate_launcher_command( + self, launcher_type: str, nnodes: int, nproc_per_node: int, master_port: int + ) -> str: + """ + Generate launcher-specific command based on launcher type. + + Follows k8s pattern: different launchers have different command generation. + + Args: + launcher_type: Type of launcher (torchrun, vllm, sglang, deepspeed, etc.) + nnodes: Number of nodes + nproc_per_node: GPUs per node + master_port: Master communication port + + Returns: + Launcher-specific environment setup and command string + """ + if launcher_type == "torchrun": + return self._generate_torchrun_command(nnodes, nproc_per_node, master_port) + elif launcher_type == "vllm": + return self._generate_vllm_command(nnodes, nproc_per_node, master_port) + elif launcher_type == "sglang": + return self._generate_sglang_command(nnodes, nproc_per_node, master_port) + elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": + return self._generate_sglang_disagg_command(nnodes, nproc_per_node, master_port) + elif launcher_type == "deepspeed": + return self._generate_deepspeed_command(nnodes, nproc_per_node, master_port) + elif launcher_type == "megatron": + return self._generate_megatron_command(nnodes, nproc_per_node, master_port) + elif launcher_type == "torchtitan": + return self._generate_torchtitan_command(nnodes, nproc_per_node, master_port) + else: + # For unknown launchers, provide basic environment variables + # and let the model script handle launcher invocation + self.console.print( + f"[yellow]Warning: Unknown launcher type '{launcher_type}'. " + f"Using basic environment setup.[/yellow]" + ) + return self._generate_basic_env_command(nnodes, nproc_per_node, master_port) + + def _generate_torchrun_command( + self, nnodes: int, nproc_per_node: int, master_port: int + ) -> str: + """ + Generate torchrun launcher command for SLURM. + + For single-node (nnodes=1): Uses standalone mode + For multi-node (nnodes>1): Uses distributed mode with SLURM environment + + Args: + nnodes: Number of nodes + nproc_per_node: GPUs per node + master_port: Master port + + Returns: + MAD_MULTI_NODE_RUNNER environment variable setup + """ + if nnodes == 1: + return f'export MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node={nproc_per_node}"' + else: + # Multi-node: Build command with SLURM_PROCID for node_rank + return f'''# Multi-node torchrun setup +export MAD_MULTI_NODE_RUNNER="torchrun --nnodes={nnodes} --nproc_per_node={nproc_per_node} --node_rank=${{NODE_RANK}} --master_addr=${{MASTER_ADDR}} --master_port={master_port}"''' + + def _generate_vllm_command( + self, nnodes: int, nproc_per_node: int, master_port: int + ) -> str: + """ + Generate vLLM launcher environment variables. + + vLLM manages its own process spawning - no torchrun needed. + Model script directly invokes vLLM with tensor/pipeline parallelism. + + Args: + nnodes: Number of nodes + nproc_per_node: GPUs per node + master_port: Master port + + Returns: + Environment variable setup for vLLM + """ + if nnodes == 1: + return f'''# vLLM single-node setup (Tensor Parallelism) +export VLLM_TENSOR_PARALLEL_SIZE={nproc_per_node} +export VLLM_PIPELINE_PARALLEL_SIZE=1 +export VLLM_DISTRIBUTED_BACKEND="auto" +# vLLM handles its own process management - no MAD_MULTI_NODE_RUNNER needed''' + else: + return f'''# vLLM multi-node setup (TP + PP with Ray) +export VLLM_TENSOR_PARALLEL_SIZE={nproc_per_node} +export VLLM_PIPELINE_PARALLEL_SIZE={nnodes} +export VLLM_DISTRIBUTED_BACKEND="ray" +# vLLM handles its own process management - no MAD_MULTI_NODE_RUNNER needed''' + + def _generate_sglang_command( + self, nnodes: int, nproc_per_node: int, master_port: int + ) -> str: + """ + Generate SGLang launcher environment variables. + + SGLang similar to vLLM - manages its own process spawning. + + Args: + nnodes: Number of nodes + nproc_per_node: GPUs per node + master_port: Master port + + Returns: + Environment variable setup for SGLang + """ + if nnodes == 1: + return f'''# SGLang single-node setup (Tensor Parallelism) +export SGLANG_TENSOR_PARALLEL_SIZE={nproc_per_node} +export SGLANG_PIPELINE_PARALLEL_SIZE=1 +# SGLang handles its own process management - no MAD_MULTI_NODE_RUNNER needed''' + else: + return f'''# SGLang multi-node setup (TP + PP with Ray) +export SGLANG_TENSOR_PARALLEL_SIZE={nproc_per_node} +export SGLANG_PIPELINE_PARALLEL_SIZE={nnodes} +# SGLang handles its own process management - no MAD_MULTI_NODE_RUNNER needed''' + + def _generate_sglang_disagg_command( + self, nnodes: int, nproc_per_node: int, master_port: int + ) -> str: + """ + Generate SGLang Disaggregated launcher environment for SLURM. + + SGLang Disaggregated Architecture: + - Node 0: Proxy (load balancer) + - Nodes 1 to xP: Prefill nodes + - Nodes xP+1 to xP+yD: Decode nodes + + Minimum cluster: 3 nodes (1 proxy + 1 prefill + 1 decode) + + Args: + nnodes: Total number of nodes (must be >= 3) + nproc_per_node: GPUs per node (tensor parallel size) + master_port: Master port for coordination + + Returns: + Environment setup with node role assignment + + Raises: + ValueError: If nnodes < 3 (minimum for disagg) + """ + if nnodes < 3: + raise ValueError( + f"SGLang Disaggregated requires minimum 3 nodes " + f"(1 proxy + 1 prefill + 1 decode), got {nnodes}" + ) + + # Check if custom split is specified in additional_context + sglang_disagg_config = self.config.additional_context.get("distributed", {}).get("sglang_disagg", {}) + prefill_nodes = sglang_disagg_config.get("prefill_nodes") + decode_nodes = sglang_disagg_config.get("decode_nodes") + + if prefill_nodes is not None and decode_nodes is not None: + # User specified custom split - validate + if prefill_nodes < 1 or decode_nodes < 1: + raise ValueError( + f"SGLang Disaggregated requires at least 1 prefill and 1 decode node, " + f"got prefill={prefill_nodes}, decode={decode_nodes}" + ) + if prefill_nodes + decode_nodes + 1 != nnodes: + raise ValueError( + f"Custom split validation failed: " + f"prefill_nodes ({prefill_nodes}) + decode_nodes ({decode_nodes}) + 1 proxy " + f"must equal nnodes ({nnodes}), but got {prefill_nodes + decode_nodes + 1}" + ) + xP = prefill_nodes + yD = decode_nodes + else: + # Default split: use golden ratio for prefill/decode + # For N total nodes: 1 proxy + ~40% prefill + ~60% decode + xP = max(1, (nnodes - 1) * 2 // 5) # ~40% of worker nodes + yD = nnodes - 1 - xP # remaining nodes + + return f'''# SGLang Disaggregated multi-node setup +# ============================================ +# Cluster Configuration: +# Total Nodes: {nnodes} +# Proxy: 1 node (NODE_RANK=0) +# Prefill: {xP} nodes (NODE_RANK=1 to {xP}) +# Decode: {yD} nodes (NODE_RANK={xP+1} to {nnodes-1}) +# ============================================ + +# Export cluster topology +export SGLANG_DISAGG_MODE="enabled" +export SGLANG_DISAGG_PREFILL_NODES={xP} +export SGLANG_DISAGG_DECODE_NODES={yD} +export SGLANG_DISAGG_TOTAL_NODES={nnodes} +export SGLANG_TP_SIZE={nproc_per_node} + +# Master coordination +export MASTER_PORT={master_port} + +# Build node IP list from SLURM +SLURM_NODE_IPS=$(scontrol show hostname ${{SLURM_JOB_NODELIST}} | while read node; do + getent hosts "$node" | awk '{{print $1}}' +done | tr '\\n' ',' | sed 's/,$//') + +export SGLANG_NODE_IPS="$SLURM_NODE_IPS" +export SGLANG_NODE_RANK=${{SLURM_PROCID}} + +echo "==========================================" +echo "SGLang Disaggregated Cluster Info" +echo "==========================================" +echo "Node Rank: $SGLANG_NODE_RANK" +echo "Node IPs: $SGLANG_NODE_IPS" +echo "Prefill Nodes: {xP}" +echo "Decode Nodes: {yD}" +echo "TP Size: {nproc_per_node}" +echo "==========================================" + +# No MAD_MULTI_NODE_RUNNER - SGLang disagg handles process management +# Model script should detect SGLANG_DISAGG_MODE and launch appropriately''' + + def _generate_deepspeed_command( + self, nnodes: int, nproc_per_node: int, master_port: int + ) -> str: + """ + Generate DeepSpeed launcher command. + + DeepSpeed has its own launcher similar to torchrun. + + Args: + nnodes: Number of nodes + nproc_per_node: GPUs per node + master_port: Master port + + Returns: + MAD_MULTI_NODE_RUNNER with deepspeed launcher + """ + if nnodes == 1: + return f'''# DeepSpeed single-node setup +export MAD_MULTI_NODE_RUNNER="deepspeed --num_gpus={nproc_per_node}"''' + else: + return f'''# DeepSpeed multi-node setup +# Generate hostfile dynamically from SLURM +cat > /tmp/deepspeed_hostfile_${{SLURM_JOB_ID}}.txt << EOF +$(scontrol show hostnames $SLURM_JOB_NODELIST | awk -v slots={nproc_per_node} '{{print $1" slots="slots}}') +EOF +export MAD_MULTI_NODE_RUNNER="deepspeed --hostfile=/tmp/deepspeed_hostfile_${{SLURM_JOB_ID}}.txt --master_addr=${{MASTER_ADDR}} --master_port={master_port}"''' + + def _generate_megatron_command( + self, nnodes: int, nproc_per_node: int, master_port: int + ) -> str: + """ + Generate Megatron-LM launcher command. + + Megatron-LM typically uses torchrun but with specific environment variables. + + Args: + nnodes: Number of nodes + nproc_per_node: GPUs per node + master_port: Master port + + Returns: + MAD_MULTI_NODE_RUNNER with megatron-specific setup + """ + # Megatron uses torchrun with Megatron-Core standard environment variables + if nnodes == 1: + return f'''# Megatron-LM single-node setup +export TENSOR_MODEL_PARALLEL_SIZE={min(nproc_per_node, 8)} +export PIPELINE_MODEL_PARALLEL_SIZE=1 +export CONTEXT_PARALLEL_SIZE=1 +export MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node={nproc_per_node}"''' + else: + return f'''# Megatron-LM multi-node setup +export TENSOR_MODEL_PARALLEL_SIZE={nproc_per_node} +export PIPELINE_MODEL_PARALLEL_SIZE={nnodes} +export CONTEXT_PARALLEL_SIZE=1 +export MAD_MULTI_NODE_RUNNER="torchrun --nnodes={nnodes} --nproc_per_node={nproc_per_node} --node_rank=${{NODE_RANK}} --master_addr=${{MASTER_ADDR}} --master_port={master_port}"''' + + def _generate_torchtitan_command( + self, nnodes: int, nproc_per_node: int, master_port: int + ) -> str: + """ + Generate TorchTitan launcher command for SLURM. + + TorchTitan is a PyTorch native platform for LLM pre-training that uses + torchrun as its underlying launcher but requires additional configuration + for multi-dimensional parallelism (FSDP2, Tensor Parallel, Pipeline Parallel). + + Key TorchTitan features: + - Uses TOML configuration files for training setup + - Supports FSDP2, Tensor Parallel, Pipeline Parallel, Context Parallel + - Built on top of torchrun for distributed coordination + + For single-node (nnodes=1): Uses standalone torchrun mode + For multi-node (nnodes>1): Uses distributed torchrun with SLURM environment + + Args: + nnodes: Number of nodes + nproc_per_node: GPUs per node + master_port: Master port + + Returns: + MAD_MULTI_NODE_RUNNER with torchtitan-specific setup + """ + if nnodes == 1: + return f'''# TorchTitan single-node setup +# TorchTitan uses torchrun as underlying launcher +export TORCHTITAN_TENSOR_PARALLEL_SIZE={nproc_per_node} +export TORCHTITAN_PIPELINE_PARALLEL_SIZE=1 +export MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node={nproc_per_node}"''' + else: + # Multi-node: Use torchrun with SLURM coordination + # TorchTitan will detect multi-node and enable appropriate parallelism + return f'''# TorchTitan multi-node setup +# Configure multi-dimensional parallelism for TorchTitan +export TORCHTITAN_TENSOR_PARALLEL_SIZE={nproc_per_node} +export TORCHTITAN_PIPELINE_PARALLEL_SIZE={nnodes} +export TORCHTITAN_FSDP_ENABLED=1 +export TORCHTITAN_CONTEXT_PARALLEL_SIZE=1 + +# Use torchrun as launcher (TorchTitan built on top of it) +export MAD_MULTI_NODE_RUNNER="torchrun --nnodes={nnodes} --nproc_per_node={nproc_per_node} --node_rank=${{NODE_RANK}} --master_addr=${{MASTER_ADDR}} --master_port={master_port}"''' + + def _generate_basic_env_command( + self, nnodes: int, nproc_per_node: int, master_port: int + ) -> str: + """ + Generate basic environment variables for unknown launchers. + + Provides standard distributed execution environment variables + and lets the model script handle launcher invocation. + + Args: + nnodes: Number of nodes + nproc_per_node: GPUs per node + master_port: Master port + + Returns: + Basic environment variable setup + """ + return f'''# Basic distributed environment (custom launcher) +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} +export MASTER_PORT={master_port} +# Model script should handle launcher invocation''' + + def deploy(self) -> DeploymentResult: + """Submit sbatch script to SLURM scheduler (locally).""" + if not self.script_path or not self.script_path.exists(): + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id="", + message="Script not generated. Run prepare() first.", + ) + + # ==================== PREFLIGHT NODE SELECTION ==================== + # For multi-node jobs with Ray/vLLM, check for clean nodes first + # to avoid OOM errors from stale processes + enable_preflight = self.slurm_config.get("enable_node_check", True) + auto_cleanup = self.slurm_config.get("auto_cleanup_nodes", False) + + if enable_preflight and self.nodes > 1: + try: + selector = SlurmNodeSelector( + console=self.console, + auto_cleanup=auto_cleanup, + verbose=self.slurm_config.get("verbose_node_check", False), + ) + + # Select clean nodes and get updated exclude list + clean_nodes, updated_exclude = selector.select_nodes( + partition=self.partition, + nodes_needed=self.nodes, + exclude=self.slurm_config.get("exclude"), + constraint=self.slurm_config.get("constraint"), + ) + + # Update exclude list if dirty nodes found + if updated_exclude and updated_exclude != self.slurm_config.get("exclude", ""): + self.console.print( + f"[dim]Updated exclude list for sbatch: {updated_exclude}[/dim]\n" + ) + # Re-generate script with updated exclude list + self.slurm_config["exclude"] = updated_exclude + self.prepare() # Re-generate sbatch script + + except Exception as e: + # Don't fail deployment if preflight fails + self.console.print( + f"[yellow]⚠ Node health check failed: {e}[/yellow]" + ) + self.console.print("[dim]Continuing with job submission[/dim]\n") + # ==================== END PREFLIGHT ==================== + + try: + # Submit job to SLURM (runs locally on login node) + result = subprocess.run( + ["sbatch", str(self.script_path)], + capture_output=True, + text=True, + timeout=30, + ) + + if result.returncode == 0: + # Parse job ID: "Submitted batch job 12345" + job_id = result.stdout.strip().split()[-1] + + self.console.print(f"[green]✓ Submitted SLURM job: {job_id}[/green]") + self.console.print(f" Nodes: {self.nodes} x {self.gpus_per_node} GPUs") + self.console.print(f" Partition: {self.partition}") + + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=job_id, + message=f"SLURM job {job_id} submitted successfully", + logs_path=str(self.output_dir), + ) + else: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id="", + message=f"sbatch failed: {result.stderr}", + ) + + except subprocess.TimeoutExpired: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id="", + message="sbatch submission timed out", + ) + except Exception as e: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id="", + message=f"Deployment error: {str(e)}", + ) + + def monitor(self, deployment_id: str) -> DeploymentResult: + """Check SLURM job status (locally).""" + try: + # Query job status using squeue (runs locally) + result = subprocess.run( + ["squeue", "-j", deployment_id, "-h", "-o", "%T"], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode != 0 or not result.stdout.strip(): + # Job not found in queue - likely completed or failed + return self._check_job_completion(deployment_id) + + status = result.stdout.strip().upper() + + # Check if live output is enabled + live_output = self.config.additional_context.get("live_output", False) + + # Stream work node output if live_output is enabled and job is running + if status == "RUNNING" and live_output: + self._stream_job_output(deployment_id) + + if status in ["RUNNING", "PENDING", "CONFIGURING", "COMPLETING"]: + # COMPLETING is a transient state before COMPLETED - treat as running + return DeploymentResult( + status=DeploymentStatus.RUNNING, + deployment_id=deployment_id, + message=f"Job {deployment_id} is {status.lower()}", + ) + elif status in ["COMPLETED"]: + # Show final output only if live_output is enabled + if live_output: + self._stream_job_output(deployment_id, final=True) + else: + self._show_log_summary(deployment_id, success=True) + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=deployment_id, + message=f"Job {deployment_id} completed successfully", + ) + else: # FAILED, CANCELLED, TIMEOUT, NODE_FAIL, etc. + # Show output on failure or show summary + if live_output: + self._stream_job_output(deployment_id, final=True) + else: + self._show_log_summary(deployment_id, success=False) + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=deployment_id, + message=f"Job {deployment_id} {status.lower()}", + ) + + except Exception as e: + self.console.print(f"[red]Monitor exception for job {deployment_id}: {e}[/red]") + import traceback + self.console.print(f"[dim red]{traceback.format_exc()}[/dim red]") + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=deployment_id, + message=f"Monitor error: {str(e)}", + ) + + def _stream_job_output(self, job_id: str, final: bool = False): + """Stream output from SLURM job output file.""" + # Track last position read from output file + if not hasattr(self, '_output_positions'): + self._output_positions = {} + + # Find output file + output_dir = self.slurm_config.get("output_dir", "./slurm_output") + output_pattern = f"{output_dir}/madengine-*_{job_id}_*.out" + + try: + import glob + output_files = glob.glob(output_pattern) + + if not output_files: + return # Output file not created yet + + output_file = output_files[0] # Use first match + + # Read new content from file + try: + with open(output_file, 'r') as f: + # Seek to last position + last_pos = self._output_positions.get(job_id, 0) + f.seek(last_pos) + + # Read new lines + new_content = f.read() + + if new_content: + # Print new output with prefix + for line in new_content.splitlines(): + if line.strip(): # Skip empty lines + self.console.print(f"[dim cyan]│[/dim cyan] {line}") + + # Update position + self._output_positions[job_id] = f.tell() + + except FileNotFoundError: + pass # File not ready yet + + except Exception as e: + # Silently ignore streaming errors to not disrupt monitoring + if final: + self.console.print(f"[dim yellow]Note: Could not stream output: {e}[/dim yellow]") + + def _show_log_summary(self, job_id: str, success: bool = True): + """Show a summary with pointers to log files instead of streaming verbose output.""" + output_dir = self.slurm_config.get("output_dir", "./slurm_output") + + try: + import glob + # Find output and error files for this job + output_files = glob.glob(f"{output_dir}/madengine-*_{job_id}_*.out") + error_files = glob.glob(f"{output_dir}/madengine-*_{job_id}_*.err") + + if output_files or error_files: + status_symbol = "✓" if success else "✗" + status_color = "green" if success else "red" + + self.console.print(f"[{status_color}]{status_symbol}[/{status_color}] SLURM job {job_id} logs saved to:") + + for out_file in output_files: + self.console.print(f" [cyan]→[/cyan] Output: {out_file}") + + for err_file in error_files: + # Check if error file has content + if os.path.exists(err_file) and os.path.getsize(err_file) > 0: + self.console.print(f" [yellow]→[/yellow] Errors: {err_file}") + + if not success and error_files: + # Show last few lines of error file for failed jobs + for err_file in error_files: + if os.path.exists(err_file) and os.path.getsize(err_file) > 0: + self.console.print(f"\n[yellow]Last 10 lines of error log:[/yellow]") + try: + with open(err_file, 'r') as f: + lines = f.readlines() + for line in lines[-10:]: + if line.strip(): + self.console.print(f" {line.rstrip()}") + except Exception: + pass + break # Only show first error file + else: + self.console.print(f"[dim yellow]Note: Log files for job {job_id} not found in {output_dir}[/dim yellow]") + + except Exception as e: + self.console.print(f"[dim yellow]Note: Could not locate log files: {e}[/dim yellow]") + + def _check_job_completion(self, job_id: str) -> DeploymentResult: + """Check completed job status using sacct (locally).""" + try: + result = subprocess.run( + ["sacct", "-j", job_id, "-n", "-X", "-o", "State"], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode == 0: + status = result.stdout.strip().upper() + self.console.print(f"[dim]SLURM job {job_id} final status: {status}[/dim]") + + # Check if live output is enabled + live_output = self.config.additional_context.get("live_output", False) + + if "COMPLETED" in status: + # Show final output or summary based on live_output flag + if live_output: + self._stream_job_output(job_id, final=True) + else: + self._show_log_summary(job_id, success=True) + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=job_id, + message=f"Job {job_id} completed successfully", + ) + else: + # Show output on failure or summary + if live_output: + self._stream_job_output(job_id, final=True) + else: + self._show_log_summary(job_id, success=False) + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=job_id, + message=f"Job {job_id} failed: {status}", + ) + + # Fallback - assume completed + self.console.print(f"[dim yellow]Warning: Could not get status for job {job_id}, assuming success[/dim yellow]") + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=job_id, + message=f"Job {job_id} completed (assumed)", + ) + + except Exception as e: + self.console.print(f"[dim yellow]Warning: Exception checking job {job_id}: {e}[/dim yellow]") + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=job_id, + message=f"Job {job_id} completed (status unavailable)", + ) + + def collect_results(self, deployment_id: str) -> Dict[str, Any]: + """Collect performance results from SLURM output files. + + NOTE: Current implementation works with single-node jobs where perf.csv + is written to shared storage. For multi-node jobs with per-node metrics, + this would need enhancement to: + 1. Read all node output files (madengine-*_jobid_noderank.out) + 2. Parse per-node metrics from each file + 3. Aggregate using _aggregate_node_metrics() (similar to kubernetes.py) + 4. Write aggregated result to perf.csv + + Args: + deployment_id: SLURM job ID + """ + # Get session_start_row from config (passed from orchestrator) + session_start_row = self.config.additional_context.get("session_start_row") + + results = { + "job_id": deployment_id, + "nodes": self.nodes, + "gpus_per_node": self.gpus_per_node, + "perf_files": [], + "logs": [], + "successful_runs": [], + "failed_runs": [], + "session_start_row": session_start_row, # Track for downstream filtering + } + + try: + # Find output files + output_pattern = f"madengine-*_{deployment_id}_*.out" + output_files = list(self.output_dir.glob(output_pattern)) + + results["logs"] = [str(f) for f in output_files] + + # Find performance CSV files + # Strategy 1: Check results_dir if configured + if self.slurm_config.get("results_dir"): + results_dir = Path(self.slurm_config["results_dir"]) + perf_pattern = f"perf_{deployment_id}_*.csv" + perf_files = list(results_dir.glob(perf_pattern)) + results["perf_files"] = [str(f) for f in perf_files] + + # Strategy 2: Check shared workspace (NFS) for perf.csv + # When using shared storage, perf.csv is written directly to workspace + if not results["perf_files"]: + workspace_perf = Path("perf.csv") + if workspace_perf.exists(): + results["perf_files"] = [str(workspace_perf)] + self.console.print("[dim]Note: Using perf.csv from shared workspace[/dim]") + + # Parse perf.csv to populate successful_runs and failed_runs + # Filter based on session_start_row passed as parameter (no external files!) + if results["perf_files"]: + perf_file = Path(results["perf_files"][0]) + try: + import csv + + with open(perf_file, 'r') as f: + reader = csv.DictReader(f) + rows = list(reader) + + # Filter to only include rows from current session if session_start_row provided + if session_start_row is not None and session_start_row < len(rows): + rows = rows[session_start_row:] + self.console.print(f"[cyan]📊 Filtered to current session: {len(rows)} runs (from row {session_start_row} of {len(rows) + session_start_row} total)[/cyan]") + elif session_start_row is not None: + # Session start equals or exceeds current rows - no new runs yet + self.console.print(f"[yellow]⚠️ No new runs in this session (session started at row {session_start_row}, CSV has {len(rows)} rows)[/yellow]") + rows = [] + else: + # No session info provided - show all rows (for backward compatibility) + self.console.print(f"[dim]Showing all {len(rows)} runs from perf.csv (no session filtering)[/dim]") + + for row in rows: + run_data = { + "model": row.get("model", ""), + "status": row.get("status", ""), + "performance": row.get("performance", ""), + "metric": row.get("metric", ""), + "duration": row.get("test_duration", ""), + "gpu_arch": row.get("gpu_architecture", ""), + "deployment": row.get("deployment_type", ""), + "machine": row.get("machine_name", ""), + } + + if row.get("status") == "SUCCESS": + results["successful_runs"].append(run_data) + else: + results["failed_runs"].append(run_data) + except Exception as parse_error: + import traceback + self.console.print(f"[red]ERROR parsing perf.csv: {parse_error}[/red]") + self.console.print(f"[dim]{traceback.format_exc()}[/dim]") + + self.console.print( + f"[green]✓ Collected results: {len(results['perf_files'])} perf files, " + f"{len(results['logs'])} log files[/green]" + ) + + except Exception as e: + self.console.print(f"[yellow]⚠ Results collection incomplete: {e}[/yellow]") + + return results + + def cleanup(self, deployment_id: str) -> bool: + """Cancel SLURM job if still running (locally).""" + try: + subprocess.run( + ["scancel", deployment_id], capture_output=True, timeout=10 + ) + self.console.print(f"[yellow]Cancelled SLURM job: {deployment_id}[/yellow]") + return True + + except Exception as e: + self.console.print(f"[yellow]⚠ Cleanup warning: {e}[/yellow]") + return False + diff --git a/src/madengine/deployment/slurm_node_selector.py b/src/madengine/deployment/slurm_node_selector.py new file mode 100644 index 00000000..b52f53d7 --- /dev/null +++ b/src/madengine/deployment/slurm_node_selector.py @@ -0,0 +1,518 @@ +#!/usr/bin/env python3 +""" +SLURM Node Selector with GPU Cleanup + +Helps SLURM select clean GPU nodes by checking for stale processes before +job submission. Prevents "out of memory" errors in multi-node vLLM/Ray jobs. + +Uses srun (not SSH) to check and clean nodes - works from SLURM login node. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import subprocess +import time +from dataclasses import dataclass +from enum import Enum +from typing import List, Optional, Tuple + +from rich.console import Console +from rich.table import Table + + +class NodeHealth(Enum): + """Health status of a compute node.""" + CLEAN = "clean" # No stale processes, ready to use + DIRTY = "dirty" # Has stale Ray/vLLM processes + UNREACHABLE = "unreachable" # Cannot connect to node + UNKNOWN = "unknown" # Status check failed + + +@dataclass +class NodeStatus: + """Status of a compute node's GPUs.""" + node: str + health: NodeHealth + gpu_memory_used_gb: float + gpu_memory_total_gb: float + process_count: int + error_message: Optional[str] = None + + @property + def memory_free_gb(self) -> float: + """Calculate free GPU memory.""" + return self.gpu_memory_total_gb - self.gpu_memory_used_gb + + @property + def memory_usage_percent(self) -> float: + """Calculate memory usage percentage.""" + if self.gpu_memory_total_gb == 0: + return 0.0 + return (self.gpu_memory_used_gb / self.gpu_memory_total_gb) * 100 + + +class SlurmNodeSelector: + """ + Selects clean GPU nodes for SLURM job allocation. + + Checks candidate nodes for stale Ray/vLLM processes that would cause + OOM errors. Can automatically clean dirty nodes or recommend exclusion. + """ + + # Memory threshold: nodes with >50GB used are considered dirty + MEMORY_THRESHOLD_GB = 50.0 + + # Process patterns that indicate stale processes + STALE_PATTERNS = ["ray::", "RayWorkerWrapper", "raylet", "vllm"] + + def __init__( + self, + console: Optional[Console] = None, + auto_cleanup: bool = False, + verbose: bool = False, + timeout: int = 30, + ): + """ + Initialize node selector. + + Args: + console: Rich console for output + auto_cleanup: Automatically clean dirty nodes + verbose: Enable verbose logging + timeout: Timeout for srun commands (seconds) + """ + self.console = console or Console() + self.auto_cleanup = auto_cleanup + self.verbose = verbose + self.timeout = timeout + + def get_candidate_nodes( + self, + partition: str, + count: int, + exclude: Optional[str] = None, + constraint: Optional[str] = None, + ) -> Optional[List[str]]: + """ + Query SLURM for candidate nodes in partition. + + Args: + partition: SLURM partition name + count: Number of nodes needed + exclude: Comma-separated nodes to exclude + constraint: SLURM constraint filter + + Returns: + List of candidate node names (2x count for redundancy) + """ + cmd = [ + "sinfo", + "-p", partition, + "-N", # Node-oriented format + "-h", # No header + "-o", "%N", # Node name only + "-t", "idle,alloc,mix", # Available states + ] + + if constraint: + cmd.extend(["-C", constraint]) + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode != 0: + if self.verbose: + self.console.print( + f"[yellow]⚠ sinfo failed: {result.stderr}[/yellow]" + ) + return None + + # Parse nodes + all_nodes = set() + for line in result.stdout.strip().split('\n'): + line = line.strip() + if line: + # Handle node ranges like "node[01-04]" + all_nodes.add(line) + + # Remove excluded nodes + if exclude: + excluded = set(exclude.split(',')) + all_nodes -= excluded + + # Return 2x count for redundancy (check more nodes than needed) + candidates = sorted(list(all_nodes))[:(count * 2)] + + return candidates + + except subprocess.TimeoutExpired: + self.console.print("[yellow]⚠ sinfo timed out[/yellow]") + return None + except Exception as e: + if self.verbose: + self.console.print(f"[yellow]⚠ Query failed: {e}[/yellow]") + return None + + def check_node_health(self, node: str) -> NodeStatus: + """ + Check GPU health on a node using srun. + + Uses srun to execute GPU check on the node without SSH. + Checks for stale Ray/vLLM processes and GPU memory usage. + + Args: + node: Node name to check + + Returns: + NodeStatus with health information + """ + # GPU check script (runs on compute node) + check_script = """ +set -e + +# Try amd-smi first, then rocm-smi +if command -v amd-smi &> /dev/null; then + GPU_TOOL="amd-smi" + GPU_INFO=$(amd-smi list 2>/dev/null || echo "GPU_CHECK_FAILED") +elif command -v rocm-smi &> /dev/null; then + GPU_TOOL="rocm-smi" + GPU_INFO=$(rocm-smi 2>/dev/null || echo "GPU_CHECK_FAILED") +else + echo "NO_GPU_TOOL_FOUND" + exit 1 +fi + +echo "===GPU_INFO===" +echo "$GPU_INFO" +echo "===END_GPU_INFO===" + +# Check for stale processes +echo "===PROCESSES===" +ps aux | grep -E "(ray::|RayWorkerWrapper|raylet|vllm)" | grep -v grep || echo "NO_PROCESSES" +echo "===END_PROCESSES===" +""" + + try: + # Use srun to execute check on specific node + result = subprocess.run( + [ + "srun", + f"--nodelist={node}", + "--ntasks=1", + "--time=00:01:00", + "--overlap", # Allow overlap with running jobs + "--quiet", + "bash", "-c", check_script + ], + capture_output=True, + text=True, + timeout=self.timeout, + ) + + if result.returncode != 0: + return NodeStatus( + node=node, + health=NodeHealth.UNREACHABLE, + gpu_memory_used_gb=0.0, + gpu_memory_total_gb=0.0, + process_count=0, + error_message=f"srun failed: {result.stderr[:100]}", + ) + + # Parse output + output = result.stdout + + # Extract GPU info + gpu_info = self._extract_section(output, "===GPU_INFO===", "===END_GPU_INFO===") + processes = self._extract_section(output, "===PROCESSES===", "===END_PROCESSES===") + + # Parse GPU memory (simplified - in production would parse actual output) + # For MI300X: typically 192GB per GPU + total_memory_gb = 192.0 * 4 # Assume 4 GPUs + + # Count processes + process_count = 0 + if processes and "NO_PROCESSES" not in processes: + process_count = len([l for l in processes.split('\n') if l.strip()]) + + # Estimate memory usage + # Rough heuristic: each process uses ~45GB (observed from Job 2437) + used_memory_gb = process_count * 45.0 + + # Determine health + if process_count == 0: + health = NodeHealth.CLEAN + elif used_memory_gb > self.MEMORY_THRESHOLD_GB: + health = NodeHealth.DIRTY + else: + health = NodeHealth.CLEAN # Minor processes, should be OK + + return NodeStatus( + node=node, + health=health, + gpu_memory_used_gb=used_memory_gb, + gpu_memory_total_gb=total_memory_gb, + process_count=process_count, + ) + + except subprocess.TimeoutExpired: + return NodeStatus( + node=node, + health=NodeHealth.UNREACHABLE, + gpu_memory_used_gb=0.0, + gpu_memory_total_gb=0.0, + process_count=0, + error_message="Timeout", + ) + except Exception as e: + return NodeStatus( + node=node, + health=NodeHealth.UNKNOWN, + gpu_memory_used_gb=0.0, + gpu_memory_total_gb=0.0, + process_count=0, + error_message=str(e)[:100], + ) + + def cleanup_node(self, node: str) -> bool: + """ + Clean up stale processes on a node using srun. + + Args: + node: Node name to clean + + Returns: + True if cleanup successful + """ + # Cleanup script (consolidated from bash scripts) + cleanup_script = """ +# Kill Ray processes +pkill -9 -f "ray::" 2>/dev/null || true +pkill -9 -f "RayWorkerWrapper" 2>/dev/null || true +pkill -9 -f "raylet" 2>/dev/null || true + +# Kill vLLM processes +pkill -9 -f "vllm" 2>/dev/null || true + +# Kill Ray Python workers +pgrep -f "ray/_private/workers" | xargs -r kill -9 2>/dev/null || true + +# Give processes time to die +sleep 2 + +echo "CLEANUP_OK" +""" + + try: + result = subprocess.run( + [ + "srun", + f"--nodelist={node}", + "--ntasks=1", + "--time=00:01:00", + "--overlap", + "--quiet", + "bash", "-c", cleanup_script + ], + capture_output=True, + text=True, + timeout=self.timeout, + ) + + success = result.returncode == 0 and "CLEANUP_OK" in result.stdout + + if success and self.verbose: + self.console.print(f"[green] ✓ Cleaned {node}[/green]") + + return success + + except Exception as e: + if self.verbose: + self.console.print(f"[yellow] ⚠ Cleanup failed for {node}: {e}[/yellow]") + return False + + def select_nodes( + self, + partition: str, + nodes_needed: int, + exclude: Optional[str] = None, + constraint: Optional[str] = None, + ) -> Tuple[List[str], str]: + """ + Select clean nodes for SLURM job. + + This is the main entry point. Checks candidate nodes and returns + a list of clean nodes plus an updated exclude list. + + Args: + partition: SLURM partition name + nodes_needed: Number of nodes required for job + exclude: Current exclude list (comma-separated) + constraint: SLURM constraint filter + + Returns: + Tuple of (clean_nodes, updated_exclude_list) + - clean_nodes: List of clean node names (may be empty) + - updated_exclude_list: Comma-separated list to pass to sbatch + """ + self.console.print("\n[bold cyan]🔍 Checking GPU Node Health[/bold cyan]") + self.console.print( + f"Partition: [cyan]{partition}[/cyan] | " + f"Nodes needed: [cyan]{nodes_needed}[/cyan]\n" + ) + + # Get candidate nodes + candidates = self.get_candidate_nodes(partition, nodes_needed, exclude, constraint) + + if not candidates: + self.console.print( + "[yellow]⚠ Cannot query candidate nodes, skipping preflight check[/yellow]\n" + ) + return [], exclude or "" + + if self.verbose: + self.console.print(f"[dim]Checking {len(candidates)} candidate nodes...[/dim]\n") + + # Check health of each candidate + statuses = [] + for node in candidates: + if self.verbose: + self.console.print(f" Checking {node}...", end="") + + status = self.check_node_health(node) + statuses.append(status) + + if self.verbose: + emoji = { + NodeHealth.CLEAN: "✓", + NodeHealth.DIRTY: "⚠", + NodeHealth.UNREACHABLE: "✗", + NodeHealth.UNKNOWN: "?", + }[status.health] + self.console.print(f" {emoji}") + + # Display summary table + self._display_status_table(statuses) + + # Identify dirty nodes + dirty_nodes = [s for s in statuses if s.health == NodeHealth.DIRTY] + clean_nodes = [s.node for s in statuses if s.health == NodeHealth.CLEAN] + + # Handle dirty nodes + if dirty_nodes: + self.console.print( + f"\n[yellow]⚠ Found {len(dirty_nodes)} dirty node(s) " + f"with stale Ray/vLLM processes[/yellow]" + ) + + if self.auto_cleanup: + self.console.print("[yellow]Running automatic cleanup...[/yellow]\n") + + for status in dirty_nodes: + self.console.print(f" Cleaning {status.node}...") + if self.cleanup_node(status.node): + # Re-check after cleanup + time.sleep(2) + new_status = self.check_node_health(status.node) + if new_status.health == NodeHealth.CLEAN: + clean_nodes.append(new_status.node) + self.console.print(f" [green]✓ {status.node} is now clean[/green]") + else: + self.console.print(f" [red]✗ {status.node} still dirty[/red]") + else: + self.console.print(f" [red]✗ Cleanup failed[/red]") + + # Update dirty nodes list + dirty_nodes = [s for s in statuses + if s.health == NodeHealth.DIRTY and s.node not in clean_nodes] + + # Build updated exclude list + dirty_node_names = [s.node for s in dirty_nodes] + existing_exclude = set(exclude.split(',')) if exclude else set() + existing_exclude.update(dirty_node_names) + updated_exclude = ','.join(sorted(existing_exclude)) + + if dirty_node_names: + self.console.print( + f"\n[yellow]Adding dirty nodes to exclude list: " + f"{', '.join(dirty_node_names)}[/yellow]" + ) + else: + updated_exclude = exclude or "" + + # Final summary + if len(clean_nodes) >= nodes_needed: + self.console.print( + f"\n[bold green]✅ Found {len(clean_nodes)} clean nodes " + f"(need {nodes_needed})[/bold green]\n" + ) + elif len(clean_nodes) > 0: + self.console.print( + f"\n[yellow]⚠ Only {len(clean_nodes)} clean nodes found " + f"(need {nodes_needed})[/yellow]" + ) + self.console.print("[yellow]Job may wait for additional nodes to become available[/yellow]\n") + else: + self.console.print( + "\n[red]❌ No clean nodes available[/red]" + ) + self.console.print( + "[yellow]Recommendation: Wait for nodes to be cleaned or run manual cleanup[/yellow]\n" + ) + + return clean_nodes, updated_exclude + + def _extract_section(self, text: str, start_marker: str, end_marker: str) -> str: + """Extract section between markers.""" + try: + start = text.index(start_marker) + len(start_marker) + end = text.index(end_marker, start) + return text[start:end].strip() + except ValueError: + return "" + + def _display_status_table(self, statuses: List[NodeStatus]): + """Display node status in a table.""" + table = Table(title="Node Health Status") + + table.add_column("Node", style="cyan", no_wrap=True) + table.add_column("Health", style="bold") + table.add_column("Memory Used", justify="right") + table.add_column("Processes", justify="right") + table.add_column("Notes", style="dim") + + for status in statuses: + health_style = { + NodeHealth.CLEAN: "green", + NodeHealth.DIRTY: "yellow", + NodeHealth.UNREACHABLE: "red", + NodeHealth.UNKNOWN: "dim", + }[status.health] + + health_text = { + NodeHealth.CLEAN: "✓ Clean", + NodeHealth.DIRTY: "⚠ Dirty", + NodeHealth.UNREACHABLE: "✗ Unreachable", + NodeHealth.UNKNOWN: "? Unknown", + }[status.health] + + memory_text = f"{status.gpu_memory_used_gb:.0f} GB" if status.gpu_memory_used_gb > 0 else "-" + processes_text = str(status.process_count) if status.process_count > 0 else "-" + notes = status.error_message if status.error_message else "" + + table.add_row( + status.node, + f"[{health_style}]{health_text}[/{health_style}]", + memory_text, + processes_text, + notes, + ) + + self.console.print(table) + self.console.print() + diff --git a/src/madengine/deployment/templates/kubernetes/configmap.yaml.j2 b/src/madengine/deployment/templates/kubernetes/configmap.yaml.j2 new file mode 100644 index 00000000..4267119a --- /dev/null +++ b/src/madengine/deployment/templates/kubernetes/configmap.yaml.j2 @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ configmap_name }} + namespace: {{ namespace }} + labels: + app: madengine + model: {{ model_name }} +data: + build_manifest.json: | +{{ manifest_content | indent(4, first=True) }} + credential.json: | +{{ credential_content | indent(4, first=True) }} + {% if data_json_content %} + data.json: | +{{ data_json_content | indent(4, first=True) }} + {% endif %} + {% if model_scripts_contents %} + # Model scripts directory (all .sh, .py, and .json files from scripts folder) + {% for script_path, script_content in model_scripts_contents.items() %} + {{ script_path | replace("/", "-") }}: | +{{ script_content | indent(4, first=True) }} + {% endfor %} + {% endif %} + {% if data_provider_script_content %} + data_provider.sh: | +{{ data_provider_script_content | indent(4, first=True) }} + {% endif %} + {% if common_script_contents %} + # Common scripts (pre_scripts, post_scripts) embedded since madengine not in container + {% for script_path, script_content in common_script_contents.items() %} + {{ script_path | replace("/", "-") }}: | +{{ script_content | indent(4, first=True) }} + {% endfor %} + {% endif %} + diff --git a/src/madengine/deployment/templates/kubernetes/job.yaml.j2 b/src/madengine/deployment/templates/kubernetes/job.yaml.j2 new file mode 100644 index 00000000..bd3e27e1 --- /dev/null +++ b/src/madengine/deployment/templates/kubernetes/job.yaml.j2 @@ -0,0 +1,624 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ job_name }} + namespace: {{ namespace }} + labels: + app: madengine + model: {{ model_name }} + madengine-job: "true" +spec: + completions: {{ completions }} + parallelism: {{ parallelism }} + {% if completion_mode %} + completionMode: {{ completion_mode }} + {% endif %} + backoffLimit: {{ backoff_limit }} + template: + metadata: + labels: + app: madengine + job-name: {{ job_name }} + model: {{ model_name }} + spec: + {% if subdomain %} + subdomain: {{ subdomain }} # Required for DNS in headless service + {% endif %} + restartPolicy: Never + terminationGracePeriodSeconds: 60 + {% if subdomain %} + subdomain: {{ subdomain }} + {% endif %} + {% if node_selector %} + nodeSelector: + {% for key, value in node_selector.items() %} + {{ key }}: "{{ value }}" + {% endfor %} + {% endif %} + {% if host_ipc %} + hostIPC: true + {% endif %} + + # Image pull secrets for private registries + imagePullSecrets: + - name: dockerhub-creds + + # Init container extracts madengine scripts from package + initContainers: + - name: extract-scripts + image: {{ image }} + command: ["/bin/bash", "-c"] + args: + - | + set -e + echo "=== Extracting madengine scripts ===" + + # Extract common scripts from ConfigMap (since madengine not installed in container) + {% if common_script_contents %} + echo "Extracting common scripts from ConfigMap..." + {% for script_path, script_content in common_script_contents.items() %} + mkdir -p /workspace/{{ script_path | dirname }} + cp /config/{{ script_path | replace("/", "-") }} /workspace/{{ script_path }} + chmod +x /workspace/{{ script_path }} 2>/dev/null || true + {% endfor %} + echo "✓ Extracted {{ common_script_contents | length }} common script(s)" + {% else %} + echo "No common scripts to extract" + {% endif %} + + # Copy K8s data provider script from ConfigMap if it exists + if [ -f /config/data_provider.sh ]; then + echo "Copying data_provider.sh to /workspace/data_provider.sh" + cp /config/data_provider.sh /workspace/data_provider.sh + chmod +x /workspace/data_provider.sh + echo "✓ Copied K8s data provider script" + fi + + # Extract model scripts directory (all .sh, .py, and .json files) + {% if model_scripts_contents %} + echo "Extracting model scripts directory..." + {% for script_path, _ in model_scripts_contents.items() %} + {% set config_key = script_path | replace("/", "-") %} + {% set script_dir = script_path | dirname %} + mkdir -p /workspace/{{ script_dir }} + if [ -f /config/{{ config_key }} ]; then + cp /config/{{ config_key }} /workspace/{{ script_path }} + # Only chmod executable files (.sh, .py), not config files (.json) + {% if script_path.endswith('.sh') or script_path.endswith('.py') %} + chmod +x /workspace/{{ script_path }} + {% endif %} + echo " ✓ {{ script_path }}" + fi + {% endfor %} + echo "✓ Extracted {{ model_scripts_contents | length }} model script(s)" + {% else %} + echo "Warning: No model scripts configured" + {% endif %} + + echo "✓ Script extraction complete" + volumeMounts: + - name: workspace + mountPath: /workspace + - name: config + mountPath: /config + readOnly: true + + # Main container runs benchmark + containers: + - name: {{ job_name }} + image: {{ image }} + imagePullPolicy: {{ image_pull_policy }} + workingDir: /workspace + command: ["/bin/bash", "-c"] + args: + - | + set -e + echo "===================================================================" + echo "madengine Kubernetes Benchmark Job" + echo "Model: {{ model_name }}" + echo "Pod: $HOSTNAME" + {% if launcher_type %} + echo "Launcher: {{ launcher_type }}" + {% endif %} + echo "===================================================================" + + # Copy config files from ConfigMap to workspace + cp /config/build_manifest.json /workspace/ + cp /config/credential.json /workspace/ 2>/dev/null || true + cp /config/data.json /workspace/ 2>/dev/null || true + + # GPU Information + if command -v rocm-smi &> /dev/null; then + echo "" + echo "=== AMD GPU Information ===" + rocm-smi || true + fi + + # Set GPU visibility for ROCm/CUDA + # CRITICAL: Ray (vLLM, SGLang) requires ONLY ONE visibility variable + # - AMD GPUs: Use ONLY HIP_VISIBLE_DEVICES + # - NVIDIA GPUs: Use ONLY CUDA_VISIBLE_DEVICES + # Setting both HIP_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES simultaneously + # causes Ray error: "Inconsistent values found" + {% if launcher_type == "vllm" or launcher_type == "sglang" %} + # Ray-based launchers: Detect GPU vendor and set appropriate variable + if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then + # AMD GPU detected - use HIP_VISIBLE_DEVICES ONLY + # CRITICAL: Unset RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES which is set by rocm/vllm image + # This variable tells Ray to ignore HIP_VISIBLE_DEVICES, causing conflicts + unset RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES + export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-{{ gpu_visibility }}} + unset ROCR_VISIBLE_DEVICES # Unset to avoid Ray conflicts + unset CUDA_VISIBLE_DEVICES # Unset to avoid "Inconsistent values" error + echo "🔧 GPU Config (AMD): HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES" + else + # NVIDIA GPU - use CUDA_VISIBLE_DEVICES ONLY + export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-{{ gpu_visibility }}} + unset HIP_VISIBLE_DEVICES # Unset to avoid Ray conflicts + unset ROCR_VISIBLE_DEVICES + echo "🔧 GPU Config (NVIDIA): CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" + fi + {% else %} + # Non-Ray launchers: Set both HIP and ROCR for broader compatibility + export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-{{ gpu_visibility }}} + export ROCR_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES:-{{ gpu_visibility }}} + export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-{{ gpu_visibility }}} + {% endif %} + export MAD_SYSTEM_GPU_ARCHITECTURE={{ gpu_architecture }} + + # K8s environment + export MAD_K8S_POD_NAME=$HOSTNAME + export MAD_K8S_NAMESPACE={{ namespace }} + export MAD_K8S_JOB=true + export MAD_DEPLOYMENT_TYPE=kubernetes + + {% if launcher_type == "torchrun" or launcher_type == "deepspeed" or launcher_type == "megatron" %} + # {{ launcher_type }} distributed environment (auto-configured from K8s) + {% if nnodes > 1 %} + # Multi-node {{ launcher_type }} (Indexed Job) + export JOB_COMPLETION_INDEX=${JOB_COMPLETION_INDEX:-0} + export POD_INDEX=$JOB_COMPLETION_INDEX + {% else %} + # Single-node {{ launcher_type }} + export JOB_COMPLETION_INDEX=0 + {% endif %} + {% endif %} + + # Data provider environment variables + {% if data_config %} + echo "" + echo "=== Setting up data environment ===" + export MAD_DATANAME="{{ data_config.data_name }}" + {% for key, value in data_config.env_vars.items() %} + export {{ key }}="{{ value }}" + {% endfor %} + echo "✓ Data environment configured for: {{ data_config.data_name }}" + {% endif %} + + # Tools configuration environment variables + {% if tools_config %} + echo "" + echo "=== Applying tools configuration ===" + {% for tool in tools_config %} + echo "Tool: {{ tool.name }}" + {% if tool.env_vars %} + {% for key, value in tool.env_vars.items() %} + export {{ key }}="{{ value }}" + {% endfor %} + {% endif %} + {% endfor %} + echo "✓ Tools configuration applied" + {% endif %} + + {% if launcher_command %} + # Launcher-based execution with tools + echo "" + echo "=== Starting benchmark with {{ launcher_type }} ===" + + cd /workspace + + # Download data if data provider is configured + {% if data_provider_script and data_config %} + echo "" + echo "=== Data Provider: {{ data_config.provider_type }} ===" + echo "Data name: {{ data_config.data_name }}" + echo "Source: {{ data_config.source_url }}" + echo "Target: {{ data_config.datahome }}" + + # Use K8s data provider script (loaded from ConfigMap) + if [ -f /workspace/data_provider.sh ]; then + bash /workspace/data_provider.sh \ + "{{ data_config.data_name }}" \ + "{{ data_config.source_url }}" \ + "{{ data_config.datahome }}" + + # Source metrics if available + if [ -f /tmp/mad_metrics.env ]; then + source /tmp/mad_metrics.env + echo "✓ Data metrics: Duration=${MAD_DATA_DOWNLOAD_DURATION}s, Size=${MAD_DATA_SIZE}" + fi + else + echo "Error: Data provider script not found at /workspace/data_provider.sh" + exit 1 + fi + {% endif %} + + # Run pre-scripts (like local execution) + {% if pre_scripts %} + echo "" + echo "=== Running pre-scripts ===" + {% for script in pre_scripts %} + # Execute: {{ script.path }} + if [ -f "{{ script.path }}" ]; then + echo "Executing: {{ script.path }} {% if script.args %}{{ script.args }}{% endif %}" + bash {{ script.path }} {% if script.args %}{{ script.args }}{% endif %} || echo "Warning: {{ script.path }} failed with exit code $?" + else + echo "Warning: Script not found: {{ script.path }}" + fi + {% endfor %} + echo "✓ Pre-scripts completed" + {% else %} + echo "No pre-scripts configured" + {% endif %} + + # Clear MIOpen cache to prevent "Duplicate ID" warnings + echo "" + echo "=== Clearing MIOpen cache ===" + if [ -d "${MIOPEN_USER_DB_PATH:-/tmp/.miopen}" ]; then + rm -rf "${MIOPEN_USER_DB_PATH:-/tmp/.miopen}"/* + echo "✓ Cleared MIOpen cache directory" + fi + mkdir -p "${MIOPEN_USER_DB_PATH:-/tmp/.miopen}" + + # Create wrapper script for launcher + echo "" + echo "=== Running model benchmark with launcher ===" + cat > /tmp/run_launcher.sh << 'LAUNCHER_EOF' + #!/bin/bash + {{ launcher_command | indent(12, first=False) }} + LAUNCHER_EOF + chmod +x /tmp/run_launcher.sh + + {% if tools_config and tools_config|length > 0 %} + # Run with profiling tools + {% for tool in tools_config %} + {% if tool.cmd %} + echo "Using profiling tool: {{ tool.name }}" + {% endif %} + {% endfor %} + {% endif %} + + # Execute launcher with tool chain + MODEL_START_TIME=$(date +%s.%N) + {% if launcher_tool_chain and launcher_tool_chain != "bash /tmp/run_launcher.sh" %} + {{ launcher_tool_chain }} + {% else %} + bash /tmp/run_launcher.sh + {% endif %} + MODEL_EXIT_CODE=$? + MODEL_END_TIME=$(date +%s.%N) + MODEL_DURATION=$(awk "BEGIN {printf \"%.6f\", $MODEL_END_TIME - $MODEL_START_TIME}") + echo "test_duration: ${MODEL_DURATION}s" + + # Run post-scripts (like local execution) + {% if post_scripts %} + echo "" + echo "=== Running post-scripts ===" + {% for script in post_scripts %} + # Execute: {{ script.path }} + if [ -f "{{ script.path }}" ]; then + echo "Executing: {{ script.path }} {% if script.args %}{{ script.args }}{% endif %}" + bash {{ script.path }} {% if script.args %}{{ script.args }}{% endif %} || echo "Warning: {{ script.path }} failed with exit code $?" + else + echo "Warning: Script not found: {{ script.path }}" + fi + {% endfor %} + echo "✓ Post-scripts completed" + {% else %} + echo "No post-scripts configured" + {% endif %} + + # Copy artifacts to PVC shared storage (always enabled) + echo "" + echo "=== Copying artifacts to PVC storage ===" + mkdir -p /results/${HOSTNAME} + + # Copy performance results + if [ -f "perf.csv" ]; then + cp perf.csv /results/${HOSTNAME}/perf.csv + echo "✓ Copied perf.csv" + fi + + # Copy environment details + if ls *_env.csv 1> /dev/null 2>&1; then + cp *_env.csv /results/${HOSTNAME}/ + echo "✓ Copied environment CSV files" + fi + + # Copy profiling outputs (rocprof, rocprofv3) + if ls results* 1> /dev/null 2>&1; then + cp -r results* /results/${HOSTNAME}/ 2>/dev/null || true + echo "✓ Copied profiling results" + fi + if ls *.db 1> /dev/null 2>&1; then + cp *.db /results/${HOSTNAME}/ 2>/dev/null || true + echo "✓ Copied profiling database files" + fi + # Copy rocprofv3 UUID directories + for dir in */; do + if [ -d "$dir" ] && [ -f "${dir}"*_results.db ] 2>/dev/null; then + cp -r "$dir" /results/${HOSTNAME}/ + echo "✓ Copied rocprofv3 directory: $dir" + fi + done + + # Copy tool-specific outputs + if ls gpu_info_*.csv 1> /dev/null 2>&1; then + cp gpu_info_*.csv /results/${HOSTNAME}/ + echo "✓ Copied GPU profiler outputs" + fi + if ls *_trace_output.csv 1> /dev/null 2>&1; then + cp *_trace_output.csv /results/${HOSTNAME}/ + echo "✓ Copied library trace outputs" + fi + if ls prof.csv 1> /dev/null 2>&1; then + cp prof.csv /results/${HOSTNAME}/ + echo "✓ Copied prof.csv" + fi + + echo "✓ All artifacts copied to PVC: /results/${HOSTNAME}/" + + echo "=== Benchmark job completed with exit code $MODEL_EXIT_CODE ===" + exit $MODEL_EXIT_CODE + {% else %} + # Direct script execution + cd /workspace + + # Download data if data provider is configured + {% if data_provider_script and data_config %} + echo "" + echo "=== Data Provider: {{ data_config.provider_type }} ===" + echo "Data name: {{ data_config.data_name }}" + echo "Source: {{ data_config.source_url }}" + echo "Target: {{ data_config.datahome }}" + + # Use K8s data provider script (loaded from ConfigMap) + if [ -f /workspace/data_provider.sh ]; then + bash /workspace/data_provider.sh \ + "{{ data_config.data_name }}" \ + "{{ data_config.source_url }}" \ + "{{ data_config.datahome }}" + + # Source metrics if available + if [ -f /tmp/mad_metrics.env ]; then + source /tmp/mad_metrics.env + echo "✓ Data metrics: Duration=${MAD_DATA_DOWNLOAD_DURATION}s, Size=${MAD_DATA_SIZE}" + fi + else + echo "Error: Data provider script not found at /workspace/data_provider.sh" + exit 1 + fi + {% endif %} + + # Run pre-scripts (like local execution) + {% if pre_scripts %} + echo "" + echo "=== Running pre-scripts ===" + {% for script in pre_scripts %} + # Execute: {{ script.path }} + if [ -f "{{ script.path }}" ]; then + echo "Executing: {{ script.path }} {% if script.args %}{{ script.args }}{% endif %}" + bash {{ script.path }} {% if script.args %}{{ script.args }}{% endif %} || echo "Warning: {{ script.path }} failed with exit code $?" + else + echo "Warning: Script not found: {{ script.path }}" + fi + {% endfor %} + echo "✓ Pre-scripts completed" + {% else %} + echo "No pre-scripts configured" + {% endif %} + + # Clear MIOpen cache to prevent "Duplicate ID" warnings + echo "" + echo "=== Clearing MIOpen cache ===" + if [ -d "${MIOPEN_USER_DB_PATH:-/tmp/.miopen}" ]; then + rm -rf "${MIOPEN_USER_DB_PATH:-/tmp/.miopen}"/* + echo "✓ Cleared MIOpen cache directory" + fi + mkdir -p "${MIOPEN_USER_DB_PATH:-/tmp/.miopen}" + + # Run main model script + echo "" + echo "=== Running model benchmark script ===" + if [ -f "{{ model_script }}" ]; then + {% if tools_config and tools_config|length > 0 %} + # Run with profiling tools + {% for tool in tools_config %} + {% if tool.cmd %} + echo "Using profiling tool: {{ tool.name }}" + {% endif %} + {% endfor %} + {% endif %} + + # Execute script with tool chain + MODEL_START_TIME=$(date +%s.%N) + {% if direct_script_tool_chain and direct_script_tool_chain != "bash " ~ model_script %} + {{ direct_script_tool_chain }} + {% else %} + bash {{ model_script }} + {% endif %} + MODEL_EXIT_CODE=$? + MODEL_END_TIME=$(date +%s.%N) + MODEL_DURATION=$(awk "BEGIN {printf \"%.6f\", $MODEL_END_TIME - $MODEL_START_TIME}") + echo "test_duration: ${MODEL_DURATION}s" + else + echo "ERROR: Script not found: {{ model_script }}" + echo "Available files in /workspace:" + ls -la /workspace/ + echo "" + echo "Available files in /workspace/scripts:" + ls -la /workspace/scripts/ 2>/dev/null || echo "scripts/ directory not found" + exit 1 + fi + + # Run post-scripts (like local execution) + {% if post_scripts %} + echo "" + echo "=== Running post-scripts ===" + {% for script in post_scripts %} + # Execute: {{ script.path }} + if [ -f "{{ script.path }}" ]; then + echo "Executing: {{ script.path }} {% if script.args %}{{ script.args }}{% endif %}" + bash {{ script.path }} {% if script.args %}{{ script.args }}{% endif %} || echo "Warning: {{ script.path }} failed with exit code $?" + else + echo "Warning: Script not found: {{ script.path }}" + fi + {% endfor %} + echo "✓ Post-scripts completed" + {% else %} + echo "No post-scripts configured" + {% endif %} + + # Copy artifacts to PVC shared storage (always enabled) + echo "" + echo "=== Copying artifacts to PVC storage ===" + mkdir -p /results/${HOSTNAME} + + # Copy performance results + if [ -f "perf.csv" ]; then + cp perf.csv /results/${HOSTNAME}/perf.csv + echo "✓ Copied perf.csv" + fi + + # Copy environment details + if ls *_env.csv 1> /dev/null 2>&1; then + cp *_env.csv /results/${HOSTNAME}/ + echo "✓ Copied environment CSV files" + fi + + # Copy profiling outputs (rocprof, rocprofv3) + if ls results* 1> /dev/null 2>&1; then + cp -r results* /results/${HOSTNAME}/ 2>/dev/null || true + echo "✓ Copied profiling results" + fi + if ls *.db 1> /dev/null 2>&1; then + cp *.db /results/${HOSTNAME}/ 2>/dev/null || true + echo "✓ Copied profiling database files" + fi + # Copy rocprofv3 UUID directories + for dir in */; do + if [ -d "$dir" ] && [ -f "${dir}"*_results.db ] 2>/dev/null; then + cp -r "$dir" /results/${HOSTNAME}/ + echo "✓ Copied rocprofv3 directory: $dir" + fi + done + + # Copy tool-specific outputs + if ls -d *_output 1> /dev/null 2>&1; then + cp -r *_output /results/${HOSTNAME}/ 2>/dev/null || true + echo "✓ Copied tool output directories" + fi + + # Copy GPU profiler outputs + if ls gpu_info_*.csv 1> /dev/null 2>&1; then + cp gpu_info_*.csv /results/${HOSTNAME}/ + echo "✓ Copied GPU profiler outputs" + fi + + # Copy library trace outputs + if ls *_trace_output.csv 1> /dev/null 2>&1; then + cp *_trace_output.csv /results/${HOSTNAME}/ + echo "✓ Copied library trace outputs" + fi + if [ -f "library_trace.csv" ]; then + cp library_trace.csv /results/${HOSTNAME}/library_trace.csv + echo "✓ Copied library_trace.csv" + fi + + # Copy tracing outputs + if ls trace.* 1> /dev/null 2>&1; then + cp trace.* /results/${HOSTNAME}/ 2>/dev/null || true + echo "✓ Copied tracing files" + fi + + echo "✓ All artifacts copied to PVC: /results/${HOSTNAME}/" + + echo "" + echo "=== Benchmark job completed with exit code ${MODEL_EXIT_CODE:-0} ===" + exit ${MODEL_EXIT_CODE:-0} + {% endif %} + + resources: + requests: + {{ gpu_resource_name }}: "{{ gpu_count }}" + memory: "{{ memory }}" + cpu: "{{ cpu }}" + limits: + {{ gpu_resource_name }}: "{{ gpu_count }}" + memory: "{{ memory_limit }}" + cpu: "{{ cpu_limit }}" + + env: + {% for key, value in env_vars.items() %} + - name: {{ key }} + value: "{{ value }}" + {% endfor %} + + volumeMounts: + - name: workspace + mountPath: /workspace + - name: config + mountPath: /config + readOnly: true + - name: shm + mountPath: /dev/shm + - name: results + mountPath: /results + {% if data_pvc %} + - name: data + mountPath: /data + readOnly: false # Must be writable for data provider downloads + {% endif %} + + securityContext: + capabilities: + add: + - SYS_PTRACE + seccompProfile: + type: Unconfined + + {% if tolerations %} + tolerations: + {% for toleration in tolerations %} + - key: {{ toleration.key }} + {% if toleration.operator %} + operator: {{ toleration.operator }} + {% endif %} + {% if toleration.value %} + value: "{{ toleration.value }}" + {% endif %} + {% if toleration.effect %} + effect: {{ toleration.effect }} + {% endif %} + {% endfor %} + {% endif %} + + volumes: + - name: workspace + emptyDir: {} + - name: config + configMap: + name: {{ configmap_name }} + - name: shm + emptyDir: + medium: Memory + sizeLimit: 16Gi # Increased for Ray/vLLM (should be >30% of RAM, recommended 16Gi+) + - name: results + persistentVolumeClaim: + claimName: {{ results_pvc }} + {% if data_pvc %} + - name: data + persistentVolumeClaim: + claimName: {{ data_pvc }} + {% endif %} + diff --git a/src/madengine/deployment/templates/kubernetes/pvc-data.yaml.j2 b/src/madengine/deployment/templates/kubernetes/pvc-data.yaml.j2 new file mode 100644 index 00000000..68a1934b --- /dev/null +++ b/src/madengine/deployment/templates/kubernetes/pvc-data.yaml.j2 @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ pvc_name }} + namespace: {{ namespace }} + labels: + app: madengine + purpose: shared-data + madengine-pvc: "true" + annotations: + description: "Shared data storage for madengine (auto-created)" +spec: + accessModes: + # RWO for single-node (broader storage class support) + # RWX for multi-node (requires NFS or similar) + - {{ access_mode | default("ReadWriteOnce") }} + resources: + requests: + storage: {{ storage_size | default("100Gi") }} + {% if storage_class %} + storageClassName: {{ storage_class }} + {% endif %} + diff --git a/src/madengine/deployment/templates/kubernetes/pvc.yaml.j2 b/src/madengine/deployment/templates/kubernetes/pvc.yaml.j2 new file mode 100644 index 00000000..2852a355 --- /dev/null +++ b/src/madengine/deployment/templates/kubernetes/pvc.yaml.j2 @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ pvc_name }} + namespace: {{ namespace }} + labels: + app: madengine + madengine-pvc: "true" +spec: + accessModes: + - ReadWriteOnce # Single-node access is sufficient for per-job results collection + resources: + requests: + storage: {{ storage_size | default("10Gi") }} + {% if storage_class %} + storageClassName: {{ storage_class }} + {% endif %} + diff --git a/src/madengine/deployment/templates/kubernetes/service.yaml.j2 b/src/madengine/deployment/templates/kubernetes/service.yaml.j2 new file mode 100644 index 00000000..51ba9720 --- /dev/null +++ b/src/madengine/deployment/templates/kubernetes/service.yaml.j2 @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ service_name }} + namespace: {{ namespace }} + labels: + app: madengine + model: {{ model_name }} +spec: + clusterIP: None # Headless service for torchrun coordination + selector: + job-name: {{ job_name }} + ports: + {% for port in ports %} + - name: port-{{ port }} + port: {{ port }} + targetPort: {{ port }} + protocol: TCP + {% endfor %} + diff --git a/src/madengine/deployment/templates/slurm/job.sh.j2 b/src/madengine/deployment/templates/slurm/job.sh.j2 new file mode 100644 index 00000000..c7fe35e8 --- /dev/null +++ b/src/madengine/deployment/templates/slurm/job.sh.j2 @@ -0,0 +1,804 @@ +#!/bin/bash +#SBATCH --job-name=madengine-{{ model_name }} +#SBATCH --output={{ output_dir }}/madengine-{{ model_name }}_%j_%t.out +#SBATCH --error={{ output_dir }}/madengine-{{ model_name }}_%j_%t.err +#SBATCH --partition={{ partition }} +#SBATCH --nodes={{ nodes }} +#SBATCH --ntasks={{ nodes }} +#SBATCH --ntasks-per-node=1 +#SBATCH --gpus-per-node={{ gpus_per_node }} +#SBATCH --time={{ time_limit }} +{% if exclude %} +#SBATCH --exclude={{ exclude }} +{% endif %} +{% if constraint %} +#SBATCH --constraint={{ constraint }} +{% endif %} +{% if exclusive %} +#SBATCH --exclusive +{% endif %} +{% if qos %} +#SBATCH --qos={{ qos }} +{% endif %} +{% if account %} +#SBATCH --account={{ account }} +{% endif %} + +# ============================================================================= +# SLURM Job Configuration Generated by madengine +# Model: {{ model_name }} +# Deployment: {{ nodes }} nodes x {{ gpus_per_node }} GPUs +# ============================================================================= + +# Load required modules +{% for module in modules %} +module load {{ module }} +{% endfor %} + +# ============================================================================= +# Environment Setup (Standard ML Environment Variables) +# ============================================================================= + +# Distributed execution environment (auto-configured from SLURM) +export MASTER_ADDR=$(scontrol show hostname $SLURM_NODELIST | head -n 1) +export MASTER_PORT={{ master_port | default(29500) }} +export WORLD_SIZE=$SLURM_NTASKS +# NOTE: RANK is set per-task inside srun context (not here in main script) +# export RANK=$SLURM_PROCID # <-- DO NOT SET HERE: will be 0 for all tasks +export LOCAL_RANK=$SLURM_LOCALID +export NNODES={{ nodes }} +export GPUS_PER_NODE={{ gpus_per_node }} + +# GPU visibility (ROCm/CUDA) +# IMPORTANT: Ray (vLLM, SGLang) requires HIP_VISIBLE_DEVICES for AMD GPUs +# Do NOT set both HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES together +GPU_LIST=$(seq -s, 0 $(({{ gpus_per_node }}-1))) +{% if launcher_type == "vllm" or launcher_type == "sglang" %} +# Ray-based launchers: Detect GPU vendor and set appropriate variable +# CRITICAL: Do NOT set both HIP_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES together +if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then + # AMD GPU detected - use HIP_VISIBLE_DEVICES ONLY + # CRITICAL: Unset RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES which is set by rocm/vllm image + # This variable tells Ray to ignore HIP_VISIBLE_DEVICES, causing conflicts + unset RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES + export HIP_VISIBLE_DEVICES=$GPU_LIST + unset ROCR_VISIBLE_DEVICES # Unset to avoid Ray conflicts + unset CUDA_VISIBLE_DEVICES # Unset to avoid "Inconsistent values" error + echo "🔧 GPU Config (AMD): HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES" +else + # NVIDIA GPU - use CUDA_VISIBLE_DEVICES ONLY + export CUDA_VISIBLE_DEVICES=$GPU_LIST + unset HIP_VISIBLE_DEVICES # Unset to avoid Ray conflicts + unset ROCR_VISIBLE_DEVICES + echo "🔧 GPU Config (NVIDIA): CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" +fi +{% else %} +# Non-Ray launchers: Set both for broader compatibility +export HIP_VISIBLE_DEVICES=$GPU_LIST +export ROCR_VISIBLE_DEVICES=$GPU_LIST +export CUDA_VISIBLE_DEVICES=$GPU_LIST +{% endif %} + +# Network configuration +{% if network_interface %} +export NCCL_SOCKET_IFNAME={{ network_interface }} +export GLOO_SOCKET_IFNAME={{ network_interface }} +{% endif %} + +# Distributed backend configuration +{% if distributed_backend %} +export DISTRIBUTED_BACKEND={{ distributed_backend }} +{% endif %} + +# Application-specific environment variables +{% for key, value in env_vars.items() %} +{% if key == 'MIOPEN_USER_DB_PATH' %} +# MIOPEN_USER_DB_PATH will be set per-process in the model script to avoid conflicts +# export {{ key }}="{{ value }}" # Commented out - set per-process instead +{% else %} +export {{ key }}="{{ value }}" +{% endif %} +{% endfor %} + +# madengine environment +export MAD_DEPLOYMENT_TYPE=slurm +export MAD_SLURM_JOB_ID=$SLURM_JOB_ID +export MAD_NODE_RANK=$SLURM_NODEID +export MAD_TOTAL_NODES={{ nodes }} + +# ============================================================================= +# Workspace Setup +# ============================================================================= + +# Determine workspace strategy based on configuration and node count +{% if shared_workspace %} +# Explicitly configured shared workspace (NFS/Lustre) +WORKSPACE={{ shared_workspace }} +WORKSPACE_TYPE="shared-explicit" +cd $WORKSPACE +{% else %} +{% if nodes > 1 %} +# ============================================================================= +# Multi-node: Per-Node Setup (executed by srun on each task) +# ============================================================================= +# For multi-node jobs, workspace setup must happen INSIDE srun context +# where SLURM_PROCID is properly set for each task. +# We'll create a setup script that srun will execute on each node. + +echo "Multi-node deployment detected ({{ nodes }} nodes)" +echo "Per-node setup will be executed by srun on each task" +echo "Submission directory: {{ manifest_file | dirname }}" + +# Note: Workspace setup happens later in srun context +# Skip to distributed execution configuration +{% else %} +# Single-node: Prefer shared storage (submission dir), with local fallback if needed +# Check if submission directory is on shared filesystem +SUBMIT_DIR={{ manifest_file | dirname }} +if df -T "$SUBMIT_DIR" 2>/dev/null | grep -qE '\bnfs\b|\blustre\b|\bgpfs\b|\bceph\b'; then + # Submission directory is on shared storage - use it directly (best practice) + WORKSPACE=$SUBMIT_DIR + WORKSPACE_TYPE="shared-nfs" + echo "Using shared NFS workspace: $WORKSPACE" +else + # Submission directory is local - use node scratch (rare case) + if [ -n "$SLURM_TMPDIR" ] && [ -d "$SLURM_TMPDIR" ] && [ -w "$SLURM_TMPDIR" ]; then + WORKSPACE=$SLURM_TMPDIR + WORKSPACE_TYPE="local-slurm" + else + WORKSPACE=/tmp/madengine_job_${SLURM_JOB_ID:-$$} + mkdir -p $WORKSPACE + WORKSPACE_TYPE="local-tmp" + fi + echo "Using local node workspace: $WORKSPACE" +fi +{% endif %} +{% endif %} + +{% if nodes > 1 %} +# Multi-node: Workspace setup happens in task script (executed by srun) +{% else %} +# Single-node: Setup workspace now +echo "Workspace type: $WORKSPACE_TYPE" +echo "Working directory: $WORKSPACE" +cd $WORKSPACE + +# File handling based on workspace type +# Single-node: Use shared files if available, copy only if using local workspace +if [ "$WORKSPACE_TYPE" = "shared-nfs" ] || [ "$WORKSPACE_TYPE" = "shared-auto" ] || [ "$WORKSPACE_TYPE" = "shared-explicit" ]; then + # Using shared workspace - reference files directly + echo "Using files from shared storage (no copy needed)" +{% if manifest_file %} + MANIFEST_FILE={{ manifest_file }} +{% endif %} +{% if credential_file %} + CREDENTIAL_FILE={{ manifest_file | dirname }}/{{ credential_file }} +{% endif %} +{% if data_file %} + DATA_FILE={{ manifest_file | dirname }}/{{ data_file }} +{% endif %} +else + # Using local workspace - copy files from shared storage + echo "Copying files to local workspace" + SUBMIT_DIR={{ manifest_file | dirname }} +{% if manifest_file %} + cp {{ manifest_file }} $WORKSPACE/build_manifest.json + MANIFEST_FILE=$WORKSPACE/build_manifest.json +{% endif %} +{% if credential_file %} + if [ -f "$SUBMIT_DIR/{{ credential_file }}" ]; then + cp $SUBMIT_DIR/{{ credential_file }} $WORKSPACE/credential.json + CREDENTIAL_FILE=$WORKSPACE/credential.json + fi +{% endif %} +{% if data_file %} + if [ -f "$SUBMIT_DIR/{{ data_file }}" ]; then + cp $SUBMIT_DIR/{{ data_file }} $WORKSPACE/data.json + DATA_FILE=$WORKSPACE/data.json + fi +{% endif %} +fi +{% endif %} + +{% if nodes == 1 %} +# ============================================================================= +# Single-node: Verify madengine availability +# ============================================================================= + +# Verify madengine availability +# Note: We rely on the submission environment being inherited by compute nodes +echo "" +echo "Verifying madengine availability..." +if command -v madengine >/dev/null 2>&1; then + MAD_CLI_VERSION=$(madengine --version 2>&1 | head -n1 || echo "unknown") + MAD_CLI_PATH=$(which madengine 2>/dev/null || echo "unknown") + + echo " ✓ madengine available" + echo " Version: $MAD_CLI_VERSION" + echo " Path: $MAD_CLI_PATH" + + # Verify it's executable + if madengine --help >/dev/null 2>&1; then + export MAD_CLI_COMMAND="madengine" + else + echo " ❌ ERROR: madengine found but not functional!" + exit 1 + fi +else + echo " ❌ ERROR: madengine not found in PATH" + echo "" + echo " To fix:" + echo " • Activate your virtual environment: source venv/bin/activate" + echo " • Install madengine: pip install -e . (for development)" + echo " • Verify before submission: madengine --version" + echo "" + exit 1 +fi +echo "" + +# ============================================================================= +# Single-node: Create local execution manifest +# ============================================================================= + +{% if manifest_file %} +# Create a local-execution manifest by modifying deployment_config +ORIGINAL_MANIFEST="{{ manifest_file | basename }}" +LOCAL_MANIFEST="build_manifest_local.json" + +echo "Creating local execution manifest from: $ORIGINAL_MANIFEST" + +python3 -c " +import json +manifest_file = '$ORIGINAL_MANIFEST' +output_file = '$LOCAL_MANIFEST' +with open(manifest_file, 'r') as f: + manifest = json.load(f) + +# Keep built_images for Docker execution +# Only modify deployment_config to run on this node (not via SLURM scheduler again) +if 'deployment_config' in manifest: + gpus_per_node = None + if 'slurm' in manifest['deployment_config']: + gpus_per_node = manifest['deployment_config']['slurm'].get('gpus_per_node') + + # Set to 'docker' instead of 'local' to force container execution + manifest['deployment_config']['target'] = 'docker' + + # Remove scheduler configs (but keep built_images!) + manifest['deployment_config'].pop('slurm', None) + manifest['deployment_config'].pop('k8s', None) + manifest['deployment_config'].pop('kubernetes', None) + + if gpus_per_node: + manifest['deployment_config']['gpus_per_node'] = gpus_per_node + +with open(output_file, 'w') as f: + json.dump(manifest, f, indent=2) +print('Created Docker execution manifest for SLURM compute node') +" + +if [ $? -eq 0 ]; then + echo "✓ Forced Docker execution in manifest: $LOCAL_MANIFEST" + EXEC_MANIFEST="$LOCAL_MANIFEST" +else + echo "⚠ Failed to modify manifest, using original" + EXEC_MANIFEST="$ORIGINAL_MANIFEST" +fi +{% else %} +EXEC_MANIFEST="" +{% endif %} +{% endif %} + +# SLURM GPU Environment Check +# SLURM already sets CUDA_VISIBLE_DEVICES, ROCR_VISIBLE_DEVICES, GPU_DEVICE_ORDINAL +echo "SLURM GPU allocation:" +echo " Allocated GPUs: ${SLURM_GPUS_ON_NODE:-unknown}" +echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-not set}" +echo " ROCR_VISIBLE_DEVICES: ${ROCR_VISIBLE_DEVICES:-not set}" +echo " Node: ${SLURM_NODEID}/${SLURM_NNODES} (Rank ${SLURM_PROCID}/${SLURM_NTASKS})" + +# Set deployment environment flags +export MAD_IN_SLURM_JOB=1 +export MAD_DEPLOYMENT_TYPE=slurm +{% if launcher_type %} +export MAD_LAUNCHER_TYPE="{{ launcher_type }}" +{% else %} +export MAD_LAUNCHER_TYPE="torchrun" +{% endif %} + +# ============================================================================= +# Configure Distributed Execution Launcher +# ============================================================================= +echo "" +echo "Distributed Execution Configuration:" +echo " NNODES: ${NNODES}" +echo " GPUS_PER_NODE: ${GPUS_PER_NODE}" +echo " TOTAL_GPUS: $((NNODES * GPUS_PER_NODE))" +echo " MASTER_ADDR: ${MASTER_ADDR}" +echo " MASTER_PORT: ${MASTER_PORT}" +echo " WORLD_SIZE: ${WORLD_SIZE}" +echo " Launcher: ${MAD_LAUNCHER_TYPE}" +echo "" + +# Note: For multi-node jobs, node-specific variables (RANK, NODE_RANK, MAD_MULTI_NODE_RUNNER) +# are set inside each task where SLURM_PROCID is properly available per-node + +# Set network interface for NCCL/GLOO if not already set +{% if network_interface %} +export NCCL_SOCKET_IFNAME={{ network_interface }} +export GLOO_SOCKET_IFNAME={{ network_interface }} +{% else %} +# Try to auto-detect InfiniBand or high-speed network interface +if [ -z "${NCCL_SOCKET_IFNAME}" ]; then + # Check for InfiniBand interfaces + if ip link show | grep -q "ib[0-9]"; then + export NCCL_SOCKET_IFNAME=ib0 + export GLOO_SOCKET_IFNAME=ib0 + echo " Network: InfiniBand (ib0)" + else + # Fallback to first non-loopback interface + DEFAULT_IFACE=$(ip route | grep default | awk '{print $5}' | head -n1) + export NCCL_SOCKET_IFNAME=${DEFAULT_IFACE:-eth0} + export GLOO_SOCKET_IFNAME=${DEFAULT_IFACE:-eth0} + echo " Network: ${NCCL_SOCKET_IFNAME}" + fi +fi +{% endif %} + +{% if nodes > 1 %} +# ============================================================================= +# Multi-node: Execute with per-task setup +# ============================================================================= +# For multi-node distributed execution: +# 1. Each srun task runs on a separate node with unique SLURM_PROCID +# 2. All nodes participate in workload via launcher (torchrun/vLLM/SGLang/etc.) +# 3. Global metrics are computed via all_reduce (identical on all nodes) +# 4. Only master node (SLURM_PROCID=0) collects/reports final metrics +# +# This approach follows distributed execution best practices: +# - Avoids duplicate data in perf.csv +# - Prevents race conditions in metric extraction +# - Ensures worker nodes exit cleanly after workload execution +# ============================================================================= + +# Create a wrapper script that each srun task will execute +# This ensures workspace setup happens with correct SLURM_PROCID + +# Use submission directory (shared filesystem) for task script +# /tmp is local to each node and won't be accessible by srun on other nodes +TASK_SCRIPT="{{ manifest_file | dirname }}/slurm_output/madengine_task_${SLURM_JOB_ID}.sh" + +cat > "$TASK_SCRIPT" << 'TASK_SCRIPT_EOF' +#!/bin/bash +set -e + +echo "=========================================================================" +echo "Task started on node: $(hostname)" +echo "SLURM_PROCID: ${SLURM_PROCID}" +echo "SLURM_JOB_ID: ${SLURM_JOB_ID}" +echo "=========================================================================" + +# Configure MAD_MULTI_NODE_RUNNER for this specific node +# CRITICAL: This must be done HERE where SLURM_PROCID is unique for each task +{% if nodes > 1 %} +# Verify SLURM_PROCID is set +if [ -z "${SLURM_PROCID}" ]; then + echo "ERROR: SLURM_PROCID not set! Cannot determine node rank." + exit 1 +fi + +# Capture node rank explicitly +NODE_RANK=${SLURM_PROCID} +export NODE_RANK + +# Debug output +echo "==========================================" +echo "🔧 Node-Specific Launcher Setup" +echo "==========================================" +echo " Launcher: {{ launcher_type|default('torchrun') }}" +echo " SLURM_PROCID: ${SLURM_PROCID}" +echo " NODE_RANK: ${NODE_RANK}" +echo " NNODES: {{ nnodes }}" +echo " NPROC_PER_NODE: {{ nproc_per_node }}" +echo " MASTER_ADDR: ${MASTER_ADDR}" +echo " MASTER_PORT: {{ master_port | default(29500) }}" +echo "==========================================" + +# Generate launcher-specific command +{{ launcher_command }} + +{% if launcher_type in ['torchrun', 'deepspeed', 'megatron'] %} +echo " MAD_MULTI_NODE_RUNNER: ${MAD_MULTI_NODE_RUNNER}" +{% endif %} +echo "==========================================" +{% else %} +# Single-node setup +echo "Single-node {{ launcher_type|default('torchrun') }} setup" +{{ launcher_command }} + +{% if launcher_type in ['torchrun', 'deepspeed', 'megatron'] %} +echo " MAD_MULTI_NODE_RUNNER: ${MAD_MULTI_NODE_RUNNER}" +{% endif %} +{% endif %} +echo "" + +# Setup workspace (SLURM_PROCID is now available) +if [ -n "$SLURM_TMPDIR" ] && [ -d "$SLURM_TMPDIR" ] && [ -w "$SLURM_TMPDIR" ]; then + WORKSPACE=$SLURM_TMPDIR/madengine_node_${SLURM_PROCID} +else + WORKSPACE=/tmp/madengine_job_${SLURM_JOB_ID}_node_${SLURM_PROCID} +fi +mkdir -p $WORKSPACE +WORKSPACE_TYPE="local-multinode" +echo "Multi-node: Node ${SLURM_PROCID} using local workspace: $WORKSPACE" +cd $WORKSPACE + +# Copy entire project to local workspace +echo "Copying entire project to local workspace" +SUBMISSION_DIR={{ manifest_file | dirname }} + +echo " Copying from: $SUBMISSION_DIR" +echo " Copying to: $WORKSPACE" +rsync -a --quiet \ + --exclude='.git' \ + --exclude='__pycache__' \ + --exclude='*.pyc' \ + --exclude='**/__pycache__' \ + --exclude='*.egg-info' \ + --exclude='.pytest_cache' \ + --exclude='venv' \ + --exclude='.venv' \ + --exclude='env' \ + --exclude='.env' \ + --exclude='slurm_output/*.out' \ + --exclude='slurm_output/*.log' \ + "$SUBMISSION_DIR/" "$WORKSPACE/" + +echo " ✓ Project copied to local workspace" +echo "" + +# ============================================================================= +# Verify madengine Availability +# ============================================================================= +# Note: We rely on the submission environment being inherited by compute nodes. +# The submission node MUST have madengine available before job submission. +# This is validated pre-flight by the Python deployment code. + +echo "Verifying madengine availability..." + +if command -v madengine >/dev/null 2>&1; then + MAD_CLI_VERSION=$(madengine --version 2>&1 | head -n1 || echo "unknown") + MAD_CLI_PATH=$(which madengine 2>/dev/null || echo "unknown") + + echo "✓ madengine available" + echo " Version: $MAD_CLI_VERSION" + echo " Path: $MAD_CLI_PATH" + + # Verify it's executable + if madengine --help >/dev/null 2>&1; then + echo " ✓ Verified: madengine is functional" + MAD_CLI_COMMAND="madengine" + else + echo "❌ ERROR: madengine found but not functional!" + echo " Please check your installation on the submission node" + exit 1 + fi +else + echo "❌ ERROR: madengine not found in PATH" + echo "" + echo "This means:" + echo " 1. madengine is not installed, OR" + echo " 2. Virtual environment not activated on submission node, OR" + echo " 3. Environment not properly inherited by SLURM" + echo "" + echo "To fix:" + echo " • Activate your virtual environment: source venv/bin/activate" + echo " • Install madengine: pip install -e . (for development)" + echo " • Verify before submission: madengine --version" + echo "" + echo "Current PATH: $PATH" + echo "" + exit 1 +fi + +# Create local execution manifest +ORIGINAL_MANIFEST="{{ manifest_file | basename }}" +LOCAL_MANIFEST="build_manifest_local.json" + +echo "" +echo "Creating local execution manifest: $LOCAL_MANIFEST" + +python3 -c " +import json +manifest_file = '$ORIGINAL_MANIFEST' +output_file = '$LOCAL_MANIFEST' +with open(manifest_file, 'r') as f: + manifest = json.load(f) + +# Keep built_images for Docker execution +# Only modify deployment_config to run on this node (not via SLURM scheduler again) +if 'deployment_config' in manifest: + gpus_per_node = None + if 'slurm' in manifest['deployment_config']: + gpus_per_node = manifest['deployment_config']['slurm'].get('gpus_per_node') + + # Set to 'docker' instead of 'local' to force container execution + manifest['deployment_config']['target'] = 'docker' + + # Remove scheduler configs (but keep built_images!) + manifest['deployment_config'].pop('slurm', None) + manifest['deployment_config'].pop('k8s', None) + manifest['deployment_config'].pop('kubernetes', None) + + if gpus_per_node: + manifest['deployment_config']['gpus_per_node'] = gpus_per_node + +with open(output_file, 'w') as f: + json.dump(manifest, f, indent=2) +print('✓ Created Docker execution manifest for SLURM compute node') +" + +if [ $? -eq 0 ] && [ -f "$LOCAL_MANIFEST" ]; then + EXEC_MANIFEST="$LOCAL_MANIFEST" + echo "✓ Manifest ready for Docker execution: $EXEC_MANIFEST" +else + echo "⚠ Using original manifest" + EXEC_MANIFEST="$ORIGINAL_MANIFEST" +fi + +# Show configuration +echo "" +echo "Node ${SLURM_PROCID} ready:" +echo " Workspace: $WORKSPACE" +echo " Manifest: $EXEC_MANIFEST" +echo " Command: $MAD_CLI_COMMAND" +echo "" + +# Execute madengine +echo "Executing madengine in LOCAL mode..." + +# Set RANK to node rank for this task (SLURM_PROCID) +export RANK=${SLURM_PROCID} + +# Set environment variable to control metric collection +# Only master node (SLURM_PROCID=0) should collect and report metrics +if [ "${SLURM_PROCID}" = "0" ]; then + export MAD_COLLECT_METRICS="true" +else + export MAD_COLLECT_METRICS="false" +fi + +# Export all environment variables that need to be passed to Docker +# This ensures they're inherited by the madengine process and Docker containers +export MASTER_ADDR="${MASTER_ADDR}" +export MASTER_PORT="${MASTER_PORT}" +export WORLD_SIZE="${WORLD_SIZE}" +export NNODES="{{ nodes }}" +export GPUS_PER_NODE="{{ gpus_per_node }}" + +# Set per-process MIOpen cache to avoid database conflicts in multi-GPU workloads +# Use LOCAL_RANK (set by launcher) to create unique directory per GPU process +# This prevents "Duplicate ID" errors and database corruption +export MIOPEN_USER_DB_PATH="/tmp/.miopen/node_${SLURM_PROCID}_rank_\${LOCAL_RANK:-0}" +# Note: Directory creation happens in the model script after LOCAL_RANK is set + +# Debug: Show environment variables being passed +echo "Environment variables for Docker container:" +echo " MASTER_ADDR: ${MASTER_ADDR}" +echo " MASTER_PORT: ${MASTER_PORT}" +echo " WORLD_SIZE: ${WORLD_SIZE}" +echo " RANK (node rank): ${RANK}" +echo " NODE_RANK: ${NODE_RANK}" +echo " NNODES: ${NNODES}" +echo " NPROC_PER_NODE: ${GPUS_PER_NODE}" +echo " MAD_MULTI_NODE_RUNNER: ${MAD_MULTI_NODE_RUNNER}" +if [ "${SLURM_PROCID}" = "0" ]; then + echo " MAD_IS_MASTER_NODE: true (will collect performance metrics)" +else + echo " MAD_IS_MASTER_NODE: false (execution only, no metric collection)" +fi +echo "" + +# Create node-specific log files in results directory +RESULTS_DIR={{ manifest_file | dirname }} +NODE_LOG_OUT="${RESULTS_DIR}/slurm_output/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${SLURM_PROCID}.out" +NODE_LOG_ERR="${RESULTS_DIR}/slurm_output/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${SLURM_PROCID}.err" + +echo "Node ${SLURM_PROCID} logs:" +echo " stdout: ${NODE_LOG_OUT}" +echo " stderr: ${NODE_LOG_ERR}" +echo "" + +# Determine if this node should skip performance collection +if [ "${SLURM_PROCID}" != "0" ]; then + export MAD_SKIP_PERF_COLLECTION="true" +else + export MAD_SKIP_PERF_COLLECTION="false" +fi + +# Run madengine with output redirected to node-specific log files +# Environment variables (MASTER_ADDR, MAD_MULTI_NODE_RUNNER, etc.) are inherited +$MAD_CLI_COMMAND run \ + --manifest-file "$EXEC_MANIFEST" \ + --timeout {{ timeout | default(3600) }} \ + {% if shared_data %}--force-mirror-local {{ shared_data }}{% endif %} \ + {% if live_output %}--live-output{% endif %} \ + > "${NODE_LOG_OUT}" 2> "${NODE_LOG_ERR}" + +TASK_EXIT=$? +echo "" +echo "Task completed with exit code: $TASK_EXIT" + +# ============================================================================= +# Multi-Node Result Collection (Best Practice: Master Node Only) +# ============================================================================= +# For distributed workloads, only the master node (SLURM_PROCID=0) should +# collect and report performance metrics to avoid: +# - Duplicate data in perf.csv +# - Race conditions in metric extraction +# - Failures from non-master nodes trying to report identical global metrics +# +# This follows distributed execution best practices where only rank 0 +# reports final metrics. +# ============================================================================= + +if [ $TASK_EXIT -eq 0 ]; then + if [ "${SLURM_PROCID}" = "0" ]; then + # Master node: Collect and report results + RESULTS_DIR={{ manifest_file | dirname }} + echo "" + echo "========================================================================" + echo "Master Node (SLURM_PROCID=0): Collecting results" + echo "========================================================================" + echo "Copying results back to: $RESULTS_DIR" + + # Copy performance results (main metric file) + if [ -f "$WORKSPACE/perf.csv" ]; then + cp "$WORKSPACE/perf.csv" "$RESULTS_DIR/perf.csv" 2>/dev/null || true + echo " ✓ Copied: perf.csv (global metrics)" + fi + + # Copy log files + for log in "$WORKSPACE"/*.log; do + if [ -f "$log" ]; then + log_basename=$(basename "$log") + cp "$log" "$RESULTS_DIR/${log_basename}" 2>/dev/null || true + echo " ✓ Copied: ${log_basename}" + fi + done + + # Copy any workload results files + if [ -f "$WORKSPACE/results.txt" ]; then + cp "$WORKSPACE/results.txt" "$RESULTS_DIR/" 2>/dev/null || true + echo " ✓ Copied: results.txt" + fi + # Legacy support for training_results.txt + if [ -f "$WORKSPACE/training_results.txt" ]; then + cp "$WORKSPACE/training_results.txt" "$RESULTS_DIR/" 2>/dev/null || true + echo " ✓ Copied: training_results.txt" + fi + + echo " ✓ Master node results collection complete" + echo "========================================================================" + else + # Worker nodes: Exit cleanly without collecting results + echo "" + echo "========================================================================" + echo "Worker Node (SLURM_PROCID=${SLURM_PROCID}): Exiting cleanly" + echo "========================================================================" + echo " Note: Performance metrics collected by master node only (best practice)" + echo "========================================================================" + fi +else + echo "" + echo "========================================================================" + echo "Task FAILED with exit code: $TASK_EXIT" + echo " Node: SLURM_PROCID=${SLURM_PROCID}" + echo "========================================================================" +fi + +exit $TASK_EXIT +TASK_SCRIPT_EOF + +chmod +x "$TASK_SCRIPT" + +echo "Launching tasks on {{ nodes }} nodes..." +srun bash "$TASK_SCRIPT" +EXIT_CODE=$? + +# Cleanup task script +rm -f "$TASK_SCRIPT" + +{% else %} +# ============================================================================= +# Single-node: Execute directly +# ============================================================================= +# Configure MAD_MULTI_NODE_RUNNER for single-node +export MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node={{ gpus_per_node }}" +export RANK=0 # Single node always has rank 0 +export NODE_RANK=0 + +echo "==========================================" +echo "🔧 Single-node Execution Setup" +echo "==========================================" +echo " NPROC_PER_NODE: {{ gpus_per_node }}" +echo " MAD_MULTI_NODE_RUNNER: ${MAD_MULTI_NODE_RUNNER}" +echo "==========================================" +echo "" + +echo "Executing madengine in LOCAL mode (inside SLURM job)" +echo " Command: $MAD_CLI_COMMAND" +echo "" + +# Environment variables (MASTER_ADDR, MAD_MULTI_NODE_RUNNER, etc.) are inherited +$MAD_CLI_COMMAND run \ + {% if manifest_file %}--manifest-file "$EXEC_MANIFEST"{% else %}--tags {{ tags }}{% endif %} \ + --timeout {{ timeout | default(3600) }} \ + {% if shared_data %}--force-mirror-local {{ shared_data }}{% endif %} \ + {% if live_output %}--live-output{% endif %} + +EXIT_CODE=$? +{% endif %} + +# ============================================================================= +# Job Completion +# ============================================================================= +# Note: For multi-node jobs, only the master node (SLURM_PROCID=0) collects +# and reports performance metrics. This follows distributed execution best +# practices where: +# - Global metrics are identical across all nodes (computed via all_reduce) +# - Only rank 0 should report to avoid duplicate/conflicting data +# - Worker nodes exit cleanly after workload completes + +echo "" +if [ $EXIT_CODE -eq 0 ]; then + echo "========================================================================" + echo "✅ SLURM Job Completed Successfully" + echo "========================================================================" + echo " Job ID: ${SLURM_JOB_ID}" + echo " Nodes: {{ nodes }}" + echo " GPUs per node: {{ gpus_per_node }}" + echo " Total GPUs: $(({{ nodes }} * {{ gpus_per_node }}))" + echo " Results: {{ manifest_file | dirname }}/perf.csv" + {% if nodes > 1 %} + echo "" + echo " 📋 Individual Node Logs ({{ nodes }} nodes):" + echo " ─────────────────────────────────────────────" + for i in $(seq 0 $(({{ nodes }} - 1))); do + NODE_OUT="{{ output_dir }}/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${i}.out" + NODE_ERR="{{ output_dir }}/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${i}.err" + if [ -f "$NODE_OUT" ]; then + OUT_SIZE=$(du -h "$NODE_OUT" 2>/dev/null | cut -f1) + ERR_SIZE=$(du -h "$NODE_ERR" 2>/dev/null | cut -f1) + echo " Node $i:" + echo " stdout: ${NODE_OUT} (${OUT_SIZE})" + echo " stderr: ${NODE_ERR} (${ERR_SIZE})" + fi + done + {% endif %} + echo "========================================================================" +else + echo "========================================================================" + echo "❌ SLURM Job Failed" + echo "========================================================================" + echo " Job ID: ${SLURM_JOB_ID}" + echo " Exit Code: $EXIT_CODE" + {% if nodes > 1 %} + echo "" + echo " 📋 Check Individual Node Logs:" + echo " ─────────────────────────────────" + for i in $(seq 0 $(({{ nodes }} - 1))); do + NODE_OUT="{{ output_dir }}/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${i}.out" + NODE_ERR="{{ output_dir }}/madengine-{{ model_name }}_${SLURM_JOB_ID}_node_${i}.err" + if [ -f "$NODE_OUT" ] || [ -f "$NODE_ERR" ]; then + echo " Node $i: ${NODE_OUT}" + fi + done + {% else %} + echo " Check logs: {{ output_dir }}/madengine-{{ model_name }}_${SLURM_JOB_ID}_*.out" + {% endif %} + echo "========================================================================" +fi + +exit $EXIT_CODE + diff --git a/src/madengine/execution/README.md b/src/madengine/execution/README.md new file mode 100644 index 00000000..1277cfa3 --- /dev/null +++ b/src/madengine/execution/README.md @@ -0,0 +1,211 @@ +# Execution Layer + +**Status**: Active +**Purpose**: Local Docker execution primitives for building and running containers + +--- + +## 🎯 Responsibility + +This layer handles low-level Docker operations: +- **Building** Docker images from Dockerfiles +- **Running** Docker containers locally +- **Managing** Docker lifecycle (create, start, stop, cleanup) + +Used by the orchestration layer to execute Docker operations. + +--- + +## 📦 Components + +### **`docker_builder.py`** + +Builds Docker images for models. + +**Key Features:** +- Multi-architecture builds (GPU-specific compilation) +- Build argument injection (ROCm/CUDA versions, architectures) +- Registry push support (DockerHub, local registries) +- Build manifest generation +- Credential management + +**Usage:** +```python +from madengine.execution.docker_builder import DockerBuilder + +builder = DockerBuilder(context, console) + +# Build single model +result = builder.build_image( + model_info={"name": "model1", "dockerfile": "docker/model1.Dockerfile"}, + dockerfile="docker/model1.Dockerfile", + phase_suffix="gfx90a" +) + +# Build all models +results = builder.build_all_models( + models_list=[model1, model2, model3], + target_archs=["gfx90a", "gfx942"] +) + +# Export build manifest +builder.export_build_manifest(output_file="build_manifest.json") +``` + +### **`container_runner.py`** + +Runs Docker containers locally for model execution. + +**Key Features:** +- GPU passthrough (ROCm, CUDA) +- Volume mounting (data, scripts, results) +- Resource limits (GPU, CPU, memory) +- Timeout management +- Performance metrics collection +- Container cleanup + +**Usage:** +```python +from madengine.execution.container_runner import ContainerRunner + +runner = ContainerRunner(context, data, console) + +# Run model in container +result = runner.run_container( + model_info=model_dict, + model_docker=docker_client, + gpu_ids="0,1", + timeout=3600 +) + +# Result includes status, metrics, logs +print(result["status"]) # "successful", "failed", "timeout" +print(result["duration"]) +``` + +--- + +## 🏗️ Architecture + +``` +┌─────────────────────────────────────┐ +│ Orchestration Layer │ +│ (build_orchestrator.py, │ +│ run_orchestrator.py) │ +└─────────────┬───────────────────────┘ + │ uses + ┌─────────┴─────────┐ + │ │ +┌───▼──────────┐ ┌─────▼──────────┐ +│ docker_builder│ │container_runner│ ← This Layer +│ (build) │ │ (run) │ +└───┬──────────┘ └─────┬──────────┘ + │ │ + └─────────┬─────────┘ + │ uses + ┌─────────▼─────────┐ + │ Core Layer │ + │ (docker.py, │ + │ context.py) │ + └───────────────────┘ +``` + +--- + +## 🔄 Workflow + +### **Build Phase** + +1. `BuildOrchestrator` discovers models +2. `BuildOrchestrator` calls `DockerBuilder.build_all_models()` +3. `DockerBuilder` builds each model with target architectures +4. `DockerBuilder` generates `build_manifest.json` + +### **Run Phase** + +1. `RunOrchestrator` loads `build_manifest.json` +2. `RunOrchestrator` calls `ContainerRunner.run_container()` +3. `ContainerRunner` executes model in Docker container +4. `ContainerRunner` collects metrics and writes results +5. Performance data saved via `reporting/update_perf_csv.py` + +--- + +## 🎯 Design Principles + +1. **Single Responsibility**: Each component does ONE thing + - `docker_builder.py` = Build images + - `container_runner.py` = Run containers + +2. **Separation from Logic**: This layer is **execution only** + - ❌ No workflow decisions (that's orchestration) + - ❌ No model discovery (that's utils) + - ✅ Pure Docker operations + +3. **Reusability**: Can be used by: + - Modern `madengine` CLI (via orchestrators) + - Future automation scripts + +4. **Testability**: Mock Docker client for unit tests + +--- + +## 🧪 Testing + +```bash +# Test docker builder +pytest tests/test_docker_builder.py -v + +# Test container runner +pytest tests/test_container_runner.py -v + +# Test multi-GPU architecture support +pytest tests/test_multi_gpu_arch.py -v +``` + +--- + +## 📚 Related Components + +| Component | Location | Purpose | +|-----------|----------|---------| +| **Orchestration** | `orchestration/` | High-level workflow coordination | +| **Deployment** | `deployment/` | Distributed execution (SLURM, K8s) | +| **Core** | `core/` | Docker client, Context, Console | +| **Utils** | `utils/` | GPU tools, validators | + +--- + +## 🔍 Key Differences + +**Execution vs Deployment:** + +| Aspect | Execution Layer | Deployment Layer | +|--------|----------------|------------------| +| **Scope** | Local Docker | Distributed systems | +| **Examples** | Build image, run container | SLURM jobs, K8s pods | +| **Location** | `execution/` | `deployment/` | +| **Complexity** | Simple (direct Docker) | Complex (cluster orchestration) | + +--- + +## ⚙️ Configuration + +Both components use `Context` for configuration: + +```python +# GPU vendor, architecture, ROCm version +context.get_gpu_vendor() # "AMD" or "NVIDIA" +context.get_system_gpu_architecture() # "gfx90a", "sm_80" + +# Docker settings +context.ctx["docker_env_vars"] # Environment variables +context.ctx["docker_build_arg"] # Build arguments +context.ctx["docker_mounts"] # Volume mounts +``` + +--- + +**Last Updated**: November 30, 2025 +**Maintainer**: madengine Team + diff --git a/src/madengine/execution/__init__.py b/src/madengine/execution/__init__.py new file mode 100644 index 00000000..c7be268e --- /dev/null +++ b/src/madengine/execution/__init__.py @@ -0,0 +1,12 @@ +""" +Execution layer for local container execution. + +Provides Docker container execution capabilities for single-node local runs. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +from .container_runner import ContainerRunner + +__all__ = ["ContainerRunner"] + diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py new file mode 100644 index 00000000..ba011e81 --- /dev/null +++ b/src/madengine/execution/container_runner.py @@ -0,0 +1,1642 @@ +#!/usr/bin/env python3 +""" +Docker Container Runner Module for madengine + +This module handles the Docker container execution phase separately from building, +enabling distributed workflows where containers are run on remote nodes +using pre-built images. +""" + +import os +import time +import json +import typing +import warnings +import re +from rich.console import Console as RichConsole +from contextlib import redirect_stdout, redirect_stderr +from madengine.core.console import Console +from madengine.core.context import Context +from madengine.core.docker import Docker +from madengine.core.timeout import Timeout +from madengine.core.dataprovider import Data +from madengine.utils.ops import PythonicTee, file_print +from madengine.reporting.update_perf_csv import update_perf_csv, flatten_tags +from madengine.reporting.update_perf_super import update_perf_super_json, update_perf_super_csv +from madengine.utils.gpu_config import resolve_runtime_gpus +from madengine.utils.config_parser import ConfigParser + + +class ContainerRunner: + """Class responsible for running Docker containers with models.""" + + def __init__( + self, + context: Context = None, + data: Data = None, + console: Console = None, + live_output: bool = False, + additional_context: typing.Dict = None, + ): + """Initialize the Container Runner. + + Args: + context: The madengine context + data: The data provider instance + console: Optional console instance + live_output: Whether to show live output + additional_context: Additional configuration context (for GPU resolution) + """ + self.context = context + self.data = data + self.console = console or Console(live_output=live_output) + self.live_output = live_output + self.rich_console = RichConsole() + self.credentials = None + self.perf_csv_path = "perf.csv" # Default output path + self.additional_context = additional_context or {} + + # Ensure runtime context is initialized for container operations + if self.context: + self.context.ensure_runtime_context() + + def set_perf_csv_path(self, path: str): + """Set the path for the performance CSV output file. + + Args: + path: Path to the performance CSV file + """ + self.perf_csv_path = path + + def ensure_perf_csv_exists(self): + """Ensure the performance CSV file exists with proper headers.""" + if not os.path.exists(self.perf_csv_path): + file_print( + "model,n_gpus,nnodes,gpus_per_node,training_precision,pipeline,args,tags,docker_file,base_docker,docker_sha,docker_image,git_commit,machine_name,deployment_type,launcher,gpu_architecture,performance,metric,relative_change,status,build_duration,test_duration,dataname,data_provider_type,data_size,data_download_duration,build_number,additional_docker_run_options", + filename=self.perf_csv_path, + mode="w", + ) + print(f"Created performance CSV file: {self.perf_csv_path}") + + def create_run_details_dict( + self, model_info: typing.Dict, build_info: typing.Dict, run_results: typing.Dict + ) -> typing.Dict: + """Create a run details dictionary similar to RunDetails class in run_models.py. + + Args: + model_info: Model information dictionary + build_info: Build information from manifest + run_results: Container execution results + + Returns: + dict: Run details dictionary for CSV generation + """ + import os + + # Resolve GPU count using hierarchical resolution + resolved_gpu_count = resolve_runtime_gpus(model_info, self.additional_context) + + # Convert -1 (all GPUs) to actual system GPU count for accurate reporting + if resolved_gpu_count == -1 and self.context: + try: + system_ngpus = int(self.context.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"]) + resolved_gpu_count = system_ngpus + print(f"ℹ️ Converted n_gpus=-1 to actual system GPU count: {system_ngpus}") + except (KeyError, ValueError, TypeError): + # If system GPU count not available, keep -1 + pass + + # Determine number of nodes and GPUs per node + # Priority: 1. SLURM env vars, 2. additional_context, 3. model_info, 4. default (1) + nnodes = "1" # Default for local execution + gpus_per_node = str(resolved_gpu_count) + + # Check for SLURM multi-node environment + if os.environ.get("MAD_DEPLOYMENT_TYPE") == "slurm": + # Get from SLURM environment variables (most accurate for SLURM jobs) + slurm_nnodes = os.environ.get("NNODES") or os.environ.get("SLURM_NNODES") + slurm_gpus_per_node = os.environ.get("GPUS_PER_NODE") or os.environ.get("SLURM_GPUS_PER_NODE") + + if slurm_nnodes: + nnodes = str(slurm_nnodes) + print(f"ℹ️ Detected SLURM multi-node: {nnodes} nodes") + + if slurm_gpus_per_node: + gpus_per_node = str(slurm_gpus_per_node) + print(f"ℹ️ GPUs per node: {gpus_per_node}") + + # Fallback to additional_context (for non-SLURM or if env vars not set) + if nnodes == "1" and self.additional_context: + slurm_config = self.additional_context.get("slurm", {}) + if slurm_config: + ctx_nodes = slurm_config.get("nodes") + ctx_gpus = slurm_config.get("gpus_per_node") + if ctx_nodes: + nnodes = str(ctx_nodes) + if ctx_gpus: + gpus_per_node = str(ctx_gpus) + + # Final fallback to model_info + if nnodes == "1": + nnodes = model_info.get("nnodes", "1") + + # Calculate total GPUs + try: + total_gpus = int(nnodes) * int(gpus_per_node) + except (ValueError, TypeError): + total_gpus = resolved_gpu_count + + # Extract launcher from multiple sources in priority order: + # 1. additional_context (passed via --additional-context CLI arg) + # 2. model_info distributed config (in models.json) + # 3. MAD_LAUNCHER environment variable + # 4. Default to 'docker' for local deployments + launcher = "" + + # Check additional_context first (highest priority) + if self.additional_context: + distributed_config = self.additional_context.get("distributed", {}) + launcher = distributed_config.get("launcher", "") + if launcher: + print(f"🚀 Launcher from additional_context: {launcher}") + + # Check model_info distributed config + if not launcher and model_info.get("distributed"): + launcher = model_info["distributed"].get("launcher", "") + if launcher: + print(f"🚀 Launcher from model_info: {launcher}") + + # Fallback to environment variable + if not launcher: + launcher = os.environ.get("MAD_LAUNCHER", "") + if launcher: + print(f"🚀 Launcher from MAD_LAUNCHER env: {launcher}") + + # Apply deployment-specific defaults if no launcher specified + deployment_type = os.environ.get("MAD_DEPLOYMENT_TYPE", "local") + if not launcher: + if deployment_type == "kubernetes": + launcher = "native" + print(f"🚀 Launcher defaulted to 'native' for kubernetes deployment") + elif deployment_type == "slurm": + # For SLURM, try to get launcher type from environment or default to torchrun + # Note: "slurm" is the deployment type, not the launcher + launcher = os.environ.get("MAD_LAUNCHER_TYPE", "torchrun") + print(f"🚀 Launcher defaulted to '{launcher}' for slurm deployment") + elif deployment_type == "local": + launcher = "docker" + print(f"🚀 Launcher defaulted to 'docker' for local deployment") + + # Print final launcher selection + if launcher: + print(f"✅ Final launcher selected: '{launcher}' (deployment_type: {deployment_type})") + else: + print(f"⚠️ No launcher specified (deployment_type: {deployment_type})") + + # Create run details dict with all required fields + run_details = { + "model": model_info["name"], + "n_gpus": str(total_gpus), # Total GPUs across all nodes + "nnodes": nnodes, + "gpus_per_node": gpus_per_node, + "training_precision": model_info.get("training_precision", ""), + "pipeline": os.environ.get("pipeline", ""), + "args": model_info.get("args", ""), + "tags": model_info.get("tags", ""), + "docker_file": build_info.get("dockerfile", ""), + "base_docker": build_info.get("base_docker", ""), + "docker_sha": build_info.get("docker_sha", ""), + "docker_image": build_info.get("docker_image", ""), + "git_commit": run_results.get("git_commit", ""), + "machine_name": run_results.get("machine_name", ""), + "deployment_type": os.environ.get("MAD_DEPLOYMENT_TYPE", "local"), # local, slurm, etc. + "launcher": launcher, # Distributed launcher: torchrun, vllm, sglang, deepspeed, etc. + "gpu_architecture": ( + self.context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] + if self.context + else "" + ), + "performance": run_results.get("performance", ""), + "metric": run_results.get("metric", ""), + "relative_change": "", + "status": run_results.get("status", "FAILURE"), + "build_duration": build_info.get("build_duration", ""), + "test_duration": run_results.get("test_duration", ""), + "dataname": run_results.get("dataname", ""), + "data_provider_type": run_results.get("data_provider_type", ""), + "data_size": run_results.get("data_size", ""), + "data_download_duration": run_results.get("data_download_duration", ""), + "build_number": os.environ.get("BUILD_NUMBER", "0"), + "additional_docker_run_options": model_info.get( + "additional_docker_run_options", "" + ), + } + + # Flatten tags if they are in list format + flatten_tags(run_details) + + # Parse and load config file if present in args for perf_entry_super.json + try: + scripts_path = model_info.get("scripts", "") + scripts_base_dir = os.path.dirname(scripts_path) if scripts_path else None + config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) + run_details["configs"] = config_parser.parse_and_load( + model_info.get("args", ""), + scripts_path + ) + except Exception as e: + print(f"⚠️ Warning: Could not parse config file: {e}") + run_details["configs"] = None + + return run_details + + def load_build_manifest( + self, manifest_file: str = "build_manifest.json" + ) -> typing.Dict: + """Load build manifest from file. + + Args: + manifest_file: Path to build manifest file + + Returns: + dict: Build manifest data + """ + with open(manifest_file, "r") as f: + manifest = json.load(f) + + print(f"Loaded build manifest from: {manifest_file}") + return manifest + + def login_to_registry(self, registry: str, credentials: typing.Dict = None) -> None: + """Login to a Docker registry for pulling images. + + Args: + registry: Registry URL (e.g., "localhost:5000", "docker.io") + credentials: Optional credentials dictionary containing username/password + """ + if not credentials: + self.rich_console.print("[yellow]No credentials provided for registry login[/yellow]") + return + + # Check if registry credentials are available + registry_key = registry if registry else "dockerhub" + + # Handle docker.io as dockerhub + if registry and registry.lower() == "docker.io": + registry_key = "dockerhub" + + if registry_key not in credentials: + error_msg = f"No credentials found for registry: {registry_key}" + if registry_key == "dockerhub": + error_msg += f"\nPlease add dockerhub credentials to credential.json:\n" + error_msg += "{\n" + error_msg += ' "dockerhub": {\n' + error_msg += ' "repository": "your-repository",\n' + error_msg += ' "username": "your-dockerhub-username",\n' + error_msg += ' "password": "your-dockerhub-password-or-token"\n' + error_msg += " }\n" + error_msg += "}" + else: + error_msg += ( + f"\nPlease add {registry_key} credentials to credential.json:\n" + ) + error_msg += "{\n" + error_msg += f' "{registry_key}": {{\n' + error_msg += f' "repository": "your-repository",\n' + error_msg += f' "username": "your-{registry_key}-username",\n' + error_msg += f' "password": "your-{registry_key}-password"\n' + error_msg += " }\n" + error_msg += "}" + print(error_msg) + raise RuntimeError(error_msg) + + creds = credentials[registry_key] + + if "username" not in creds or "password" not in creds: + error_msg = f"Invalid credentials format for registry: {registry_key}" + error_msg += f"\nCredentials must contain 'username' and 'password' fields" + print(error_msg) + raise RuntimeError(error_msg) + + # Ensure credential values are strings + username = str(creds["username"]) + password = str(creds["password"]) + + # Perform docker login + login_command = f"echo '{password}' | docker login" + + if registry and registry.lower() not in ["docker.io", "dockerhub"]: + login_command += f" {registry}" + + login_command += f" --username {username} --password-stdin" + + try: + self.console.sh(login_command, secret=True) + self.rich_console.print(f"[green]✅ Successfully logged in to registry: {registry or 'DockerHub'}[/green]") + except Exception as e: + self.rich_console.print(f"[red]❌ Failed to login to registry {registry}: {e}[/red]") + # Don't raise exception here, as public images might still be pullable + + def pull_image( + self, + registry_image: str, + local_name: str = None, + registry: str = None, + credentials: typing.Dict = None, + ) -> str: + """Pull an image from registry. + + Args: + registry_image: Full registry image name + local_name: Optional local name to tag the image + registry: Optional registry URL for authentication + credentials: Optional credentials dictionary for authentication + + Returns: + str: Local image name + """ + # Login to registry if credentials are provided + if registry and credentials: + self.login_to_registry(registry, credentials) + + self.rich_console.print(f"\n[bold blue]📥 Starting docker pull from registry...[/bold blue]") + print(f"📍 Registry: {registry or 'Default'}") + print(f"🏷️ Image: {registry_image}") + + # Force fresh pull on SLURM compute nodes to avoid corrupted cached layers + # This prevents "permission denied" errors from corrupted image layers + deployment_type = os.environ.get("MAD_DEPLOYMENT_TYPE", "local") + in_slurm_job = os.environ.get("MAD_IN_SLURM_JOB", "0") == "1" + + if deployment_type == "slurm" and in_slurm_job: + print(f"🔄 Using fresh pull policy for SLURM compute node (prevents cached layer corruption)") + # Remove any existing cached image to force fresh pull + try: + self.console.sh(f"docker rmi -f {registry_image} 2>/dev/null || true") + print(f"✓ Removed cached image layers") + except: + pass # It's okay if image doesn't exist + + try: + self.console.sh(f"docker pull {registry_image}") + + if local_name: + self.console.sh(f"docker tag {registry_image} {local_name}") + print(f"🏷️ Tagged as: {local_name}") + self.rich_console.print(f"[bold green]✅ Successfully pulled and tagged image[/bold green]") + self.rich_console.print(f"[dim]{'='*80}[/dim]") + return local_name + + self.rich_console.print(f"[bold green]✅ Successfully pulled image:[/bold green] [cyan]{registry_image}[/cyan]") + self.rich_console.print(f"[dim]{'='*80}[/dim]") + return registry_image + + except Exception as e: + self.rich_console.print(f"[red]❌ Failed to pull image {registry_image}: {e}[/red]") + raise + + def get_gpu_arg(self, requested_gpus: str) -> str: + """Get the GPU arguments for docker run. + + Args: + requested_gpus: The requested GPUs. + + Returns: + str: The GPU arguments. + """ + gpu_arg = "" + gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] + n_system_gpus = self.context.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"] + gpu_strings = self.context.ctx["docker_gpus"].split(",") + + # Parse GPU string, example: '{0-4}' -> [0,1,2,3,4] + docker_gpus = [] + for gpu_string in gpu_strings: + if "-" in gpu_string: + gpu_range = gpu_string.split("-") + docker_gpus += [ + item for item in range(int(gpu_range[0]), int(gpu_range[1]) + 1) + ] + else: + docker_gpus.append(int(gpu_string)) + docker_gpus.sort() + + # Check GPU range is valid for system + if requested_gpus == "-1": + print("NGPUS requested is ALL (" + ",".join(map(str, docker_gpus)) + ").") + requested_gpus = len(docker_gpus) + + print( + "NGPUS requested is " + + str(requested_gpus) + + " out of " + + str(n_system_gpus) + ) + + if int(requested_gpus) > int(n_system_gpus) or int(requested_gpus) > len( + docker_gpus + ): + raise RuntimeError( + f"Too many gpus requested({requested_gpus}). System has {n_system_gpus} gpus. Context has {len(docker_gpus)} gpus." + ) + + # Expose number of requested gpus + self.context.ctx["docker_env_vars"]["MAD_RUNTIME_NGPUS"] = str(requested_gpus) + + # Create docker arg to assign requested GPUs + if gpu_vendor.find("AMD") != -1: + gpu_arg = "--device=/dev/kfd " + gpu_renderDs = self.context.ctx["gpu_renderDs"] + if gpu_renderDs is not None: + for idx in range(0, int(requested_gpus)): + gpu_arg += ( + f"--device=/dev/dri/renderD{gpu_renderDs[docker_gpus[idx]]} " + ) + + elif gpu_vendor.find("NVIDIA") != -1: + gpu_str = "" + for idx in range(0, int(requested_gpus)): + gpu_str += str(docker_gpus[idx]) + "," + gpu_arg += f"--gpus '\"device={gpu_str}\"' " + else: + raise RuntimeError("Unable to determine gpu vendor.") + + print(f"GPU arguments: {gpu_arg}") + return gpu_arg + + def get_cpu_arg(self) -> str: + """Get the CPU arguments for docker run.""" + if "docker_cpus" not in self.context.ctx: + return "" + cpus = self.context.ctx["docker_cpus"].replace(" ", "") + return f"--cpuset-cpus {cpus} " + + def get_env_arg(self, run_env: typing.Dict) -> str: + """Get the environment arguments for docker run.""" + env_args = "" + + # Add custom environment variables + if run_env: + for env_arg in run_env: + env_args += f"--env {env_arg}='{str(run_env[env_arg])}' " + + # Add context environment variables + if "docker_env_vars" in self.context.ctx: + for env_arg in self.context.ctx["docker_env_vars"].keys(): + env_args += f"--env {env_arg}='{str(self.context.ctx['docker_env_vars'][env_arg])}' " + + print(f"Env arguments: {env_args}") + return env_args + + def get_mount_arg(self, mount_datapaths: typing.List) -> str: + """Get the mount arguments for docker run.""" + mount_args = "" + + # Mount data paths + if mount_datapaths: + for mount_datapath in mount_datapaths: + if mount_datapath: + mount_args += ( + f"-v {mount_datapath['path']}:{mount_datapath['home']}" + ) + if ( + "readwrite" in mount_datapath + and mount_datapath["readwrite"] == "true" + ): + mount_args += " " + else: + mount_args += ":ro " + + # Mount context paths + if "docker_mounts" in self.context.ctx: + for mount_arg in self.context.ctx["docker_mounts"].keys(): + mount_args += ( + f"-v {self.context.ctx['docker_mounts'][mount_arg]}:{mount_arg} " + ) + + return mount_args + + def apply_tools( + self, + pre_encapsulate_post_scripts: typing.Dict, + run_env: typing.Dict, + tools_json_file: str, + ) -> None: + """Apply tools configuration to the runtime environment.""" + if "tools" not in self.context.ctx: + return + + # Read tool settings from tools.json + with open(tools_json_file) as f: + tool_file = json.load(f) + + # Track commands that have been added to avoid duplicates + # Some tools (like trace tools) share the same wrapper script + added_cmds = set() + + # Iterate over tools in context, apply tool settings + for ctx_tool_config in self.context.ctx["tools"]: + tool_name = ctx_tool_config["name"] + tool_config = tool_file["tools"][tool_name] + + if "cmd" in ctx_tool_config: + tool_config.update({"cmd": ctx_tool_config["cmd"]}) + + if "env_vars" in ctx_tool_config: + for env_var in ctx_tool_config["env_vars"]: + tool_config["env_vars"].update( + {env_var: ctx_tool_config["env_vars"][env_var]} + ) + + print(f"Selected Tool, {tool_name}. Configuration : {str(tool_config)}.") + + # Setup tool before other existing scripts + if "pre_scripts" in tool_config: + pre_encapsulate_post_scripts["pre_scripts"] = ( + tool_config["pre_scripts"] + + pre_encapsulate_post_scripts["pre_scripts"] + ) + # Cleanup tool after other existing scripts + if "post_scripts" in tool_config: + pre_encapsulate_post_scripts["post_scripts"] += tool_config[ + "post_scripts" + ] + # Update environment variables (always apply, even if cmd is duplicate) + if "env_vars" in tool_config: + run_env.update(tool_config["env_vars"]) + + # Only add cmd if it hasn't been added yet + # This prevents duplicate wrappers like get_library_trace.py + if "cmd" in tool_config: + cmd = tool_config["cmd"] + if cmd not in added_cmds: + # Prepend encapsulate cmd + pre_encapsulate_post_scripts["encapsulate_script"] = ( + cmd + + " " + + pre_encapsulate_post_scripts["encapsulate_script"] + ) + added_cmds.add(cmd) + else: + print(f" Note: Command '{cmd}' already added by another tool, skipping duplicate.") + + def run_pre_post_script( + self, model_docker: Docker, model_dir: str, pre_post: typing.List + ) -> None: + """Run pre/post scripts in the container.""" + for script in pre_post: + script_path = script["path"].strip() + model_docker.sh( + f"cp -vLR --preserve=all {script_path} {model_dir}", timeout=600 + ) + script_name = os.path.basename(script_path) + script_args = "" + if "args" in script: + script_args = script["args"].strip() + model_docker.sh( + f"cd {model_dir} && bash {script_name} {script_args}", timeout=600 + ) + + def gather_system_env_details( + self, pre_encapsulate_post_scripts: typing.Dict, model_name: str + ) -> None: + """Gather system environment details. + + Args: + pre_encapsulate_post_scripts: The pre, encapsulate and post scripts. + model_name: The model name. + + Returns: + None + + Raises: + Exception: An error occurred while gathering system environment details. + + Note: + This function is used to gather system environment details. + """ + # initialize pre_env_details + pre_env_details = {} + pre_env_details["path"] = "scripts/common/pre_scripts/run_rocenv_tool.sh" + pre_env_details["args"] = model_name.replace("/", "_") + "_env" + pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details) + print(f"pre encap post scripts: {pre_encapsulate_post_scripts}") + + def run_container( + self, + model_info: typing.Dict, + docker_image: str, + build_info: typing.Dict = None, + keep_alive: bool = False, + keep_model_dir: bool = False, + timeout: int = 7200, + tools_json_file: str = "scripts/common/tools.json", + phase_suffix: str = "", + generate_sys_env_details: bool = True, + ) -> typing.Dict: + """Run a model in a Docker container. + + Args: + model_info: Model information dictionary + docker_image: Docker image name to run + build_info: Optional build information from manifest + keep_alive: Whether to keep container alive after execution + keep_model_dir: Whether to keep model directory after execution + timeout: Execution timeout in seconds + tools_json_file: Path to tools configuration file + phase_suffix: Suffix for log file name (e.g., ".run" or "") + generate_sys_env_details: Whether to collect system environment details + + Returns: + dict: Execution results including performance metrics + """ + self.rich_console.print(f"[bold green]🏃 Running model:[/bold green] [bold cyan]{model_info['name']}[/bold cyan] [dim]in container[/dim] [yellow]{docker_image}[/yellow]") + + # Apply timeout logic: model timeout can override default timeout + # If model has a timeout in models.json and CLI timeout is default (7200), use model's timeout + # If CLI timeout is explicitly set (not default), it overrides model timeout + if "timeout" in model_info and model_info["timeout"] is not None and model_info["timeout"] > 0 and timeout == 7200: + # Model has a timeout and CLI is using default, so use model's timeout + timeout = model_info["timeout"] + + # Create log file for this run + # Extract dockerfile part from docker image name (remove "ci-" prefix and model name prefix) + image_name_without_ci = docker_image.replace("ci-", "") + model_name_clean = model_info["name"].replace("/", "_").lower() + + # Remove model name from the beginning to get the dockerfile part + if image_name_without_ci.startswith(model_name_clean + "_"): + dockerfile_part = image_name_without_ci[len(model_name_clean + "_") :] + else: + dockerfile_part = image_name_without_ci + + log_file_path = ( + model_info["name"].replace("/", "_") + + "_" + + dockerfile_part + + phase_suffix + + ".live.log" + ) + # Replace / with _ in log file path (already done above, but keeping for safety) + log_file_path = log_file_path.replace("/", "_") + + print(f"Run log will be written to: {log_file_path}") + + # get machine name + machine_name = self.console.sh("hostname") + print(f"MACHINE NAME is {machine_name}") + + # Initialize results + run_results = { + "model": model_info["name"], + "docker_image": docker_image, + "status": "FAILURE", + "performance": "", + "metric": "", + "test_duration": 0, + "machine_name": machine_name, + "log_file": log_file_path, + } + + # If build info provided, merge it + if build_info: + run_results.update(build_info) + + # Prepare docker run options + gpu_vendor = self.context.ctx["gpu_vendor"] + docker_options = "" + + if gpu_vendor.find("AMD") != -1: + docker_options = ( + "--network host -u root --group-add video " + "--cap-add=SYS_PTRACE --cap-add SYS_ADMIN --device /dev/fuse " + "--security-opt seccomp=unconfined --security-opt apparmor=unconfined --ipc=host " + ) + elif gpu_vendor.find("NVIDIA") != -1: + docker_options = ( + "-u root --cap-add=SYS_PTRACE --cap-add SYS_ADMIN --cap-add SYS_NICE --device /dev/fuse " + "--security-opt seccomp=unconfined --security-opt apparmor=unconfined " + "--network host --ipc=host " + ) + else: + raise RuntimeError("Unable to determine gpu vendor.") + + # Initialize scripts + pre_encapsulate_post_scripts = { + "pre_scripts": [], + "encapsulate_script": "", + "post_scripts": [], + } + + if "pre_scripts" in self.context.ctx: + pre_encapsulate_post_scripts["pre_scripts"] = self.context.ctx[ + "pre_scripts" + ] + if "post_scripts" in self.context.ctx: + pre_encapsulate_post_scripts["post_scripts"] = self.context.ctx[ + "post_scripts" + ] + if "encapsulate_script" in self.context.ctx: + pre_encapsulate_post_scripts["encapsulate_script"] = self.context.ctx[ + "encapsulate_script" + ] + + # Add environment variables + docker_options += f"--env MAD_MODEL_NAME='{model_info['name']}' " + docker_options += ( + f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' " + ) + + # Gather data and environment + run_env = {} + mount_datapaths = None + + # Merge docker_env_vars from additional_context into context + # Also check shell environment for SLURM-passed variables + if "docker_env_vars" not in self.context.ctx: + self.context.ctx["docker_env_vars"] = {} + + # For SLURM jobs, check shell environment and populate additional_context with GPU info + # This ensures GPU resolution works correctly + if os.environ.get("MAD_DEPLOYMENT_TYPE") == "slurm": + if "NPROC_PER_NODE" in os.environ or "GPUS_PER_NODE" in os.environ: + gpus_per_node_str = os.environ.get("NPROC_PER_NODE") or os.environ.get("GPUS_PER_NODE") + if gpus_per_node_str: + try: + gpus = int(gpus_per_node_str) + # Add gpus_per_node to additional_context for GPU resolution + # resolve_runtime_gpus looks for this field name + if not self.additional_context: + self.additional_context = {} + if "gpus_per_node" not in self.additional_context: + self.additional_context["gpus_per_node"] = gpus + print(f"ℹ️ SLURM GPU override: {gpus} GPUs per node (from shell environment)") + except ValueError: + pass + + # List of environment variables to pass from shell to Docker (for SLURM jobs) + slurm_env_vars = [ + 'MASTER_ADDR', 'MASTER_PORT', 'WORLD_SIZE', 'RANK', 'NODE_RANK', + 'NNODES', 'NPROC_PER_NODE', 'MAD_MULTI_NODE_RUNNER', + 'MAD_COLLECT_METRICS', 'NCCL_SOCKET_IFNAME', 'GLOO_SOCKET_IFNAME', + 'NCCL_DEBUG', 'NCCL_IB_DISABLE', 'NCCL_NET_GDR_LEVEL', + # GPU visibility variables for Ray-based launchers (vLLM, SGLang) + # CRITICAL: These must be passed to Docker for proper GPU device mapping + 'HIP_VISIBLE_DEVICES', 'ROCR_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES' + ] + + # Check shell environment and add to docker_env_vars + merged_from_env = 0 + for var_name in slurm_env_vars: + if var_name in os.environ: + self.context.ctx["docker_env_vars"][var_name] = os.environ[var_name] + merged_from_env += 1 + + # CRITICAL FIX for rocm/vllm image: Override RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES + # The rocm/vllm Docker image has RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 baked in, + # which tells Ray to IGNORE HIP_VISIBLE_DEVICES. We must explicitly override it. + # This is only needed if HIP_VISIBLE_DEVICES is set (indicating AMD GPU usage with Ray) + if 'HIP_VISIBLE_DEVICES' in self.context.ctx["docker_env_vars"]: + # Set to empty string to disable Ray's behavior of ignoring HIP_VISIBLE_DEVICES + self.context.ctx["docker_env_vars"]['RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES'] = '' + print("ℹ️ Overriding RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES to enable HIP_VISIBLE_DEVICES") + + if merged_from_env > 0: + print(f"ℹ️ Inherited {merged_from_env} environment variables from shell for Docker") + + # Also merge from additional_context if present + if self.additional_context and "docker_env_vars" in self.additional_context: + merged_count = 0 + for key, value in self.additional_context["docker_env_vars"].items(): + self.context.ctx["docker_env_vars"][key] = value + merged_count += 1 + if merged_count > 0: + print(f"ℹ️ Merged {merged_count} environment variables from additional_context") + + if "data" in model_info and model_info["data"] != "" and self.data: + mount_datapaths = self.data.get_mountpaths(model_info["data"]) + model_dataenv = self.data.get_env(model_info["data"]) + if model_dataenv is not None: + run_env.update(model_dataenv) + run_env["MAD_DATANAME"] = model_info["data"] + + # Add credentials to environment + if "cred" in model_info and model_info["cred"] != "" and self.credentials: + if model_info["cred"] not in self.credentials: + raise RuntimeError(f"Credentials({model_info['cred']}) not found") + for key_cred, value_cred in self.credentials[model_info["cred"]].items(): + run_env[model_info["cred"] + "_" + key_cred.upper()] = value_cred + + # Apply tools if configured + if os.path.exists(tools_json_file): + self.apply_tools(pre_encapsulate_post_scripts, run_env, tools_json_file) + + # Add system environment collection script to pre_scripts (equivalent to generate_sys_env_details) + # This ensures distributed runs have the same system environment logging as standard runs + if generate_sys_env_details or self.context.ctx.get("gen_sys_env_details"): + self.gather_system_env_details( + pre_encapsulate_post_scripts, model_info["name"] + ) + + # Build docker options + # Use hierarchical GPU resolution: runtime > deployment > model > default + resolved_gpu_count = resolve_runtime_gpus(model_info, self.additional_context) + docker_options += self.get_gpu_arg(str(resolved_gpu_count)) + docker_options += self.get_cpu_arg() + + # Filter out MIOPEN_USER_DB_PATH from run_env if it exists + # It should be passed via docker_env_vars in context instead + if "MIOPEN_USER_DB_PATH" in run_env: + del run_env["MIOPEN_USER_DB_PATH"] + print("ℹ️ Removed MIOPEN_USER_DB_PATH from run_env (will use context.docker_env_vars)") + + # Add MIOPEN_USER_DB_PATH from shell environment to context.docker_env_vars + # This is set by SLURM script with ${LOCAL_RANK} variable for per-process paths + if "MIOPEN_USER_DB_PATH" in os.environ and "MIOPEN_USER_DB_PATH" not in self.context.ctx["docker_env_vars"]: + self.context.ctx["docker_env_vars"]["MIOPEN_USER_DB_PATH"] = os.environ["MIOPEN_USER_DB_PATH"] + print(f"ℹ️ Added MIOPEN_USER_DB_PATH to docker_env_vars: {os.environ['MIOPEN_USER_DB_PATH']}") + + docker_options += self.get_env_arg(run_env) + docker_options += self.get_mount_arg(mount_datapaths) + docker_options += f" {model_info.get('additional_docker_run_options', '')}" + + # Generate container name + base_container_name = "container_" + re.sub( + ".*:", "", docker_image.replace("/", "_").replace(":", "_") + ) + + # For multi-node SLURM jobs, add node rank to avoid name conflicts + node_rank = os.environ.get("SLURM_PROCID") or os.environ.get("RANK") + if node_rank is not None: + container_name = f"{base_container_name}_node{node_rank}" + else: + container_name = base_container_name + + print(f"Docker options: {docker_options}") + + self.rich_console.print(f"\n[bold blue]🏃 Starting Docker container execution...[/bold blue]") + print(f"🏷️ Image: {docker_image}") + print(f"📦 Container: {container_name}") + print(f"📝 Log file: {log_file_path}") + print(f"🎮 GPU Vendor: {gpu_vendor}") + self.rich_console.print(f"[dim]{'='*80}[/dim]") + + # Run the container with logging + try: + with open(log_file_path, mode="w", buffering=1) as outlog: + with redirect_stdout( + PythonicTee(outlog, self.live_output) + ), redirect_stderr(PythonicTee(outlog, self.live_output)): + # set timeout (print inside log redirection so it appears in log file) + print(f"⏰ Setting timeout to {str(timeout)} seconds.") + + with Timeout(timeout): + model_docker = Docker( + docker_image, + container_name, + docker_options, + keep_alive=keep_alive, + console=self.console, + ) + + # Check user + whoami = model_docker.sh("whoami") + print(f"👤 Running as user: {whoami}") + + # Show GPU info with version-aware tool selection (PR #54) + if gpu_vendor.find("AMD") != -1: + print(f"🎮 Checking AMD GPU status...") + # Use version-aware SMI tool selection + # Note: Use amd-smi without arguments to show full status table (same as legacy madengine) + try: + tool_manager = self.context._get_tool_manager() + preferred_tool = tool_manager.get_preferred_smi_tool() + if preferred_tool == "amd-smi": + model_docker.sh("/opt/rocm/bin/amd-smi || /opt/rocm/bin/rocm-smi || true") + else: + model_docker.sh("/opt/rocm/bin/rocm-smi || /opt/rocm/bin/amd-smi || true") + except Exception: + # Fallback: try both tools + model_docker.sh("/opt/rocm/bin/amd-smi || /opt/rocm/bin/rocm-smi || true") + elif gpu_vendor.find("NVIDIA") != -1: + print(f"🎮 Checking NVIDIA GPU status...") + model_docker.sh("/usr/bin/nvidia-smi || true") + + # Prepare model directory + model_dir = "run_directory" + if "url" in model_info and model_info["url"] != "": + model_dir = model_info["url"].rstrip("/").split("/")[-1] + + # Validate model_dir + special_char = r"[^a-zA-Z0-9\-\_]" + if re.search(special_char, model_dir) is not None: + warnings.warn( + "Model url contains special character. Fix url." + ) + + model_docker.sh(f"rm -rf {model_dir}", timeout=240) + model_docker.sh( + "git config --global --add safe.directory /myworkspace" + ) + + # Clone model repo if needed + if "url" in model_info and model_info["url"] != "": + if ( + "cred" in model_info + and model_info["cred"] != "" + and self.credentials + ): + print(f"Using credentials for {model_info['cred']}") + + if model_info["url"].startswith("ssh://"): + model_docker.sh( + f"git -c core.sshCommand='ssh -l {self.credentials[model_info['cred']]['username']} " + f"-i {self.credentials[model_info['cred']]['ssh_key_file']} -o IdentitiesOnly=yes " + f"-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no' " + f"clone {model_info['url']}", + timeout=240, + ) + else: # http or https + model_docker.sh( + f"git clone -c credential.helper='!f() {{ echo username={self.credentials[model_info['cred']]['username']}; " + f"echo password={self.credentials[model_info['cred']]['password']}; }};f' " + f"{model_info['url']}", + timeout=240, + secret=f"git clone {model_info['url']}", + ) + else: + model_docker.sh( + f"git clone {model_info['url']}", timeout=240 + ) + + model_docker.sh( + f"git config --global --add safe.directory /myworkspace/{model_dir}" + ) + run_results["git_commit"] = model_docker.sh( + f"cd {model_dir} && git rev-parse HEAD" + ) + print(f"MODEL GIT COMMIT is {run_results['git_commit']}") + model_docker.sh( + f"cd {model_dir}; git submodule update --init --recursive" + ) + else: + model_docker.sh(f"mkdir -p {model_dir}") + + # Run pre-scripts + if pre_encapsulate_post_scripts["pre_scripts"]: + self.run_pre_post_script( + model_docker, + model_dir, + pre_encapsulate_post_scripts["pre_scripts"], + ) + + # Prepare script execution + scripts_arg = model_info["scripts"] + if scripts_arg.endswith(".sh"): + # Shell script specified directly + dir_path = os.path.dirname(scripts_arg) + script_name = "bash " + os.path.basename(scripts_arg) + elif scripts_arg.endswith(".py"): + # Python script specified directly + dir_path = os.path.dirname(scripts_arg) + script_name = "python3 " + os.path.basename(scripts_arg) + else: + # Directory specified (legacy behavior) + dir_path = model_info["scripts"] + script_name = "bash run.sh" + + # Add script prepend command + script_name = ( + pre_encapsulate_post_scripts["encapsulate_script"] + + " " + + script_name + ) + + # print repo hash + commit = model_docker.sh( + f"cd {dir_path}; git rev-parse HEAD || true" + ) + print("======================================================") + print("MODEL REPO COMMIT: ", commit) + print("======================================================") + + # Copy scripts to model directory + model_docker.sh( + f"cp -vLR --preserve=all {dir_path}/. {model_dir}/" + ) + + # Prepare data if needed + if ( + "data" in model_info + and model_info["data"] != "" + and self.data + ): + self.data.prepare_data(model_info["data"], model_docker) + + # Capture data provider information from selected_data_provider + if ( + hasattr(self.data, "selected_data_provider") + and self.data.selected_data_provider + ): + if "dataname" in self.data.selected_data_provider: + run_results["dataname"] = self.data.selected_data_provider["dataname"] + if "data_provider_type" in self.data.selected_data_provider: + run_results["data_provider_type"] = self.data.selected_data_provider["data_provider_type"] + if "duration" in self.data.selected_data_provider: + run_results["data_download_duration"] = self.data.selected_data_provider["duration"] + if "size" in self.data.selected_data_provider: + run_results["data_size"] = self.data.selected_data_provider["size"] + print( + f"Data Provider Details: {run_results.get('dataname', '')}, " + f"{run_results.get('data_provider_type', '')}, " + f"{run_results.get('data_size', '')}, " + f"{run_results.get('data_download_duration', '')}s" + ) + + # Set permissions + model_docker.sh(f"chmod -R a+rw {model_dir}") + + # Run the model + test_start_time = time.time() + self.rich_console.print("[bold blue]Running model...[/bold blue]") + + model_args = self.context.ctx.get( + "model_args", model_info["args"] + ) + # Use the container timeout (default 7200s) for script execution + # to prevent indefinite hangs + model_output = model_docker.sh( + f"cd {model_dir} && {script_name} {model_args}", + timeout=timeout, + ) + # Print output to ensure it gets captured in log file + print(model_output) + + run_results["test_duration"] = time.time() - test_start_time + print(f"Test Duration: {run_results['test_duration']} seconds") + + # Run post-scripts + if pre_encapsulate_post_scripts["post_scripts"]: + self.run_pre_post_script( + model_docker, + model_dir, + pre_encapsulate_post_scripts["post_scripts"], + ) + + # Extract performance metrics from logs + # Look for performance data in the log output similar to original run_models.py + try: + # Check if multiple results file is specified in model_info + multiple_results = model_info.get("multiple_results", None) + + if multiple_results: + run_results["performance"] = multiple_results + # Validate multiple results file format using proper CSV parsing + try: + import csv + with open(multiple_results, "r") as f: + csv_reader = csv.DictReader(f) + + # Check if 'performance' column exists + if 'performance' not in csv_reader.fieldnames: + print("Error: 'performance' column not found in multiple results file.") + run_results["performance"] = None + else: + # Check if at least one row has a non-empty performance value + has_valid_perf = False + for row in csv_reader: + if row.get('performance', '').strip(): + has_valid_perf = True + break + + if not has_valid_perf: + run_results["performance"] = None + print("Error: Performance metric is empty in all rows of multiple results file.") + except Exception as e: + self.rich_console.print( + f"[yellow]Warning: Could not validate multiple results file: {e}[/yellow]" + ) + run_results["performance"] = None + else: + # Match the actual output format: "performance: 14164 samples_per_second" + # Simple pattern to capture number and metric unit + + # Extract from log file + try: + # Note: re and os are already imported at module level (lines 10, 15) + + # Verify log file exists and is readable + if not os.path.exists(log_file_path): + print(f"Warning: Log file not found: {log_file_path}") + run_results["performance"] = None + run_results["metric"] = None + else: + # Read the log file once (avoids rocprofv3 crash from shell pipelines) + # This approach matches the Kubernetes implementation pattern + with open(log_file_path, 'r', encoding='utf-8', errors='ignore') as f: + log_content = f.read() + + # Try multiple patterns to match different log formats + + # Pattern 1: "performance: 12345 metric_name" (original expected format) + perf_pattern = r'performance:\s+([0-9][0-9.eE+-]*)\s+([a-zA-Z_][a-zA-Z0-9_]*)' + match = re.search(perf_pattern, log_content) + + if match: + run_results["performance"] = match.group(1).strip() + run_results["metric"] = match.group(2).strip() + print(f"✓ Extracted performance: {run_results['performance']} {run_results['metric']}") + else: + # Pattern 2: HuggingFace format - "'train_samples_per_second': 4.23" or "train_samples_per_second = 4.23" + # This matches the actual output from HuggingFace Trainer + hf_pattern = r'train_samples_per_second[\'"\s:=]+([0-9][0-9.eE+-]*)' + hf_match = re.search(hf_pattern, log_content) + + if hf_match: + run_results["performance"] = hf_match.group(1).strip() + run_results["metric"] = "samples_per_second" + print(f"✓ Extracted performance (HuggingFace format): {run_results['performance']} {run_results['metric']}") + else: + # No performance metrics found + print("Warning: Performance metric not found in expected format 'performance: NUMBER METRIC' or 'train_samples_per_second'") + run_results["performance"] = None + run_results["metric"] = None + + except Exception as e: + print(f"Warning: Error extracting performance metrics: {e}") + run_results["performance"] = None + run_results["metric"] = None + # Performance extraction is optional - don't fail the entire run + except Exception as e: + print( + f"Warning: Could not extract performance metrics: {e}" + ) + + # Set status based on performance and error patterns + # First check for obvious failure patterns in the logs + try: + # Check for common failure patterns in the log file + # Note: Patterns should be specific enough to avoid false positives + # from profiling tools (rocprof, etc.) that use "Error:" as log level + error_patterns = [ + "OutOfMemoryError", + "HIP out of memory", + "CUDA out of memory", + "RuntimeError:", # More specific with colon + "AssertionError:", + "ValueError:", + "SystemExit", + "failed \\(exitcode:", # Escape parenthesis for grep + "Traceback \\(most recent call last\\)", # Python tracebacks + "FAILED", + "Exception:", + "ImportError:", + "ModuleNotFoundError:", + ] + + has_errors = False + if log_file_path and os.path.exists(log_file_path): + try: + # Define benign patterns to exclude from error detection + # These are known warnings/info messages that should not trigger failures + benign_patterns = [ + "Failed to establish connection to the metrics exporter agent", + "RpcError: Running out of retries to initialize the metrics agent", + "Metrics will not be exported", + "FutureWarning", + # ROCProf/glog logging patterns (E/W/I prefixes are log levels, not errors) + r"^E[0-9]{8}.*generateRocpd\.cpp", # ROCProf error-level logs + r"^W[0-9]{8}.*simple_timer\.cpp", # ROCProf warning-level logs + r"^W[0-9]{8}.*generateRocpd\.cpp", # ROCProf warning-level logs + r"^E[0-9]{8}.*tool\.cpp", # ROCProf tool logs + "Opened result file:", # ROCProf result file messages + "SQLite3 generation ::", # ROCProf SQLite messages + r"\[rocprofv3\]", # ROCProf v3 messages + "rocpd_op:", # ROCProf operation logs + "rpd_tracer:", # ROCProf tracer logs + ] + + # Check for error patterns in the log (exclude our own grep commands, output messages, and benign patterns) + for pattern in error_patterns: + # Build exclusion regex: our own commands, output messages, and benign patterns + exclusions = f"(grep -q.*{pattern}|Found error pattern.*{pattern}" + for benign in benign_patterns: + # Escape special regex characters in benign patterns + escaped_benign = benign.replace(".", r"\.").replace("(", r"\(").replace(")", r"\)") + exclusions += f"|{escaped_benign}" + exclusions += ")" + + # Use grep with -v to exclude false positives + error_check_cmd = f"grep -v -E '{exclusions}' {log_file_path} | grep -q '{pattern}' && echo 'FOUND' || echo 'NOT_FOUND'" + result = self.console.sh( + error_check_cmd, canFail=True + ) + if result.strip() == "FOUND": + has_errors = True + print( + f"Found error pattern '{pattern}' in logs" + ) + break + except Exception: + pass # Error checking is optional + + # Status logic: Must have performance AND no errors to be considered success + # Exception: Worker nodes in multi-node training (MAD_COLLECT_METRICS=false) + # are not expected to report global performance metrics + performance_value = run_results.get("performance") + has_performance = ( + performance_value + and performance_value.strip() + and performance_value.strip() != "N/A" + ) + + # Check if this is a worker node (not collecting metrics) + is_worker_node = os.environ.get("MAD_COLLECT_METRICS", "true").lower() == "false" + + if has_errors: + run_results["status"] = "FAILURE" + self.rich_console.print( + f"[red]Status: FAILURE (error patterns detected in logs)[/red]" + ) + elif has_performance: + run_results["status"] = "SUCCESS" + self.rich_console.print( + f"[green]Status: SUCCESS (performance metrics found, no errors)[/green]" + ) + elif is_worker_node: + # Worker nodes don't report global performance metrics - this is expected + run_results["status"] = "SUCCESS" + self.rich_console.print( + f"[green]Status: SUCCESS (worker node, no errors detected)[/green]" + ) + else: + run_results["status"] = "FAILURE" + self.rich_console.print(f"[red]Status: FAILURE (no performance metrics)[/red]") + + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Error in status determination: {e}[/yellow]") + # Fallback to simple performance check + # Worker nodes don't need performance metrics + is_worker_node = os.environ.get("MAD_COLLECT_METRICS", "true").lower() == "false" + run_results["status"] = ( + "SUCCESS" + if run_results.get("performance") or is_worker_node + else "FAILURE" + ) + + print( + f"{model_info['name']} performance is {run_results.get('performance', 'N/A')} {run_results.get('metric', '')}" + ) + + # ============================================================================= + # Multi-Node Performance Collection (Master Node Only) + # ============================================================================= + # For distributed training, only master node should collect metrics + # Check skip_perf_collection flag from additional_context + skip_perf = self.additional_context.get("skip_perf_collection", False) + + if skip_perf: + self.rich_console.print( + "[cyan]ℹ️ Worker node: Skipping performance metric collection " + "(master node will collect results)[/cyan]" + ) + else: + # Generate performance results and update perf.csv + self.ensure_perf_csv_exists() + try: + # Create run details dictionary for CSV generation + run_details_dict = self.create_run_details_dict( + model_info, build_info, run_results + ) + + # Handle multiple results if specified + multiple_results = model_info.get("multiple_results", None) + if ( + multiple_results + and run_results.get("status") == "SUCCESS" + ): + # Generate common info JSON for multiple results + common_info = run_details_dict.copy() + # Remove model-specific fields for common info + for key in ["model", "performance", "metric", "status"]: + common_info.pop(key, None) + + with open("common_info.json", "w") as f: + json.dump(common_info, f) + + # Update perf.csv with multiple results + update_perf_csv( + multiple_results=multiple_results, + perf_csv=self.perf_csv_path, + model_name=run_details_dict["model"], + common_info="common_info.json", + ) + print( + f"Updated perf.csv with multiple results for {model_info['name']}" + ) + + # Update perf_super.json with multiple results + try: + scripts_path = model_info.get("scripts", "") + scripts_base_dir = os.path.dirname(scripts_path) if scripts_path else None + + # Reuse common_info.json for super files (no need for duplicate) + num_entries = update_perf_super_json( + multiple_results=multiple_results, + perf_super_json="perf_super.json", + model_name=run_details_dict["model"], + common_info="common_info.json", + scripts_base_dir=scripts_base_dir, + ) + + # Generate CSV and JSON files from perf_super.json + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv", + num_entries=num_entries + ) + except Exception as e: + print(f"⚠️ Warning: Could not update perf_super files: {e}") + else: + # Generate single result JSON + with open("perf_entry.json", "w") as f: + json.dump(run_details_dict, f) + + # Update perf.csv with single result + if run_results.get("status") == "SUCCESS": + update_perf_csv( + single_result="perf_entry.json", + perf_csv=self.perf_csv_path, + ) + else: + update_perf_csv( + exception_result="perf_entry.json", + perf_csv=self.perf_csv_path, + ) + print( + f"Updated perf.csv with result for {model_info['name']}" + ) + + # Update perf_super.json with single result + try: + scripts_path = model_info.get("scripts", "") + scripts_base_dir = os.path.dirname(scripts_path) if scripts_path else None + + # Use perf_entry.json as input (already created above) + if run_results.get("status") == "SUCCESS": + num_entries = update_perf_super_json( + single_result="perf_entry.json", + perf_super_json="perf_super.json", + scripts_base_dir=scripts_base_dir, + ) + else: + num_entries = update_perf_super_json( + exception_result="perf_entry.json", + perf_super_json="perf_super.json", + scripts_base_dir=scripts_base_dir, + ) + + # Generate CSV and JSON files from perf_super.json + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv", + num_entries=num_entries + ) + except Exception as e: + print(f"⚠️ Warning: Could not update perf_super files: {e}") + + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Could not update perf.csv: {e}[/yellow]") + + # Copy profiler/trace output files from run_directory to base directory before cleanup + # This ensures test files like gpu_info_power_profiler_output.csv and library_trace.csv are accessible + try: + model_docker.sh(f"cp {model_dir}/*_profiler_output.csv . 2>/dev/null || true") + model_docker.sh(f"cp {model_dir}/*_output.csv . 2>/dev/null || true") + model_docker.sh(f"cp {model_dir}/*_trace.csv . 2>/dev/null || true") + model_docker.sh(f"cp {model_dir}/library_trace.csv . 2>/dev/null || true") + except Exception as e: + # Ignore errors if no profiler/trace output files exist + pass + + # Cleanup if not keeping alive and not keeping model directory + if not keep_alive and not keep_model_dir: + model_docker.sh(f"rm -rf {model_dir}", timeout=240) + else: + model_docker.sh(f"chmod -R a+rw {model_dir}") + reason = "keep_alive" if keep_alive else "keep_model_dir" + print( + f"{reason} specified; model_dir({model_dir}) is not removed" + ) + + # Explicitly delete model docker to stop the container + del model_docker + + except Exception as e: + self.rich_console.print("[bold red]===== EXCEPTION =====[/bold red]") + self.rich_console.print(f"[red]Exception: {e}[/red]") + import traceback + + traceback.print_exc() + self.rich_console.print("[bold red]=============== =====[/bold red]") + run_results["status"] = "FAILURE" + + # Also update perf.csv for failures + self.ensure_perf_csv_exists() + try: + # Create run details dictionary for failed runs + run_details_dict = self.create_run_details_dict( + model_info, build_info, run_results + ) + + # Generate exception result JSON + with open("perf_entry.json", "w") as f: + json.dump(run_details_dict, f) + + # Update perf.csv with exception result + update_perf_csv( + exception_result="perf_entry.json", + perf_csv=self.perf_csv_path, + ) + print( + f"Updated perf.csv with exception result for {model_info['name']}" + ) + + # Update perf_super.json with exception result + try: + scripts_path = model_info.get("scripts", "") + scripts_base_dir = os.path.dirname(scripts_path) if scripts_path else None + + # Use perf_entry.json as input (already created above) + num_entries = update_perf_super_json( + exception_result="perf_entry.json", + perf_super_json="perf_super.json", + scripts_base_dir=scripts_base_dir, + ) + + # Generate CSV and JSON files from perf_super.json + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv", + num_entries=num_entries + ) + except Exception as e: + print(f"⚠️ Warning: Could not update perf_super files: {e}") + + except Exception as csv_e: + self.rich_console.print(f"[yellow]Warning: Could not update perf.csv with exception: {csv_e}[/yellow]") + + return run_results + + def set_credentials(self, credentials: typing.Dict) -> None: + """Set credentials for model execution. + + Args: + credentials: Credentials dictionary + """ + self.credentials = credentials + + def run_models_from_manifest( + self, + manifest_file: str, + registry: str = None, + timeout: int = 7200, + keep_alive: bool = False, + keep_model_dir: bool = False, + phase_suffix: str = "", + ) -> typing.Dict: + """Run all models from a build manifest file. + + This is the main entry point for running pre-built containers from a manifest. + + Args: + manifest_file: Path to build_manifest.json + registry: Optional registry override + timeout: Execution timeout per model in seconds + keep_alive: Whether to keep containers alive after execution + keep_model_dir: Whether to keep model directory after execution + phase_suffix: Suffix for log files (e.g., ".run") + + Returns: + dict: Execution summary with successful and failed runs + """ + self.rich_console.print(f"[bold blue]📦 Loading manifest:[/bold blue] {manifest_file}") + + # Load manifest + manifest = self.load_build_manifest(manifest_file) + built_images = manifest.get("built_images", {}) + built_models = manifest.get("built_models", {}) + + # Load deployment_config from manifest for GPU resolution + if "deployment_config" in manifest and not self.additional_context: + self.additional_context = {"deployment_config": manifest["deployment_config"]} + + if not built_images: + self.rich_console.print("[yellow]⚠️ No images found in manifest[/yellow]") + return {"successful_runs": [], "failed_runs": []} + + self.rich_console.print(f"[green]Found {len(built_images)} image(s) to run[/green]\n") + + # Login to registry if needed + if registry or any(img.get("registry") for img in built_images.values()): + effective_registry = registry or next( + (img.get("registry") for img in built_images.values() if img.get("registry")), + None + ) + if effective_registry: + try: + self.login_to_registry(effective_registry, self.credentials) + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Registry login failed: {e}[/yellow]") + self.rich_console.print("[yellow]Proceeding with local images only[/yellow]\n") + + # Track results + successful_runs = [] + failed_runs = [] + + # Run each model + for image_name, build_info in built_images.items(): + model_info = built_models.get(image_name, {}) + if not model_info: + self.rich_console.print(f"[yellow]⚠️ No model info for {image_name}, skipping[/yellow]") + continue + + try: + # Handle different image sources + if build_info.get("local_image"): + # Local image mode (MAD_CONTAINER_IMAGE): Use the provided image directly + run_image = build_info.get("docker_image") + self.rich_console.print(f"[yellow]🏠 Using local image: {run_image}[/yellow]") + + # Verify image exists + try: + self.console.sh(f"docker image inspect {run_image} > /dev/null 2>&1") + except (subprocess.CalledProcessError, RuntimeError) as e: + self.rich_console.print(f"[yellow]⚠️ Image {run_image} not found, attempting to pull...[/yellow]") + try: + self.pull_image(run_image) + except Exception as e: + raise RuntimeError(f"Failed to find or pull local image {run_image}: {e}") + + elif build_info.get("registry_image"): + # Registry image: Pull from registry + try: + self.pull_image(build_info["registry_image"]) + # Update docker_image to use registry image + run_image = build_info["registry_image"] + except Exception as pull_error: + self.rich_console.print(f"[yellow]Warning: Could not pull from registry, using local image[/yellow]") + run_image = image_name + else: + # Normal built image: Use the image name directly + run_image = image_name + + # Run the container + run_results = self.run_container( + model_info=model_info, + docker_image=run_image, + build_info=build_info, + keep_alive=keep_alive, + keep_model_dir=keep_model_dir, + timeout=timeout, + phase_suffix=phase_suffix, + ) + + # Check actual status and track accordingly + status = run_results.get("status", "SUCCESS") + if status == "SUCCESS": + successful_runs.append({ + "model": model_info["name"], + "image": run_image, + "status": status, + "performance": run_results.get("performance"), + "duration": run_results.get("test_duration"), + }) + else: + # Status is FAILURE - track as failed + failed_runs.append({ + "model": model_info["name"], + "image": run_image, + "status": status, + "error": "Container execution failed - check logs for details", + }) + self.rich_console.print(f"[red]❌ Run failed for {model_info['name']}: Status={status}[/red]") + + except Exception as e: + self.rich_console.print(f"[red]❌ Failed to run {model_info['name']}: {e}[/red]") + failed_runs.append({ + "model": model_info.get("name", image_name), + "image": image_name, + "error": str(e), + }) + + # Summary + self.rich_console.print(f"\n[bold]📊 Execution Summary:[/bold]") + self.rich_console.print(f" [green]✓ Successful:[/green] {len(successful_runs)}") + self.rich_console.print(f" [red]✗ Failed:[/red] {len(failed_runs)}") + + return { + "successful_runs": successful_runs, + "failed_runs": failed_runs, + "total_runs": len(successful_runs) + len(failed_runs), + } diff --git a/src/madengine/execution/docker_builder.py b/src/madengine/execution/docker_builder.py new file mode 100644 index 00000000..3901d864 --- /dev/null +++ b/src/madengine/execution/docker_builder.py @@ -0,0 +1,1116 @@ +#!/usr/bin/env python3 +""" +Docker Image Builder Module for madengine + +This module handles the Docker image building phase separately from execution, +enabling distributed workflows where images are built on a central host +and then distributed to remote nodes for execution. +""" + +import os +import time +import json +import re +import typing +from contextlib import redirect_stdout, redirect_stderr +from rich.console import Console as RichConsole +from madengine.core.console import Console +from madengine.core.context import Context +from madengine.utils.ops import PythonicTee + + +class DockerBuilder: + """Class responsible for building Docker images for models.""" + + # GPU architecture variables used in MAD/DLM Dockerfiles + GPU_ARCH_VARIABLES = [ + "MAD_SYSTEM_GPU_ARCHITECTURE", + "PYTORCH_ROCM_ARCH", + "GPU_TARGETS", + "GFX_COMPILATION_ARCH", + "GPU_ARCHS" + ] + + def __init__( + self, context: Context, console: Console = None, live_output: bool = False + ): + """Initialize the Docker Builder. + + Args: + context: The madengine context + console: Optional console instance + live_output: Whether to show live output + """ + self.context = context + self.console = console or Console(live_output=live_output) + self.live_output = live_output + self.rich_console = RichConsole() + self.built_images = {} # Track built images + self.built_models = {} # Track built models + + def get_context_path(self, info: typing.Dict) -> str: + """Get the context path for Docker build. + + Args: + info: The model info dict. + + Returns: + str: The context path. + """ + if "dockercontext" in info and info["dockercontext"] != "": + return info["dockercontext"] + else: + return "./docker" + + def get_build_arg(self, run_build_arg: typing.Dict = {}) -> str: + """Get the build arguments. + + Args: + run_build_arg: The run build arguments. + + Returns: + str: The build arguments. + """ + if not run_build_arg and "docker_build_arg" not in self.context.ctx: + return "" + + build_args = "" + for build_arg in self.context.ctx["docker_build_arg"].keys(): + build_args += ( + "--build-arg " + + build_arg + + "='" + + self.context.ctx["docker_build_arg"][build_arg] + + "' " + ) + + if run_build_arg: + for key, value in run_build_arg.items(): + build_args += "--build-arg " + key + "='" + value + "' " + + return build_args + + def build_image( + self, + model_info: typing.Dict, + dockerfile: str, + credentials: typing.Dict = None, + clean_cache: bool = False, + phase_suffix: str = "", + additional_build_args: typing.Dict[str, str] = None, + override_image_name: str = None, + ) -> typing.Dict: + """Build a Docker image for the given model. + + Args: + model_info: The model information dictionary + dockerfile: Path to the Dockerfile + credentials: Optional credentials dictionary + clean_cache: Whether to use --no-cache + phase_suffix: Suffix for log file name (e.g., ".build" or "") + additional_build_args: Additional build arguments to pass to Docker + override_image_name: Override the generated image name + + Returns: + dict: Build information including image name, build duration, etc. + """ + # Generate image name first + if override_image_name: + docker_image = override_image_name + else: + image_docker_name = ( + model_info["name"].replace("/", "_").lower() + + "_" + + os.path.basename(dockerfile).replace(".Dockerfile", "") + ) + docker_image = "ci-" + image_docker_name + + # Create log file for this build + cur_docker_file_basename = os.path.basename(dockerfile).replace( + ".Dockerfile", "" + ) + log_file_path = ( + model_info["name"].replace("/", "_") + + "_" + + cur_docker_file_basename + + phase_suffix + + ".live.log" + ) + # Replace / with _ in log file path (already done above, but keeping for safety) + log_file_path = log_file_path.replace("/", "_") + + self.rich_console.print(f"\n[bold green]🔨 Starting Docker build for model:[/bold green] [bold cyan]{model_info['name']}[/bold cyan]") + print(f"📁 Dockerfile: {dockerfile}") + print(f"🏷️ Target image: {docker_image}") + print(f"📝 Build log: {log_file_path}") + self.rich_console.print(f"[dim]{'='*80}[/dim]") + + # Get docker context + docker_context = self.get_context_path(model_info) + + # Prepare build args + run_build_arg = {} + if "cred" in model_info and model_info["cred"] != "" and credentials: + if model_info["cred"] not in credentials: + raise RuntimeError( + f"Credentials({model_info['cred']}) not found for model {model_info['name']}" + ) + # Add cred to build args + for key_cred, value_cred in credentials[model_info["cred"]].items(): + run_build_arg[model_info["cred"] + "_" + key_cred.upper()] = value_cred + + # Add additional build args if provided (for multi-architecture builds) + if additional_build_args: + run_build_arg.update(additional_build_args) + + build_args = self.get_build_arg(run_build_arg) + + use_cache_str = "--no-cache" if clean_cache else "" + + # Build the image with logging + build_start_time = time.time() + + build_command = ( + f"docker build {use_cache_str} --network=host " + f"-t {docker_image} --pull -f {dockerfile} " + f"{build_args} {docker_context}" + ) + + # Execute build with log redirection + with open(log_file_path, mode="w", buffering=1) as outlog: + with redirect_stdout( + PythonicTee(outlog, self.live_output) + ), redirect_stderr(PythonicTee(outlog, self.live_output)): + print(f"🔨 Executing build command...") + self.console.sh(build_command, timeout=None) + + build_duration = time.time() - build_start_time + + print(f"⏱️ Build Duration: {build_duration:.2f} seconds") + print(f"🏷️ MAD_CONTAINER_IMAGE is {docker_image}") + self.rich_console.print(f"[bold green]✅ Docker build completed successfully[/bold green]") + self.rich_console.print(f"[dim]{'='*80}[/dim]") + + # Get base docker info + base_docker = "" + if ( + "docker_build_arg" in self.context.ctx + and "BASE_DOCKER" in self.context.ctx["docker_build_arg"] + ): + base_docker = self.context.ctx["docker_build_arg"]["BASE_DOCKER"] + else: + base_docker = self.console.sh( + f"grep '^ARG BASE_DOCKER=' {dockerfile} | sed -E 's/ARG BASE_DOCKER=//g'" + ) + + print(f"BASE DOCKER is {base_docker}") + + # Get docker SHA + docker_sha = "" + try: + docker_sha = self.console.sh( + f'docker manifest inspect {base_docker} | grep digest | head -n 1 | cut -d \\" -f 4' + ) + print(f"BASE DOCKER SHA is {docker_sha}") + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Could not get docker SHA: {e}[/yellow]") + + # Infer GPU vendor from dockerfile path + gpu_vendor = self._infer_gpu_vendor_from_dockerfile(dockerfile) + + build_info = { + "model": model_info["name"], + "docker_image": docker_image, + "dockerfile": dockerfile, + "base_docker": base_docker, + "docker_sha": docker_sha, + "build_duration": build_duration, + "build_command": build_command, + "log_file": log_file_path, + "gpu_vendor": gpu_vendor, # Add GPU vendor for filtering + } + + # Store built image info + self.built_images[docker_image] = build_info + + # Store model info linked to the built image + self.built_models[docker_image] = model_info + + self.rich_console.print(f"[bold green]Successfully built image:[/bold green] [cyan]{docker_image}[/cyan]") + + return build_info + + def login_to_registry(self, registry: str, credentials: typing.Dict = None) -> None: + """Login to a Docker registry. + + Args: + registry: Registry URL (e.g., "localhost:5000", "docker.io", or empty for DockerHub) + credentials: Optional credentials dictionary containing username/password + """ + if not credentials: + print("No credentials provided for registry login") + return + + # Check if registry credentials are available + registry_key = registry if registry else "dockerhub" + + # Handle docker.io as dockerhub + if registry and registry.lower() == "docker.io": + registry_key = "dockerhub" + + if registry_key not in credentials: + error_msg = f"No credentials found for registry: {registry_key}" + if registry_key == "dockerhub": + error_msg += f"\nPlease add dockerhub credentials to credential.json:\n" + error_msg += "{\n" + error_msg += ' "dockerhub": {\n' + error_msg += ' "repository": "your-repository",\n' + error_msg += ' "username": "your-dockerhub-username",\n' + error_msg += ' "password": "your-dockerhub-password-or-token"\n' + error_msg += " }\n" + error_msg += "}" + else: + error_msg += ( + f"\nPlease add {registry_key} credentials to credential.json:\n" + ) + error_msg += "{\n" + error_msg += f' "{registry_key}": {{\n' + error_msg += f' "repository": "your-repository",\n' + error_msg += f' "username": "your-{registry_key}-username",\n' + error_msg += f' "password": "your-{registry_key}-password"\n' + error_msg += " }\n" + error_msg += "}" + self.rich_console.print(f"[red]{error_msg}[/red]") + raise RuntimeError(error_msg) + + creds = credentials[registry_key] + + if "username" not in creds or "password" not in creds: + error_msg = f"Invalid credentials format for registry: {registry_key}" + error_msg += f"\nCredentials must contain 'username' and 'password' fields" + self.rich_console.print(f"[red]{error_msg}[/red]") + raise RuntimeError(error_msg) + + # Ensure credential values are strings + username = str(creds["username"]) + password = str(creds["password"]) + + # Perform docker login + login_command = f"echo '{password}' | docker login" + + if registry and registry.lower() not in ["docker.io", "dockerhub"]: + login_command += f" {registry}" + + login_command += f" --username {username} --password-stdin" + + try: + self.console.sh(login_command, secret=True) + self.rich_console.print(f"[green]✅ Successfully logged in to registry: {registry or 'DockerHub'}[/green]") + except Exception as e: + self.rich_console.print(f"[red]❌ Failed to login to registry {registry}: {e}[/red]") + raise + + def push_image( + self, + docker_image: str, + registry: str = None, + credentials: typing.Dict = None, + explicit_registry_image: str = None, + ) -> str: + """Push the built image to a registry. + + Args: + docker_image: The local docker image name + registry: Optional registry URL (e.g., "localhost:5000", "docker.io", or empty for DockerHub) + credentials: Optional credentials dictionary for registry authentication + + Returns: + str: The full registry image name + """ + if not registry: + print(f"No registry specified, image remains local: {docker_image}") + return docker_image + + # Login to registry if credentials are provided + if credentials: + self.login_to_registry(registry, credentials) + + # Determine registry image name (this should match what was already determined) + if explicit_registry_image: + registry_image = explicit_registry_image + else: + registry_image = self._determine_registry_image_name( + docker_image, registry, credentials + ) + + try: + # Tag the image if different from local name + if registry_image != docker_image: + print(f"Tagging image: docker tag {docker_image} {registry_image}") + tag_command = f"docker tag {docker_image} {registry_image}" + self.console.sh(tag_command) + else: + print( + f"No tag needed, docker_image and registry_image are the same: {docker_image}" + ) + + # Push the image + push_command = f"docker push {registry_image}" + self.rich_console.print(f"\n[bold blue]🚀 Starting docker push to registry...[/bold blue]") + print(f"📤 Registry: {registry}") + print(f"🏷️ Image: {registry_image}") + self.console.sh(push_command) + + self.rich_console.print(f"[bold green]✅ Successfully pushed image to registry:[/bold green] [cyan]{registry_image}[/cyan]") + self.rich_console.print(f"[dim]{'='*80}[/dim]") + return registry_image + + except Exception as e: + self.rich_console.print(f"[red]❌ Failed to push image {docker_image} to registry {registry}: {e}[/red]") + raise + + def export_build_manifest( + self, + output_file: str = "build_manifest.json", + registry: str = None, + batch_build_metadata: typing.Optional[dict] = None, + ) -> None: + """Export enhanced build information to a manifest file. + + This creates a comprehensive build manifest that includes all necessary + information for deployment, reducing the need for separate execution configs. + + Args: + output_file: Path to output manifest file + registry: Registry used for building (added to each image entry) + batch_build_metadata: Optional metadata for batch builds + """ + # Extract credentials from models + credentials_required = list( + set( + [ + model.get("cred", "") + for model in self.built_models.values() + if model.get("cred", "") != "" + ] + ) + ) + + # Set registry for each built image + for image_name, build_info in self.built_images.items(): + # If registry is not set in build_info, set it from argument + if registry: + build_info["registry"] = registry + + # If registry is set in batch_build_metadata, override it + docker_file = build_info.get("dockerfile", "") + truncated_docker_file = docker_file.split("/")[-1].split(".Dockerfile")[0] + model_name = ( + image_name.split("ci-")[1].split(truncated_docker_file)[0].rstrip("_") + ) + if batch_build_metadata and model_name in batch_build_metadata: + self.rich_console.print( + f"[yellow]Overriding registry for {model_name} from batch_build_metadata[/yellow]" + ) + build_info["registry"] = batch_build_metadata[model_name].get( + "registry" + ) + + manifest = { + "built_images": self.built_images, + "built_models": self.built_models, + "context": { + "docker_env_vars": self.context.ctx.get("docker_env_vars", {}), + "docker_mounts": self.context.ctx.get("docker_mounts", {}), + "docker_build_arg": self.context.ctx.get("docker_build_arg", {}), + "gpu_vendor": self.context.ctx.get("gpu_vendor", ""), + "guest_os": self.context.ctx.get("guest_os", ""), + "docker_gpus": self.context.ctx.get("docker_gpus", ""), + }, + "credentials_required": credentials_required, + } + + # Preserve tools configuration if present in context + if "tools" in self.context.ctx: + manifest["context"]["tools"] = self.context.ctx["tools"] + + # Preserve pre/post scripts if present in context + if "pre_scripts" in self.context.ctx: + manifest["context"]["pre_scripts"] = self.context.ctx["pre_scripts"] + if "post_scripts" in self.context.ctx: + manifest["context"]["post_scripts"] = self.context.ctx["post_scripts"] + if "encapsulate_script" in self.context.ctx: + manifest["context"]["encapsulate_script"] = self.context.ctx["encapsulate_script"] + + # Add push failure summary if any pushes failed + push_failures = [] + for image_name, build_info in self.built_images.items(): + if "push_failed" in build_info and build_info["push_failed"]: + push_failures.append( + { + "image": image_name, + "intended_registry_image": build_info.get("registry_image"), + "error": build_info.get("push_error"), + } + ) + + if push_failures: + manifest["push_failures"] = push_failures + + with open(output_file, "w") as f: + json.dump(manifest, f, indent=2) + + self.rich_console.print(f"[green]Build manifest exported to:[/green] {output_file}") + if push_failures: + self.rich_console.print(f"[yellow]Warning: {len(push_failures)} image(s) failed to push to registry[/yellow]") + for failure in push_failures: + self.rich_console.print( + f"[red] - {failure['image']} -> {failure['intended_registry_image']}: {failure['error']}[/red]" + ) + + def build_all_models( + self, + models: typing.List[typing.Dict], + credentials: typing.Dict = None, + clean_cache: bool = False, + registry: str = None, + phase_suffix: str = "", + batch_build_metadata: typing.Optional[dict] = None, + target_archs: typing.List[str] = None, # New parameter + ) -> typing.Dict: + """Build images for all models, with optional multi-architecture support. + + Args: + models: List of model information dictionaries + credentials: Optional credentials dictionary + clean_cache: Whether to use --no-cache + registry: Optional registry to push images to + phase_suffix: Suffix for log file name (e.g., ".build" or "") + batch_build_metadata: Optional batch build metadata + target_archs: Optional list of target GPU architectures for multi-arch builds + + Returns: + dict: Summary of all built images + """ + self.rich_console.print(f"[bold blue]Building Docker images for {len(models)} models...[/bold blue]") + + if target_archs: + self.rich_console.print(f"[bold cyan]Multi-architecture build mode enabled for: {', '.join(target_archs)}[/bold cyan]") + else: + self.rich_console.print(f"[bold cyan]Single architecture build mode[/bold cyan]") + + build_summary = { + "successful_builds": [], + "failed_builds": [], + "total_build_time": 0, + "successful_pushes": [], + "failed_pushes": [], + } + + for model_info in models: + # Check if MAD_SYSTEM_GPU_ARCHITECTURE is provided in additional_context + # This overrides --target-archs and uses default flow + if ("docker_build_arg" in self.context.ctx and + "MAD_SYSTEM_GPU_ARCHITECTURE" in self.context.ctx["docker_build_arg"]): + self.rich_console.print(f"[yellow]Info: MAD_SYSTEM_GPU_ARCHITECTURE provided in additional_context, " + f"disabling --target-archs and using default flow for model {model_info['name']}[/yellow]") + # Use single architecture build mode regardless of target_archs + try: + single_build_info = self._build_model_single_arch( + model_info, credentials, clean_cache, + registry, phase_suffix, batch_build_metadata + ) + build_summary["successful_builds"].extend(single_build_info) + build_summary["total_build_time"] += sum( + info.get("build_duration", 0) for info in single_build_info + ) + except Exception as e: + build_summary["failed_builds"].append({ + "model": model_info["name"], + "error": str(e) + }) + elif target_archs: + # Multi-architecture build mode - always use architecture suffix + for arch in target_archs: + try: + # Always build with architecture suffix when --target-archs is used + arch_build_info = self._build_model_for_arch( + model_info, arch, credentials, clean_cache, + registry, phase_suffix, batch_build_metadata + ) + + build_summary["successful_builds"].extend(arch_build_info) + build_summary["total_build_time"] += sum( + info.get("build_duration", 0) for info in arch_build_info + ) + except Exception as e: + build_summary["failed_builds"].append({ + "model": model_info["name"], + "architecture": arch, + "error": str(e) + }) + else: + # Single architecture build mode (existing behavior - no validation needed) + try: + single_build_info = self._build_model_single_arch( + model_info, credentials, clean_cache, + registry, phase_suffix, batch_build_metadata + ) + build_summary["successful_builds"].extend(single_build_info) + build_summary["total_build_time"] += sum( + info.get("build_duration", 0) for info in single_build_info + ) + except Exception as e: + build_summary["failed_builds"].append({ + "model": model_info["name"], + "error": str(e) + }) + + return build_summary + + def _check_dockerfile_has_gpu_variables(self, model_info: typing.Dict) -> typing.Tuple[bool, str]: + """ + Check if model's Dockerfile contains GPU architecture variables. + Returns (has_gpu_vars, dockerfile_path) + """ + try: + # Find dockerfiles for this model + dockerfiles = self._get_dockerfiles_for_model(model_info) + + for dockerfile_path in dockerfiles: + with open(dockerfile_path, 'r') as f: + dockerfile_content = f.read() + + # Parse GPU architecture variables from Dockerfile + dockerfile_gpu_vars = self._parse_dockerfile_gpu_variables(dockerfile_content) + + if dockerfile_gpu_vars: + return True, dockerfile_path + else: + return False, dockerfile_path + + # No dockerfiles found + return False, "No Dockerfile found" + + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Error checking GPU variables for model {model_info['name']}: {e}[/yellow]") + return False, "Error reading Dockerfile" + + def _get_dockerfiles_for_model(self, model_info: typing.Dict) -> typing.List[str]: + """Get dockerfiles for a model.""" + try: + all_dockerfiles = self.console.sh( + f"ls {model_info['dockerfile']}.*" + ).split("\n") + + dockerfiles = {} + for cur_docker_file in all_dockerfiles: + # Get context of dockerfile + dockerfiles[cur_docker_file] = self.console.sh( + f"head -n5 {cur_docker_file} | grep '# CONTEXT ' | sed 's/# CONTEXT //g'" + ) + + # Filter dockerfiles based on context + dockerfiles = self.context.filter(dockerfiles) + + return list(dockerfiles.keys()) + + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Error finding dockerfiles for model {model_info['name']}: {e}[/yellow]") + return [] + + def _validate_target_arch_against_dockerfile(self, model_info: typing.Dict, target_arch: str) -> bool: + """ + Validate that target architecture is compatible with model's Dockerfile GPU variables. + Called during build phase when --target-archs is provided. + """ + try: + # Find dockerfiles for this model + dockerfiles = self._get_dockerfiles_for_model(model_info) + + for dockerfile_path in dockerfiles: + with open(dockerfile_path, 'r') as f: + dockerfile_content = f.read() + + # Parse GPU architecture variables from Dockerfile + dockerfile_gpu_vars = self._parse_dockerfile_gpu_variables(dockerfile_content) + + if not dockerfile_gpu_vars: + # No GPU variables found - target arch is acceptable + self.rich_console.print(f"[cyan]Info: No GPU architecture variables found in {dockerfile_path}, " + f"target architecture '{target_arch}' is acceptable[/cyan]") + continue + + # Validate target architecture against each GPU variable + for var_name, var_values in dockerfile_gpu_vars.items(): + if not self._is_target_arch_compatible_with_variable( + var_name, var_values, target_arch + ): + self.rich_console.print(f"[red]Error: Target architecture '{target_arch}' is not compatible " + f"with {var_name}={var_values} in {dockerfile_path}[/red]") + return False + + self.rich_console.print(f"[cyan]Info: Target architecture '{target_arch}' validated successfully " + f"against {dockerfile_path}[/cyan]") + + return True + + except FileNotFoundError as e: + self.rich_console.print(f"[yellow]Warning: Dockerfile not found for model {model_info['name']}: {e}[/yellow]") + return True # Assume compatible if Dockerfile not found + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Error validating target architecture for model {model_info['name']}: {e}[/yellow]") + return True # Assume compatible on parsing errors + + def _parse_dockerfile_gpu_variables(self, dockerfile_content: str) -> typing.Dict[str, typing.List[str]]: + """Parse GPU architecture variables from Dockerfile content.""" + gpu_variables = {} + + for var_name in self.GPU_ARCH_VARIABLES: + # Look for ARG declarations + arg_pattern = rf"ARG\s+{var_name}=([^\s\n]+)" + arg_matches = re.findall(arg_pattern, dockerfile_content, re.IGNORECASE) + + # Look for ENV declarations + env_pattern = rf"ENV\s+{var_name}[=\s]+([^\s\n]+)" + env_matches = re.findall(env_pattern, dockerfile_content, re.IGNORECASE) + + # Process found values + all_matches = arg_matches + env_matches + if all_matches: + # Take the last defined value (in case of multiple definitions) + raw_value = all_matches[-1].strip('"\'') + parsed_values = self._parse_gpu_variable_value(var_name, raw_value) + if parsed_values: + gpu_variables[var_name] = parsed_values + + return gpu_variables + + def _parse_gpu_variable_value(self, var_name: str, raw_value: str) -> typing.List[str]: + """Parse GPU variable value based on variable type and format.""" + architectures = [] + + # Handle different variable formats + if var_name in ["GPU_TARGETS", "GPU_ARCHS", "PYTORCH_ROCM_ARCH"]: + # These often contain multiple architectures separated by semicolons or commas + if ";" in raw_value: + architectures = [arch.strip() for arch in raw_value.split(";") if arch.strip()] + elif "," in raw_value: + architectures = [arch.strip() for arch in raw_value.split(",") if arch.strip()] + else: + architectures = [raw_value.strip()] + else: + # Single architecture value (MAD_SYSTEM_GPU_ARCHITECTURE, GFX_COMPILATION_ARCH) + architectures = [raw_value.strip()] + + # Normalize architecture names + normalized_archs = [] + for arch in architectures: + normalized = self._normalize_architecture_name(arch) + if normalized: + normalized_archs.append(normalized) + + return normalized_archs + + def _normalize_architecture_name(self, arch: str) -> str: + """Normalize architecture name to standard format.""" + arch = arch.lower().strip() + + # Handle common variations and aliases + if arch.startswith("gfx"): + return arch + elif arch in ["mi100", "mi-100"]: + return "gfx908" + elif arch in ["mi200", "mi-200", "mi210", "mi250"]: + return "gfx90a" + elif arch in ["mi300", "mi-300", "mi300a"]: + return "gfx940" + elif arch in ["mi300x", "mi-300x"]: + return "gfx942" + elif arch.startswith("mi"): + # Unknown MI series - return as is for potential future support + return arch + + return arch if arch else None + + def _is_target_arch_compatible_with_variable( + self, + var_name: str, + var_values: typing.List[str], + target_arch: str + ) -> bool: + """ + Validate that target architecture is compatible with a specific GPU variable. + Used during build phase validation. + """ + if var_name == "MAD_SYSTEM_GPU_ARCHITECTURE": + # MAD_SYSTEM_GPU_ARCHITECTURE will be overridden by target_arch, so always compatible + return True + + elif var_name in ["PYTORCH_ROCM_ARCH", "GPU_TARGETS", "GPU_ARCHS"]: + # Multi-architecture variables - target arch must be in the list + return target_arch in var_values + + elif var_name == "GFX_COMPILATION_ARCH": + # Compilation architecture should be compatible with target arch + return len(var_values) == 1 and ( + var_values[0] == target_arch or + self._is_compilation_arch_compatible(var_values[0], target_arch) + ) + + # Unknown variable - assume compatible + return True + + def _is_compilation_arch_compatible(self, compile_arch: str, target_arch: str) -> bool: + """Check if compilation architecture is compatible with target architecture.""" + # Define compatibility rules for compilation + compatibility_matrix = { + "gfx908": ["gfx908"], # MI100 - exact match only + "gfx90a": ["gfx90a"], # MI200 - exact match only + "gfx940": ["gfx940"], # MI300A - exact match only + "gfx941": ["gfx941"], # MI300X - exact match only + "gfx942": ["gfx942"], # MI300X - exact match only + } + + compatible_archs = compatibility_matrix.get(compile_arch, [compile_arch]) + return target_arch in compatible_archs + + def _build_model_single_arch( + self, + model_info: typing.Dict, + credentials: typing.Dict, + clean_cache: bool, + registry: str, + phase_suffix: str, + batch_build_metadata: typing.Optional[dict] + ) -> typing.List[typing.Dict]: + """Build model using existing single architecture flow.""" + + # Use existing build logic - MAD_SYSTEM_GPU_ARCHITECTURE comes from additional_context + # or Dockerfile defaults + dockerfiles = self._get_dockerfiles_for_model(model_info) + + results = [] + for dockerfile in dockerfiles: + build_info = self.build_image( + model_info, + dockerfile, + credentials, + clean_cache, + phase_suffix + ) + + # Extract GPU architecture from build args or context for manifest + gpu_arch = self._get_effective_gpu_architecture(model_info, dockerfile) + if gpu_arch: + build_info["gpu_architecture"] = gpu_arch + + # Handle registry push (existing logic) + if registry: + try: + registry_image = self._create_registry_image_name( + build_info["docker_image"], registry, batch_build_metadata, model_info + ) + self.push_image(build_info["docker_image"], registry, credentials, registry_image) + build_info["registry_image"] = registry_image + except Exception as e: + build_info["push_error"] = str(e) + + results.append(build_info) + + return results + + def _get_effective_gpu_architecture(self, model_info: typing.Dict, dockerfile_path: str) -> str: + """Get effective GPU architecture for single arch builds.""" + # Check if MAD_SYSTEM_GPU_ARCHITECTURE is in build args from additional_context + if ("docker_build_arg" in self.context.ctx and + "MAD_SYSTEM_GPU_ARCHITECTURE" in self.context.ctx["docker_build_arg"]): + return self.context.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] + + # Try to extract from Dockerfile defaults + try: + with open(dockerfile_path, 'r') as f: + content = f.read() + + # Look for ARG or ENV declarations + patterns = [ + r"ARG\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)", + r"ENV\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)" + ] + + for pattern in patterns: + match = re.search(pattern, content, re.IGNORECASE) + if match: + return match.group(1).strip('"\'') + except Exception: + pass + + return None + + def _infer_gpu_vendor_from_dockerfile(self, dockerfile: str) -> str: + """Infer GPU vendor from dockerfile path. + + Args: + dockerfile: Path to dockerfile (e.g., "docker/dummy.ubuntu.amd.Dockerfile") + + Returns: + GPU vendor string: "AMD", "NVIDIA", or "" + """ + dockerfile_lower = dockerfile.lower() + + # Check for explicit vendor indicators in filename + if '.amd.' in dockerfile_lower or dockerfile_lower.endswith('.amd'): + return "AMD" + elif '.nvidia.' in dockerfile_lower or dockerfile_lower.endswith('.nvidia'): + return "NVIDIA" + + # Try to infer from base image in Dockerfile + try: + with open(dockerfile, 'r') as f: + content = f.read() + + # Look for base image indicators + if 'FROM' in content: + if 'rocm' in content.lower() or 'amd' in content.lower(): + return "AMD" + elif 'nvidia' in content.lower() or 'cuda' in content.lower(): + return "NVIDIA" + except Exception: + pass + + # Default to empty (legacy - will be treated as compatible with all) + return "" + + def _create_base_image_name(self, model_info: typing.Dict, dockerfile: str) -> str: + """Create base image name from model info and dockerfile.""" + # Extract dockerfile context suffix (e.g., "ubuntu.amd" from "dummy.ubuntu.amd.Dockerfile") + dockerfile_name = os.path.basename(dockerfile) + if '.' in dockerfile_name: + # Remove the .Dockerfile extension and get context + context_parts = dockerfile_name.replace('.Dockerfile', '').split('.')[1:] # Skip model name + context_suffix = '.'.join(context_parts) if context_parts else 'default' + else: + context_suffix = 'default' + + # Create base image name: ci-{model}_{model}.{context} + return f"ci-{model_info['name']}_{model_info['name']}.{context_suffix}" + + def _create_registry_image_name( + self, + image_name: str, + registry: str, + batch_build_metadata: typing.Optional[dict], + model_info: typing.Dict + ) -> str: + """Create registry image name.""" + if batch_build_metadata and model_info["name"] in batch_build_metadata: + meta = batch_build_metadata[model_info["name"]] + if meta.get("registry_image"): + return meta["registry_image"] + + # Default registry naming + return self._determine_registry_image_name(image_name, registry) + + def _create_arch_registry_image_name( + self, + image_name: str, + gpu_arch: str, + registry: str, + batch_build_metadata: typing.Optional[dict], + model_info: typing.Dict + ) -> str: + """Create architecture-specific registry image name.""" + # For multi-arch builds, add architecture to the tag + if batch_build_metadata and model_info["name"] in batch_build_metadata: + meta = batch_build_metadata[model_info["name"]] + if meta.get("registry_image"): + # Append architecture to existing registry image + return f"{meta['registry_image']}_{gpu_arch}" + + # Default arch-specific registry naming + base_registry_name = self._determine_registry_image_name(image_name, registry) + return f"{base_registry_name}" # Architecture already in image_name + + def _determine_registry_image_name( + self, docker_image: str, registry: str, credentials: typing.Dict = None + ) -> str: + """Determine the registry image name that would be used for pushing. + + Args: + docker_image: The local docker image name + registry: Registry URL (e.g., "localhost:5000", "docker.io", or empty for DockerHub) + credentials: Optional credentials dictionary for registry authentication + + Returns: + str: The full registry image name that would be used + """ + if not registry: + return docker_image + + # Determine registry image name based on registry type + if registry.lower() in ["docker.io", "dockerhub"]: + # For DockerHub, always use format: repository:tag + # Try to get repository from credentials, fallback to default if not available + if ( + credentials + and "dockerhub" in credentials + and "repository" in credentials["dockerhub"] + ): + registry_image = ( + f"{credentials['dockerhub']['repository']}:{docker_image}" + ) + else: + registry_image = docker_image + else: + # For other registries (local, AWS ECR, etc.), use format: registry/repository:tag + registry_key = registry + if ( + credentials + and registry_key in credentials + and "repository" in credentials[registry_key] + ): + registry_image = f"{registry}/{credentials[registry_key]['repository']}:{docker_image}" + else: + # Fallback to just registry/imagename if no repository specified + registry_image = f"{registry}/{docker_image}" + + return registry_image + + def _is_compilation_arch_compatible(self, compile_arch: str, target_arch: str) -> bool: + """Check if compilation architecture is compatible with target architecture.""" + # Define compatibility rules for compilation + compatibility_matrix = { + "gfx908": ["gfx908"], # MI100 - exact match only + "gfx90a": ["gfx90a"], # MI200 - exact match only + "gfx940": ["gfx940"], # MI300A - exact match only + "gfx941": ["gfx941"], # MI300X - exact match only + "gfx942": ["gfx942"], # MI300X - exact match only + } + + compatible_archs = compatibility_matrix.get(compile_arch, [compile_arch]) + return target_arch in compatible_archs + + def _build_model_for_arch( + self, + model_info: typing.Dict, + gpu_arch: str, + credentials: typing.Dict, + clean_cache: bool, + registry: str, + phase_suffix: str, + batch_build_metadata: typing.Optional[dict] + ) -> typing.List[typing.Dict]: + """Build model for specific GPU architecture with smart image naming.""" + + # Find dockerfiles + dockerfiles = self._get_dockerfiles_for_model(model_info) + + arch_results = [] + for dockerfile in dockerfiles: + # When using --target-archs, always add architecture suffix regardless of GPU variables + # This ensures consistent naming for multi-architecture builds + base_image_name = self._create_base_image_name(model_info, dockerfile) + arch_image_name = f"{base_image_name}_{gpu_arch}" + + # Set MAD_SYSTEM_GPU_ARCHITECTURE for this build + arch_build_args = {"MAD_SYSTEM_GPU_ARCHITECTURE": gpu_arch} + + # Build the image + build_info = self.build_image( + model_info, + dockerfile, + credentials, + clean_cache, + phase_suffix, + additional_build_args=arch_build_args, + override_image_name=arch_image_name + ) + + # Add architecture metadata + build_info["gpu_architecture"] = gpu_arch + + # Handle registry push with architecture-specific tagging + if registry: + registry_image = self._determine_registry_image_name( + arch_image_name, registry, credentials + ) + try: + self.push_image(arch_image_name, registry, credentials, registry_image) + build_info["registry_image"] = registry_image + except Exception as e: + build_info["push_error"] = str(e) + + arch_results.append(build_info) + + return arch_results + + def _build_model_single_arch( + self, + model_info: typing.Dict, + credentials: typing.Dict, + clean_cache: bool, + registry: str, + phase_suffix: str, + batch_build_metadata: typing.Optional[dict] + ) -> typing.List[typing.Dict]: + """Build model using existing single architecture flow.""" + + # Find dockerfiles for this model + dockerfiles = self._get_dockerfiles_for_model(model_info) + + results = [] + for dockerfile in dockerfiles: + build_info = self.build_image( + model_info, + dockerfile, + credentials, + clean_cache, + phase_suffix + ) + + # Extract GPU architecture from build args or context for manifest + gpu_arch = self._get_effective_gpu_architecture(model_info, dockerfile) + if gpu_arch: + build_info["gpu_architecture"] = gpu_arch + + # Handle registry push (existing logic) + if registry: + registry_image = self._determine_registry_image_name( + build_info["docker_image"], registry, credentials + ) + try: + self.push_image(build_info["docker_image"], registry, credentials, registry_image) + build_info["registry_image"] = registry_image + except Exception as e: + build_info["push_error"] = str(e) + + results.append(build_info) + + return results + + def _get_effective_gpu_architecture(self, model_info: typing.Dict, dockerfile_path: str) -> str: + """Get effective GPU architecture for single arch builds.""" + # Check if MAD_SYSTEM_GPU_ARCHITECTURE is in build args from additional_context + if ("docker_build_arg" in self.context.ctx and + "MAD_SYSTEM_GPU_ARCHITECTURE" in self.context.ctx["docker_build_arg"]): + return self.context.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] + + # Try to extract from Dockerfile defaults + try: + with open(dockerfile_path, 'r') as f: + content = f.read() + + # Look for ARG or ENV declarations + patterns = [ + r"ARG\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)", + r"ENV\s+MAD_SYSTEM_GPU_ARCHITECTURE=([^\s\n]+)" + ] + + for pattern in patterns: + match = re.search(pattern, content, re.IGNORECASE) + if match: + return match.group(1).strip('"\'') + except Exception: + pass + + return None diff --git a/src/madengine/mad.py b/src/madengine/mad.py deleted file mode 100644 index 861571b7..00000000 --- a/src/madengine/mad.py +++ /dev/null @@ -1,291 +0,0 @@ -#!/usr/bin/env python -"""Mad Engine CLI tool. - -This script provides a command-line interface to run models, generate reports, and tools for profiling and tracing. -This tool is used to run LLMs and Deep Learning models locally. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in imports -import argparse -import sys -# MAD Engine imports -from madengine import __version__ -from madengine.tools.run_models import RunModels -from madengine.tools.discover_models import DiscoverModels -from madengine.tools.create_table_db import CreateTable -from madengine.tools.update_table_db import UpdateTable -from madengine.tools.upload_mongodb import MongoDBHandler -from madengine.tools.update_perf_csv import UpdatePerfCsv -from madengine.tools.csv_to_html import ConvertCsvToHtml -from madengine.tools.csv_to_email import ConvertCsvToEmail -from madengine.core.constants import MODEL_DIR # pylint: disable=unused-import -from madengine.utils.gpu_validator import validate_gpu_installation, GPUInstallationError, detect_gpu_vendor, GPUVendor - - -# ----------------------------------------------------------------------------- -# Sub-command functions -# ----------------------------------------------------------------------------- -# Router of the command-line arguments to the corresponding functions -def run_models(args: argparse.Namespace): - """Run models on container. - - Args: - args: The command-line arguments. - """ - print(f"Running models on container") - run_models = RunModels(args=args) - return run_models.run() - - -def discover_models(args: argparse.Namespace): - """Discover the models. - - Args: - args: The command-line arguments. - """ - print(f"Discovering all models in the project") - discover_models = DiscoverModels(args=args) - return discover_models.run() - - -def update_perf_csv(args): - """Update performance metrics of models perf.csv to database. - - Args: - args: The command-line arguments. - """ - print(f"Running update_perf_csv") - update_perf_csv = UpdatePerfCsv(args=args) - return update_perf_csv.run() - - -def csv_to_html(args): - """Convert CSV to HTML report of models. - - Args: - args: The command-line arguments. - """ - print(f"Running csv_to_html") - convert_csv_to_html = ConvertCsvToHtml(args=args) - return convert_csv_to_html.run() - - -def csv_to_email(args): - """Convert CSV to Email of models. - - Args: - args: The command-line arguments. - """ - print(f"Convert CSV to Email of models") - convert_csv_to_email = ConvertCsvToEmail(args=args) - return convert_csv_to_email.run() - - -def create_table(args): - """Create table in DB. - - Args: - args: The command-line arguments. - """ - print(f"Create table in DB") - create_table = CreateTable(args=args) - return create_table.run() - - -def update_table(args): - """Update table in DB. - - Args: - args: The command-line arguments. - """ - print(f"Update table in DB") - update_table = UpdateTable(args=args) - return update_table.run() - -def upload_mongodb(args): - """Upload to MongoDB. - - Args: - args: The command-line arguments. - """ - print(f"Uploading to MongoDB") - upload_mongodb = MongoDBHandler(args=args) - return upload_mongodb.run() - - -def validate_gpu(args): - """Validate GPU installation (ROCm for AMD, CUDA for NVIDIA). - - Args: - args: The command-line arguments. - - Returns: - int: Exit code (0 for success, 1 for failure) - """ - verbose = args.verbose if hasattr(args, 'verbose') else False - - try: - # Detect GPU vendor and run appropriate validation - result = validate_gpu_installation(vendor=None, verbose=verbose, raise_on_error=False) - - # Print summary based on validation result - if result.is_valid: - print() - print("=" * 70) - print(f"✓ {result.vendor.value} GPU Installation is VALID") - print("=" * 70) - if result.version: - version_label = "ROCm Version" if result.vendor == GPUVendor.AMD else "Driver/CUDA Version" - print(f"{version_label}: {result.version}") - print() - print("You can proceed with running madengine workloads:") - print(" madengine run --tags ") - print() - return 0 - else: - print() - print("=" * 70) - print(f"✗ {result.vendor.value} GPU Installation Validation FAILED") - print("=" * 70) - print() - - if result.issues: - print("Critical Issues:") - for issue in result.issues: - print(f" - {issue}") - print() - - if result.warnings: - print("Warnings:") - for warning in result.warnings: - print(f" - {warning}") - print() - - if result.suggestions: - print("Suggested Actions:") - for suggestion in result.suggestions: - print(f" • {suggestion}") - print() - - print("Please fix the issues above before running madengine workloads.") - print() - return 1 - - except GPUInstallationError as e: - print() - print("=" * 70) - print("GPU Installation Validation FAILED") - print("=" * 70) - print() - print(str(e)) - print() - return 1 - except Exception as e: - print(f"✗ Unexpected error during validation: {e}") - import traceback - if verbose: - traceback.print_exc() - return 1 - - -# ----------------------------------------------------------------------------- -# Main function -# ----------------------------------------------------------------------------- -def main(): - """Main function to parse the command-line arguments. - """ - parser = argparse.ArgumentParser(description="A Models automation and dashboarding command-line tool to run LLMs and Deep Learning models locally.") - - parser.add_argument('-v', '--version', action='version', version=__version__) - - subparsers = parser.add_subparsers(title="Commands", description="Available commands for running models, generating reports, and toolings.", dest="command") - - # Run models command - parser_run = subparsers.add_parser('run', description="Run LLMs and Deep Learning models on container", help='Run models on container') - parser_run.add_argument('--tags', nargs='+', default=[], help="tags to run (can be multiple).") - - # Deprecated Tag - parser_run.add_argument('--ignore-deprecated-flag', action='store_true', help="Force run deprecated models even if marked deprecated.") - - parser_run.add_argument('--timeout', type=int, default=-1, help="time out for model run in seconds; Overrides per-model timeout if specified or default timeout of 7200 (2 hrs).\ - Timeout of 0 will never timeout.") - parser_run.add_argument('--live-output', action='store_true', help="prints output in real-time directly on STDOUT") - parser_run.add_argument('--clean-docker-cache', action='store_true', help="rebuild docker image without using cache") - parser_run.add_argument('--additional-context-file', default=None, help="additonal context, as json file, to filter behavior of workloads. Overrides detected contexts.") - parser_run.add_argument('--additional-context', default='{}', help="additional context, as string representation of python dict, to filter behavior of workloads. " + - " Overrides detected contexts and additional-context-file.") - parser_run.add_argument('--data-config-file-name', default="data.json", help="custom data configuration file.") - parser_run.add_argument('--tools-json-file-name', default="./scripts/common/tools.json", help="custom tools json configuration file.") - parser_run.add_argument('--generate-sys-env-details', default=True, help='generate system config env details by default') - parser_run.add_argument('--force-mirror-local', default=None, help="Path to force all relevant dataproviders to mirror data locally on.") - parser_run.add_argument('--keep-alive', action='store_true', help="keep Docker container alive after run; will keep model directory after run") - parser_run.add_argument('--keep-model-dir', action='store_true', help="keep model directory after run") - parser_run.add_argument('--skip-model-run', action='store_true', help="skips running the model; will not keep model directory after run unless specified through keep-alive or keep-model-dir") - parser_run.add_argument('--disable-skip-gpu-arch', action='store_true', help="disables skipping model based on gpu architecture") - parser_run.add_argument('-o', '--output', default='perf.csv', help='output file') - parser_run.set_defaults(func=run_models) - - # Discover models command - parser_discover = subparsers.add_parser('discover', description="Discover all models in the project", help='Discover the models.') - parser_discover.add_argument('--tags', nargs='+', default=[], help="tags to discover models (can be multiple).") - parser_discover.set_defaults(func=discover_models) - - # Report command - parser_report = subparsers.add_parser('report', description="", help='Generate report of models') - subparsers_report = parser_report.add_subparsers(title="Report Commands", description="Available commands for generating reports.", dest="report_command") - # Report subcommand update-perf - parser_report_update_perf= subparsers_report.add_parser('update-perf', description="Update performance metrics of models perf.csv to database.", help='Update perf.csv to database') - parser_report_update_perf.add_argument("--single_result", help="path to the single result json") - parser_report_update_perf.add_argument("--exception-result", help="path to the single result json") - parser_report_update_perf.add_argument("--failed-result", help="path to the single result json") - parser_report_update_perf.add_argument("--multiple-results", help="path to the results csv") - parser_report_update_perf.add_argument("--perf-csv", default="perf.csv") - parser_report_update_perf.add_argument("--model-name") - parser_report_update_perf.add_argument("--common-info") - parser_report_update_perf.set_defaults(func=update_perf_csv) - # Report subcommand to-html - parser_report_html= subparsers_report.add_parser('to-html', description="Convert CSV to HTML report of models.", help='Convert CSV to HTML report of models') - parser_report_html.add_argument("--csv-file-path", type=str) - parser_report_html.set_defaults(func=csv_to_html) - # Report subcommand to-email - parser_report_email= subparsers_report.add_parser('to-email', description="Convert CSV to Email of models.", help='Convert CSV to Email of models') - parser_report_email.add_argument("--csv-file-path", type=str, default='.', help="Path to the directory containing the CSV files.") - parser_report_email.set_defaults(func=csv_to_email) - - # Database command - parser_database = subparsers.add_parser('database', help='CRUD for database') - subparsers_database = parser_database.add_subparsers(title="Database Commands", description="Available commands for database, such as creating and updating table in DB.", dest="database_command") - # Database subcommand creating tabe - parser_database_create_table = subparsers_database.add_parser('create-table', description="Create table in DB.", help='Create table in DB') - parser_database_create_table.add_argument('-v', '--verbose', action='store_true', help='verbose output') - parser_database_create_table.set_defaults(func=create_table) - # Database subcommand updating table - parser_database_update_table = subparsers_database.add_parser('update-table', description="Update table in DB.", help='Update table in DB') - parser_database_update_table.add_argument('--csv-file-path', type=str, help='Path to the csv file') - parser_database_update_table.add_argument('--model-json-path', type=str, help='Path to the model json file') - parser_database_update_table.set_defaults(func=update_table) - # Database subcommand uploading to MongoDB - parser_database_upload_mongodb = subparsers_database.add_parser('upload-mongodb', description="Update table in DB.", help='Update table in DB') - parser_database_upload_mongodb.add_argument('--csv-file-path', type=str, default='perf_entry.csv', help='Path to the csv file') - parser_database_upload_mongodb.add_argument("--database-name", type=str, required=True, help="Name of the MongoDB database") - parser_database_upload_mongodb.add_argument("--collection-name", type=str, required=True, help="Name of the MongoDB collection") - parser_database_upload_mongodb.set_defaults(func=upload_mongodb) - - # Validate GPU command - parser_validate = subparsers.add_parser('validate', description="Validate GPU installation (ROCm for AMD, CUDA for NVIDIA)", help='Validate GPU installation') - parser_validate.add_argument('-v', '--verbose', action='store_true', help='Show detailed validation output') - parser_validate.set_defaults(func=validate_gpu) - - args = parser.parse_args() - - if args.command: - result = args.func(args) - if args.command == 'validate' and result is not None: - sys.exit(result) - else: - parser.print_help() - - -if __name__ == "__main__": - main() diff --git a/src/madengine/orchestration/__init__.py b/src/madengine/orchestration/__init__.py new file mode 100644 index 00000000..e3dce29a --- /dev/null +++ b/src/madengine/orchestration/__init__.py @@ -0,0 +1,16 @@ +""" +Orchestration layer for madengine workflows. + +Provides high-level workflow coordination for build and run phases. +This layer sits between the CLI (presentation) and execution/deployment layers. + +Architecture: +- BuildOrchestrator: Manages Docker image building workflow +- RunOrchestrator: Manages model execution workflow (local or distributed) +""" + +from .build_orchestrator import BuildOrchestrator +from .run_orchestrator import RunOrchestrator + +__all__ = ["BuildOrchestrator", "RunOrchestrator"] + diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py new file mode 100644 index 00000000..49ee76c2 --- /dev/null +++ b/src/madengine/orchestration/build_orchestrator.py @@ -0,0 +1,420 @@ +#!/usr/bin/env python3 +""" +Build Orchestrator - Coordinates Docker image building workflow. + +Extracted from distributed_orchestrator.py build_phase() method. +Manages the discovery, building, and manifest generation for Docker images. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import os +import shutil +from pathlib import Path +from typing import Dict, List, Optional + +from rich.console import Console as RichConsole + +from madengine.core.console import Console +from madengine.core.context import Context +from madengine.core.errors import ( + BuildError, + ConfigurationError, + DiscoveryError, + create_error_context, + handle_error, +) +from madengine.utils.discover_models import DiscoverModels +from madengine.execution.docker_builder import DockerBuilder + + +class BuildOrchestrator: + """ + Orchestrates the build workflow. + + Responsibilities: + - Discover models by tags + - Build Docker images + - Push to registry (optional) + - Generate build_manifest.json + - Save deployment_config from --additional-context + """ + + def __init__(self, args, additional_context: Optional[Dict] = None): + """ + Initialize build orchestrator. + + Args: + args: CLI arguments namespace + additional_context: Dict from --additional-context (merged with args if present) + """ + self.args = args + self.console = Console(live_output=getattr(args, "live_output", True)) + self.rich_console = RichConsole() + + # Merge additional_context from args and parameter + merged_context = {} + + # Load from file first if provided + if hasattr(args, "additional_context_file") and args.additional_context_file: + try: + with open(args.additional_context_file, "r") as f: + merged_context = json.load(f) + except (FileNotFoundError, json.JSONDecodeError) as e: + print(f"Warning: Could not load additional_context_file: {e}") + + # Then merge string additional_context (overrides file) + if hasattr(args, "additional_context") and args.additional_context: + try: + if isinstance(args.additional_context, str): + # Use ast.literal_eval for Python dict syntax (single quotes) + # This matches what Context class expects + import ast + context_from_string = ast.literal_eval(args.additional_context) + merged_context.update(context_from_string) + elif isinstance(args.additional_context, dict): + merged_context.update(args.additional_context) + except (ValueError, SyntaxError) as e: + print(f"Warning: Could not parse additional_context: {e}") + pass + + # Finally merge parameter additional_context (overrides all) + if additional_context: + merged_context.update(additional_context) + + self.additional_context = merged_context + + # Apply ConfigLoader to infer deploy type, validate, and apply defaults + if self.additional_context: + try: + from madengine.deployment.config_loader import ConfigLoader + # This will: + # 1. Infer deploy type from k8s/slurm presence + # 2. Validate for conflicts (e.g., both k8s and slurm) + # 3. Apply appropriate defaults + # 4. Add 'deploy' field for internal use + self.additional_context = ConfigLoader.load_config(self.additional_context) + except ValueError as e: + # Configuration validation error - fail fast + self.rich_console.print(f"[red]Configuration Error: {e}[/red]") + raise SystemExit(1) + except Exception as e: + # Other errors during config loading - warn but continue + self.rich_console.print(f"[yellow]Warning: Could not apply config defaults: {e}[/yellow]") + + # Initialize context in build-only mode (no GPU detection) + # Context expects additional_context as a string representation of Python dict + # Use repr() instead of json.dumps() because Context uses ast.literal_eval() + context_string = repr(merged_context) if merged_context else None + self.context = Context( + additional_context=context_string, + build_only_mode=True, + ) + + # Load credentials if available + self.credentials = self._load_credentials() + + def _load_credentials(self) -> Optional[Dict]: + """Load credentials from credential.json and environment variables.""" + credentials = None + + # Try loading from file + credential_file = "credential.json" + if os.path.exists(credential_file): + try: + with open(credential_file) as f: + credentials = json.load(f) + print(f"Loaded credentials from {credential_file}: {list(credentials.keys())}") + except Exception as e: + context = create_error_context( + operation="load_credentials", + component="BuildOrchestrator", + file_path=credential_file, + ) + handle_error( + ConfigurationError( + f"Could not load credentials: {e}", + context=context, + suggestions=[ + "Check if credential.json exists and has valid JSON format" + ], + ) + ) + + # Override with environment variables if present + docker_hub_user = os.environ.get("MAD_DOCKERHUB_USER") + docker_hub_password = os.environ.get("MAD_DOCKERHUB_PASSWORD") + docker_hub_repo = os.environ.get("MAD_DOCKERHUB_REPO") + + if docker_hub_user and docker_hub_password: + print("Found Docker Hub credentials in environment variables") + if credentials is None: + credentials = {} + + credentials["dockerhub"] = { + "username": docker_hub_user, + "password": docker_hub_password, + } + if docker_hub_repo: + credentials["dockerhub"]["repository"] = docker_hub_repo + + return credentials + + def _copy_scripts(self): + """[DEPRECATED] Copy common scripts to model directories. + + This method is no longer called during build phase as it's not needed. + Build phase only creates Docker images - script execution happens in run phase. + Scripts are copied by run_orchestrator._copy_scripts() for local execution. + K8s and Slurm deployments have their own script management mechanisms. + """ + # No-op: This method is deprecated and should not be called + pass + + def execute( + self, + registry: Optional[str] = None, + clean_cache: bool = False, + manifest_output: str = "build_manifest.json", + batch_build_metadata: Optional[Dict] = None, + ) -> str: + """ + Execute build workflow. + + Args: + registry: Optional registry to push images to + clean_cache: Whether to use --no-cache for Docker builds + manifest_output: Output file for build manifest + batch_build_metadata: Optional batch build metadata + + Returns: + Path to generated build_manifest.json + + Raises: + DiscoveryError: If model discovery fails + BuildError: If Docker build fails + """ + self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") + self.rich_console.print("[bold blue]🔨 BUILD PHASE[/bold blue]") + self.rich_console.print("[yellow](Build-only mode - no GPU detection)[/yellow]") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + try: + # Step 1: Discover models + self.rich_console.print("[bold cyan]🔍 Discovering models...[/bold cyan]") + discover_models = DiscoverModels(args=self.args) + models = discover_models.run() + + if not models: + raise DiscoveryError( + "No models discovered", + context=create_error_context( + operation="discover_models", + component="BuildOrchestrator", + ), + suggestions=[ + "Check if models.json exists", + "Verify --tags parameter is correct", + "Ensure model definitions have matching tags", + ], + ) + + self.rich_console.print(f"[green]✓ Found {len(models)} models[/green]\n") + + # Step 2: Validate build context (scripts not needed for build phase) + # Build phase only creates Docker images - script execution happens in run phase + # Note: K8s and Slurm have their own script management mechanisms + if "MAD_SYSTEM_GPU_ARCHITECTURE" not in self.context.ctx["docker_build_arg"]: + self.rich_console.print( + "[yellow]⚠️ Warning: MAD_SYSTEM_GPU_ARCHITECTURE not provided[/yellow]" + ) + self.rich_console.print( + "[dim] Provide GPU architecture via --additional-context:[/dim]" + ) + self.rich_console.print( + '[dim] --additional-context \'{"docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a"}}\'[/dim]\n' + ) + + # Step 3: Build Docker images + self.rich_console.print("[bold cyan]🏗️ Building Docker images...[/bold cyan]") + builder = DockerBuilder( + self.context, + self.console, + live_output=getattr(self.args, "live_output", False), + ) + + # Determine phase suffix for log files + # Build phase always uses .build suffix to avoid conflicts with run logs + phase_suffix = ".build" + + # Get target architectures from args if provided + target_archs = getattr(self.args, "target_archs", []) + if target_archs: + processed_archs = [] + for arch_arg in target_archs: + # Split comma-separated values + processed_archs.extend( + [arch.strip() for arch in arch_arg.split(",") if arch.strip()] + ) + target_archs = processed_archs + + # Build all models (resilient to individual failures) + build_summary = builder.build_all_models( + models, + self.credentials, + clean_cache, + registry, + phase_suffix, + batch_build_metadata=batch_build_metadata, + target_archs=target_archs, + ) + + # Extract results + failed_builds = build_summary.get("failed_builds", []) + successful_builds = build_summary.get("successful_builds", []) + + # Report build results + if len(successful_builds) > 0: + self.rich_console.print( + f"\n[green]✓ Built {len(successful_builds)} images[/green]" + ) + + if len(failed_builds) > 0: + self.rich_console.print( + f"[yellow]⚠️ {len(failed_builds)} model(s) failed to build:[/yellow]" + ) + for failed in failed_builds: + model_name = failed.get("model", "unknown") + error_msg = failed.get("error", "unknown error") + self.rich_console.print(f" [red]• {model_name}: {error_msg}[/red]") + + # Step 4: ALWAYS generate manifest (even with partial failures) + self.rich_console.print("\n[bold cyan]📄 Generating build manifest...[/bold cyan]") + builder.export_build_manifest(manifest_output, registry, batch_build_metadata) + + # Step 5: Save build summary to manifest + self._save_build_summary(manifest_output, build_summary) + + # Step 6: Save deployment_config to manifest + self._save_deployment_config(manifest_output) + + self.rich_console.print(f"[green]✓ Build complete: {manifest_output}[/green]") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + # Step 7: Check if we should fail (only if ALL builds failed) + if len(failed_builds) > 0: + if len(successful_builds) == 0: + # All builds failed - this is critical + raise BuildError( + "All builds failed - no images available", + context=create_error_context( + operation="build_images", + component="BuildOrchestrator", + ), + suggestions=[ + "Check Docker build logs in *.build.live.log files", + "Verify Dockerfile syntax", + "Ensure base images are accessible", + ], + ) + else: + # Partial success - log warning but don't raise + self.rich_console.print( + f"[yellow]⚠️ Warning: Partial build - " + f"{len(successful_builds)} succeeded, {len(failed_builds)} failed[/yellow]" + ) + + return manifest_output + + except (DiscoveryError, BuildError): + raise + except Exception as e: + context = create_error_context( + operation="build_phase", + component="BuildOrchestrator", + ) + raise BuildError( + f"Build phase failed: {e}", + context=context, + suggestions=[ + "Check Docker daemon is running", + "Verify network connectivity for image pulls", + "Check disk space for Docker builds", + ], + ) from e + + def _save_build_summary(self, manifest_file: str, build_summary: Dict): + """Save build summary to manifest for display purposes.""" + try: + with open(manifest_file, "r") as f: + manifest = json.load(f) + + # Add summary to manifest + manifest["summary"] = build_summary + + with open(manifest_file, "w") as f: + json.dump(manifest, f, indent=2) + + except Exception as e: + self.rich_console.print(f"[yellow]Warning: Could not save build summary: {e}[/yellow]") + + def _save_deployment_config(self, manifest_file: str): + """Save deployment_config from --additional-context to manifest.""" + if not self.additional_context: + self.rich_console.print("[dim]No additional_context provided, skipping deployment config[/dim]") + return + + try: + with open(manifest_file, "r") as f: + manifest = json.load(f) + + # Extract deployment configuration + # Auto-detect target from config presence if not explicitly set + target = self.additional_context.get("deploy") + if not target: + # Auto-detect based on config presence + if self.additional_context.get("slurm"): + target = "slurm" + elif self.additional_context.get("k8s") or self.additional_context.get("kubernetes"): + target = "k8s" + else: + target = "local" + + # Get env_vars and filter out MIOPEN_USER_DB_PATH + # This variable must be set per-process in multi-GPU training to avoid database conflicts + env_vars = self.additional_context.get("env_vars", {}).copy() + if "MIOPEN_USER_DB_PATH" in env_vars: + del env_vars["MIOPEN_USER_DB_PATH"] + print("ℹ️ Filtered MIOPEN_USER_DB_PATH from env_vars (will be set per-process in training)") + + deployment_config = { + "target": target, + "slurm": self.additional_context.get("slurm"), + "k8s": self.additional_context.get("k8s"), + "kubernetes": self.additional_context.get("kubernetes"), + "distributed": self.additional_context.get("distributed"), + "vllm": self.additional_context.get("vllm"), + "env_vars": env_vars, + "debug": self.additional_context.get("debug", False), + } + + # Remove None values + deployment_config = { + k: v for k, v in deployment_config.items() if v is not None + } + + if deployment_config and deployment_config != {"target": "local", "env_vars": {}}: + manifest["deployment_config"] = deployment_config + + with open(manifest_file, "w") as f: + json.dump(manifest, f, indent=2) + + self.rich_console.print(f"[green]✓ Saved deployment config to {manifest_file}[/green]") + else: + self.rich_console.print("[dim]No deployment config to save (local execution)[/dim]") + + except Exception as e: + # Non-fatal - just warn + self.rich_console.print(f"[yellow]Warning: Could not save deployment config: {e}[/yellow]") + diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py new file mode 100644 index 00000000..42032fb1 --- /dev/null +++ b/src/madengine/orchestration/run_orchestrator.py @@ -0,0 +1,1196 @@ +#!/usr/bin/env python3 +""" +Run Orchestrator - Coordinates model execution workflow. + +Supports: +1. Run-only (with manifest): Run pre-built images +2. Full workflow (with tags): Build + Run +3. Local execution: Direct container execution +4. Distributed deployment: SLURM or Kubernetes + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import os +import subprocess +from pathlib import Path +from typing import Dict, Optional + +from rich.console import Console as RichConsole + +from madengine.core.console import Console +from madengine.core.context import Context +from madengine.core.dataprovider import Data +from madengine.core.errors import ( + ConfigurationError, + RuntimeError as MADRuntimeError, + create_error_context, + handle_error, +) +from madengine.utils.session_tracker import SessionTracker + + +class RunOrchestrator: + """ + Orchestrates the run workflow. + + Responsibilities: + - Load manifest or trigger build + - Determine execution target (local vs distributed) + - Delegate to appropriate executor (container_runner or deployment) + - Collect and aggregate results + """ + + def __init__(self, args, additional_context: Optional[Dict] = None): + """ + Initialize run orchestrator. + + Args: + args: CLI arguments namespace + additional_context: Dict from --additional-context + """ + self.args = args + self.console = Console(live_output=getattr(args, "live_output", True)) + self.rich_console = RichConsole() + + # Merge additional_context from args and parameter + merged_context = {} + if hasattr(args, "additional_context") and args.additional_context: + try: + if isinstance(args.additional_context, str): + # Use ast.literal_eval for Python dict syntax (single quotes) + # This matches what Context class expects + import ast + parsed = ast.literal_eval(args.additional_context) + print(f"📝 RunOrchestrator: Parsed additional_context keys: {list(parsed.keys()) if isinstance(parsed, dict) else 'not a dict'}") + merged_context = parsed + elif isinstance(args.additional_context, dict): + merged_context = args.additional_context + print(f"📝 RunOrchestrator: Got dict additional_context keys: {list(merged_context.keys())}") + except (ValueError, SyntaxError) as e: + print(f"Warning: Could not parse additional_context: {e}") + print(f"Raw additional_context: {args.additional_context[:200] if args.additional_context else 'None'}") + pass + + if additional_context: + merged_context.update(additional_context) + + self.additional_context = merged_context + print(f"📝 RunOrchestrator: Final additional_context keys: {list(self.additional_context.keys()) if self.additional_context else 'None'}") + + # Track if we copied MODEL_DIR contents (for cleanup) + self._copied_from_model_dir = False + + # Track if we ran build phase in this workflow (for log combination) + self._did_build_phase = False + + # Initialize session tracker for filtering current run results + perf_csv_path = getattr(args, "output", "perf.csv") + self.session_tracker = SessionTracker(perf_csv_path) + + # Initialize context in runtime mode (with GPU detection for local) + # This will be lazy-initialized only when needed + self.context = None + self.data = None + + def _init_runtime_context(self): + """Initialize runtime context (with GPU detection).""" + # Always reinitialize context in runtime mode for run phase + # This ensures GPU detection and proper runtime context even after build phase + + # Context expects additional_context as a string representation of Python dict + # Use repr() instead of json.dumps() because Context uses ast.literal_eval() + if self.additional_context: + context_string = repr(self.additional_context) + else: + context_string = None + + self.context = Context( + additional_context=context_string, + build_only_mode=False, + ) + + # Initialize data provider if data config exists + data_json_file = getattr(self.args, "data_config_file_name", "data.json") + if os.path.exists(data_json_file): + self.data = Data( + self.context, + filename=data_json_file, + force_mirrorlocal=getattr(self.args, "force_mirror_local", False), + ) + + def execute( + self, + manifest_file: Optional[str] = None, + tags: Optional[list] = None, + registry: Optional[str] = None, + timeout: int = 3600, + ) -> Dict: + """ + Execute run workflow. + + Supports two modes: + 1. Run-only: If manifest_file provided + 2. Full workflow: If tags provided (build + run) + + Args: + manifest_file: Path to build_manifest.json + tags: Model tags to build (triggers build phase if no manifest) + registry: Optional registry override + timeout: Execution timeout in seconds + + Returns: + Execution results dict + + Raises: + ConfigurationError: If neither manifest nor tags provided + MADRuntimeError: If execution fails + """ + self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") + self.rich_console.print("[bold blue]🚀 RUN PHASE[/bold blue]") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + # Track session start for filtering current run results + # The marker file is automatically saved in same directory as perf.csv + session_start_row = self.session_tracker.start_session() + + try: + # Check for MAD_CONTAINER_IMAGE (local image mode) + # This must be checked before normal build/manifest flow + mad_container_image = None + if self.additional_context: + mad_container_image = self.additional_context.get("MAD_CONTAINER_IMAGE") + + if mad_container_image: + # Local image mode: Skip build, create synthetic manifest + if not tags: + raise ConfigurationError( + "Tags required for MAD_CONTAINER_IMAGE mode", + context=create_error_context( + operation="local_image_mode", + component="RunOrchestrator", + ), + suggestions=[ + "Provide --tags to specify which models to run", + "Example: --tags model_name --additional-context \"{'MAD_CONTAINER_IMAGE': 'rocm/tensorflow:latest'}\"", + ], + ) + + # Generate synthetic manifest using the provided image + manifest_file = self._create_manifest_from_local_image( + image_name=mad_container_image, + tags=tags, + manifest_output=getattr(self.args, "manifest_output", "build_manifest.json"), + ) + + # Step 1: Ensure we have a manifest (build if needed) + elif not manifest_file or not os.path.exists(manifest_file): + if not tags: + raise ConfigurationError( + "Either --manifest-file or --tags required", + context=create_error_context( + operation="run_phase", + component="RunOrchestrator", + ), + suggestions=[ + "Provide --manifest-file path to run pre-built images", + "Provide --tags to build and run models", + ], + ) + + self.rich_console.print("[cyan]No manifest found, building first...[/cyan]\n") + manifest_file = self._build_phase(tags, registry) + self._did_build_phase = True # Mark that we built in this workflow + + # Step 2: Load manifest and merge with runtime context + manifest_file = self._load_and_merge_manifest(manifest_file) + + # Step 3: Determine execution target from manifest's deployment_config + # (with optional runtime override) + with open(manifest_file) as f: + manifest = json.load(f) + + deployment_config = manifest.get("deployment_config", {}) + + # Update additional_context with deployment_config for deployment layer + if not self.additional_context: + self.additional_context = {} + + # Merge deployment_config into additional_context (for deployment layer to use) + for key in ["slurm", "k8s", "kubernetes", "distributed", "vllm", "env_vars", "debug"]: + if key in deployment_config and key not in self.additional_context: + self.additional_context[key] = deployment_config[key] + + # Infer deployment target from config structure (Convention over Configuration) + # No explicit "deploy" field needed - presence of k8s/slurm indicates deployment type + target = self._infer_deployment_target(self.additional_context) + + # Legacy support: check manifest for explicit target + if not target or target == "local": + target = deployment_config.get("target", "local") + + self.rich_console.print(f"[bold cyan]Deployment target: {target}[/bold cyan]\n") + + # Step 4: Execute based on target + try: + if target == "local" or target == "docker": + results = self._execute_local(manifest_file, timeout) + else: + results = self._execute_distributed(target, manifest_file) + + # Combine build and run logs for full workflow + if self._did_build_phase and (target == "local" or target == "docker"): + self._combine_build_and_run_logs(manifest_file) + + # Add session information to results for filtering + results["session_start_row"] = session_start_row + results["session_row_count"] = self.session_tracker.get_session_row_count() + + # Always cleanup madengine package files after execution + self.rich_console.print("\n[dim]🧹 Cleaning up madengine package files...[/dim]") + self._cleanup_model_dir_copies() + + # NOTE: Do NOT cleanup session marker here! + # It's needed by display functions in CLI layer + # Cleanup happens in CLI after display (via perf_csv_path) + + return results + + except Exception as e: + # Always cleanup madengine package files even on error + self.rich_console.print("\n[dim]🧹 Cleaning up madengine package files...[/dim]") + self._cleanup_model_dir_copies() + raise + + except (ConfigurationError, MADRuntimeError): + raise + except Exception as e: + context = create_error_context( + operation="run_phase", + component="RunOrchestrator", + ) + raise MADRuntimeError( + f"Run phase failed: {e}", + context=context, + suggestions=[ + "Check manifest file exists and is valid", + "Verify Docker daemon is running", + "Check network connectivity", + ], + ) from e + + def _build_phase(self, tags: list, registry: Optional[str] = None) -> str: + """Trigger build phase if needed.""" + from .build_orchestrator import BuildOrchestrator + + # Update args with tags + self.args.tags = tags + + build_orch = BuildOrchestrator(self.args, self.additional_context) + manifest_file = build_orch.execute( + registry=registry, + clean_cache=getattr(self.args, "clean_docker_cache", False), + ) + + return manifest_file + + def _create_manifest_from_local_image( + self, + image_name: str, + tags: list, + manifest_output: str = "build_manifest.json" + ) -> str: + """ + Create a synthetic manifest for a user-provided local image. + + This enables MAD_CONTAINER_IMAGE functionality where users can skip + the build phase and directly run models using a pre-existing Docker image. + + Args: + image_name: Docker image name/tag (e.g., 'rocm/tensorflow:latest') + tags: Model tags to discover + manifest_output: Output path for the manifest file + + Returns: + Path to the generated manifest file + + Raises: + DiscoveryError: If no models are found + RuntimeError: If image validation fails + """ + from madengine.utils.discover_models import DiscoverModels + from madengine.core.errors import DiscoveryError + + self.rich_console.print(f"[yellow]🏠 Local Image Mode: Using {image_name}[/yellow]") + self.rich_console.print(f"[dim]Skipping build phase, creating synthetic manifest...[/dim]\n") + + # Validate that the image exists locally or can be pulled + try: + self.console.sh(f"docker image inspect {image_name} > /dev/null 2>&1") + self.rich_console.print(f"[green]✓ Image {image_name} found locally[/green]") + except (subprocess.CalledProcessError, RuntimeError) as e: + self.rich_console.print(f"[yellow]⚠️ Image {image_name} not found locally, attempting to pull...[/yellow]") + try: + self.console.sh(f"docker pull {image_name}") + self.rich_console.print(f"[green]✓ Successfully pulled {image_name}[/green]") + except Exception as e: + raise RuntimeError( + f"Failed to find or pull image {image_name}. " + f"Ensure the image exists locally or can be pulled from a registry. " + f"Error: {e}" + ) + + # Discover models by tags (without building) + self.args.tags = tags + discover_models = DiscoverModels(args=self.args) + models = discover_models.run() + + if not models: + raise DiscoveryError( + "No models discovered for local image mode", + context=create_error_context( + operation="create_local_image_manifest", + component="RunOrchestrator", + ), + suggestions=[ + "Check if models.json exists", + "Verify --tags parameter is correct", + "Ensure model definitions have matching tags", + ], + ) + + self.rich_console.print(f"[green]✓ Discovered {len(models)} model(s) for tags: {tags}[/green]\n") + + # Initialize build-only context for manifest generation + # (we need context structure, but skip GPU detection since we're not building) + context_string = repr(self.additional_context) if self.additional_context else None + build_context = Context( + additional_context=context_string, + build_only_mode=True, + ) + + # Create manifest structure + manifest = { + "built_images": {}, + "built_models": {}, + "context": build_context.ctx, + "local_image_mode": True, + "local_image_name": image_name, + "deployment_config": self.additional_context.get("deployment_config", {}), + } + + # For each model, create a synthetic entry using the provided image + for model in models: + model_name = model["name"] + # Create a synthetic image identifier (not an actual built image) + synthetic_image_id = f"local-{model_name.replace('/', '_')}" + + manifest["built_images"][synthetic_image_id] = { + "docker_image": image_name, # Use user-provided image + "dockerfile": "N/A (local image mode)", + "build_status": "SKIPPED", + "build_time": 0, + "local_image": True, + "registry_image": None, + } + + # Convert data list to comma-separated string (required by dataprovider) + data_field = model.get("data", []) + if isinstance(data_field, list): + data_str = ",".join(data_field) if data_field else "" + else: + data_str = data_field if data_field else "" + + # Build model info dict with all fields that ContainerRunner expects + # Use exact field names from models.json format + manifest["built_models"][synthetic_image_id] = { + "name": model_name, + "tags": model.get("tags", []), + "dockerfile": "N/A (local image mode)", + "scripts": model.get("scripts", ""), # models.json uses "scripts" (plural) + "n_gpus": model.get("n_gpus", "1"), # models.json uses "n_gpus" (string format) + "owner": model.get("owner", ""), + "training_precision": model.get("training_precision", ""), + "args": model.get("args", ""), # Required field for docker run + "timeout": model.get("timeout", None), # Optional timeout override + "data": data_str, + "cred": model.get("cred", ""), + "deprecated": model.get("deprecated", False), + "skip_gpu_arch": model.get("skip_gpu_arch", []), + "additional_docker_run_options": model.get("additional_docker_run_options", ""), + } + + # Write manifest to file + with open(manifest_output, "w") as f: + json.dump(manifest, f, indent=2) + + self.rich_console.print(f"[green]✓ Generated synthetic manifest: {manifest_output}[/green]") + self.rich_console.print(f"[yellow]⚠️ Warning: User-provided image {image_name}. Model support not guaranteed.[/yellow]\n") + + return manifest_output + + def _load_and_merge_manifest(self, manifest_file: str) -> str: + """Load manifest and merge with runtime --additional-context.""" + if not os.path.exists(manifest_file): + raise FileNotFoundError(f"Build manifest not found: {manifest_file}") + + with open(manifest_file, "r") as f: + manifest = json.load(f) + + print(f"Loaded manifest with {len(manifest.get('built_images', {}))} images") + + # Merge deployment configs and context (runtime overrides build-time) + if self.additional_context: + # Merge deployment_config + if "deployment_config" in manifest: + stored_config = manifest["deployment_config"] + # Runtime --additional-context overrides stored config + for key in ["deploy", "slurm", "k8s", "kubernetes", "distributed", "vllm", "env_vars", "debug"]: + if key in self.additional_context: + stored_config[key] = self.additional_context[key] + manifest["deployment_config"] = stored_config + + # Merge context (tools, pre_scripts, post_scripts, encapsulate_script) + if "context" not in manifest: + manifest["context"] = {} + + merge_keys = ["tools", "pre_scripts", "post_scripts", "encapsulate_script"] + context_updated = False + for key in merge_keys: + if key in self.additional_context: + manifest["context"][key] = self.additional_context[key] + context_updated = True + + if context_updated or "deployment_config" in manifest: + # Write back merged config + with open(manifest_file, "w") as f: + json.dump(manifest, f, indent=2) + print("Merged runtime context and deployment config with manifest") + + return manifest_file + + def _execute_local(self, manifest_file: str, timeout: int) -> Dict: + """Execute locally using container_runner.""" + self.rich_console.print("[cyan]Executing locally...[/cyan]\n") + + # Load manifest first to check if we have Docker images + with open(manifest_file, "r") as f: + manifest = json.load(f) + + has_docker_images = bool(manifest.get("built_images", {})) + + if has_docker_images: + # Using Docker containers - containers have GPU support built-in + self.rich_console.print("[dim cyan]Using Docker containers with built-in GPU support[/dim cyan]\n") + + # Initialize runtime context (runs full GPU detection on compute nodes) + self._init_runtime_context() + + # Show node info + self._show_node_info() + + # Import from execution layer + from madengine.execution.container_runner import ContainerRunner + + # Load credentials + credentials = self._load_credentials() + + # Restore context from manifest if present + if "context" in manifest: + manifest_context = manifest["context"] + if "tools" in manifest_context: + self.context.ctx["tools"] = manifest_context["tools"] + if "pre_scripts" in manifest_context: + self.context.ctx["pre_scripts"] = manifest_context["pre_scripts"] + if "post_scripts" in manifest_context: + self.context.ctx["post_scripts"] = manifest_context["post_scripts"] + if "encapsulate_script" in manifest_context: + self.context.ctx["encapsulate_script"] = manifest_context["encapsulate_script"] + + # Merge runtime additional_context (takes precedence over manifest) + # This allows users to override tools/scripts at runtime + if self.additional_context: + if "tools" in self.additional_context: + self.context.ctx["tools"] = self.additional_context["tools"] + self.rich_console.print( + f"[dim] Using tools from runtime --additional-context[/dim]" + ) + if "pre_scripts" in self.additional_context: + self.context.ctx["pre_scripts"] = self.additional_context["pre_scripts"] + if "post_scripts" in self.additional_context: + self.context.ctx["post_scripts"] = self.additional_context["post_scripts"] + if "encapsulate_script" in self.additional_context: + self.context.ctx["encapsulate_script"] = self.additional_context["encapsulate_script"] + + # Filter images by GPU vendor and architecture + # Filter images by GPU compatibility + try: + # Always filter by runtime GPU compatibility (both Docker and bare-metal) + runtime_gpu_vendor = self.context.get_gpu_vendor() + runtime_gpu_arch = self.context.get_system_gpu_architecture() + print(f"Runtime GPU vendor: {runtime_gpu_vendor}") + print(f"Runtime GPU architecture detected: {runtime_gpu_arch}") + + if has_docker_images: + # Docker images: filter by GPU vendor at runtime to avoid cross-vendor execution + self.rich_console.print("[dim cyan]Filtering Docker images by runtime GPU compatibility...[/dim cyan]") + else: + # Bare-metal execution: filter by runtime GPU + self.rich_console.print("[dim cyan]Filtering bare-metal images by runtime GPU compatibility...[/dim cyan]") + + compatible_images = self._filter_images_by_gpu_compatibility( + manifest["built_images"], runtime_gpu_vendor, runtime_gpu_arch + ) + + if not compatible_images: + raise MADRuntimeError( + f"No compatible images for GPU vendor '{runtime_gpu_vendor}' and architecture '{runtime_gpu_arch}'", + context=create_error_context( + operation="filter_images", + component="RunOrchestrator", + ), + suggestions=[ + f"Build images for {runtime_gpu_vendor} GPU", + f"Build images for {runtime_gpu_arch} using --target-archs", + "Check manifest contains images for your GPU", + ], + ) + + manifest["built_images"] = compatible_images + print(f"Filtered to {len(compatible_images)} compatible images\n") + + # Filter by skip_gpu_arch from model definitions (applies to both Docker and bare-metal) + runtime_gpu_arch = self.context.get_system_gpu_architecture() + if "built_models" in manifest and compatible_images: + self.rich_console.print("[cyan]Checking skip_gpu_arch model restrictions...[/cyan]") + compatible_images = self._filter_images_by_skip_gpu_arch( + compatible_images, manifest["built_models"], runtime_gpu_arch + ) + manifest["built_images"] = compatible_images + print(f"After skip_gpu_arch filtering: {len(compatible_images)} images to run\n") + + # NOTE: Dockerfile context filtering is already done during build phase + # Re-filtering during run phase causes issues because: + # 1. The build phase already filtered dockerfiles based on build-time context + # 2. All built images should be runnable on the runtime node + # 3. Legacy behavior: filtering happens once (either build or run, not both) + + # Write filtered manifest back to file so runner sees the filtered list + with open(manifest_file, "w") as f: + json.dump(manifest, f, indent=2) + + except Exception as e: + import traceback + self.rich_console.print(f"[yellow]Warning: GPU/Context filtering failed: {e}[/yellow]") + self.rich_console.print(f"[red]Traceback: {traceback.format_exc()}[/red]") + self.rich_console.print("[yellow]Proceeding with all images[/yellow]\n") + + # Copy scripts + self._copy_scripts() + + # Initialize runner + runner = ContainerRunner( + self.context, + self.data, + self.console, + live_output=getattr(self.args, "live_output", False), + additional_context=self.additional_context, + ) + runner.set_credentials(credentials) + + if hasattr(self.args, "output") and self.args.output: + runner.set_perf_csv_path(self.args.output) + + # Run phase always uses .run suffix + # For full workflow, logs are combined later by _combine_build_and_run_logs() + phase_suffix = ".run" + + # Run models + results = runner.run_models_from_manifest( + manifest_file=manifest_file, + registry=getattr(self.args, "registry", None), + timeout=timeout, + keep_alive=getattr(self.args, "keep_alive", False), + keep_model_dir=getattr(self.args, "keep_model_dir", False), + phase_suffix=phase_suffix, + ) + + self.rich_console.print(f"\n[green]✓ Local execution complete[/green]") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + return results + + def _execute_distributed(self, target: str, manifest_file: str) -> Dict: + """Execute on distributed infrastructure.""" + self.rich_console.print(f"[cyan]Deploying to {target}...[/cyan]\n") + + # Import from deployment layer + from madengine.deployment.factory import DeploymentFactory + from madengine.deployment.base import DeploymentConfig + + # Add runtime flags to additional_context for deployment layer + if "live_output" not in self.additional_context: + self.additional_context["live_output"] = getattr(self.args, "live_output", False) + + # Pass session_start_row for result filtering in collect_results + session_start_row = self.session_tracker.session_start_row + if "session_start_row" not in self.additional_context: + self.additional_context["session_start_row"] = session_start_row + + # Create deployment configuration + deployment_config = DeploymentConfig( + target=target, + manifest_file=manifest_file, + additional_context=self.additional_context, + timeout=getattr(self.args, "timeout", 3600), + monitor=self.additional_context.get("monitor", True), + cleanup_on_failure=self.additional_context.get("cleanup_on_failure", True), + ) + + # Create and execute deployment + deployment = DeploymentFactory.create(deployment_config) + result = deployment.execute() + + if result.is_success: + self.rich_console.print(f"[green]✓ Deployment to {target} complete[/green]") + self.rich_console.print(f" Deployment ID: {result.deployment_id}") + if result.logs_path: + self.rich_console.print(f" Logs: {result.logs_path}") + else: + self.rich_console.print(f"[red]✗ Deployment to {target} failed[/red]") + self.rich_console.print(f" Error: {result.message}") + + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + # Return metrics in the format expected by display_results_table + # Extract successful_runs and failed_runs from metrics if available + if result.metrics: + return { + "successful_runs": result.metrics.get("successful_runs", []), + "failed_runs": result.metrics.get("failed_runs", []), + } + else: + return {"successful_runs": [], "failed_runs": []} + + def _show_node_info(self): + """Show node ROCm information.""" + self.console.sh("echo 'MAD Run Models'") + + host_os = self.context.ctx.get("host_os", "") + if "HOST_UBUNTU" in host_os: + print(self.console.sh("apt show rocm-libs -a", canFail=True)) + elif "HOST_CENTOS" in host_os: + print(self.console.sh("yum info rocm-libs", canFail=True)) + elif "HOST_SLES" in host_os: + print(self.console.sh("zypper info rocm-libs", canFail=True)) + elif "HOST_AZURE" in host_os: + print(self.console.sh("tdnf info rocm-libs", canFail=True)) + else: + self.rich_console.print("[yellow]Warning: Unable to detect host OS[/yellow]") + + def _cleanup_model_dir_copies(self): + """Clean up only madengine package files from scripts/common directory. + + This cleanup removes ONLY the files that were copied from madengine package: + - scripts/common/tools.json + - scripts/common/test_echo.sh + - scripts/common/pre_scripts/ + - scripts/common/post_scripts/ + - scripts/common/tools/ + + This preserves the user's actual scripts/ and docker/ directories in MAD project. + """ + import shutil + import subprocess + + # Only clean up scripts/common/ subdirectories that came from madengine package + common_dir = Path("scripts/common") + if not common_dir.exists(): + return + + # List of items to clean up (from madengine package) + items_to_cleanup = [ + "tools.json", + "test_echo.sh", + "pre_scripts", + "post_scripts", + "tools" + ] + + for item_name in items_to_cleanup: + item_path = common_dir / item_name + if item_path.exists(): + try: + if item_path.is_dir(): + # Fix permissions first for directories + try: + subprocess.run( + ["chmod", "-R", "+w", str(item_path)], + capture_output=True, + timeout=10 + ) + except (subprocess.TimeoutExpired, subprocess.CalledProcessError, OSError) as e: + print(f"Warning: chmod failed for {item_path}: {e}") + shutil.rmtree(item_path) + else: + item_path.unlink() + self.rich_console.print(f"[dim] Cleaned up: scripts/common/{item_name}[/dim]") + except Exception as e: + # Try with sudo for permission issues + try: + subprocess.run( + ["sudo", "rm", "-rf", str(item_path)], + check=True, + capture_output=True, + timeout=10 + ) + self.rich_console.print(f"[dim] Cleaned up: scripts/common/{item_name} (elevated)[/dim]") + except Exception as e2: + self.rich_console.print( + f"[yellow]⚠️ Warning: Could not clean up {item_path}: {e2}[/yellow]" + ) + + def _combine_build_and_run_logs(self, manifest_file: str): + """Combine build.live.log and run.live.log into live.log for full workflow. + + For full workflow (build + run), this creates a unified log file by: + 1. Reading the manifest to find models that were actually executed in this session + 2. Finding corresponding *.build.live.log and *.run.live.log files for those models + 3. Concatenating them into *.live.log + 4. Keeping the original build and run logs for reference + + Args: + manifest_file: Path to the manifest file containing executed models + """ + import json + + # Load manifest to get list of build log files + try: + with open(manifest_file, "r") as f: + manifest = json.load(f) + + built_images = manifest.get("built_images", {}) + if not built_images: + return # No models to process + except Exception as e: + self.rich_console.print(f"[yellow]⚠️ Warning: Could not load manifest for log combining: {e}[/yellow]") + return + + self.rich_console.print("\n[dim]📝 Combining build and run logs...[/dim]") + combined_count = 0 + + # Process each built image + for image_name, image_info in built_images.items(): + # Get build log file name from manifest + build_log = image_info.get("log_file") + if not build_log or not os.path.exists(build_log): + continue # Skip if build log doesn't exist + + # Derive the base name and corresponding run log + base_name = build_log.replace(".build.live.log", "") + run_log = f"{base_name}.run.live.log" + combined_log = f"{base_name}.live.log" + + # Check if run log exists + if not os.path.exists(run_log): + continue # Skip if run log doesn't exist + + try: + # Combine build and run logs + with open(combined_log, 'w') as outfile: + # Add build log + with open(build_log, 'r') as infile: + outfile.write(infile.read()) + + # Add separator + outfile.write("\n" + "=" * 80 + "\n") + outfile.write("RUN PHASE LOG\n") + outfile.write("=" * 80 + "\n\n") + + # Add run log + with open(run_log, 'r') as infile: + outfile.write(infile.read()) + + combined_count += 1 + self.rich_console.print(f"[dim] Combined: {combined_log}[/dim]") + + except Exception as e: + self.rich_console.print( + f"[yellow]⚠️ Warning: Could not combine logs for {base_name}: {e}[/yellow]" + ) + + if combined_count > 0: + self.rich_console.print(f"[dim]✓ Combined {combined_count} log file(s)[/dim]") + + def _copy_scripts(self): + """Copy common scripts to model directories. + + Handles scenarios: + 1. MAD Project: scripts/ already exists in current directory - just add madengine common files + 2. External MODEL_DIR: Copy from external path to current directory + 3. madengine Testing: Copy from src/madengine/scripts/common + + NOTE: Does NOT delete existing scripts/ or docker/ directories in current working directory. + """ + import shutil + + # Define ignore function for cache files (used for all copy operations) + def ignore_cache_files(directory, files): + """Ignore Python cache files and directories.""" + return [f for f in files if f.endswith('.pyc') or f == '__pycache__' or f.endswith('.pyo')] + + # Step 1: Check if MODEL_DIR points to external directory and copy if needed + # MODEL_DIR default is "." (current directory), so only copy if it's different + model_dir_env = os.environ.get("MODEL_DIR", ".") + model_dir_abs = os.path.abspath(model_dir_env) + current_dir_abs = os.path.abspath(".") + + # Only copy if MODEL_DIR points to a different directory (not current dir) + if model_dir_abs != current_dir_abs and os.path.exists(model_dir_env): + self.rich_console.print(f"[yellow]📁 External MODEL_DIR detected: {model_dir_env}[/yellow]") + self.rich_console.print("[yellow]Copying MODEL_DIR contents for run phase...[/yellow]") + + # Copy docker/ and scripts/ from MODEL_DIR (without deleting existing ones first) + for subdir in ["docker", "scripts"]: + src_path = Path(model_dir_env) / subdir + if src_path.exists(): + dest_path = Path(subdir) + # Use copytree with dirs_exist_ok=True to merge instead of replace + if dest_path.exists(): + # Only warn, don't delete existing directories + self.rich_console.print(f"[dim] Note: Merging {subdir}/ from MODEL_DIR with existing directory[/dim]") + shutil.copytree(src_path, dest_path, dirs_exist_ok=True, ignore=ignore_cache_files) + + self.rich_console.print("[green]✓ MODEL_DIR structure copied (docker/, scripts/)[/green]") + elif not os.path.exists(model_dir_env): + self.rich_console.print(f"[yellow]⚠️ Warning: MODEL_DIR '{model_dir_env}' does not exist, using current directory[/yellow]") + + # Step 2: Copy madengine's common scripts (pre_scripts, post_scripts, tools) + # This provides the execution framework scripts + # Find madengine installation path (works for both development and installed package) + madengine_common = None + + # Option 1: Development mode - check if running from source + dev_path = Path("src/madengine/scripts/common") + if dev_path.exists(): + madengine_common = dev_path + print(f"Found madengine scripts in development mode: {madengine_common}") + else: + # Option 2: Installed package - find via module location + try: + import madengine + madengine_module_path = Path(madengine.__file__).parent + installed_path = madengine_module_path / "scripts" / "common" + if installed_path.exists(): + madengine_common = installed_path + print(f"Found madengine scripts in installed package: {madengine_common}") + except Exception as e: + print(f"Could not locate madengine scripts: {e}") + + if madengine_common and madengine_common.exists(): + print(f"Copying madengine common scripts from {madengine_common} to scripts/common") + + dest_common = Path("scripts/common") + # Ensure the destination directory exists before copying + dest_common.mkdir(parents=True, exist_ok=True) + + # Copy pre_scripts, post_scripts, tools if they exist + for item in ["pre_scripts", "post_scripts", "tools", "tools.json", "test_echo.sh"]: + src_item = madengine_common / item + if src_item.exists(): + dest_item = dest_common / item + if dest_item.exists(): + if dest_item.is_dir(): + shutil.rmtree(dest_item) + else: + dest_item.unlink() + + if src_item.is_dir(): + shutil.copytree(src_item, dest_item, ignore=ignore_cache_files) + else: + shutil.copy2(src_item, dest_item) + print(f" Copied {item}") + else: + self.rich_console.print("[yellow]⚠️ Could not find madengine scripts directory[/yellow]") + + # Step 3: REMOVED - Distribution to model directories is incorrect + # scripts/common should remain at /scripts/common/ for proper relative path access + # Model scripts reference it via ../scripts/common/ from their directory (e.g., scripts/dummy/) + # + # This ensures compatibility with legacy workflow where: + # - scripts/common/ stays at working directory root + # - Model scripts use ../scripts/common/ relative paths + # - ContainerRunner mounts the entire working directory preserving structure + # + # Note: K8s and Slurm deployments have their own script handling mechanisms + # and do not rely on this local filesystem operation + + def _load_credentials(self) -> Optional[Dict]: + """Load credentials from credential.json and environment.""" + credentials = None + + credential_file = "credential.json" + if os.path.exists(credential_file): + try: + with open(credential_file) as f: + credentials = json.load(f) + except Exception as e: + print(f"Warning: Could not load credentials: {e}") + + # Override with environment variables + docker_hub_user = os.environ.get("MAD_DOCKERHUB_USER") + docker_hub_password = os.environ.get("MAD_DOCKERHUB_PASSWORD") + docker_hub_repo = os.environ.get("MAD_DOCKERHUB_REPO") + + if docker_hub_user and docker_hub_password: + if credentials is None: + credentials = {} + credentials["dockerhub"] = { + "username": docker_hub_user, + "password": docker_hub_password, + } + if docker_hub_repo: + credentials["dockerhub"]["repository"] = docker_hub_repo + + return credentials + + def _filter_images_by_gpu_compatibility( + self, built_images: Dict, runtime_gpu_vendor: str, runtime_gpu_arch: str + ) -> Dict: + """Filter images compatible with runtime GPU vendor and architecture. + + Args: + built_images: Dictionary of built images from manifest + runtime_gpu_vendor: Runtime GPU vendor (AMD, NVIDIA, NONE) + runtime_gpu_arch: Runtime GPU architecture (gfx90a, sm_90, etc.) + + Returns: + Dictionary of compatible images + """ + compatible_images = {} + + for model_name, image_info in built_images.items(): + image_gpu_vendor = image_info.get("gpu_vendor", "") + image_arch = image_info.get("gpu_architecture", "") + + # Legacy images without vendor info - treat as compatible for backward compatibility + if not image_gpu_vendor: + self.rich_console.print( + f"[yellow] Warning: {model_name} has no gpu_vendor, treating as compatible (legacy)[/yellow]" + ) + compatible_images[model_name] = image_info + continue + + # Check GPU vendor compatibility first (most important) + if runtime_gpu_vendor == "NONE" or image_gpu_vendor == runtime_gpu_vendor: + # Vendor matches or CPU-only, check architecture if specified + if image_arch: + # Architecture specified, must match + if image_arch == runtime_gpu_arch: + compatible_images[model_name] = image_info + else: + self.rich_console.print( + f"[dim] Skipping {model_name}: architecture mismatch " + f"({image_arch} != {runtime_gpu_arch})[/dim]" + ) + else: + # No architecture specified, vendor match is enough + compatible_images[model_name] = image_info + else: + # Vendor mismatch + self.rich_console.print( + f"[dim] Skipping {model_name}: GPU vendor mismatch " + f"({image_gpu_vendor} != {runtime_gpu_vendor})[/dim]" + ) + + return compatible_images + + def _filter_images_by_gpu_architecture( + self, built_images: Dict, runtime_gpu_arch: str + ) -> Dict: + """Legacy method for backward compatibility.""" + # Get runtime GPU vendor + runtime_gpu_vendor = self.context.get_gpu_vendor() if self.context else "NONE" + return self._filter_images_by_gpu_compatibility( + built_images, runtime_gpu_vendor, runtime_gpu_arch + ) + + def _filter_images_by_skip_gpu_arch( + self, built_images: Dict, built_models: Dict, runtime_gpu_arch: str + ) -> Dict: + """Filter out models that should skip the current GPU architecture. + + This implements the skip_gpu_arch logic from model definitions, + where models can specify GPU architectures they don't support. + + Args: + built_images: Dictionary of built images from manifest + built_models: Dictionary of model metadata from manifest + runtime_gpu_arch: Runtime GPU architecture (gfx90a, A100, etc.) + + Returns: + Dictionary of images that should run (not skipped) + """ + if getattr(self.args, 'disable_skip_gpu_arch', False): + # User disabled skip logic, run all models + self.rich_console.print("[dim] --disable-skip-gpu-arch flag set, skipping GPU architecture checks[/dim]") + return built_images + + compatible_images = {} + + for model_name, image_info in built_images.items(): + # Get model metadata to check skip_gpu_arch field + model_info = built_models.get(model_name, {}) + skip_gpu_arch_str = model_info.get("skip_gpu_arch", "") + + if skip_gpu_arch_str: + # Parse comma-separated list of architectures to skip + skip_list = [arch.strip() for arch in skip_gpu_arch_str.split(",")] + + # Normalize architecture comparison (handle "NVIDIA A100" -> "A100") + sys_gpu_arch = runtime_gpu_arch + if sys_gpu_arch and "NVIDIA" in sys_gpu_arch: + sys_gpu_arch = sys_gpu_arch.split()[1] + + if sys_gpu_arch in skip_list: + self.rich_console.print( + f"[yellow] Skipping model {model_name} as it is not supported on {runtime_gpu_arch} architecture.[/yellow]" + ) + + # Write SKIPPED status to perf CSV + self._write_skipped_status(model_name, image_info, runtime_gpu_arch) + continue + + compatible_images[model_name] = image_info + + return compatible_images + + def _write_skipped_status(self, model_name: str, image_info: Dict, gpu_arch: str) -> None: + """Write SKIPPED status to perf CSV for models that were skipped. + + Args: + model_name: Name of the model that was skipped + image_info: Image information dictionary + gpu_arch: GPU architecture that caused the skip + """ + try: + from madengine.reporting.update_perf_csv import update_perf_csv + import json + import tempfile + + # Create a perf entry for the skipped model + perf_entry = { + "model": model_name, + "status": "SKIPPED", + "reason": f"Model not supported on {gpu_arch} architecture", + "gpu_architecture": gpu_arch, + } + + # Write to temporary JSON file + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + json.dump(perf_entry, f) + temp_file = f.name + + # Get output CSV path from args + output_csv = getattr(self.args, 'output', 'perf.csv') + + # Update perf CSV with skipped entry + update_perf_csv(exception_result=temp_file, perf_csv=output_csv) + + # Clean up temp file + import os + os.unlink(temp_file) + + except Exception as e: + self.rich_console.print(f"[dim] Warning: Could not write SKIPPED status to CSV: {e}[/dim]") + + def _infer_deployment_target(self, config: Dict) -> str: + """ + Infer deployment target from configuration structure. + + Convention over Configuration: + - Presence of "k8s" or "kubernetes" field → k8s deployment + - Presence of "slurm" field → slurm deployment + - Neither present → local execution + + Args: + config: Configuration dictionary + + Returns: + Deployment target: "k8s", "slurm", or "local" + """ + if "k8s" in config or "kubernetes" in config: + return "k8s" + elif "slurm" in config: + return "slurm" + else: + return "local" + + def _filter_images_by_dockerfile_context(self, built_images: Dict) -> Dict: + """Filter images by dockerfile context matching runtime context. + + This implements the legacy behavior where dockerfiles are filtered + at runtime based on their CONTEXT header matching the current runtime context. + + Args: + built_images: Dictionary of built images from manifest + + Returns: + Dictionary of images that match the runtime context + """ + if not self.context: + return built_images + + compatible_images = {} + + for image_name, image_info in built_images.items(): + dockerfile = image_info.get("dockerfile", "") + + if not dockerfile: + # No dockerfile info, include by default (legacy compatibility) + compatible_images[image_name] = image_info + continue + + # Check if dockerfile exists + if not os.path.exists(dockerfile): + self.rich_console.print( + f"[dim] Warning: Dockerfile {dockerfile} not found. Including by default.[/dim]" + ) + compatible_images[image_name] = image_info + continue + + # Read dockerfile context header + try: + dockerfile_context_str = self.console.sh( + f"head -n5 {dockerfile} | grep '# CONTEXT ' | sed 's/# CONTEXT //g'" + ).strip() + + if not dockerfile_context_str: + # No context header, include by default + compatible_images[image_name] = image_info + continue + + # Create a dict with this dockerfile and its context + dockerfile_dict = {dockerfile: dockerfile_context_str} + + # Use context.filter() to check if this dockerfile matches runtime context + filtered = self.context.filter(dockerfile_dict) + + if filtered: + # Dockerfile matches runtime context + compatible_images[image_name] = image_info + else: + self.rich_console.print( + f"[dim] Skipping {image_name}: dockerfile context doesn't match runtime context[/dim]" + ) + + except Exception as e: + # If we can't read the dockerfile, include it by default + self.rich_console.print( + f"[dim] Warning: Could not read context for {dockerfile}: {e}. Including by default.[/dim]" + ) + compatible_images[image_name] = image_info + + return compatible_images + diff --git a/src/madengine/reporting/README.md b/src/madengine/reporting/README.md new file mode 100644 index 00000000..33d8e5a4 --- /dev/null +++ b/src/madengine/reporting/README.md @@ -0,0 +1,137 @@ +# Performance Reporting Layer + +**Status**: Active +**Used by**: Modern `madengine` CLI + +--- + +## 🎯 Purpose + +Handles performance metrics collection, processing, and CSV output generation for model execution results. + +--- + +## 📦 Components + +### **`update_perf_csv.py`** + +Updates performance CSV files with run results from both legacy and new CLI. + +**Used by:** +- ✅ `execution/container_runner.py` (modern madengine CLI) + +**Key Functions:** +```python +from madengine.reporting.update_perf_csv import update_perf_csv, flatten_tags + +# Update CSV with new results +update_perf_csv( + perf_json_path="results.json", + output_csv="performance.csv" +) + +# Flatten nested tags for CSV export +flattened = flatten_tags(perf_entry) +``` + +--- + +## 🗂️ Legacy Reporting Tools + +The following legacy-only reporting tools remain in `tools/`: + +| File | Purpose | Used By | Status | +|------|---------|---------|--------| +| `tools/csv_to_html.py` | Convert CSV to HTML | `mad.py`, `run_models.py` | Legacy only | +| `tools/csv_to_email.py` | Email CSV reports | `mad.py` | Legacy only | + +These tools are **NOT** used by the modern `madengine` CLI. + +--- + +## 📋 Architecture Decision + +**Why is `update_perf_csv.py` in `reporting/` instead of `tools/`?** + +1. ✅ **Shared across architectures**: Used by both legacy and new CLI +2. ✅ **Active development**: Not deprecated, actively maintained +3. ✅ **Clear responsibility**: Performance data processing +4. ✅ **Semantic clarity**: Reporting is a distinct concern + +**Why are other CSV tools still in `tools/`?** + +- They are **not used** by the modern `madengine` CLI +- Kept for backward compatibility only +- Will be deprecated when legacy CLI is retired + +--- + +## 🔄 Usage Examples + +### **New madengine** (via `container_runner.py`) + +```python +from madengine.reporting.update_perf_csv import update_perf_csv + +# After model execution completes +results_json = "/path/to/results.json" +output_csv = "/path/to/performance.csv" + +update_perf_csv(results_json, output_csv) +``` + +### **Legacy madengine** (via `run_models.py` or `mad.py`) + +```python +from madengine.reporting.update_perf_csv import UpdatePerfCsv + +# Class-based interface (legacy) +updater = UpdatePerfCsv(args) +updater.run() +``` + +--- + +## 📊 Data Flow + +``` +Model Execution + ↓ + Results JSON + ↓ +update_perf_csv() + ↓ +Performance CSV + ↓ +(Optional) CSV → HTML (legacy only) +(Optional) CSV → Email (legacy only) +``` + +--- + +## 🧪 Testing + +```bash +# Test the reporting module +pytest tests/test_update_perf_csv.py -v + +# Test integration with container runner +pytest tests/test_container_runner.py -v -k "perf" +``` + +--- + +## 🚀 Future Enhancements + +Potential improvements (not currently planned): + +- JSON output format (in addition to CSV) +- Parquet output for large datasets +- Real-time metrics streaming +- Integration with `database/` layer for direct ingestion + +--- + +**Last Updated**: November 30, 2025 +**Maintainer**: madengine Team + diff --git a/src/madengine/reporting/__init__.py b/src/madengine/reporting/__init__.py new file mode 100644 index 00000000..26a312dc --- /dev/null +++ b/src/madengine/reporting/__init__.py @@ -0,0 +1,21 @@ +""" +madengine Reporting + +Reporting modules for madengine including performance CSV and superset generation. +""" + +from .update_perf_csv import update_perf_csv, flatten_tags +from .update_perf_super import ( + update_perf_super_json, + update_perf_super_csv, + convert_super_json_to_csv, +) + +__all__ = [ + "update_perf_csv", + "flatten_tags", + "update_perf_super_json", + "update_perf_super_csv", + "convert_super_json_to_csv", +] + diff --git a/src/madengine/reporting/csv_to_email.py b/src/madengine/reporting/csv_to_email.py new file mode 100644 index 00000000..0902ef00 --- /dev/null +++ b/src/madengine/reporting/csv_to_email.py @@ -0,0 +1,168 @@ +"""Module for converting CSV files to email-ready HTML reports. + +This module provides functionality to convert multiple CSV files in a directory +to a consolidated HTML report suitable for email distribution. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import os +import argparse +import logging +from typing import List, Tuple + +import pandas as pd + +logger = logging.getLogger(__name__) + + +def find_csv_files(directory: str) -> List[str]: + """Find all CSV files in the specified directory. + + Args: + directory: Path to the directory to search. + + Returns: + List of CSV file paths found in the directory. + """ + csv_files = [] + for filename in os.listdir(directory): + if filename.endswith('.csv'): + csv_files.append(os.path.join(directory, filename)) + return sorted(csv_files) + + +def csv_to_html_section(file_path: str) -> Tuple[str, str]: + """Convert a CSV file to an HTML section with header. + + Args: + file_path: Path to the CSV file. + + Returns: + Tuple of (section_name, html_content). + """ + # Read the CSV file + df = pd.read_csv(file_path) + + # Get section name from file path + base_name = os.path.basename(file_path) + section_name = os.path.splitext(base_name)[0] + + # Convert DataFrame to HTML + html_table = df.to_html(index=False) + + # Create HTML section with header + html_section = f"

{section_name}

\n{html_table}\n" + + return section_name, html_section + + +def convert_directory_csvs_to_html( + directory_path: str, + output_file: str = "run_results.html" +) -> str: + """Convert all CSV files in a directory to a single HTML file. + + Args: + directory_path: Path to the directory containing CSV files. + output_file: Name of the output HTML file. + + Returns: + Path to the generated HTML file. + + Raises: + NotADirectoryError: If the path is not a directory. + FileNotFoundError: If the directory does not exist. + """ + # Validate input + if not os.path.exists(directory_path): + raise FileNotFoundError(f"Directory not found: {directory_path}") + + if not os.path.isdir(directory_path): + raise NotADirectoryError(f"Path is not a directory: {directory_path}") + + # Find all CSV files + csv_files = find_csv_files(directory_path) + + if not csv_files: + logger.warning(f"No CSV files found in directory: {directory_path}") + print(f"⚠️ No CSV files found in {directory_path}") + return None + + print(f"📊 Found {len(csv_files)} CSV file(s) to process") + + # Process each CSV file and combine HTML + full_html_content = "" + for csv_file in csv_files: + try: + section_name, html_section = csv_to_html_section(csv_file) + full_html_content += html_section + logger.info(f"Processed: {section_name}") + print(f" ✓ Converted {os.path.basename(csv_file)}") + except Exception as e: + logger.error(f"Failed to process {csv_file}: {e}") + print(f" ✗ Failed to convert {os.path.basename(csv_file)}: {e}") + + # Write combined HTML to output file + output_path = os.path.join(directory_path, output_file) if directory_path != "." else output_file + + with open(output_path, 'w', encoding='utf-8') as html_file: + html_file.write(full_html_content) + + logger.info(f"Generated HTML report: {output_path}") + return output_path + + +class ConvertCsvToEmail: + """Handler class for CSV to email-ready HTML conversion command. + + This class provides a command-line interface wrapper for converting + multiple CSV files in a directory to a consolidated HTML report. + """ + + def __init__(self, args: argparse.Namespace): + """Initialize the ConvertCsvToEmail handler. + + Args: + args: Command-line arguments containing path to CSV directory. + """ + self.args = args + self.return_status = False + + def run(self) -> bool: + """Execute the CSV to email HTML conversion. + + Returns: + True if conversion was successful, False otherwise. + """ + directory_path = getattr(self.args, 'csv_file_path', '.') or '.' + output_file = getattr(self.args, 'output_file', 'run_results.html') + + print("\n" + "=" * 80) + print("📧 CONVERTING CSV FILES TO EMAIL REPORT") + print("=" * 80) + print(f"📂 Input directory: {directory_path}") + + try: + output_path = convert_directory_csvs_to_html(directory_path, output_file) + + if output_path: + print(f"📄 Output file: {output_path}") + print("✅ Email report generated successfully") + else: + print("ℹ️ No files to process") + + print("=" * 80 + "\n") + self.return_status = True + except (FileNotFoundError, NotADirectoryError) as e: + print(f"❌ Error: {e}") + print("=" * 80 + "\n") + self.return_status = False + except Exception as e: + print(f"❌ Unexpected error during conversion: {e}") + logger.exception("Email report generation failed") + print("=" * 80 + "\n") + self.return_status = False + + return self.return_status + diff --git a/src/madengine/reporting/csv_to_html.py b/src/madengine/reporting/csv_to_html.py new file mode 100644 index 00000000..baf7a027 --- /dev/null +++ b/src/madengine/reporting/csv_to_html.py @@ -0,0 +1,136 @@ +"""Module for converting CSV files to HTML reports. + +This module provides functionality to convert CSV files to HTML format +for generating performance reports and visualizations. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import os +import argparse +import logging +from typing import Optional + +import pandas as pd + +logger = logging.getLogger(__name__) + + +def convert_csv_to_html( + file_path: str, + output_path: Optional[str] = None, + include_index: bool = False +) -> str: + """Convert a CSV file to an HTML file. + + Args: + file_path: The path to the CSV file. + output_path: Optional custom output path. If None, creates HTML in same directory. + include_index: Whether to include DataFrame index in HTML output. + + Returns: + The path to the generated HTML file. + + Raises: + FileNotFoundError: If the CSV file does not exist. + ValueError: If the file is not a CSV file. + pd.errors.EmptyDataError: If the CSV file is empty. + """ + # Validate input + if not os.path.exists(file_path): + raise FileNotFoundError(f"CSV file not found: {file_path}") + + if not file_path.endswith('.csv'): + raise ValueError(f"File must be a CSV file: {file_path}") + + # Determine output path + if output_path is None: + base_path = os.path.dirname(file_path) + base_name = os.path.basename(file_path) + file_name = os.path.splitext(base_name)[0] + + output_path = os.path.join(base_path, f"{file_name}.html") if base_path else f"{file_name}.html" + + # Read CSV file + logger.info(f"Reading CSV file: {file_path}") + try: + df = pd.read_csv(file_path) + except pd.errors.EmptyDataError: + logger.error(f"CSV file is empty: {file_path}") + raise + + # Display DataFrame (with beautiful formatting if available) + file_name = os.path.splitext(os.path.basename(file_path))[0] + try: + from madengine.utils.log_formatting import print_dataframe_beautiful + print_dataframe_beautiful(df, f"Converting CSV: {file_name}") + except ImportError: + # Fallback to basic formatting if utils not available + print(f"\n📊 Converting CSV: {file_name}") + print("=" * 80) + print(df.to_string(max_rows=20, max_cols=10)) + print("=" * 80) + + # Convert DataFrame to HTML + logger.info(f"Converting to HTML: {output_path}") + df_html = df.to_html(index=include_index) + + # Write HTML file + with open(output_path, 'w', encoding='utf-8') as html_file: + html_file.write(df_html) + + logger.info(f"✅ Successfully converted {file_path} to {output_path}") + return output_path + + +class ConvertCsvToHtml: + """Handler class for CSV to HTML conversion command. + + This class provides a command-line interface wrapper for converting + CSV files to HTML format. + """ + + def __init__(self, args: argparse.Namespace): + """Initialize the ConvertCsvToHtml handler. + + Args: + args: Command-line arguments containing csv_file_path. + """ + self.args = args + self.return_status = False + + def run(self) -> bool: + """Execute the CSV to HTML conversion. + + Returns: + True if conversion was successful, False otherwise. + """ + file_path = self.args.csv_file_path + + print("\n" + "=" * 80) + print("🔄 CONVERTING CSV TO HTML REPORT") + print("=" * 80) + print(f"📂 Input file: {file_path}") + + try: + output_path = convert_csv_to_html(file_path) + print(f"📄 Output file: {output_path}") + print("✅ Conversion completed successfully") + print("=" * 80 + "\n") + self.return_status = True + except FileNotFoundError as e: + print(f"❌ Error: {e}") + print("=" * 80 + "\n") + self.return_status = False + except ValueError as e: + print(f"❌ Error: {e}") + print("=" * 80 + "\n") + self.return_status = False + except Exception as e: + print(f"❌ Unexpected error during conversion: {e}") + logger.exception("Conversion failed") + print("=" * 80 + "\n") + self.return_status = False + + return self.return_status + diff --git a/src/madengine/tools/update_perf_csv.py b/src/madengine/reporting/update_perf_csv.py similarity index 67% rename from src/madengine/tools/update_perf_csv.py rename to src/madengine/reporting/update_perf_csv.py index 5e32e3e2..c3666486 100644 --- a/src/madengine/tools/update_perf_csv.py +++ b/src/madengine/reporting/update_perf_csv.py @@ -9,16 +9,17 @@ import json import argparse import typing + # third-party imports import pandas as pd def df_strip_columns(df: pd.DataFrame) -> pd.DataFrame: """Strip the column names of a DataFrame. - + Args: df: The DataFrame to strip the column names of. - + Returns: The DataFrame with stripped column names. """ @@ -28,10 +29,10 @@ def df_strip_columns(df: pd.DataFrame) -> pd.DataFrame: def read_json(js: str) -> dict: """Read a JSON file. - + Args: js: The path to the JSON file. - + Returns: The JSON dictionary. """ @@ -42,7 +43,7 @@ def read_json(js: str) -> dict: def flatten_tags(perf_entry: dict): """Flatten the tags of a performance entry. - + Args: perf_entry: The performance entry. @@ -56,7 +57,7 @@ def flatten_tags(perf_entry: dict): def perf_entry_df_to_csv(perf_entry: pd.DataFrame) -> None: """Write the performance entry DataFrame to a CSV file. - + Args: perf_entry: The performance entry DataFrame. @@ -68,7 +69,7 @@ def perf_entry_df_to_csv(perf_entry: pd.DataFrame) -> None: def perf_entry_dict_to_csv(perf_entry: typing.Dict) -> None: """Write the performance entry dictionary to a CSV file. - + Args: perf_entry: The performance entry dictionary. """ @@ -78,22 +79,19 @@ def perf_entry_dict_to_csv(perf_entry: typing.Dict) -> None: def handle_multiple_results( - perf_csv_df: pd.DataFrame, - multiple_results: str, - common_info: str, - model_name: str - ) -> pd.DataFrame: + perf_csv_df: pd.DataFrame, multiple_results: str, common_info: str, model_name: str +) -> pd.DataFrame: """Handle multiple results. - + Args: perf_csv_df: The performance csv DataFrame. multiple_results: The path to the multiple results CSV file. common_info: The path to the common info JSON file. model_name: The model name. - + Returns: The updated performance csv DataFrame. - + Raises: AssertionError: If the number of columns in the performance csv DataFrame is not equal to the length of the row. """ @@ -123,16 +121,40 @@ def handle_multiple_results( else: row["status"] = "FAILURE" + # Ensure all values are scalars (convert lists to strings) + for key, value in row.items(): + if isinstance(value, (list, tuple)): + row[key] = ",".join(str(v) for v in value) + + # Create a single-row DataFrame from the row dict + row_df = pd.DataFrame([row]) final_multiple_results_df = pd.concat( - [final_multiple_results_df, pd.DataFrame(row, index=[0])], ignore_index=True + [final_multiple_results_df, row_df], ignore_index=True ) - # Reorder columns according to existing perf csv + + # Reorder columns according to existing perf csv (do this once after loop) + if not perf_csv_df.empty: columns = perf_csv_df.columns.tolist() - # Add any additional columns to the end + # Add any additional columns from final_multiple_results_df to the end columns = columns + [col for col in final_multiple_results_df.columns if col not in columns] + # Reorder final_multiple_results_df to match + for col in columns: + if col not in final_multiple_results_df.columns: + final_multiple_results_df[col] = "" final_multiple_results_df = final_multiple_results_df[columns] perf_entry_df_to_csv(final_multiple_results_df) + + # Also save as JSON for consistency with single result workflow + # This ensures perf_entry.json is always up-to-date regardless of result type + perf_entry_list = final_multiple_results_df.to_dict(orient='records') + with open("perf_entry.json", "w") as f: + # If multiple entries, save as array; if single, save as object for consistency + if len(perf_entry_list) == 1: + json.dump(perf_entry_list[0], f, indent=2) + else: + json.dump(perf_entry_list, f, indent=2) + if perf_csv_df.empty: perf_csv_df = final_multiple_results_df else: @@ -141,16 +163,13 @@ def handle_multiple_results( return perf_csv_df -def handle_single_result( - perf_csv_df: pd.DataFrame, - single_result: str - ) -> pd.DataFrame: +def handle_single_result(perf_csv_df: pd.DataFrame, single_result: str) -> pd.DataFrame: """Handle a single result. - + Args: perf_csv_df: The performance csv DataFrame. single_result: The path to the single result JSON file. - + Returns: The updated performance csv DataFrame. @@ -161,23 +180,30 @@ def handle_single_result( perf_entry_dict_to_csv(single_result_json) single_result_df = pd.DataFrame(single_result_json, index=[0]) if perf_csv_df.empty: + # If perf_csv_df is empty but has columns, fill missing columns with empty strings + for col in perf_csv_df.columns: + if col not in single_result_df.columns: + single_result_df[col] = "" perf_csv_df = single_result_df[perf_csv_df.columns] else: + # Add missing columns to single_result_df before concatenation + for col in perf_csv_df.columns: + if col not in single_result_df.columns: + single_result_df[col] = "" perf_csv_df = pd.concat([perf_csv_df, single_result_df], ignore_index=True) return perf_csv_df def handle_exception_result( - perf_csv_df: pd.DataFrame, - exception_result: str - ) -> pd.DataFrame: + perf_csv_df: pd.DataFrame, exception_result: str +) -> pd.DataFrame: """Handle an exception result. - + Args: perf_csv_df: The performance csv DataFrame. exception_result: The path to the exception result JSON file. - + Returns: The updated performance csv DataFrame. @@ -188,28 +214,41 @@ def handle_exception_result( perf_entry_dict_to_csv(exception_result_json) exception_result_df = pd.DataFrame(exception_result_json, index=[0]) if perf_csv_df.empty: + # If perf_csv_df is empty but has columns, fill missing columns with empty strings + for col in perf_csv_df.columns: + if col not in exception_result_df.columns: + exception_result_df[col] = "" perf_csv_df = exception_result_df[perf_csv_df.columns] else: + # Add missing columns to exception_result_df before concatenation + for col in perf_csv_df.columns: + if col not in exception_result_df.columns: + exception_result_df[col] = "" perf_csv_df = pd.concat([perf_csv_df, exception_result_df], ignore_index=True) return perf_csv_df def update_perf_csv( - perf_csv: str, - multiple_results: typing.Optional[str] = None, - single_result: typing.Optional[str] = None, - exception_result: typing.Optional[str] = None, - common_info: typing.Optional[str] = None, - model_name: typing.Optional[str] = None, - ): + perf_csv: str, + multiple_results: typing.Optional[str] = None, + single_result: typing.Optional[str] = None, + exception_result: typing.Optional[str] = None, + common_info: typing.Optional[str] = None, + model_name: typing.Optional[str] = None, +): """Update the performance csv file with the latest performance data.""" - print(f"Attaching performance metrics of models to perf.csv") + print("\n" + "=" * 80) + print("📈 ATTACHING PERFORMANCE METRICS TO DATABASE") + print("=" * 80) + print(f"📂 Target file: {perf_csv}") + # read perf.csv perf_csv_df = df_strip_columns(pd.read_csv(perf_csv)) # handle multiple_results, single_result, and exception_result if multiple_results: + print("🔄 Processing multiple results...") perf_csv_df = handle_multiple_results( perf_csv_df, multiple_results, @@ -217,17 +256,19 @@ def update_perf_csv( model_name, ) elif single_result: + print("🔄 Processing single result...") perf_csv_df = handle_single_result(perf_csv_df, single_result) elif exception_result: - perf_csv_df = handle_exception_result( - perf_csv_df, exception_result - ) + print("⚠️ Processing exception result...") + perf_csv_df = handle_exception_result(perf_csv_df, exception_result) else: - print("No results to update in perf.csv") + print("ℹ️ No results to update in perf.csv") # write new perf.csv # Note that this file will also generate a perf_entry.csv regardless of the output file args. perf_csv_df.to_csv(perf_csv, index=False) + print(f"✅ Successfully updated: {perf_csv}") + print("=" * 80 + "\n") class UpdatePerfCsv: @@ -247,12 +288,17 @@ def __init__(self, args: argparse.Namespace): def run(self): """Update the performance csv file with the latest performance data.""" - print(f"Updating performance metrics of models perf.csv to database") + print("\n" + "=" * 80) + print("📊 UPDATING PERFORMANCE METRICS DATABASE") + print("=" * 80) + print(f"📂 Processing: {self.args.perf_csv}") + # read perf.csv perf_csv_df = df_strip_columns(pd.read_csv(self.args.perf_csv)) # handle multiple_results, single_result, and exception_result if self.args.multiple_results: + print("🔄 Processing multiple results...") perf_csv_df = handle_multiple_results( perf_csv_df, self.args.multiple_results, @@ -260,17 +306,22 @@ def run(self): self.args.model_name, ) elif self.args.single_result: + print("🔄 Processing single result...") perf_csv_df = handle_single_result(perf_csv_df, self.args.single_result) elif self.args.exception_result: + print("⚠️ Processing exception result...") perf_csv_df = handle_exception_result( perf_csv_df, self.args.exception_result ) else: - print("No results to update in perf.csv") + print("ℹ️ No results to update in perf.csv") # write new perf.csv # Note that this file will also generate a perf_entry.csv regardless of the output file args. perf_csv_df.to_csv(self.args.perf_csv, index=False) + print(f"✅ Successfully updated: {self.args.perf_csv}") + print("=" * 80 + "\n") + self.return_status = True return self.return_status diff --git a/src/madengine/reporting/update_perf_super.py b/src/madengine/reporting/update_perf_super.py new file mode 100644 index 00000000..f0d5cda5 --- /dev/null +++ b/src/madengine/reporting/update_perf_super.py @@ -0,0 +1,398 @@ +"""Module to update the perf_super.json file with enhanced performance data. + +This module is used to update the perf_super.json file (cumulative) with performance data +that includes configuration information from config files, and provides CSV/JSON export. +It also generates perf_entry_super.json (latest run only) for consistency with perf_entry.json. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# built-in imports +import json +import os +import typing +# third-party imports +import pandas as pd +# MAD Engine imports +from madengine.utils.config_parser import ConfigParser + + +def read_json(js: str) -> typing.Union[dict, list]: + """Read a JSON file. + + Args: + js: The path to the JSON file. + + Returns: + The JSON dictionary or list. + """ + with open(js, 'r') as f: + return json.load(f) + + +def write_json(data: typing.Union[dict, list], output_path: str) -> None: + """Write data to a JSON file. + + Args: + data: The data to write (dict or list). + output_path: The path to the output JSON file. + """ + with open(output_path, 'w') as f: + json.dump(data, f, indent=2) + + +def load_perf_super_json(perf_super_json: str) -> list: + """Load existing perf_super.json file (cumulative). + + Args: + perf_super_json: Path to perf_super.json file. + + Returns: + List of performance records, or empty list if file doesn't exist. + """ + if not os.path.exists(perf_super_json): + return [] + + try: + data = read_json(perf_super_json) + # Ensure it's a list + if isinstance(data, list): + return data + else: + return [data] + except Exception as e: + print(f"Warning: Could not load existing {perf_super_json}: {e}") + return [] + + +def handle_multiple_results_super( + perf_super_list: list, + multiple_results: str, + common_info: str, + model_name: str, + config_parser: ConfigParser + ) -> list: + """Handle multiple results with config matching. + + Args: + perf_super_list: List of existing performance records. + multiple_results: The path to the multiple results CSV file. + common_info: The path to the common info JSON file. + model_name: The model name. + config_parser: ConfigParser instance for loading configs. + + Returns: + Updated list of performance records with configs. + """ + # Load multiple results CSV + multiple_results_df = pd.read_csv(multiple_results) + multiple_results_df.columns = multiple_results_df.columns.str.strip() + + # Check required columns + required_cols = ['model', 'performance', 'metric'] + for col in required_cols: + if col not in multiple_results_df.columns: + raise RuntimeError(f"{multiple_results} file is missing the {col} column") + + # Load common info + common_info_json = read_json(common_info) + + # Parse config file from args if present + configs_data = None + if 'args' in common_info_json and common_info_json['args']: + # Try to extract config path from args + scripts_path = common_info_json.get('pipeline', '') + configs_data = config_parser.parse_and_load( + common_info_json['args'], + scripts_path + ) + + # Process each result row + for result_row in multiple_results_df.to_dict(orient="records"): + record = common_info_json.copy() + + # Update model name + result_model = result_row.pop("model") + record["model"] = f"{model_name}_{result_model}" + + # Extract standard performance/metric columns + record["performance"] = result_row.pop("performance") + record["metric"] = result_row.pop("metric") + + # Put remaining metrics into multi_results + # Exclude internal fields that shouldn't be in multi_results + extra_metrics = {k: v for k, v in result_row.items() + if k not in ["status"] and pd.notna(v)} + if extra_metrics: + record["multi_results"] = extra_metrics + else: + record["multi_results"] = None + + # Set status based on performance + if record.get("performance") is not None and pd.notna(record.get("performance")): + record["status"] = "SUCCESS" + else: + record["status"] = "FAILURE" + + # Match config to this specific result + if configs_data: + if isinstance(configs_data, list): + # For CSV configs with multiple rows, try to match + matched_config = config_parser.match_config_to_result( + configs_data, + result_row, + result_model + ) + record["configs"] = matched_config + else: + # For JSON/YAML configs, use as-is + record["configs"] = configs_data + else: + record["configs"] = None + + perf_super_list.append(record) + + return perf_super_list + + +def handle_single_result_super( + perf_super_list: list, + single_result: str + ) -> list: + """Handle a single result. + + Args: + perf_super_list: List of existing performance records. + single_result: The path to the single result JSON file. + + Returns: + Updated list of performance records. + """ + single_result_json = read_json(single_result) + + # Ensure configs field exists (may be None) + if "configs" not in single_result_json: + single_result_json["configs"] = None + + # Ensure multi_results field exists (may be None) + if "multi_results" not in single_result_json: + single_result_json["multi_results"] = None + + perf_super_list.append(single_result_json) + return perf_super_list + + +def handle_exception_result_super( + perf_super_list: list, + exception_result: str + ) -> list: + """Handle an exception result. + + Args: + perf_super_list: List of existing performance records. + exception_result: The path to the exception result JSON file. + + Returns: + Updated list of performance records. + """ + exception_result_json = read_json(exception_result) + + # Ensure configs field exists (may be None) + if "configs" not in exception_result_json: + exception_result_json["configs"] = None + + # Ensure multi_results field exists (may be None) + if "multi_results" not in exception_result_json: + exception_result_json["multi_results"] = None + + perf_super_list.append(exception_result_json) + return perf_super_list + + +def update_perf_super_json( + perf_super_json: str, + multiple_results: typing.Optional[str] = None, + single_result: typing.Optional[str] = None, + exception_result: typing.Optional[str] = None, + common_info: typing.Optional[str] = None, + model_name: typing.Optional[str] = None, + scripts_base_dir: typing.Optional[str] = None, + ) -> int: + """Update the perf_super.json file (cumulative) with the latest performance data. + + Args: + perf_super_json: Path to perf_super.json file (cumulative). + multiple_results: Path to multiple results CSV file. + single_result: Path to single result JSON file. + exception_result: Path to exception result JSON file. + common_info: Path to common info JSON file. + model_name: The model name. + scripts_base_dir: Base directory for scripts (for config file resolution). + + Returns: + Number of entries added in this update. + """ + print("\n" + "=" * 80) + print("📊 UPDATING PERFORMANCE SUPERSET DATABASE") + print("=" * 80) + print(f"📂 Target file: {perf_super_json}") + + # Load existing perf_super.json + perf_super_list = load_perf_super_json(perf_super_json) + initial_count = len(perf_super_list) + + # Create config parser + config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) + + # Handle different result types + if multiple_results: + print("🔄 Processing multiple results with configs...") + perf_super_list = handle_multiple_results_super( + perf_super_list, + multiple_results, + common_info, + model_name, + config_parser, + ) + elif single_result: + print("🔄 Processing single result with configs...") + perf_super_list = handle_single_result_super(perf_super_list, single_result) + elif exception_result: + print("⚠️ Processing exception result...") + perf_super_list = handle_exception_result_super( + perf_super_list, exception_result + ) + else: + print("ℹ️ No results to update in perf_super.json") + return 0 + + # Write updated perf_super.json + write_json(perf_super_list, perf_super_json) + entries_added = len(perf_super_list) - initial_count + print(f"✅ Successfully updated: {perf_super_json} (added {entries_added} entries)") + print("=" * 80 + "\n") + + return entries_added + + +def generate_perf_entry_super_json( + perf_super_json: str = "perf_super.json", + perf_entry_super_json: str = "perf_entry_super.json", + num_entries: int = 1 +) -> None: + """Generate perf_entry_super.json (latest entries) from perf_super.json (cumulative). + + Args: + perf_super_json: Path to cumulative JSON source + perf_entry_super_json: Path to entry JSON output (latest entries only) + num_entries: Number of latest entries to include + """ + if not os.path.exists(perf_super_json): + print(f"⚠️ {perf_super_json} not found, skipping entry JSON generation") + return + + data = read_json(perf_super_json) + if not isinstance(data, list): + data = [data] + + if not data: + print(f"⚠️ {perf_super_json} is empty, skipping entry JSON generation") + return + + # Take the latest num_entries entries + entry_data = data[-num_entries:] if num_entries > 0 else [data[-1]] + + # Write to perf_entry_super.json + write_json(entry_data, perf_entry_super_json) + print(f"✅ Generated entry JSON: {perf_entry_super_json} ({len(entry_data)} entries)") + + +def convert_super_json_to_csv( + perf_super_json: str, + output_csv: str, + entry_only: bool = False, + num_entries: int = 1 +) -> None: + """Convert JSON to CSV format. + + Args: + perf_super_json: Path to JSON source + output_csv: Output CSV path + entry_only: If True, only convert latest entries; if False, convert all + num_entries: Number of latest entries to include when entry_only=True + """ + # Load JSON list + if not os.path.exists(perf_super_json): + print(f"⚠️ {perf_super_json} not found, skipping CSV generation") + return + + data = read_json(perf_super_json) + if not isinstance(data, list): + data = [data] + + if not data: + print(f"⚠️ {perf_super_json} is empty, skipping CSV generation") + return + + if entry_only and data: + # Take the latest num_entries entries + data = data[-num_entries:] if num_entries > 0 else [data[-1]] + + # Convert to DataFrame + df = pd.DataFrame(data) + + # Serialize complex fields to JSON strings + if 'configs' in df.columns: + df['configs'] = df['configs'].apply( + lambda x: json.dumps(x) if x is not None else None + ) + + if 'multi_results' in df.columns: + df['multi_results'] = df['multi_results'].apply( + lambda x: json.dumps(x) if x is not None else None + ) + + # Write to CSV + df.to_csv(output_csv, index=False) + print(f"✅ Generated CSV: {output_csv} ({len(df)} entries)") + + +def update_perf_super_csv( + perf_super_json: str = "perf_super.json", + perf_super_csv: str = "perf_super.csv", + num_entries: int = 1 +) -> None: + """Generate perf_entry_super.json, perf_entry_super.csv and perf_super.csv from perf_super.json. + + Args: + perf_super_json: Path to cumulative JSON source (perf_super.json) + perf_super_csv: Path to cumulative CSV (perf_super.csv) + num_entries: Number of latest entries to include in perf_entry_super.* + """ + print("\n" + "=" * 80) + print("📄 GENERATING FILES FROM PERFORMANCE SUPERSET") + print("=" * 80) + + # Generate perf_entry_super.json (latest entries from current run) + generate_perf_entry_super_json( + perf_super_json=perf_super_json, + perf_entry_super_json="perf_entry_super.json", + num_entries=num_entries + ) + + # Generate perf_entry_super.csv (latest entries from current run) + convert_super_json_to_csv( + "perf_entry_super.json", # Use the entry JSON as source + "perf_entry_super.csv", + entry_only=False # Read all from entry JSON (already filtered) + ) + + # Generate perf_super.csv (all entries) + convert_super_json_to_csv( + perf_super_json, + perf_super_csv, + entry_only=False + ) + + print("=" * 80 + "\n") + diff --git a/src/madengine/scripts/common/post_scripts/gpu_info_post.sh b/src/madengine/scripts/common/post_scripts/gpu_info_post.sh index 5582b986..337a9550 100644 --- a/src/madengine/scripts/common/post_scripts/gpu_info_post.sh +++ b/src/madengine/scripts/common/post_scripts/gpu_info_post.sh @@ -4,19 +4,60 @@ # All rights reserved. # -set -e set -x tool=$1 +# Output filename is tool_output.csv (e.g., gpu_info_power_profiler_output.csv) OUTPUT=${tool}_output.csv -SAVESPACE=/myworkspace/ -cd $SAVESPACE -if [ -d "$OUTPUT" ]; then - mkdir "$OUTPUT" +# In Docker local execution, prof.csv is in current directory (run_directory) +# In K8s execution, prof.csv is also in current directory (/workspace) +echo "Current directory: $(pwd)" +echo "Looking for profiler output for tool: $tool..." + +# Check if the profiler already wrote to the final output file +# (This happens when OUTPUT_FILE env var is set in tools.json) +if [ -f "$OUTPUT" ]; then + echo "✓ Profiler output already exists: $OUTPUT" + chmod a+rw "${OUTPUT}" + echo "Profiler output saved to: $(pwd)/${OUTPUT}" + exit 0 fi -mv prof.csv "$OUTPUT" +# When multiple gpu_info tools are stacked together, they may create their outputs +# with different filenames. Look for the specific output file by checking common locations. + +# Check if any profiler output files exist +echo "Looking for any *_profiler_output.csv files..." +ls -la *_profiler_output.csv 2>/dev/null || echo "No *_profiler_output.csv files found" + +# When tools are stacked, one tool might have created its output file while another didn't +# This is expected behavior - don't fail the entire run +if [ ! -f "$OUTPUT" ]; then + echo "⚠️ Warning: $OUTPUT not found in $(pwd)" + echo "⚠️ This may be expected if multiple gpu_info tools are stacked together" + echo "⚠️ and only one ran successfully. Checking for any profiler outputs..." + + # Check if prof.csv exists (default output name) + if [ -f "prof.csv" ]; then + echo "Found prof.csv - renaming to $OUTPUT" + mv prof.csv "$OUTPUT" + chmod a+rw "${OUTPUT}" + echo "Profiler output saved to: $(pwd)/${OUTPUT}" + exit 0 + fi + + # List all CSV files for debugging + echo "Available CSV files in directory:" + ls -la *.csv 2>/dev/null || echo "No CSV files found" + + # Don't fail - just warn and exit successfully + # This allows other stacked tools to complete their post-scripts + echo "⚠️ Profiler output $OUTPUT not found - skipping (non-fatal)" + exit 0 +fi -chmod -R a+rw "${SAVESPACE}/${OUTPUT}" +# If we get here, OUTPUT exists but wasn't caught by the first check +chmod a+rw "${OUTPUT}" +echo "Profiler output saved to: $(pwd)/${OUTPUT}" diff --git a/src/madengine/scripts/common/post_scripts/gpu_info_power_stop.sh b/src/madengine/scripts/common/post_scripts/gpu_info_power_stop.sh new file mode 100755 index 00000000..051eb9a7 --- /dev/null +++ b/src/madengine/scripts/common/post_scripts/gpu_info_power_stop.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# +# Copyright (c) Advanced Micro Devices, Inc. +# All rights reserved. +# +# Stop gpu_info_power_profiler and collect output + +set -x + +echo "Stopping GPU power profiler..." + +PROFILER_PID_FILE="/tmp/gpu_info_power_profiler.pid" +PROFILER_START_FILE="/tmp/gpu_info_power_profiler.started" + +# Check if profiler was started +if [ ! -f "$PROFILER_START_FILE" ]; then + echo "⚠️ Warning: Power profiler was not started - skipping" + exit 0 +fi + +# Check if PID file exists +if [ ! -f "$PROFILER_PID_FILE" ]; then + echo "⚠️ Warning: Power profiler PID file not found - profiler may not be running" + exit 0 +fi + +# Read PID +PROFILER_PID=$(cat "$PROFILER_PID_FILE") + +# Check if process is still running +if ! kill -0 "$PROFILER_PID" 2>/dev/null; then + echo "⚠️ Warning: Power profiler process (PID: $PROFILER_PID) is not running" +else + echo "Sending termination signal to power profiler (PID: $PROFILER_PID)..." + + # Send SIGTERM to gracefully stop the profiler + kill -TERM "$PROFILER_PID" 2>/dev/null || true + + # Wait for profiler to finish writing output (max 10 seconds) + WAIT_COUNT=0 + while kill -0 "$PROFILER_PID" 2>/dev/null && [ $WAIT_COUNT -lt 20 ]; do + sleep 0.5 + WAIT_COUNT=$((WAIT_COUNT + 1)) + done + + # Force kill if still running + if kill -0 "$PROFILER_PID" 2>/dev/null; then + echo "⚠️ Profiler did not stop gracefully, force killing..." + kill -9 "$PROFILER_PID" 2>/dev/null || true + fi + + echo "✓ GPU power profiler stopped" +fi + +# Clean up temporary files +rm -f "$PROFILER_PID_FILE" "$PROFILER_START_FILE" + +echo "✓ Power profiler cleanup complete" + +# Show profiler log if it exists +if [ -f "/tmp/gpu_info_power_profiler.log" ]; then + echo "=== Power Profiler Log ===" + tail -20 /tmp/gpu_info_power_profiler.log || true + echo "==========================" +fi + diff --git a/src/madengine/scripts/common/post_scripts/gpu_info_vram_stop.sh b/src/madengine/scripts/common/post_scripts/gpu_info_vram_stop.sh new file mode 100755 index 00000000..221a283a --- /dev/null +++ b/src/madengine/scripts/common/post_scripts/gpu_info_vram_stop.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# +# Copyright (c) Advanced Micro Devices, Inc. +# All rights reserved. +# +# Stop gpu_info_vram_profiler and collect output + +set -x + +echo "Stopping GPU VRAM profiler..." + +PROFILER_PID_FILE="/tmp/gpu_info_vram_profiler.pid" +PROFILER_START_FILE="/tmp/gpu_info_vram_profiler.started" + +# Check if profiler was started +if [ ! -f "$PROFILER_START_FILE" ]; then + echo "⚠️ Warning: VRAM profiler was not started - skipping" + exit 0 +fi + +# Check if PID file exists +if [ ! -f "$PROFILER_PID_FILE" ]; then + echo "⚠️ Warning: VRAM profiler PID file not found - profiler may not be running" + exit 0 +fi + +# Read PID +PROFILER_PID=$(cat "$PROFILER_PID_FILE") + +# Check if process is still running +if ! kill -0 "$PROFILER_PID" 2>/dev/null; then + echo "⚠️ Warning: VRAM profiler process (PID: $PROFILER_PID) is not running" +else + echo "Sending termination signal to VRAM profiler (PID: $PROFILER_PID)..." + + # Send SIGTERM to gracefully stop the profiler + kill -TERM "$PROFILER_PID" 2>/dev/null || true + + # Wait for profiler to finish writing output (max 10 seconds) + WAIT_COUNT=0 + while kill -0 "$PROFILER_PID" 2>/dev/null && [ $WAIT_COUNT -lt 20 ]; do + sleep 0.5 + WAIT_COUNT=$((WAIT_COUNT + 1)) + done + + # Force kill if still running + if kill -0 "$PROFILER_PID" 2>/dev/null; then + echo "⚠️ Profiler did not stop gracefully, force killing..." + kill -9 "$PROFILER_PID" 2>/dev/null || true + fi + + echo "✓ GPU VRAM profiler stopped" +fi + +# Clean up temporary files +rm -f "$PROFILER_PID_FILE" "$PROFILER_START_FILE" + +echo "✓ VRAM profiler cleanup complete" + +# Show profiler log if it exists +if [ -f "/tmp/gpu_info_vram_profiler.log" ]; then + echo "=== VRAM Profiler Log ===" + tail -20 /tmp/gpu_info_vram_profiler.log || true + echo "==========================" +fi + diff --git a/src/madengine/scripts/common/post_scripts/trace.sh b/src/madengine/scripts/common/post_scripts/trace.sh index 950e51cf..26484321 100644 --- a/src/madengine/scripts/common/post_scripts/trace.sh +++ b/src/madengine/scripts/common/post_scripts/trace.sh @@ -12,7 +12,7 @@ tool=$1 OUTPUT=${tool}_output SAVESPACE=/myworkspace/ -mkdir "$OUTPUT" +mkdir -p "$OUTPUT" case "$tool" in @@ -54,8 +54,65 @@ rpd) ;; rocprof) - mv results* "$OUTPUT" - cp -vLR --preserve=all "$OUTPUT" "$SAVESPACE" + # Handle both legacy rocprof (results*) and rocprofv3 (different output format) + echo "ROCprof post-script: Collecting profiling output..." + + # Check for legacy rocprof results files + if ls results* 1> /dev/null 2>&1; then + echo "Found rocprof results files" + mv results* "$OUTPUT" 2>/dev/null || true + else + echo "No rocprof results* files found (may be using rocprofv3)" + fi + + # Check for rocprofv3 output directories (UUID pattern like 1e4d92661463/) + # rocprofv3 creates directories with hex UUIDs containing .db files + found_rocprofv3_output=false + for dir in */; do + # Check if directory exists and contains .db files + if [ -d "$dir" ]; then + # Use proper glob expansion to check for any .db file + if compgen -G "${dir}*_results.db" > /dev/null; then + echo "Found rocprofv3 output directory: $dir" + mv "$dir" "$OUTPUT/" 2>/dev/null || true + found_rocprofv3_output=true + fi + fi + done + + # Also check for other rocprofv3 output patterns + if ls rocprofv3-* 1> /dev/null 2>&1; then + echo "Found rocprofv3-* files" + mv rocprofv3-* "$OUTPUT" 2>/dev/null || true + found_rocprofv3_output=true + fi + + if [ "$found_rocprofv3_output" = true ]; then + echo "Collected rocprofv3 profiling data" + fi + + # Check for CSV trace files in subdirectories (rocprof can create hostname subdirectories) + # Look for patterns like: hostname/pid_kernel_trace.csv, hostname/pid_hip_api_trace.csv, etc. + csv_found=false + for dir in */; do + if [ -d "$dir" ]; then + # Check for CSV files matching rocprof patterns + if compgen -G "${dir}*_trace.csv" > /dev/null || compgen -G "${dir}*_api_trace.csv" > /dev/null; then + echo "Found rocprof CSV files in directory: $dir" + # Copy CSV files to output directory, preserving subdirectory structure + mkdir -p "$OUTPUT/$dir" + cp -v "${dir}"*.csv "$OUTPUT/$dir/" 2>/dev/null || true + csv_found=true + fi + fi + done + + if [ "$csv_found" = true ]; then + echo "Collected rocprof CSV trace files from subdirectories" + fi + + # Copy output directory (even if empty - non-critical) + cp -vLR --preserve=all "$OUTPUT" "$SAVESPACE" || echo "Note: Output directory may be empty (profiling was passive)" ;; esac diff --git a/src/madengine/scripts/common/pre_scripts/gpu_info_power_start.sh b/src/madengine/scripts/common/pre_scripts/gpu_info_power_start.sh new file mode 100755 index 00000000..d28c5763 --- /dev/null +++ b/src/madengine/scripts/common/pre_scripts/gpu_info_power_start.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# +# Copyright (c) Advanced Micro Devices, Inc. +# All rights reserved. +# +# Start gpu_info_power_profiler in background mode + +set -x + +echo "Starting GPU power profiler in background..." + +# Get environment variables from tools.json (with POWER_ prefix to avoid conflicts) +DEVICE=${POWER_DEVICE:-"all"} +SAMPLING_RATE=${POWER_SAMPLING_RATE:-"0.1"} +MODE=${POWER_MODE:-"power"} +OUTPUT_FILE=${POWER_OUTPUT_FILE:-"gpu_info_power_profiler_output.csv"} +DUAL_GCD=${POWER_DUAL_GCD:-"false"} + +# Export environment variables for the profiler (without prefix for the profiler script) +export DEVICE +export SAMPLING_RATE +export MODE +export OUTPUT_FILE +export DUAL_GCD + +# Create a marker file to track profiler status +PROFILER_PID_FILE="/tmp/gpu_info_power_profiler.pid" +PROFILER_START_FILE="/tmp/gpu_info_power_profiler.started" + +# Start profiler in background using a wrapper approach +# The profiler will run "tail -f /dev/null" as a dummy command that runs forever +# We'll kill it in the post-script after the actual workload completes +echo "Launching power profiler..." + +# Determine the correct path to gpu_info_profiler.py based on environment +# K8s: scripts are in /workspace/scripts/ +# Local: scripts are in ../scripts/ relative to working directory +if [ -f "scripts/common/tools/gpu_info_profiler.py" ]; then + # K8s or working from root directory + PROFILER_SCRIPT="scripts/common/tools/gpu_info_profiler.py" +elif [ -f "../scripts/common/tools/gpu_info_profiler.py" ]; then + # Local execution from subdirectory + PROFILER_SCRIPT="../scripts/common/tools/gpu_info_profiler.py" +else + echo "Error: Cannot find gpu_info_profiler.py" + exit 1 +fi + +nohup python3 "$PROFILER_SCRIPT" tail -f /dev/null > /tmp/gpu_info_power_profiler.log 2>&1 & +PROFILER_PID=$! + +# Save PID for later termination +echo "$PROFILER_PID" > "$PROFILER_PID_FILE" +echo "✓ GPU power profiler started (PID: $PROFILER_PID)" + +# Give profiler time to initialize +sleep 2 + +# Touch start marker +touch "$PROFILER_START_FILE" + +echo "✓ GPU power profiler initialization complete" + diff --git a/src/madengine/scripts/common/pre_scripts/gpu_info_vram_start.sh b/src/madengine/scripts/common/pre_scripts/gpu_info_vram_start.sh new file mode 100755 index 00000000..2ae8e83d --- /dev/null +++ b/src/madengine/scripts/common/pre_scripts/gpu_info_vram_start.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# +# Copyright (c) Advanced Micro Devices, Inc. +# All rights reserved. +# +# Start gpu_info_vram_profiler in background mode + +set -x + +echo "Starting GPU VRAM profiler in background..." + +# Get environment variables from tools.json (with VRAM_ prefix to avoid conflicts) +DEVICE=${VRAM_DEVICE:-"all"} +SAMPLING_RATE=${VRAM_SAMPLING_RATE:-"0.1"} +MODE=${VRAM_MODE:-"vram"} +OUTPUT_FILE=${VRAM_OUTPUT_FILE:-"gpu_info_vram_profiler_output.csv"} +DUAL_GCD=${VRAM_DUAL_GCD:-"false"} + +# Export environment variables for the profiler (without prefix for the profiler script) +export DEVICE +export SAMPLING_RATE +export MODE +export OUTPUT_FILE +export DUAL_GCD + +# Create a marker file to track profiler status +PROFILER_PID_FILE="/tmp/gpu_info_vram_profiler.pid" +PROFILER_START_FILE="/tmp/gpu_info_vram_profiler.started" + +# Start profiler in background using a wrapper approach +# The profiler will run "tail -f /dev/null" as a dummy command that runs forever +# We'll kill it in the post-script after the actual workload completes +echo "Launching VRAM profiler..." + +# Determine the correct path to gpu_info_profiler.py based on environment +# K8s: scripts are in /workspace/scripts/ +# Local: scripts are in ../scripts/ relative to working directory +if [ -f "scripts/common/tools/gpu_info_profiler.py" ]; then + # K8s or working from root directory + PROFILER_SCRIPT="scripts/common/tools/gpu_info_profiler.py" +elif [ -f "../scripts/common/tools/gpu_info_profiler.py" ]; then + # Local execution from subdirectory + PROFILER_SCRIPT="../scripts/common/tools/gpu_info_profiler.py" +else + echo "Error: Cannot find gpu_info_profiler.py" + exit 1 +fi + +nohup python3 "$PROFILER_SCRIPT" tail -f /dev/null > /tmp/gpu_info_vram_profiler.log 2>&1 & +PROFILER_PID=$! + +# Save PID for later termination +echo "$PROFILER_PID" > "$PROFILER_PID_FILE" +echo "✓ GPU VRAM profiler started (PID: $PROFILER_PID)" + +# Give profiler time to initialize +sleep 2 + +# Touch start marker +touch "$PROFILER_START_FILE" + +echo "✓ GPU VRAM profiler initialization complete" + diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py index 66fb84ac..db504803 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py @@ -284,11 +284,23 @@ def dump_csv_output(self): fs.write(sys_config_info[j]) fs.write("\n") fs.close() - print ("OK: Dumped into {} file.".format(self.filename)) + print("\n" + "="*60) + print(f"✅ SUCCESS: System config data dumped to {self.filename}") + print("="*60 + "\n") def print_csv_output(self): - print ("Printing the sys config info env variables...") + print("\n" + "="*80) + print("📋 SYSTEM CONFIG INFO - ENVIRONMENT VARIABLES") + print("="*80) if self.sys_config_info_list: for j in range(len(self.sys_config_info_list)): line = self.sys_config_info_list[j] - print (line) + # Add some formatting for key-value pairs + if "|" in line and not line.startswith("Tag"): + key, value = line.split("|", 1) + print(f"🔹 {key:<30}: {value}") + else: + print(f"📌 {line}") + else: + print("❌ No system config information available") + print("="*80 + "\n") diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py index 8aca62d7..8fcaebec 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py @@ -63,6 +63,7 @@ def print_gpu_hardware_information(gpu_device_type): cmd = "nvidia-smi -L" else: print ("WARNING: Unknown GPU device detected") + cmd = "echo 'Unknown GPU device'" cmd_info = CommandInfo("GPU Information", [cmd]) return cmd_info diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/test_rocenv.sh b/src/madengine/scripts/common/pre_scripts/rocEnvTool/test_rocenv.sh new file mode 100644 index 00000000..a817001e --- /dev/null +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/test_rocenv.sh @@ -0,0 +1,234 @@ +#!/bin/bash +# Test script for rocenv_tool_v2.py +# Validates functionality on both TheRock and traditional ROCm systems + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo "==========================================" +echo "rocenv_tool_v2.py Test Suite" +echo "==========================================" +echo + +# Function to print test results +pass() { + echo -e "${GREEN}✓ PASS${NC}: $1" +} + +fail() { + echo -e "${RED}✗ FAIL${NC}: $1" + exit 1 +} + +info() { + echo -e "${YELLOW}ℹ INFO${NC}: $1" +} + +# Test 1: Check file exists +echo "Test 1: File existence" +if [ -f "rocenv_tool_v2.py" ]; then + pass "rocenv_tool_v2.py exists" +else + fail "rocenv_tool_v2.py not found" +fi + +# Test 2: Check dependencies +echo +echo "Test 2: Dependency checks" +if [ -f "console.py" ]; then + pass "console.py found" +else + fail "console.py not found" +fi + +if [ -f "csv_parser.py" ]; then + pass "csv_parser.py found" +else + fail "csv_parser.py not found" +fi + +# Test 3: Python syntax check +echo +echo "Test 3: Python syntax validation" +if python3 -m py_compile rocenv_tool_v2.py 2>/dev/null; then + pass "Python syntax valid" +else + fail "Python syntax errors detected" +fi + +# Test 4: Help text +echo +echo "Test 4: Command-line interface" +if python3 rocenv_tool_v2.py --help > /dev/null 2>&1; then + pass "Help text accessible" +else + fail "Help text failed" +fi + +# Test 5: Verbose mode detection +echo +echo "Test 5: Installation detection (verbose mode)" +info "Running detection..." +OUTPUT=$(python3 rocenv_tool_v2.py --verbose --output-name test_verbose 2>&1 || true) +echo "$OUTPUT" | head -20 +echo + +if echo "$OUTPUT" | grep -q "Installation Type:"; then + INSTALL_TYPE=$(echo "$OUTPUT" | grep "Installation Type:" | head -1) + pass "Detection completed: $INSTALL_TYPE" +else + fail "Detection failed to identify installation type" +fi + +# Test 6: Basic execution +echo +echo "Test 6: Basic execution (non-verbose)" +if python3 rocenv_tool_v2.py --output-name test_basic > /dev/null 2>&1; then + pass "Basic execution successful" +else + fail "Basic execution failed" +fi + +# Test 7: Output directory creation +echo +echo "Test 7: Output directory validation" +if [ -d ".test_basic" ]; then + pass "Output directory created" + + # Count subdirectories + NUM_SECTIONS=$(find .test_basic -mindepth 1 -maxdepth 1 -type d | wc -l) + info "Generated $NUM_SECTIONS information sections" + + if [ "$NUM_SECTIONS" -gt 5 ]; then + pass "Sufficient sections generated ($NUM_SECTIONS)" + else + fail "Too few sections generated ($NUM_SECTIONS)" + fi +else + fail "Output directory not created" +fi + +# Test 8: Check key sections +echo +echo "Test 8: Key section validation" +REQUIRED_SECTIONS=("os_information" "cpu_information" "gpu_information") +for section in "${REQUIRED_SECTIONS[@]}"; do + if [ -d ".test_basic/$section" ]; then + if [ -f ".test_basic/$section/$section.txt" ]; then + pass "Section '$section' generated" + else + fail "Section '$section' file missing" + fi + else + info "Section '$section' not generated (may be optional)" + fi +done + +# Test 9: ROCm-specific sections +echo +echo "Test 9: ROCm-specific sections" +if [ -d ".test_basic/rocm_information" ]; then + pass "ROCm information section generated" + + # Check content + if [ -f ".test_basic/rocm_information/rocm_information.txt" ]; then + CONTENT=$(cat .test_basic/rocm_information/rocm_information.txt) + + if echo "$CONTENT" | grep -q "Installation Type:"; then + DETECTED_TYPE=$(echo "$CONTENT" | grep "Installation Type:" | head -1) + pass "ROCm installation type detected: $DETECTED_TYPE" + fi + + if echo "$CONTENT" | grep -q "ROCm Root:"; then + DETECTED_ROOT=$(echo "$CONTENT" | grep "ROCm Root:" | head -1) + pass "ROCm root identified: $DETECTED_ROOT" + fi + fi +else + info "ROCm information not generated (GPU may not be AMD)" +fi + +# Test 10: CSV generation +echo +echo "Test 10: CSV generation" +if python3 rocenv_tool_v2.py --output-name test_csv --dump-csv > /dev/null 2>&1; then + if [ -f "test_csv.csv" ]; then + pass "CSV file generated" + + LINE_COUNT=$(wc -l < test_csv.csv) + info "CSV contains $LINE_COUNT lines" + + if [ "$LINE_COUNT" -gt 10 ]; then + pass "CSV contains data" + fi + else + fail "CSV file not created" + fi +else + fail "CSV generation failed" +fi + +# Test 11: Lite mode +echo +echo "Test 11: Lite mode" +if [ -f "env_tags.json" ]; then + if python3 rocenv_tool_v2.py --lite --output-name test_lite > /dev/null 2>&1; then + pass "Lite mode execution successful" + else + fail "Lite mode execution failed" + fi +else + info "env_tags.json not found, skipping lite mode test" +fi + +# Test 12: Error handling (invalid path) +echo +echo "Test 12: Error handling" +# This should not crash even with missing tools +if timeout 30 python3 rocenv_tool_v2.py --output-name test_robust > /dev/null 2>&1; then + pass "Robust error handling (script completed)" +else + EXITCODE=$? + if [ $EXITCODE -eq 124 ]; then + fail "Script timed out (possible hang)" + else + fail "Script crashed unexpectedly" + fi +fi + +# Cleanup +echo +echo "==========================================" +echo "Cleanup" +echo "==========================================" +echo "Removing test output directories..." +rm -rf .test_basic .test_verbose .test_csv .test_lite .test_robust +rm -f test_csv.csv + +echo +echo "==========================================" +echo "Test Summary" +echo "==========================================" +echo -e "${GREEN}All tests passed!${NC}" +echo +echo "Next steps:" +echo "1. Review the implementation in rocenv_tool_v2.py" +echo "2. Test on a TheRock container:" +echo " docker run -it python3 rocenv_tool_v2.py --verbose" +echo "3. Test on a traditional ROCm system:" +echo " python3 rocenv_tool_v2.py --verbose" +echo "4. Compare outputs with original rocenv_tool.py" +echo +echo "Documentation:" +echo "- README_v2.md - Usage guide" +echo "- THEROCK_COMPATIBILITY.md - Compatibility details" +echo "- IMPLEMENTATION_SUMMARY.md - Implementation overview" +echo + diff --git a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh index 7e3a7e2b..84879d05 100644 --- a/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh +++ b/src/madengine/scripts/common/pre_scripts/run_rocenv_tool.sh @@ -5,11 +5,32 @@ # OUTPUT_FILE_NAME=${1:-"sys_config_info"} -cp -r ../scripts/common/pre_scripts/rocEnvTool . -cd rocEnvTool -python3 rocenv_tool.py --lite --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME -out_dir="."$OUTPUT_FILE_NAME -out_csv=$OUTPUT_FILE_NAME".csv" -cp -r $out_dir ../../ -cp $out_csv ../../ -cd .. + +# Determine the script's directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Check if rocEnvTool is in the same directory (K8s execution) +if [ -d "$SCRIPT_DIR/rocEnvTool" ]; then + # K8s execution: rocEnvTool is already in place + cd "$SCRIPT_DIR/rocEnvTool" + python3 rocenv_tool.py --lite --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME + out_dir="."$OUTPUT_FILE_NAME + out_csv=$OUTPUT_FILE_NAME".csv" + # Copy results back to workspace root + if [ -d "$out_dir" ]; then + cp -r "$out_dir" /workspace/ + fi + if [ -f "$out_csv" ]; then + cp "$out_csv" /workspace/ + fi +else + # Local execution: copy rocEnvTool from relative path + cp -r ../scripts/common/pre_scripts/rocEnvTool . + cd rocEnvTool + python3 rocenv_tool.py --lite --dump-csv --print-csv --output-name $OUTPUT_FILE_NAME + out_dir="."$OUTPUT_FILE_NAME + out_csv=$OUTPUT_FILE_NAME".csv" + cp -r $out_dir ../../ + cp $out_csv ../../ + cd .. +fi diff --git a/src/madengine/scripts/common/tools.json b/src/madengine/scripts/common/tools.json index 0b6a4907..c0792ab1 100644 --- a/src/madengine/scripts/common/tools.json +++ b/src/madengine/scripts/common/tools.json @@ -20,7 +20,133 @@ }, "rocprof": { "pre_scripts": [], - "cmd": "rocprof", + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --runtime-trace --", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprof_hip_only": { + "pre_scripts": [], + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprof_sys": { + "pre_scripts": [], + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --sys-trace --", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprofv3": { + "pre_scripts": [], + "cmd": "rocprofv3 --runtime-trace --", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprofv3_compute": { + "pre_scripts": [], + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --kernel-trace -i ../scripts/common/tools/counters/compute_bound.txt --output-format pftrace -d ./rocprof_output --", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprofv3_memory": { + "pre_scripts": [], + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --kernel-trace --memory-copy-trace --memory-allocation-trace -i ../scripts/common/tools/counters/memory_bound.txt --output-format pftrace -d ./rocprof_output --", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprofv3_communication": { + "pre_scripts": [], + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --hsa-trace --kernel-trace --memory-copy-trace --rccl-trace --scratch-memory-trace --output-format pftrace -d ./rocprof_output --", + "env_vars": { + "RCCL_DEBUG": "INFO" + }, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprofv3_full": { + "pre_scripts": [], + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --hsa-trace --kernel-trace --memory-copy-trace --memory-allocation-trace --rccl-trace --scratch-memory-trace --marker-trace --runtime-trace -i ../scripts/common/tools/counters/full_profile.txt --output-format pftrace --stats -d ./rocprof_output --", + "env_vars": { + "RCCL_DEBUG": "INFO", + "HSA_ENABLE_SDMA": "0" + }, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprofv3_lightweight": { + "pre_scripts": [], + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --kernel-trace --output-format json -d ./rocprof_output --", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprofv3_perfetto": { + "pre_scripts": [], + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --kernel-trace --memory-copy-trace --output-format pftrace --perfetto-trace-filename model_trace.pftrace -d ./rocprof_output --", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprofv3_api_overhead": { + "pre_scripts": [], + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --hsa-trace --marker-trace --runtime-trace --stats --output-format json -d ./rocprof_output --", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocprofv3_pc_sampling": { + "pre_scripts": [], + "cmd": "bash ../scripts/common/tools/rocprof_wrapper.sh --hip-trace --kernel-trace --pc-sampling-beta-enabled --pc-sampling-unit time --pc-sampling-method stochastic --pc-sampling-interval 1000 --output-format pftrace -d ./rocprof_output --", "env_vars": {}, "post_scripts": [ { @@ -85,11 +211,17 @@ "pre_scripts": [ { "path": "scripts/common/pre_scripts/gpu_info_pre.sh" + }, + { + "path": "scripts/common/pre_scripts/gpu_info_power_start.sh" } ], - "cmd": "python3 ../scripts/common/tools/gpu_info_profiler.py", - "env_vars": {"DEVICE":"all", "SAMPLING_RATE":"0.1", "MODE":"power", "DUAL-GCD":"false"}, + "cmd": "", + "env_vars": {"POWER_DEVICE":"all", "POWER_SAMPLING_RATE":"0.1", "POWER_MODE":"power", "POWER_DUAL_GCD":"false", "POWER_OUTPUT_FILE":"gpu_info_power_profiler_output.csv"}, "post_scripts": [ + { + "path": "scripts/common/post_scripts/gpu_info_power_stop.sh" + }, { "path": "scripts/common/post_scripts/gpu_info_post.sh", "args": "gpu_info_power_profiler" @@ -100,16 +232,32 @@ "pre_scripts": [ { "path": "scripts/common/pre_scripts/gpu_info_pre.sh" + }, + { + "path": "scripts/common/pre_scripts/gpu_info_vram_start.sh" } ], - "cmd": "python3 ../scripts/common/tools/gpu_info_profiler.py", - "env_vars": {"DEVICE":"all", "SAMPLING_RATE":"0.1", "MODE":"vram", "DUAL-GCD":"false"}, + "cmd": "", + "env_vars": {"VRAM_DEVICE":"all", "VRAM_SAMPLING_RATE":"0.1", "VRAM_MODE":"vram", "VRAM_DUAL_GCD":"false", "VRAM_OUTPUT_FILE":"gpu_info_vram_profiler_output.csv"}, "post_scripts": [ + { + "path": "scripts/common/post_scripts/gpu_info_vram_stop.sh" + }, { "path": "scripts/common/post_scripts/gpu_info_post.sh", "args": "gpu_info_vram_profiler" } ] + }, + "therock_check": { + "pre_scripts": [ + { + "path": "scripts/common/tools/detect_therock.sh" + } + ], + "cmd": "", + "env_vars": {}, + "post_scripts": [] } } } diff --git a/src/madengine/scripts/common/tools/counters/communication_bound.txt b/src/madengine/scripts/common/tools/counters/communication_bound.txt new file mode 100644 index 00000000..2f6c0228 --- /dev/null +++ b/src/madengine/scripts/common/tools/counters/communication_bound.txt @@ -0,0 +1,18 @@ +# Communication-bound profiling counters +# For multi-GPU distributed training + +# PCIe traffic +pmc: TCC_EA_WRREQ_64B_sum +pmc: TCC_EA_RDREQ_32B_sum + +# Atomic operations (used in synchronization) +pmc: TCC_ATOMIC_sum +pmc: TCP_TCC_ATOMIC_REQ_sum + +# Wave synchronization +pmc: SQ_WAIT_INST_ANY +pmc: SQ_WAVES + +# Memory transfers +pmc: TCP_TCC_READ_REQ_sum +pmc: TCP_TCC_WRITE_REQ_sum diff --git a/src/madengine/scripts/common/tools/counters/compute_bound.txt b/src/madengine/scripts/common/tools/counters/compute_bound.txt new file mode 100644 index 00000000..9c67aa4b --- /dev/null +++ b/src/madengine/scripts/common/tools/counters/compute_bound.txt @@ -0,0 +1,25 @@ +# Compute-bound profiling counters +# For models bottlenecked by ALU operations + +# Wave execution +pmc: SQ_WAVES +pmc: SQ_WAVE_CYCLES + +# VALU instructions (vector ALU) +pmc: SQ_INSTS_VALU +pmc: SQ_ACTIVE_INST_VALU +pmc: SQ_INSTS_VALU_ADD_F32 +pmc: SQ_INSTS_VALU_MUL_F32 +pmc: SQ_INSTS_VALU_FMA_F32 +pmc: SQ_INSTS_VALU_TRANS_F32 + +# SALU instructions (scalar ALU) +pmc: SQ_INSTS_SALU + +# Memory instructions +pmc: SQ_INSTS_VMEM_RD +pmc: SQ_INSTS_VMEM_WR + +# Wait states +pmc: SQ_WAIT_INST_ANY +# Note: SQ_WAIT_INST_VALU not available on gfx942 (MI300X) diff --git a/src/madengine/scripts/common/tools/counters/full_profile.txt b/src/madengine/scripts/common/tools/counters/full_profile.txt new file mode 100644 index 00000000..cdc7d768 --- /dev/null +++ b/src/madengine/scripts/common/tools/counters/full_profile.txt @@ -0,0 +1,27 @@ +# Comprehensive profiling counters +# Collect all major metrics + +# Execution +pmc: SQ_WAVES +pmc: SQ_WAVE_CYCLES +pmc: SQ_INSTS_VALU +pmc: SQ_INSTS_SALU +pmc: SQ_ACTIVE_INST_VALU + +# Memory +pmc: L2CacheHit +pmc: TCC_HIT_sum +pmc: TCC_MISS_sum +pmc: TCP_TCC_READ_REQ_sum +pmc: TCP_TCC_WRITE_REQ_sum + +# Cache efficiency +pmc: TCC_EA_RDREQ_32B_sum +pmc: TCC_EA_WRREQ_64B_sum + +# Waits +pmc: SQ_WAIT_INST_ANY +pmc: SQ_WAIT_INST_VALU + +# LDS +pmc: SQ_LDS_BANK_CONFLICT diff --git a/src/madengine/scripts/common/tools/counters/memory_bound.txt b/src/madengine/scripts/common/tools/counters/memory_bound.txt new file mode 100644 index 00000000..059b87bc --- /dev/null +++ b/src/madengine/scripts/common/tools/counters/memory_bound.txt @@ -0,0 +1,28 @@ +# Memory-bound profiling counters +# For models bottlenecked by memory bandwidth + +# L2 Cache metrics +pmc: L2CacheHit +pmc: L2CacheMiss +pmc: L2CacheHitRate + +# TCP (L1 cache) to TCC (L2 cache) traffic +pmc: TCP_TCC_READ_REQ_sum +pmc: TCP_TCC_WRITE_REQ_sum +pmc: TCP_TCC_ATOMIC_REQ_sum + +# TCC (L2) hits and misses +pmc: TCC_HIT_sum +pmc: TCC_MISS_sum +pmc: TCC_EA_RDREQ_32B_sum +pmc: TCC_EA_WRREQ_64B_sum + +# Memory controller traffic +pmc: TCC_EA_RDREQ_LEVEL_sum +pmc: TCC_EA_WRREQ_LEVEL_sum + +# Scalar cache +pmc: SQ_INSTS_SMEM + +# LDS (Local Data Share) usage +pmc: SQ_LDS_BANK_CONFLICT diff --git a/src/madengine/scripts/common/tools/detect_therock.sh b/src/madengine/scripts/common/tools/detect_therock.sh new file mode 100755 index 00000000..2e04d2d1 --- /dev/null +++ b/src/madengine/scripts/common/tools/detect_therock.sh @@ -0,0 +1,176 @@ +#!/bin/sh +# +# Quick TheRock ROCm Detection Script +# +# This script checks if TheRock is installed on the system. +# TheRock does NOT use apt - it uses Python pip or tarballs. +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +FOUND=0 + +echo "==================================================" +echo "TheRock ROCm Distribution Detection" +echo "==================================================" +echo "" + +# Function to check if path contains TheRock installation +check_therock_path() { + path="$1" + label="$2" + + if [ ! -d "$path" ]; then + return 1 + fi + + manifest="$path/share/therock/therock_manifest.json" + dist_info="$path/share/therock/dist_info.json" + + if [ -f "$manifest" ]; then + printf "${GREEN}✓ Found TheRock installation${NC}\n" + echo " Type: $label" + echo " Path: $path" + + if [ -f "$dist_info" ]; then + targets=$(grep -oP '(?<="dist_amdgpu_targets": ")[^"]*' "$dist_info" 2>/dev/null || echo "unknown") + echo " GPU Targets: $targets" + fi + + if command -v jq > /dev/null 2>&1; then + commit=$(jq -r '.the_rock_commit' "$manifest" 2>/dev/null || echo "unknown") + echo " Commit: $commit" + fi + + echo "" + FOUND=$((FOUND + 1)) + return 0 + fi + + return 1 +} + +# Check 1: rocm-sdk command (Python installation) +printf "${BLUE}[1] Checking for rocm-sdk command...${NC}\n" +if command -v rocm-sdk > /dev/null 2>&1; then + printf "${GREEN}✓ Found rocm-sdk command${NC}\n" + + # Get version + version=$(rocm-sdk version 2>/dev/null || echo "unknown") + echo " Version: $version" + + # Get root path + if root_path=$(rocm-sdk path --root 2>/dev/null); then + echo " Root: $root_path" + check_therock_path "$root_path" "Python Package" + fi +else + echo " ✗ rocm-sdk command not found" +fi +echo "" + +# Check 2: Python site-packages +printf "${BLUE}[2] Checking Python site-packages...${NC}\n" +if python3 -c "import rocm_sdk" 2>/dev/null; then + version=$(python3 -c "import rocm_sdk; print(rocm_sdk.__version__)" 2>/dev/null || echo "unknown") + printf "${GREEN}✓ Found rocm_sdk Python package${NC}\n" + echo " Version: $version" + + # Try to find the package path + pkg_path=$(python3 -c " +import importlib.util +import pathlib +spec = importlib.util.find_spec('_rocm_sdk_core') +if spec and spec.origin: + print(pathlib.Path(spec.origin).parent) +" 2>/dev/null || echo "") + + if [ -n "$pkg_path" ]; then + check_therock_path "$pkg_path" "Python Package" + fi +else + echo " ✗ rocm_sdk Python package not found" +fi +echo "" + +# Check 3: Common installation paths +printf "${BLUE}[3] Checking common installation paths...${NC}\n" +for path in "$HOME/rocm" "$HOME/therock" "/opt/rocm" "/usr/local/rocm" "$HOME/.local/rocm"; do + if check_therock_path "$path" "Tarball Installation"; then + : # Found, already printed + fi +done + +# Check 4: Environment variables +printf "${BLUE}[4] Checking environment variables...${NC}\n" +env_found=0 +for var in ROCM_PATH ROCM_HOME HIP_PATH; do + eval "var_value=\$$var" + if [ -n "$var_value" ]; then + echo " Checking \$$var = $var_value" + if check_therock_path "$var_value" "Environment Variable (\$$var)"; then + env_found=1 + fi + fi +done + +if [ $env_found -eq 0 ]; then + echo " ✗ No TheRock installations found via environment variables" +fi +echo "" + +# Check 5: Local build directory +printf "${BLUE}[5] Checking for local build directory...${NC}\n" +if [ -f "version.json" ] && [ -f "CMakeLists.txt" ]; then + if grep -q "rocm-version" version.json 2>/dev/null; then + printf "${YELLOW}✓ Found TheRock source directory${NC}\n" + echo " Path: $(pwd)" + + if [ -d "build/dist" ]; then + for dist_dir in build/dist/*; do + if [ -d "$dist_dir" ]; then + check_therock_path "$dist_dir" "Local Build" + fi + done + else + echo " (No build/dist directory found - not yet built)" + fi + fi +else + echo " ✗ Not in a TheRock source directory" +fi +echo "" + +# Summary +echo "==================================================" +echo "Summary" +echo "==================================================" + +if [ $FOUND -gt 0 ]; then + printf "${GREEN}Found $FOUND TheRock installation(s)${NC}\n" + echo "" + echo "TheRock is installed on this system!" + exit 0 +else + printf "${RED}No TheRock installations detected${NC}\n" + echo "" + echo "TheRock does NOT use apt/system packages." + echo "It installs via:" + echo " 1. Python pip (recommended)" + echo " 2. Standalone tarballs" + echo " 3. Build from source" + echo "" + echo "To install TheRock:" + echo " pip install --index-url https://rocm.nightlies.amd.com/v2/gfx110X-all/ 'rocm[libraries,devel]'" + echo "" + echo "More info: https://github.com/ROCm/TheRock/blob/main/RELEASES.md" + exit 1 +fi + diff --git a/src/madengine/scripts/common/tools/get_library_trace.py b/src/madengine/scripts/common/tools/get_library_trace.py index 63df8a28..d011e643 100644 --- a/src/madengine/scripts/common/tools/get_library_trace.py +++ b/src/madengine/scripts/common/tools/get_library_trace.py @@ -241,8 +241,12 @@ def write( m_match = process_miopen_trace(data.splitlines()) matched |= m_match - if self.stdio and (self.printConfigs or (not matched)): - self.stdio.write(data) + if self.stdio: + # Always print non-matching lines (like performance output) + # Only suppress matching trace lines if printConfigs is False + if self.printConfigs or (not matched): + self.stdio.write(data) + self.stdio.flush() # Ensure output is immediately available, not buffered # else: #debug # self.stdio.write( "$(%s,%s,%s) " % (r_match, t_match, m_match) + data ) @@ -271,11 +275,25 @@ def run_command( modified_env = os.environ.copy() modified_env.update(request_env) - with redirect_stdout(outlog), redirect_stderr(outlog): - process = subprocess.Popen(commandstring, shell=True, env=modified_env, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = process.communicate() - outlog.write(stdout.decode()) - outlog.write(stderr.decode()) + # Run subprocess with STDOUT (not PIPE) so output goes directly to our stdout + # This avoids buffering issues with nested processes + process = subprocess.Popen( + commandstring, + shell=True, + env=modified_env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, # Merge stderr into stdout + universal_newlines=True, + bufsize=1 # Line buffered + ) + + # Stream output line by line + for line in process.stdout: + outlog.write(line) + outlog.flush() + + # Wait for process to complete + process.wait() def main(): @@ -289,7 +307,7 @@ def main(): # WORKAROUND: This command does not stack # calling multiple get_library_trace calls in a chain is equivalent to calling it once - commandstring = re.sub("([~ ]|^).*get_library_trace ", "", commandstring) + commandstring = re.sub("([~ ]|^).*get_library_trace\\.py ", "", commandstring) request_env = {} if "rocblas_trace" in mode: @@ -318,7 +336,7 @@ def main(): date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Write the library trace information to the CSV file - filename = "/myworkspace/library_trace.csv" + filename = os.environ.get("OUTPUT_FILE", "library_trace.csv") fields = ["jobid", "created_date", "model", "library", "config", "calls"] with open(filename, "w") as csvfile: csvwriter = csv.writer(csvfile) diff --git a/src/madengine/scripts/common/tools/gpu_info_profiler.py b/src/madengine/scripts/common/tools/gpu_info_profiler.py index d249ff0a..111f655d 100644 --- a/src/madengine/scripts/common/tools/gpu_info_profiler.py +++ b/src/madengine/scripts/common/tools/gpu_info_profiler.py @@ -16,6 +16,7 @@ import os import logging import typing +import signal from typing import Optional, List, Dict, Any @@ -425,7 +426,7 @@ def main() -> None: MODE: "power" or "vram" DEVICE: Comma-separated device IDs or "all" SAMPLING_RATE: Sampling interval in seconds - DUAL-GCD: "true" to enable dual GCD mode (AMD-specific) + DUAL_GCD: "true" to enable dual GCD mode (AMD-specific) Raises: ValueError: If MODE is invalid or required env vars are missing. @@ -445,7 +446,7 @@ def main() -> None: mode = os.environ.get("MODE") device = os.environ.get("DEVICE") sampling_rate_str = os.environ.get("SAMPLING_RATE") - dual_gcd = os.environ.get("DUAL-GCD", "false") + dual_gcd = os.environ.get("DUAL_GCD", "false") # Validate environment variables if not mode: @@ -520,18 +521,45 @@ def main() -> None: profiler=profiler ) + # Global flag for signal handling + shutdown_requested = threading.Event() + + def signal_handler(signum, frame): + """Handle SIGTERM/SIGINT to gracefully shutdown.""" + logging.info(f"Received signal {signum}, initiating graceful shutdown...") + shutdown_requested.set() + # Stop the profiler event to signal threads to stop + event.clear() + + # Register signal handlers + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGINT, signal_handler) + # Execute profiling workload_thread.start() profiler_thread.start() - workload_thread.join() - profiler_thread.join() - + + # Wait for either workload completion or shutdown signal + workload_thread.join(timeout=1) + while workload_thread.is_alive() and not shutdown_requested.is_set(): + time.sleep(0.1) + + # If shutdown was requested, clear event to stop profiler + if shutdown_requested.is_set(): + event.clear() + logging.info("Shutdown requested, stopping profiler thread...") + + # Wait for profiler thread to finish + profiler_thread.join(timeout=5) + # Write results to CSV - output_file = os.environ.get("OUTPUT_FILE", "/myworkspace/prof.csv") + output_file = os.environ.get("OUTPUT_FILE", "prof.csv") if not profiler_thread.data: - logging.error("No profiling data collected") - sys.exit(1) + logging.warning("No profiling data collected") + # Don't exit with error if we got a shutdown signal - this is expected + if not shutdown_requested.is_set(): + sys.exit(1) else: try: with open(output_file, "w", newline='') as csvfile: diff --git a/src/madengine/scripts/common/tools/rocprof_wrapper.sh b/src/madengine/scripts/common/tools/rocprof_wrapper.sh new file mode 100755 index 00000000..e4fca783 --- /dev/null +++ b/src/madengine/scripts/common/tools/rocprof_wrapper.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# +# Copyright (c) Advanced Micro Devices, Inc. +# All rights reserved. +# +# ROCm Profiler Wrapper - Intelligently select between rocprof (legacy) and rocprofv3 (new) +# +# This wrapper handles the transition from rocprof to rocprofv3 across ROCm versions. +# It automatically detects the available profiler and uses the appropriate one. +# +# ROCm Version Support: +# - ROCm < 7.0: Uses rocprof (legacy) +# - ROCm >= 7.0: Prefers rocprofv3, falls back to rocprof if not available +# +# Usage: +# bash rocprof_wrapper.sh [profiler-options] -- [app-args] +# +# Important: +# - Always include the '--' separator between profiler options and the application command +# - This is required for rocprofv3 (ROCm >= 7.0) to correctly parse arguments +# - The separator works with both rocprof (legacy) and rocprofv3 for compatibility +# +# Examples: +# # Basic HIP trace +# bash rocprof_wrapper.sh --hip-trace -- python my_model.py +# +# # System trace with custom options +# bash rocprof_wrapper.sh --sys-trace --stats -- ./my_app --batch-size 32 +# +# # Counter collection with output directory +# bash rocprof_wrapper.sh --counter-collection -i counters.txt -d ./output -- python train.py +# + +# Function to detect ROCm version +get_rocm_version() { + # Try multiple methods to detect ROCm version + local version="" + + # Method 1: Check rocm-smi output + if command -v rocm-smi &> /dev/null; then + version=$(rocm-smi --version 2>/dev/null | grep -oP 'ROCm version: \K[0-9]+\.[0-9]+\.[0-9]+' | head -1) + fi + + # Method 2: Check /opt/rocm/.info/version file + if [ -z "$version" ] && [ -f /opt/rocm/.info/version ]; then + version=$(cat /opt/rocm/.info/version) + fi + + # Method 3: Check ROCM_PATH or default ROCm installation + if [ -z "$version" ]; then + local rocm_path="${ROCM_PATH:-/opt/rocm}" + if [ -f "$rocm_path/.info/version" ]; then + version=$(cat "$rocm_path/.info/version") + fi + fi + + echo "$version" +} + +# Function to compare version strings (returns 0 if v1 >= v2) +version_gte() { + # Convert version strings to comparable numbers + local v1=$(echo "$1" | awk -F. '{ printf("%d%03d%03d\n", $1,$2,$3); }') + local v2=$(echo "$2" | awk -F. '{ printf("%d%03d%03d\n", $1,$2,$3); }') + [ "$v1" -ge "$v2" ] +} + +# Function to detect available profiler +detect_profiler() { + local rocm_version=$(get_rocm_version) + + # Check if rocprofv3 is available + if command -v rocprofv3 &> /dev/null; then + echo "rocprofv3" + return 0 + fi + + # Check if rocprof (legacy) is available + if command -v rocprof &> /dev/null; then + # For ROCm >= 7.0, warn that rocprofv3 should be available + if [ -n "$rocm_version" ] && version_gte "$rocm_version" "7.0.0"; then + echo "Warning: ROCm $rocm_version detected but rocprofv3 not found, using legacy rocprof" >&2 + fi + echo "rocprof" + return 0 + fi + + # No profiler found + echo "Error: Neither rocprofv3 nor rocprof found in PATH" >&2 + echo "Please ensure ROCm profiler tools are installed" >&2 + return 1 +} + +# Main execution +main() { + local profiler=$(detect_profiler) + local exit_code=$? + + if [ $exit_code -ne 0 ]; then + return 1 + fi + + # Execute the detected profiler with all passed arguments + if [ "$profiler" = "rocprof" ]; then + # Legacy rocprof syntax: rocprof [options] [args] + # All arguments can be passed directly + # Filter deprecation warnings while preserving stdout and exit code + { rocprof "$@" 2>&1 1>&3 | grep -v "WARNING: We are phasing out" | grep -v "roctracer/rocprofiler" | grep -v "rocprofv2 in favor" >&2; } 3>&1 + return ${PIPESTATUS[0]} + else + # New rocprofv3 syntax: rocprofv3 [options] -- [args] + # Need to separate profiler options from application command + local profiler_opts=() + local app_cmd=() + local found_separator=false + + for arg in "$@"; do + if [ "$arg" = "--" ]; then + # Found the separator, everything after this is the application command + found_separator=true + continue + fi + + if [ "$found_separator" = true ]; then + app_cmd+=("$arg") + else + profiler_opts+=("$arg") + fi + done + + # Build command with proper argument placement + if [ "${#profiler_opts[@]}" -gt 0 ]; then + # Has profiler options: rocprofv3 -- + rocprofv3 "${profiler_opts[@]}" -- "${app_cmd[@]}" + else + # No profiler options: rocprofv3 -- + rocprofv3 -- "${app_cmd[@]}" + fi + return $? + fi +} + +# Run main function +main "$@" + diff --git a/src/madengine/scripts/common/tools/therock_detector.py b/src/madengine/scripts/common/tools/therock_detector.py new file mode 100755 index 00000000..441a0204 --- /dev/null +++ b/src/madengine/scripts/common/tools/therock_detector.py @@ -0,0 +1,401 @@ +#!/usr/bin/env python3 +""" +TheRock ROCm Distribution Detection Script + +This script detects if TheRock (The HIP Environment and ROCm Kit) is installed +on the system. TheRock uses Python pip packages or standalone tarballs instead +of traditional apt/system package managers. + +Detection methods: +1. Python package installation (via pip in venvs or site-packages) +2. Tarball installation (custom directories) +3. Local build directories +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +from pathlib import Path +from typing import Dict, List, Optional + + +class TherockDetector: + """Detects TheRock ROCm installations on the system.""" + + def __init__(self, verbose: bool = False): + self.verbose = verbose + self.installations: List[Dict] = [] + + def log(self, message: str): + """Print verbose log messages.""" + if self.verbose: + print(f"[DEBUG] {message}") + + def detect_all(self) -> List[Dict]: + """Run all detection methods and return list of found installations.""" + self.log("Starting TheRock detection...") + + # Method 1: Check for rocm-sdk command in PATH + self._detect_rocm_sdk_command() + + # Method 2: Check Python site-packages + self._detect_python_packages() + + # Method 3: Check common installation paths + self._detect_tarball_installations() + + # Method 4: Check environment variables + self._detect_from_env_vars() + + # Method 5: Check for local build directories + self._detect_build_directories() + + return self.installations + + def _add_installation(self, install_type: str, path: Path, details: Dict): + """Add a detected installation to the list.""" + installation = { + "type": install_type, + "path": str(path.resolve()), + "details": details, + } + + # Avoid duplicates + if not any(inst["path"] == installation["path"] for inst in self.installations): + self.installations.append(installation) + self.log(f"Found {install_type} installation at: {path}") + + def _is_therock_installation(self, path: Path) -> Optional[Dict]: + """ + Check if a path contains TheRock installation markers. + + Returns dict with installation details if TheRock is detected, None otherwise. + """ + if not path.exists(): + return None + + details = {} + + # Marker 1: therock_manifest.json + manifest_path = path / "share" / "therock" / "therock_manifest.json" + if manifest_path.exists(): + self.log(f"Found therock_manifest.json at {manifest_path}") + try: + with open(manifest_path, "r") as f: + manifest = json.load(f) + details["manifest"] = { + "commit": manifest.get("the_rock_commit", "unknown"), + "submodules_count": len(manifest.get("submodules", [])), + } + except Exception as e: + self.log(f"Error reading manifest: {e}") + + # Marker 2: dist_info.json + dist_info_path = path / "share" / "therock" / "dist_info.json" + if dist_info_path.exists(): + self.log(f"Found dist_info.json at {dist_info_path}") + try: + with open(dist_info_path, "r") as f: + dist_info = json.load(f) + details["dist_info"] = { + "amdgpu_targets": dist_info.get("dist_amdgpu_targets", "unknown"), + } + except Exception as e: + self.log(f"Error reading dist_info: {e}") + + # Marker 3: Unique directory structure (lib/llvm symlink) + llvm_symlink = path / "llvm" + if llvm_symlink.exists() and llvm_symlink.is_symlink(): + target = os.readlink(llvm_symlink) + if target == "lib/llvm": + self.log(f"Found TheRock-specific llvm symlink at {llvm_symlink}") + details["llvm_symlink"] = True + + # Marker 4: Check for TheRock-specific binaries + bin_dir = path / "bin" + if bin_dir.exists(): + therock_binaries = [] + for binary in ["amdclang", "amdclang++", "amdflang", "hipcc"]: + if (bin_dir / binary).exists(): + therock_binaries.append(binary) + if therock_binaries: + details["binaries"] = therock_binaries + + # If we found any TheRock markers, return details + if details: + return details + + return None + + def _detect_rocm_sdk_command(self): + """Detect rocm-sdk command in PATH (indicates pip installation).""" + self.log("Checking for rocm-sdk command...") + + rocm_sdk_path = shutil.which("rocm-sdk") + if rocm_sdk_path: + self.log(f"Found rocm-sdk at: {rocm_sdk_path}") + + # Try to get installation details + details = {"command_path": rocm_sdk_path} + + # Get version + try: + result = subprocess.run( + ["rocm-sdk", "version"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0: + details["version"] = result.stdout.strip() + except Exception as e: + self.log(f"Error getting version: {e}") + + # Get root path + try: + result = subprocess.run( + ["rocm-sdk", "path", "--root"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0: + root_path = Path(result.stdout.strip()) + therock_details = self._is_therock_installation(root_path) + if therock_details: + details.update(therock_details) + self._add_installation("python_package", root_path, details) + return + except Exception as e: + self.log(f"Error getting root path: {e}") + + def _detect_python_packages(self): + """Detect TheRock Python packages in site-packages.""" + self.log("Checking Python site-packages...") + + try: + import site + import importlib.util + + # Check for rocm_sdk package + spec = importlib.util.find_spec("rocm_sdk") + if spec and spec.origin: + package_path = Path(spec.origin).parent + self.log(f"Found rocm_sdk package at: {package_path}") + + # Try to import and get details + try: + import rocm_sdk + details = { + "package_path": str(package_path), + "version": getattr(rocm_sdk, "__version__", "unknown"), + } + + # Try to get rocm_sdk_core path for TheRock markers + core_spec = importlib.util.find_spec("_rocm_sdk_core") + if core_spec and core_spec.origin: + core_path = Path(core_spec.origin).parent + therock_details = self._is_therock_installation(core_path) + if therock_details: + details.update(therock_details) + self._add_installation("python_package", core_path, details) + except Exception as e: + self.log(f"Error importing rocm_sdk: {e}") + + except Exception as e: + self.log(f"Error checking Python packages: {e}") + + def _detect_tarball_installations(self): + """Detect tarball installations in common paths.""" + self.log("Checking common installation paths...") + + # Common installation directories for tarballs + common_paths = [ + Path.home() / "rocm", + Path.home() / "therock", + Path("/opt/rocm"), + Path("/usr/local/rocm"), + Path.home() / ".local" / "rocm", + ] + + for path in common_paths: + if path.exists(): + details = self._is_therock_installation(path) + if details: + self._add_installation("tarball", path, details) + + def _detect_from_env_vars(self): + """Detect TheRock from environment variables.""" + self.log("Checking environment variables...") + + env_vars = [ + "ROCM_PATH", + "ROCM_HOME", + "HIP_PATH", + ] + + for var in env_vars: + value = os.environ.get(var) + if value: + path = Path(value) + if path.exists(): + self.log(f"Checking {var}={value}") + details = self._is_therock_installation(path) + if details: + details["detected_via"] = var + self._add_installation("environment_variable", path, details) + + def _detect_build_directories(self): + """Detect local TheRock build directories.""" + self.log("Checking for local build directories...") + + # Check current directory and parent directories + current = Path.cwd() + for _ in range(5): # Check up to 5 levels up + # Check for TheRock source indicators + if (current / "CMakeLists.txt").exists() and (current / "version.json").exists(): + try: + with open(current / "version.json", "r") as f: + version_data = json.load(f) + if "rocm-version" in version_data: + self.log(f"Found TheRock source at: {current}") + + # Check build directory + build_dir = current / "build" + if build_dir.exists(): + dist_dir = build_dir / "dist" + if dist_dir.exists(): + for dist_subdir in dist_dir.iterdir(): + if dist_subdir.is_dir(): + details = self._is_therock_installation(dist_subdir) + if details: + details["source_path"] = str(current) + details["rocm_version"] = version_data.get("rocm-version") + self._add_installation("local_build", dist_subdir, details) + except Exception as e: + self.log(f"Error checking build directory: {e}") + + parent = current.parent + if parent == current: + break + current = parent + + +def format_installation_info(installation: Dict) -> str: + """Format installation information for display.""" + lines = [] + lines.append(f"\nType: {installation['type']}") + lines.append(f"Path: {installation['path']}") + + details = installation.get("details", {}) + + if "version" in details: + lines.append(f"Version: {details['version']}") + + if "rocm_version" in details: + lines.append(f"ROCm Version: {details['rocm_version']}") + + if "manifest" in details: + manifest = details["manifest"] + lines.append(f"TheRock Commit: {manifest.get('commit', 'unknown')}") + lines.append(f"Submodules: {manifest.get('submodules_count', 0)}") + + if "dist_info" in details: + dist_info = details["dist_info"] + lines.append(f"GPU Targets: {dist_info.get('amdgpu_targets', 'unknown')}") + + if "binaries" in details: + lines.append(f"Compilers: {', '.join(details['binaries'])}") + + if "command_path" in details: + lines.append(f"Command: {details['command_path']}") + + if "detected_via" in details: + lines.append(f"Detected via: ${details['detected_via']}") + + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser( + description="Detect TheRock ROCm installations on the system", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s # Detect all installations + %(prog)s -v # Verbose output + %(prog)s --json # Output as JSON + %(prog)s --path /opt/rocm # Check specific path + """, + ) + parser.add_argument( + "-v", "--verbose", + action="store_true", + help="Enable verbose output", + ) + parser.add_argument( + "--json", + action="store_true", + help="Output results as JSON", + ) + parser.add_argument( + "--path", + type=Path, + help="Check specific path for TheRock installation", + ) + + args = parser.parse_args() + + detector = TherockDetector(verbose=args.verbose) + + # If specific path provided, check only that + if args.path: + details = detector._is_therock_installation(args.path) + if details: + installation = { + "type": "manual_check", + "path": str(args.path.resolve()), + "details": details, + } + installations = [installation] + else: + print(f"No TheRock installation detected at: {args.path}") + sys.exit(1) + else: + # Run full detection + installations = detector.detect_all() + + # Output results + if not installations: + print("No TheRock installations detected.") + print("\nTheRock uses Python pip packages or tarballs, not apt.") + print("See: https://github.com/ROCm/TheRock/blob/main/RELEASES.md") + sys.exit(1) + + if args.json: + print(json.dumps(installations, indent=2)) + else: + print(f"Found {len(installations)} TheRock installation(s):") + for i, installation in enumerate(installations, 1): + print(f"\n{'=' * 60}") + print(f"Installation #{i}") + print('=' * 60) + print(format_installation_info(installation)) + + print(f"\n{'=' * 60}") + print("\nTheRock Installation Info:") + print("- TheRock does NOT use apt/system packages") + print("- It installs via Python pip OR standalone tarballs") + print("- Python packages install to venv site-packages") + print("- Tarballs extract to custom directories") + print("\nFor more info: https://github.com/ROCm/TheRock") + + sys.exit(0) + + +if __name__ == "__main__": + main() + diff --git a/src/madengine/scripts/k8s/data/download_aws.sh b/src/madengine/scripts/k8s/data/download_aws.sh new file mode 100755 index 00000000..79a705ff --- /dev/null +++ b/src/madengine/scripts/k8s/data/download_aws.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# madengine K8s Data Provider - AWS S3 +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# Usage: download_aws.sh + +set -e + +DATANAME=$1 +DATAPATH=$2 +DATAHOME=${3:-/data_dlm_0} +AWS_REGION=${AWS_REGION:-us-east-2} + +echo "=== AWS S3 Data Download ===" +echo "Data: $DATANAME" +echo "Source: $DATAPATH" +echo "Target: $DATAHOME" +echo "Region: $AWS_REGION" + +# Get credentials from environment +export AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-${MAD_AWS_ACCESS_KEY}} +export AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-${MAD_AWS_SECRET_KEY}} + +# Install AWS CLI if not present +if ! command -v aws &> /dev/null; then + echo "Installing AWS CLI..." + pip3 --no-cache-dir install --upgrade awscli +fi + +# Create target directory +mkdir -p $DATAHOME + +# Download data +START_TIME=$(date +%s) +echo "Downloading..." + +if aws --region=$AWS_REGION s3 ls $DATAPATH 2>/dev/null | grep "PRE"; then + # Directory download + aws --region=$AWS_REGION s3 sync $DATAPATH $DATAHOME +else + # Single file download + aws --region=$AWS_REGION s3 sync \ + $(dirname $DATAPATH) $DATAHOME \ + --exclude="*" --include="$(basename $DATAPATH)" +fi + +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +# Calculate size +SIZE=$(du -sh $DATAHOME 2>/dev/null | cut -f1 || echo "0") + +echo "✓ Download complete" +echo "Duration: ${DURATION}s" +echo "Size: $SIZE" + +# Export metrics +mkdir -p /tmp +echo "MAD_DATA_DOWNLOAD_DURATION=$DURATION" >> /tmp/mad_metrics.env +echo "MAD_DATA_SIZE=$SIZE" >> /tmp/mad_metrics.env +echo "MAD_DATA_PROVIDER_TYPE=aws" >> /tmp/mad_metrics.env +echo "MAD_DATANAME=$DATANAME" >> /tmp/mad_metrics.env + diff --git a/src/madengine/scripts/k8s/data/download_local.sh b/src/madengine/scripts/k8s/data/download_local.sh new file mode 100755 index 00000000..901af88c --- /dev/null +++ b/src/madengine/scripts/k8s/data/download_local.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# madengine K8s Data Provider - Local +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# Usage: download_local.sh + +set -e + +DATANAME=$1 +DATAPATH=$2 +DATAHOME=${3:-/data_dlm_0} + +echo "=== Local Data Provider ===" +echo "Data: $DATANAME" +echo "Path: $DATAPATH" +echo "Target: $DATAHOME" + +# For local data, the path should already be mounted as a volume +# Just verify it exists and calculate size + +if [ ! -e "$DATAPATH" ]; then + echo "Error: Local data path does not exist: $DATAPATH" + exit 1 +fi + +# If DATAHOME is different from DATAPATH, we might need to symlink or the data is already mounted +if [ "$DATAPATH" != "$DATAHOME" ]; then + echo "Note: Data is at $DATAPATH, expected at $DATAHOME" + echo "Assuming data is pre-mounted by K8s volume" +fi + +# Calculate size +SIZE=$(du -sh $DATAHOME 2>/dev/null | cut -f1 || du -sh $DATAPATH 2>/dev/null | cut -f1 || echo "0") + +echo "✓ Local data verified" +echo "Size: $SIZE" + +# Export metrics +mkdir -p /tmp +echo "MAD_DATA_DOWNLOAD_DURATION=0" >> /tmp/mad_metrics.env +echo "MAD_DATA_SIZE=$SIZE" >> /tmp/mad_metrics.env +echo "MAD_DATA_PROVIDER_TYPE=local" >> /tmp/mad_metrics.env +echo "MAD_DATANAME=$DATANAME" >> /tmp/mad_metrics.env + diff --git a/src/madengine/scripts/k8s/data/download_minio.sh b/src/madengine/scripts/k8s/data/download_minio.sh new file mode 100755 index 00000000..f0da3932 --- /dev/null +++ b/src/madengine/scripts/k8s/data/download_minio.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# madengine K8s Data Provider - MinIO +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# Usage: download_minio.sh + +set -e + +DATANAME=$1 +DATAPATH=$2 +DATAHOME=${3:-/data_dlm_0} + +echo "=== MinIO Data Download ===" +echo "Data: $DATANAME" +echo "Source: $DATAPATH" +echo "Target: $DATAHOME" + +# Get credentials from environment or credential.json +MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY:-${MAD_MINIO_USERNAME}} +MINIO_SECRET_KEY=${MINIO_SECRET_KEY:-${MAD_MINIO_PASSWORD}} +MINIO_ENDPOINT=${MINIO_ENDPOINT:-https://minio-frameworks.amd.com} + +# If credentials not in environment, try to read from credential.json +if [ -z "$MINIO_ACCESS_KEY" ] && [ -f "/workspace/credential.json" ]; then + echo "Reading MinIO credentials from credential.json..." + MINIO_ACCESS_KEY=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); print(d.get('MAD_MINIO', {}).get('USERNAME', ''))" 2>/dev/null || echo "") + MINIO_SECRET_KEY=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); print(d.get('MAD_MINIO', {}).get('PASSWORD', ''))" 2>/dev/null || echo "") + MINIO_ENDPOINT=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); print(d.get('MAD_MINIO', {}).get('ENDPOINT_URL', 'https://minio-frameworks.amd.com'))" 2>/dev/null || echo "https://minio-frameworks.amd.com") +fi + +# Verify credentials are available +if [ -z "$MINIO_ACCESS_KEY" ] || [ -z "$MINIO_SECRET_KEY" ]; then + echo "Error: MinIO credentials not found in environment or credential.json" + echo "Required: MINIO_ACCESS_KEY, MINIO_SECRET_KEY" + exit 1 +fi + +# Install AWS CLI if not present +if ! command -v aws &> /dev/null; then + echo "Installing AWS CLI..." + pip3 --no-cache-dir install --upgrade awscli +fi + +# Configure AWS CLI for MinIO +export AWS_ACCESS_KEY_ID=$MINIO_ACCESS_KEY +export AWS_SECRET_ACCESS_KEY=$MINIO_SECRET_KEY +export AWS_ENDPOINT_URL_S3=$MINIO_ENDPOINT + +# Create target directory +mkdir -p $DATAHOME + +# Download data +START_TIME=$(date +%s) +echo "Downloading..." + +if aws --endpoint-url $MINIO_ENDPOINT s3 ls $DATAPATH 2>/dev/null | grep PRE; then + # Directory download + aws --endpoint-url $MINIO_ENDPOINT s3 sync $DATAPATH $DATAHOME +else + # Single file download + aws --endpoint-url $MINIO_ENDPOINT s3 sync \ + $(dirname $DATAPATH) $DATAHOME \ + --exclude="*" --include="$(basename $DATAPATH)" +fi + +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +# Calculate size +SIZE=$(du -sh $DATAHOME 2>/dev/null | cut -f1 || echo "0") + +echo "✓ Download complete" +echo "Duration: ${DURATION}s" +echo "Size: $SIZE" + +# Export metrics for collection +mkdir -p /tmp +echo "MAD_DATA_DOWNLOAD_DURATION=$DURATION" >> /tmp/mad_metrics.env +echo "MAD_DATA_SIZE=$SIZE" >> /tmp/mad_metrics.env +echo "MAD_DATA_PROVIDER_TYPE=minio" >> /tmp/mad_metrics.env +echo "MAD_DATANAME=$DATANAME" >> /tmp/mad_metrics.env + diff --git a/src/madengine/scripts/k8s/data/download_nas.sh b/src/madengine/scripts/k8s/data/download_nas.sh new file mode 100755 index 00000000..45e062d8 --- /dev/null +++ b/src/madengine/scripts/k8s/data/download_nas.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# madengine K8s Data Provider - NAS (SSH/rsync) +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# Usage: download_nas.sh + +set -e + +DATANAME=$1 +DATAPATH=$2 +DATAHOME=${3:-/data_dlm_0} + +echo "=== NAS Data Download ===" +echo "Data: $DATANAME" +echo "Source: $DATAPATH" +echo "Target: $DATAHOME" + +# Get NAS credentials from environment or credential.json +NAS_HOST=${NAS_HOST:-mlse-nas.amd.com} +NAS_PORT=${NAS_PORT:-22} +NAS_USER=${NAS_USERNAME:-datum} +NAS_PASS=${NAS_PASSWORD} + +# If credentials not in environment, try to read from credential.json +if [ -z "$NAS_PASS" ] && [ -f "/workspace/credential.json" ]; then + echo "Reading NAS credentials from credential.json..." + + # Extract NAS node info (try first node or find by hostname) + NAS_HOST=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); nodes=d.get('NAS_NODES', []); print(nodes[0].get('HOST', 'mlse-nas.amd.com') if nodes else 'mlse-nas.amd.com')" 2>/dev/null || echo "mlse-nas.amd.com") + + NAS_PORT=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); nodes=d.get('NAS_NODES', []); print(nodes[0].get('PORT', '22') if nodes else '22')" 2>/dev/null || echo "22") + + NAS_USER=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); nodes=d.get('NAS_NODES', []); print(nodes[0].get('USERNAME', 'datum') if nodes else 'datum')" 2>/dev/null || echo "datum") + + NAS_PASS=$(python3 -c "import json; f=open('/workspace/credential.json'); d=json.load(f); nodes=d.get('NAS_NODES', []); print(nodes[0].get('PASSWORD', '') if nodes else '')" 2>/dev/null || echo "") +fi + +# Verify credentials are available +if [ -z "$NAS_PASS" ]; then + echo "Error: NAS_PASSWORD not found in environment or credential.json" + echo "Required: NAS_PASSWORD environment variable or credential.json with NAS_NODES" + exit 1 +fi + +echo "Using NAS: $NAS_USER@$NAS_HOST:$NAS_PORT" + +# Install required tools +echo "Installing dependencies..." +if [ -f "$(which apt)" ]; then + apt update && apt install -y sshpass rsync +elif [ -f "$(which yum)" ]; then + yum install -y sshpass rsync +else + echo "Error: Unable to detect package manager" + exit 1 +fi + +# Create target directory +mkdir -p $DATAHOME + +# Download data +START_TIME=$(date +%s) +echo "Downloading from NAS..." + +# Use sshpass directly (no wrapper script needed) +export SSHPASS="$NAS_PASS" +sshpass -e rsync --progress -avz -e "ssh -p $NAS_PORT -o StrictHostKeyChecking=no" \ + ${NAS_USER}@${NAS_HOST}:${DATAPATH}/ $DATAHOME/ || { + echo "Warning: rsync failed, checking if partial data was transferred" + # Even if rsync fails, continue - might be partial transfer +} + +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +# Calculate size +SIZE=$(du -sh $DATAHOME 2>/dev/null | cut -f1 || echo "0") + +echo "✓ Download complete" +echo "Duration: ${DURATION}s" +echo "Size: $SIZE" + +# Export metrics +mkdir -p /tmp +echo "MAD_DATA_DOWNLOAD_DURATION=$DURATION" >> /tmp/mad_metrics.env +echo "MAD_DATA_SIZE=$SIZE" >> /tmp/mad_metrics.env +echo "MAD_DATA_PROVIDER_TYPE=nas" >> /tmp/mad_metrics.env +echo "MAD_DATANAME=$DATANAME" >> /tmp/mad_metrics.env + diff --git a/src/madengine/scripts/k8s/tools.json b/src/madengine/scripts/k8s/tools.json new file mode 100644 index 00000000..c7a3398e --- /dev/null +++ b/src/madengine/scripts/k8s/tools.json @@ -0,0 +1,100 @@ +{ + "_comment": "madengine K8s Tools Configuration", + "_description": "Configuration for K8s-specific tools and data providers", + + "data_providers": { + "minio": { + "script": "scripts/k8s/data/download_minio.sh", + "description": "MinIO S3-compatible object storage", + "env_vars": { + "MINIO_ENDPOINT": "https://minio-frameworks.amd.com" + } + }, + "aws": { + "script": "scripts/k8s/data/download_aws.sh", + "description": "AWS S3 object storage", + "env_vars": { + "AWS_REGION": "us-east-2" + } + }, + "nas": { + "script": "scripts/k8s/data/download_nas.sh", + "description": "NAS via SSH/rsync", + "env_vars": { + "NAS_HOST": "mlse-nas.amd.com", + "NAS_PORT": "22" + } + }, + "local": { + "script": "scripts/k8s/data/download_local.sh", + "description": "Local filesystem (pre-mounted volume)", + "env_vars": {} + } + }, + + "wrappers": { + "gpu_profiler": { + "script": "scripts/k8s/wrappers/run_profiler.sh", + "description": "GPU profiling (power, VRAM)", + "args": ["power", "vram"], + "env_vars": { + "DEVICE": "all", + "SAMPLING_RATE": "0.1" + } + }, + "rocenv": { + "script": "scripts/k8s/wrappers/run_rocenv.sh", + "description": "ROCm environment collection", + "env_vars": {} + } + }, + + "shared_tools": { + "_note": "These tools from scripts/common/ work directly in K8s without wrappers", + "tools": [ + { + "name": "gpu_info_profiler", + "path": "scripts/common/tools/gpu_info_profiler.py", + "type": "python", + "description": "GPU utilization profiler" + }, + { + "name": "rocenv_tool", + "path": "scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py", + "type": "python", + "description": "ROCm environment analyzer" + }, + { + "name": "trace_tools", + "path": "scripts/common/post_scripts/trace.sh", + "type": "bash", + "description": "RPD and rocprof trace collection" + }, + { + "name": "get_library_trace", + "path": "scripts/common/tools/get_library_trace.py", + "type": "python", + "description": "Library call tracing (ROCBLAS, MIOpen, etc.)" + } + ] + }, + + "pre_scripts": [ + { + "name": "gpu_info_pre", + "path": "scripts/common/pre_scripts/gpu_info_pre.sh", + "enabled": true, + "description": "Pre-execution GPU status check" + } + ], + + "post_scripts": [ + { + "name": "gpu_info_post", + "path": "scripts/common/post_scripts/gpu_info_post.sh", + "enabled": true, + "description": "Post-execution GPU status and metrics collection" + } + ] +} + diff --git a/src/madengine/scripts/k8s/wrappers/run_profiler.sh b/src/madengine/scripts/k8s/wrappers/run_profiler.sh new file mode 100755 index 00000000..17bd125c --- /dev/null +++ b/src/madengine/scripts/k8s/wrappers/run_profiler.sh @@ -0,0 +1,50 @@ +#!/bin/bash +# madengine K8s Wrapper - GPU Info Profiler +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# Wrapper for gpu_info_profiler.py to work in K8s environment +# Usage: run_profiler.sh [power|vram] + +set -e + +MODE=${1:-power} +OUTPUT_DIR=${OUTPUT_DIR:-/workspace/profiler_results} + +echo "=== GPU Info Profiler (K8s) ===" +echo "Mode: $MODE" +echo "Output: $OUTPUT_DIR" + +# Verify the Python tool exists +PROFILER_SCRIPT="/workspace/scripts/common/tools/gpu_info_profiler.py" +if [ ! -f "$PROFILER_SCRIPT" ]; then + echo "Error: gpu_info_profiler.py not found at $PROFILER_SCRIPT" + echo "Available scripts:" + ls -la /workspace/scripts/common/tools/ 2>/dev/null || echo " scripts/common/tools/ not found" + exit 1 +fi + +# Set environment variables for the profiler +export DEVICE=${DEVICE:-all} +export SAMPLING_RATE=${SAMPLING_RATE:-0.1} +export MODE=$MODE +export DUAL_GCD=${DUAL_GCD:-false} + +# Create output directory +mkdir -p $OUTPUT_DIR + +# Change to workspace to match expected paths +cd /workspace + +# Run the profiler (reusing the same Python script as local execution!) +echo "Starting profiler..." +python3 $PROFILER_SCRIPT + +echo "✓ GPU profiler completed" +echo "Results saved to: $OUTPUT_DIR" + +# List output files +if [ -d "$OUTPUT_DIR" ]; then + echo "Output files:" + ls -lh $OUTPUT_DIR +fi + diff --git a/src/madengine/scripts/k8s/wrappers/run_rocenv.sh b/src/madengine/scripts/k8s/wrappers/run_rocenv.sh new file mode 100755 index 00000000..c26ad9d5 --- /dev/null +++ b/src/madengine/scripts/k8s/wrappers/run_rocenv.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# madengine K8s Wrapper - rocEnvTool +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +# +# Wrapper for rocEnvTool to work in K8s environment +# Usage: run_rocenv.sh [output_name] + +set -e + +OUTPUT_NAME=${1:-sys_config_info} + +echo "=== rocEnvTool (K8s) ===" +echo "Output: $OUTPUT_NAME" + +# Verify rocEnvTool exists +ROCENV_DIR="/workspace/scripts/common/pre_scripts/rocEnvTool" +if [ ! -d "$ROCENV_DIR" ]; then + echo "Error: rocEnvTool not found at $ROCENV_DIR" + echo "Available pre_scripts:" + ls -la /workspace/scripts/common/pre_scripts/ 2>/dev/null || echo " pre_scripts/ not found" + exit 1 +fi + +# Change to workspace +cd /workspace + +# Copy rocEnvTool to working directory (same as local execution) +echo "Copying rocEnvTool..." +cp -r scripts/common/pre_scripts/rocEnvTool . + +# Run rocEnvTool (same command as local!) +echo "Running rocEnvTool..." +cd rocEnvTool +python3 rocenv_tool.py --lite --dump-csv --print-csv --output-name $OUTPUT_NAME + +# Copy results back to workspace +echo "Copying results..." +OUT_DIR=".$OUTPUT_NAME" +OUT_CSV="$OUTPUT_NAME.csv" + +if [ -d "$OUT_DIR" ]; then + cp -r $OUT_DIR /workspace/ + echo "✓ Copied directory: /workspace/$OUT_DIR" +fi + +if [ -f "$OUT_CSV" ]; then + cp $OUT_CSV /workspace/ + echo "✓ Copied CSV: /workspace/$OUT_CSV" +fi + +cd /workspace + +echo "✓ rocEnvTool completed" +echo "Results saved to: /workspace/$OUTPUT_NAME.csv" + +# List output files +if [ -f "/workspace/$OUT_CSV" ]; then + echo "CSV file size: $(du -h /workspace/$OUT_CSV | cut -f1)" +fi + diff --git a/src/madengine/scripts/slurm/SLURM_EPILOG_SETUP.md b/src/madengine/scripts/slurm/SLURM_EPILOG_SETUP.md new file mode 100644 index 00000000..23661ff5 --- /dev/null +++ b/src/madengine/scripts/slurm/SLURM_EPILOG_SETUP.md @@ -0,0 +1,283 @@ +# SLURM Epilog Script Setup Guide + +This guide explains how to install and configure the SLURM epilog script to automatically clean up GPU processes after each job completes or fails. + +## Problem Statement + +In multi-node GPU jobs, when a job fails or is cancelled: +- Ray worker processes may continue running in Docker containers on compute nodes +- These "zombie" processes hold GPU memory (100-180 GB per GPU) +- Subsequent jobs fail with "insufficient GPU memory" errors +- Manual cleanup is required on each node + +## Solution: SLURM Epilog Script + +The epilog script runs **automatically after every job** (success or failure) on each compute node to: +1. Kill Ray worker processes +2. Kill vLLM processes +3. Clean up Docker containers +4. Kill any remaining GPU processes +5. Optionally reset GPU state + +--- + +## Installation + +### 1. Copy Script to Compute Nodes + +On **each SLURM compute node**, copy the epilog script: + +```bash +sudo cp src/madengine/scripts/slurm/epilog.sh /etc/slurm/epilog.sh +sudo chmod +x /etc/slurm/epilog.sh +sudo chown root:root /etc/slurm/epilog.sh +``` + +### 2. Create Log Directory + +```bash +sudo mkdir -p /var/log/slurm +sudo chmod 755 /var/log/slurm +``` + +### 3. Configure SLURM + +Edit `/etc/slurm/slurm.conf` on the **SLURM controller** and add: + +```conf +# Epilog script to clean up GPU processes after each job +Epilog=/etc/slurm/epilog.sh + +# Optional: Set timeout for epilog script (default: 60 seconds) +EpilogMsgTime=30 +``` + +### 4. Restart SLURM Services + +On **compute nodes**: +```bash +sudo systemctl restart slurmd +``` + +On **controller**: +```bash +sudo systemctl restart slurmctld +``` + +--- + +## Verification + +### 1. Submit a Test Job + +```bash +sbatch --nodes=1 --gpus-per-node=1 --time=00:01:00 --wrap="python3 -c 'import time; time.sleep(30)'" +``` + +### 2. Check Epilog Logs + +On the compute node where the job ran: + +```bash +sudo tail -f /var/log/slurm/epilog.log +``` + +You should see entries like: +``` +[2025-12-17 12:34:56] [Job 12345] === Epilog script starting === +[2025-12-17 12:34:56] [Job 12345] Checking for GPU processes... +[2025-12-17 12:34:56] [Job 12345] No GPU processes found +[2025-12-17 12:34:56] [Job 12345] === Epilog script completed === +``` + +### 3. Test GPU Cleanup After Failed Job + +Submit a job that will fail: +```bash +sbatch --nodes=2 --gpus-per-node=4 <> "$LOG_FILE" + fi +} +``` + +### Exclude Specific Jobs + +To skip cleanup for certain jobs (e.g., debugging), check the job name: + +```bash +# At the start of epilog.sh +if [ "$SLURM_JOB_NAME" = "debug_session" ]; then + log_message "Skipping cleanup for debug session" + exit 0 +fi +``` + +--- + +## Troubleshooting + +### Epilog Script Not Running + +**Symptom**: No entries in `/var/log/slurm/epilog.log` after jobs complete + +**Solutions**: +1. Verify script permissions: + ```bash + ls -la /etc/slurm/epilog.sh + # Should be: -rwxr-xr-x root root + ``` + +2. Check SLURM configuration: + ```bash + grep Epilog /etc/slurm/slurm.conf + # Should show: Epilog=/etc/slurm/epilog.sh + ``` + +3. Check SLURM logs: + ```bash + sudo tail -f /var/log/slurm/slurmd.log + ``` + +### Epilog Script Times Out + +**Symptom**: SLURM logs show "Epilog timed out" + +**Solution**: Increase timeout in `slurm.conf`: +```conf +EpilogMsgTime=60 +``` + +### GPU Processes Still Present + +**Symptom**: After epilog runs, GPU processes still exist + +**Solution**: +1. Check if processes are in Docker containers: + ```bash + docker ps -a | grep container_rocm + ``` + +2. Add more aggressive Docker cleanup to epilog script: + ```bash + # In cleanup_docker_containers() + docker ps -q | xargs -r docker kill + docker ps -aq | xargs -r docker rm -f + ``` + +### Permissions Errors + +**Symptom**: Epilog log shows "Permission denied" errors + +**Solution**: Epilog runs as root by default. If issues persist: +1. Check SELinux status: `getenforce` +2. Add SELinux policy or disable: `sudo setenforce 0` + +--- + +## Best Practices + +### 1. Monitor Epilog Logs + +Set up log rotation for epilog logs: + +```bash +sudo cat > /etc/logrotate.d/slurm-epilog </dev/null | grep -q '[0-9]'; then + echo "ERROR: GPUs not clean before job start" + exit 1 +fi +``` + +--- + +## Integration with madengine + +The epilog script is designed to work seamlessly with madengine's `run.sh` cleanup: + +1. **During Job**: `run.sh` trap handler cleans up on script exit +2. **After Job**: SLURM epilog catches any missed processes +3. **Defense in Depth**: Two layers of cleanup ensure robustness + +This dual-layer approach ensures GPU resources are always released, even if: +- The job is killed with SIGKILL +- Docker containers fail to stop +- Ray workers don't respond to shutdown signals + +--- + +## References + +- [SLURM Prolog/Epilog Documentation](https://slurm.schedmd.com/prolog_epilog.html) +- [Ray Cluster Cleanup Best Practices](https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html) + diff --git a/src/madengine/scripts/slurm/epilog.sh b/src/madengine/scripts/slurm/epilog.sh new file mode 100644 index 00000000..6f7b68e2 --- /dev/null +++ b/src/madengine/scripts/slurm/epilog.sh @@ -0,0 +1,178 @@ +#!/bin/bash +# +# SLURM Epilog Script for GPU Cleanup +# +# This script should be installed on SLURM compute nodes to ensure +# GPU processes are properly cleaned up after each job. +# +# Installation: +# 1. Copy this script to /etc/slurm/epilog.sh on all compute nodes +# 2. Make it executable: chmod +x /etc/slurm/epilog.sh +# 3. Add to /etc/slurm/slurm.conf: +# Epilog=/etc/slurm/epilog.sh +# 4. Restart SLURM: sudo systemctl restart slurmd +# +# This script runs as root after each job completes/fails +# + +LOG_FILE="/var/log/slurm/epilog.log" +mkdir -p "$(dirname "$LOG_FILE")" + +log_message() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] [Job ${SLURM_JOB_ID:-unknown}] $1" >> "$LOG_FILE" +} + +log_message "=== Epilog script starting ===" + +# Function to kill GPU processes +cleanup_gpu_processes() { + log_message "Checking for GPU processes..." + + # Try AMD GPUs first + if [ -x /opt/rocm/bin/amd-smi ]; then + log_message "Detected AMD ROCm installation, checking for processes..." + + # Get PIDs using amd-smi + PIDS=$(amd-smi process 2>/dev/null | grep -v PID | awk '{print $1}' | grep -E '^[0-9]+$' | sort -u) + + if [ ! -z "$PIDS" ]; then + log_message "Found GPU processes to clean: $PIDS" + for pid in $PIDS; do + if ps -p $pid > /dev/null 2>&1; then + log_message "Killing GPU process: $pid" + kill -9 $pid 2>/dev/null || true + sleep 0.5 + fi + done + else + log_message "No GPU processes found via amd-smi" + fi + + # Try fuser on GPU devices as backup + for device in /dev/kfd /dev/dri/renderD*; do + if [ -e "$device" ]; then + DEVICE_PIDS=$(fuser "$device" 2>/dev/null | tr -s ' ' '\n' | grep -E '^[0-9]+$') + if [ ! -z "$DEVICE_PIDS" ]; then + log_message "Found processes using $device: $DEVICE_PIDS" + for pid in $DEVICE_PIDS; do + if ps -p $pid > /dev/null 2>&1; then + log_message "Killing process using $device: $pid" + kill -9 $pid 2>/dev/null || true + fi + done + fi + fi + done + fi + + # Try NVIDIA GPUs + if [ -x /usr/bin/nvidia-smi ]; then + log_message "Detected NVIDIA GPU installation, checking for processes..." + + PIDS=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null | grep -E '^[0-9]+$') + + if [ ! -z "$PIDS" ]; then + log_message "Found NVIDIA GPU processes to clean: $PIDS" + for pid in $PIDS; do + if ps -p $pid > /dev/null 2>&1; then + log_message "Killing NVIDIA GPU process: $pid" + kill -9 $pid 2>/dev/null || true + sleep 0.5 + fi + done + else + log_message "No NVIDIA GPU processes found" + fi + fi +} + +# Function to kill Ray processes +cleanup_ray_processes() { + log_message "Cleaning up Ray processes..." + + # Kill Ray worker processes + RAY_PIDS=$(pgrep -f "ray::" 2>/dev/null || true) + if [ ! -z "$RAY_PIDS" ]; then + log_message "Found Ray processes: $RAY_PIDS" + pkill -9 -f "ray::" 2>/dev/null || true + sleep 1 + else + log_message "No Ray processes found" + fi + + # Kill vLLM worker processes + VLLM_PIDS=$(pgrep -f "RayWorkerWrapper" 2>/dev/null || true) + if [ ! -z "$VLLM_PIDS" ]; then + log_message "Found vLLM worker processes: $VLLM_PIDS" + pkill -9 -f "RayWorkerWrapper" 2>/dev/null || true + sleep 1 + else + log_message "No vLLM worker processes found" + fi + + # Kill any vllm processes + VLLM_MAIN_PIDS=$(pgrep -f "vllm" 2>/dev/null || true) + if [ ! -z "$VLLM_MAIN_PIDS" ]; then + log_message "Found vLLM main processes: $VLLM_MAIN_PIDS" + pkill -9 -f "vllm" 2>/dev/null || true + sleep 1 + fi +} + +# Function to clean Docker containers (if any are still running) +cleanup_docker_containers() { + if command -v docker &> /dev/null; then + log_message "Checking for stale Docker containers..." + + # Find containers that might be from madengine + CONTAINERS=$(docker ps -q --filter "name=container_rocm" 2>/dev/null || true) + if [ ! -z "$CONTAINERS" ]; then + log_message "Found stale containers: $CONTAINERS" + for container in $CONTAINERS; do + log_message "Stopping container: $container" + docker stop --time=5 "$container" 2>/dev/null || true + docker rm -f "$container" 2>/dev/null || true + done + else + log_message "No stale Docker containers found" + fi + fi +} + +# Function to reset GPU state +reset_gpu_state() { + log_message "Resetting GPU state..." + + # AMD GPU reset + if [ -x /opt/rocm/bin/rocm-smi ]; then + log_message "Resetting AMD GPUs..." + /opt/rocm/bin/rocm-smi --gpureset 2>/dev/null || log_message "GPU reset failed (may require reboot)" + fi + + # NVIDIA GPU reset (requires nvidia-smi) + if [ -x /usr/bin/nvidia-smi ]; then + log_message "Resetting NVIDIA GPUs..." + nvidia-smi --gpu-reset -i 0 2>/dev/null || log_message "GPU reset failed (may require reboot)" + fi +} + +# Main cleanup sequence +log_message "Starting cleanup sequence for job ${SLURM_JOB_ID:-unknown}" + +# Step 1: Kill Ray and vLLM processes first +cleanup_ray_processes + +# Step 2: Clean Docker containers +cleanup_docker_containers + +# Step 3: Kill any remaining GPU processes +cleanup_gpu_processes + +# Step 4: Reset GPU state (optional, may cause brief GPU unavailability) +# Uncomment if needed: +# reset_gpu_state + +log_message "=== Epilog script completed ===" + +exit 0 + diff --git a/src/madengine/tools/create_table_db.py b/src/madengine/tools/create_table_db.py deleted file mode 100644 index 68aec9e2..00000000 --- a/src/madengine/tools/create_table_db.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python -"""Module to create tables in the database. - -This module provides the functions to create tables in the database. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -import argparse -import subprocess -import typing -# third-party modules -import paramiko -import socket -# mad-engine modules -from madengine.utils.ssh_to_db import SFTPClient, print_ssh_out -from madengine.db.logger import setup_logger -from madengine.db.utils import get_env_vars - -# Create the logger -LOGGER = setup_logger() -# Get the environment variables -ENV_VARS = get_env_vars() - - -class CreateTable: - """Class to create tables in the database. - - This class provides the functions to create tables in the database. - """ - def __init__(self, args: argparse.Namespace): - """Initialize the CreateTable class. - - Args: - args (argparse.Namespace): The arguments passed to the script. - """ - self.args = args - self.db_name = ENV_VARS["db_name"] - self.db_hostname = ENV_VARS["db_hostname"] - self.db_port = ENV_VARS["db_port"] - self.user_name = ENV_VARS["user_name"] - self.user_password = ENV_VARS["user_password"] - self.ssh_user = ENV_VARS["ssh_user"] - self.ssh_password = ENV_VARS["ssh_password"] - self.ssh_hostname = ENV_VARS["ssh_hostname"] - self.ssh_port = ENV_VARS["ssh_port"] - - # get the db folder - self.db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../db") - LOGGER.info(f"DB path: {self.db_path}") - self.status = False - - def run(self, table_name: str='dlm_table') -> None: - """Create an empty table in the database. - - Args: - table_name (str): The name of the table to create. - - Returns: - None - - Raises: - Exception: An error occurred creating the table. - """ - print(f"Creating table {table_name} in the database") - - if 'localhost' in self.ssh_hostname or '127.0.0.1' in self.ssh_hostname: - try: - self.local_db() - self.status = True - return self.status - except Exception as error: - LOGGER.error(f"Error creating table in local database: {error}") - return self.status - else: - try: - self.remote_db() - self.status = True - return self.status - except Exception as error: - LOGGER.error(f"Error creating table in remote database: {error}") - return self.status - - def local_db(self) -> None: - """Create a table in the local database. - - Returns: - None - - Raises: - Exception: An error occurred creating the table in the local database. - """ - print("Creating table in local database") - - # copy the db folder from the db_path to the current working directory - cmd_list = ["cp", "-r", self.db_path, "."] - - try: - ret = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = ret.communicate() - if ret.returncode == 0: - if out: - LOGGER.info(out.decode('utf-8')) - print("Copied scripts to current work path") - else: - if err: - LOGGER.error(err.decode('utf-8')) - except Exception as e: - LOGGER.error(f"An error occurred: {e}") - - # run upload_csv_to_db.py in the db folder with environment variables using subprocess Popen - cmd_list = ["python3", "./db/upload_csv_to_db.py"] - - # Ensure ENV_VARS is a dictionary - env_vars = dict(ENV_VARS) - print(f"ENV_VARS: {env_vars}") - - try: - ret = subprocess.Popen(cmd_list, env=env_vars, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = ret.communicate() - - if ret.returncode == 0: - if out: - LOGGER.info(out.decode('utf-8')) - else: - if err: - LOGGER.error(err.decode('utf-8')) - raise Exception(f"Error updating table in the local database: {err.decode('utf-8')}") - except Exception as e: - LOGGER.error(f"An error occurred: {e}") - - print("Script execution completed") - - def remote_db(self) -> None: - """Create a table in the remote database. - - Returns: - None - - Raises: - socket.error: An error occurred connecting to the database. - """ - print("Creating table in remote database") - - # create an ssh client - ssh_client = paramiko.SSHClient() - ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - ssh_client.load_system_host_keys() - - # connect to the host of database - try: - ssh_client.connect( - hostname=self.ssh_hostname, - port=self.ssh_port, - username=self.ssh_user, - password=self.ssh_password, - timeout=10, - ) - except paramiko.ssh_exception.AuthenticationException as error: - print(f"Authentication failed: {error}") - return - except paramiko.ssh_exception.SSHException as error: - print(f"SSH error: {error}") - return - except socket.error as error: - print(f"Socket error: {error}") - return - - print("SSH client created, connected to the host of database") - - # print remote dir layout - print_ssh_out(ssh_client.exec_command("pwd")) - print_ssh_out(ssh_client.exec_command("ls -l")) - - # get remote path for files - upload_script_path_remote = os.path.basename(self.db_path) - print(upload_script_path_remote) - - # clean up previous uploads - print_ssh_out(ssh_client.exec_command("rm -rf {}".format(upload_script_path_remote))) - print_ssh_out(ssh_client.exec_command("ls -l")) - - # upload file - sftp_client = SFTPClient.from_transport(ssh_client.get_transport()) - sftp_client.mkdir(upload_script_path_remote, ignore_existing=True) - sftp_client.put_dir(self.db_path, upload_script_path_remote) - - # close the sftp client - sftp_client.close() - - # run script on remote node - main_script = os.path.join(upload_script_path_remote, "upload_csv_to_db.py") - print_ssh_out( - ssh_client.exec_command( - "TUNA_DB_USER_NAME={} TUNA_DB_USER_PASSWORD={} TUNA_DB_NAME={} TUNA_DB_HOSTNAME={} python3 {}".format( - self.user_name, - self.user_password, - self.db_name, - self.db_hostname, - main_script, - ) - ) - ) - - # print remote dir after upload - print_ssh_out(ssh_client.exec_command("ls -l")) - - # close the ssh client - ssh_client.close() diff --git a/src/madengine/tools/csv_to_email.py b/src/madengine/tools/csv_to_email.py deleted file mode 100644 index e9c51611..00000000 --- a/src/madengine/tools/csv_to_email.py +++ /dev/null @@ -1,74 +0,0 @@ -"""Module to send emails. - -This module provides the functions to send emails. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" - -# built-in modules -import os - -# third-party modules -import pandas as pd - - -def convert_csv_to_html(path: str): - """Convert CSV files to HTML files. - - Args: - path: The path to the directory containing the CSV files. - """ - if not os.path.exists(path) or not os.path.isdir(path): - print("The specified path does not exist or is not a directory.") - return - - full_html_source = "" - html_file_path = "./run_results.html" - for filename in os.listdir(path): - # Check if the file is a CSV file - if filename.endswith(".csv"): - file_path = os.path.join(path, filename) - - # Read the CSV file using pandas - df = pd.read_csv(file_path) - - ## Convert DataFrame to HTML and save it - # html_file_path = file_path.rsplit('.', 1)[0] + '.html' - # df.to_html(html_file_path) - html_source = df.to_html() - - # Add H2 header to html_source - html_source = ( - "

" - + file_path.rsplit(".", 1)[0].split("/")[1] - + "

" - + html_source - ) - - # Now add html_source to single file - full_html_source += html_source - - print(f"Converted {filename} to HTML and saved as {html_file_path}") - - func = open(html_file_path, "w") - func.write(full_html_source) - func.close() - - -class ConvertCsvToEmail: - def __init__(self, args): - """Initialize the ConvertCsvToEmail object. - - Args: - args: The command-line arguments. - """ - self.args = args - self.return_status = False - - def run(self): - """Convert the CSV files to HTML files.""" - path = self.args.path - convert_csv_to_html(path) - - self.return_status = True - return self.return_status diff --git a/src/madengine/tools/csv_to_html.py b/src/madengine/tools/csv_to_html.py deleted file mode 100644 index 5a27952a..00000000 --- a/src/madengine/tools/csv_to_html.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Module for converting a CSV file to an HTML file. - -This module is responsible for converting a CSV file to an HTML file. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" - -# built-in imports -import os -import argparse - -# third-party imports -import pandas as pd - - -def convert_csv_to_html(file_path: str): - """Convert the CSV file to an HTML file. - - Args: - file_path: The path to the CSV file. - """ - # get file names - base_path = os.path.dirname(file_path) - base_name = os.path.basename(file_path) - file_name = os.path.splitext(base_name)[0] - - output_name = "" - if base_path: - output_name = base_path + "/" - output_name += file_name + ".html" - # read csv - df = pd.read_csv(file_path) - print(df) - - # Use the .to_html() to get your table in html - df_html = df.to_html(index=False) - perf_html = open(output_name, "w") - n = perf_html.write(df_html) - perf_html.close() - - -class ConvertCsvToHtml: - def __init__(self, args: argparse.Namespace): - """Initialize the ConvertCsvToHtml object. - - Args: - args: The command-line arguments. - """ - self.args = args - self.return_status = False - - def run(self): - """Convert the CSV file to an HTML file.""" - file_path = self.args.csv_file_path - print(f"Converting CSV file to HTML file: {file_path}") - - # get file names - base_path = os.path.dirname(file_path) - base_name = os.path.basename(file_path) - file_name = os.path.splitext(base_name)[0] - - output_name = "" - if base_path: - output_name = base_path + "/" - - output_name += file_name + ".html" - - # read csv - df = pd.read_csv(file_path) - print(df) - - # Use the .to_html() to get your table in html - df_html = df.to_html(index=False) - perf_html = open(output_name, "w") - n = perf_html.write(df_html) - perf_html.close() - - self.return_status = True - return self.return_status diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py deleted file mode 100644 index a620d96f..00000000 --- a/src/madengine/tools/run_models.py +++ /dev/null @@ -1,1169 +0,0 @@ -# lint as: python3 -############################################################################### -# -# MIT License -# -# Copyright (c) Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################# -"""Module of running models on container. - -This module contains the RunModels class, which is responsible for running models on container. -It also contains the RunDetails class, which is responsible for storing the performance results of a model. -""" -# built-in modules -import sys -import os -import json -import time -import re -import traceback -from contextlib import redirect_stdout, redirect_stderr -import warnings -import typing - -# MADEngine modules -from madengine.core.console import Console -from madengine.core.context import Context -from madengine.core.dataprovider import Data -from madengine.core.docker import Docker -from madengine.utils.ops import PythonicTee, file_print, substring_found, find_and_replace_pattern -from madengine.core.constants import MAD_MINIO, MAD_AWS_S3 -from madengine.core.constants import MODEL_DIR, PUBLIC_GITHUB_ROCM_KEY -from madengine.core.timeout import Timeout -from madengine.tools.update_perf_csv import update_perf_csv -from madengine.tools.csv_to_html import convert_csv_to_html -from madengine.tools.discover_models import DiscoverModels - - -class RunDetails: - """Class to store the performance results of a model. - - Attributes: - model (str): The model name. - pipeline (str): The pipeline used. - n_gpus (str): The number of GPUs used. - training_precision (str): The training precision used. - args (str): The arguments used. - tags (str): The tags used. - docker_file (str): The docker file used. - base_docker (str): The base docker used. - docker_sha (str): The docker SHA used. - docker_image (str): The docker image used. - git_commit (str): The git commit used. - machine_name (str): The machine name used. - gpu_architecture (str): The GPU architecture used. - performance (str): The performance of the model. - metric (str): The metric used. - relative_change (str): The relative change in performance. - status (str): The status of the model. - build_duration (str): The build duration. - test_duration (str): The test duration. - dataname (str): The data name used. - data_provider_type (str): The data provider type used. - data_size (str): The size of the data. - data_download_duration (str): The duration of data download. - build_number (str): The CI build number. - additional_docker_run_options (str): The additional options used for docker run. - """ - - # Avoiding @property for ease of code, add if needed. - def __init__(self): - self.model = "" - self.pipeline = "" - self.n_gpus = "" - self.training_precision = "" - self.args = "" - self.tags = "" - self.docker_file = "" - self.base_docker = "" - self.docker_sha = "" - self.docker_image = "" - self.git_commit = "" - self.machine_name = "" - self.gpu_architecture = "" - self.performance = "" - self.metric = "" - self.relative_change = "" - self.status = "FAILURE" - self.build_duration = "" - self.test_duration = "" - self.dataname = "" - self.data_provider_type = "" - self.data_size = "" - self.data_download_duration = "" - self.build_number = "" - self.additional_docker_run_options = "" - - def print_perf(self): - """Print the performance results of a model. - - Method to print stage perf results of a model. - """ - print(f"{self.model} performance is {self.performance} {self.metric}") - - # Exports all info in json format to json_name - # multiple_results excludes the "model,performance,metric,status" keys - # to handle results more generically regardless of the multiple_results csv being passed in - def generate_json(self, json_name: str, multiple_results: bool = False) -> None: - """Generate JSON file for performance results of a model. - - Args: - json_name (str): The name of the JSON file. - multiple_results (bool): The status of multiple results. Default is False. - - Raises: - Exception: An error occurred while generating JSON file for performance results of a model. - """ - keys_to_exclude = ( - {"model", "performance", "metric", "status"} if multiple_results else {} - ) - attributes = vars(self) - output_dict = {x: attributes[x] for x in attributes if x not in keys_to_exclude} - with open(json_name, "w") as outfile: - json.dump(output_dict, outfile) - - -class RunModels: - """Class to run models on container.""" - - def __init__(self, args): - """Constructor of the RunModels class. - - Args: - args: The command-line arguments. - """ - self.return_status = True - self.args = args - self.console = Console(live_output=True) - self.context = Context( - additional_context=args.additional_context, - additional_context_file=args.additional_context_file, - ) - # check the data.json file exists - data_json_file = args.data_config_file_name - - if not os.path.exists(data_json_file): - self.data = None - else: - self.data = Data( - self.context, - filename=args.data_config_file_name, - force_mirrorlocal=args.force_mirror_local, - ) - self.creds = None - print(f"Context is {self.context.ctx}") - - def get_base_prefix_compat(self): - """Get base/real prefix, or sys.prefix if there is none. - - Returns: - str: The base/real prefix or sys.prefix if there is none. - """ - return ( - getattr(sys, "base_prefix", None) - or getattr(sys, "real_prefix", None) - or sys.prefix - ) - - def in_virtualenv(self) -> bool: - """Check if the current environment is a virtual environment. - - Returns: - bool: The status of the current environment. - """ - return self.get_base_prefix_compat() != sys.prefix - - def clean_up_docker_container(self, is_cleaned: bool = False) -> None: - """Clean up docker container.""" - if is_cleaned: - self.console.sh("docker ps -a || true") - self.console.sh("docker kill $(docker ps -q) || true") - - # get gpu vendor - gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] - # show gpu info - if gpu_vendor.find("AMD") != -1: - self.console.sh("/opt/rocm/bin/amd-smi || true") - elif gpu_vendor.find("NVIDIA") != -1: - self.console.sh("nvidia-smi -L || true") - - # Either return the dockercontext path from the model info - # or use the default of the ./docker directory if it doesn't exist - def get_context_path(self, info: typing.Dict) -> str: - """Get the context path. - - Args: - info: The model info dict. - - Returns: - str: The context path. - - Raises: - Exception: An error occurred while getting the context path. - """ - if "dockercontext" in info and info["dockercontext"] != "": - return info["dockercontext"] - else: - return "./docker" - - def get_build_arg(self, run_build_arg: typing.Dict = {}) -> str: - """Get the build arguments. - - Args: - run_build_arg: The run build arguments. - - Returns: - str: The build arguments. - - Raises: - RuntimeError: An error occurred while getting the build arguments. - """ - # check if docker_build_arg is provided in context, if not return empty string. - if not run_build_arg and "docker_build_arg" not in self.context.ctx: - return "" - - build_args = "" - for build_arg in self.context.ctx["docker_build_arg"].keys(): - build_args += ( - "--build-arg " - + build_arg - + "='" - + self.context.ctx["docker_build_arg"][build_arg] - + "' " - ) - - # add model cred - if run_build_arg: - for key, value in run_build_arg.items(): - build_args += "--build-arg " + key + "='" + value + "' " - - return build_args - - def apply_tools( - self, - pre_encapsulate_post_scripts: typing.Dict, - run_env: typing.Dict - ) -> None: - """Apply tools to the model. - - Args: - pre_encapsulate_post_scripts: The pre, encapsulate and post scripts. - run_env: The run environment. - - Raises: - Exception: An error occurred while applying tools to the model. - """ - if "tools" not in self.context.ctx: - return - - # read tool setting from tools.json - tool_file = None - with open(self.args.tools_json_file_name) as f: - tool_file = json.load(f) - - # iterate over tools in context, apply tool settings. - for ctx_tool_config in self.context.ctx["tools"]: - tool_name = ctx_tool_config["name"] - tool_config = tool_file["tools"][tool_name] - - if "cmd" in ctx_tool_config: - tool_config.update({"cmd": ctx_tool_config["cmd"]}) - - if "env_vars" in ctx_tool_config: - for env_var in ctx_tool_config["env_vars"]: - tool_config["env_vars"].update({env_var: ctx_tool_config["env_vars"][env_var]}) - - print(f"Selected Tool, {tool_name}. Configuration : {str(tool_config)}.") - - # setup tool before other existing scripts - if "pre_scripts" in tool_config: - pre_encapsulate_post_scripts["pre_scripts"] = ( - tool_config["pre_scripts"] + pre_encapsulate_post_scripts["pre_scripts"] - ) - # cleanup tool after other existing scripts - if "post_scripts" in tool_config: - pre_encapsulate_post_scripts["post_scripts"] += tool_config["post_scripts"] - # warning: this will update existing keys from env or other tools - if "env_vars" in tool_config: - run_env.update(tool_config["env_vars"]) - if "cmd" in tool_config: - # prepend encapsulate cmd - pre_encapsulate_post_scripts["encapsulate_script"] = ( - tool_config["cmd"] + " " + pre_encapsulate_post_scripts["encapsulate_script"] - ) - - def gather_system_env_details( - self, - pre_encapsulate_post_scripts: typing.Dict, - model_name: str - ) -> None: - """Gather system environment details. - - Args: - pre_encapsulate_post_scripts: The pre, encapsulate and post scripts. - model_name: The model name. - - Returns: - None - - Raises: - Exception: An error occurred while gathering system environment details. - - Note: - This function is used to gather system environment details. - """ - # initialize pre_env_details - pre_env_details = {} - pre_env_details["path"] = "scripts/common/pre_scripts/run_rocenv_tool.sh" - pre_env_details["args"] = model_name.replace("/", "_") + "_env" - pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details) - print(f"pre encap post scripts: {pre_encapsulate_post_scripts}") - - def copy_scripts(self) -> None: - """Copy scripts to the model directory.""" - scripts_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "scripts") - print(f"Package path: {scripts_path}") - # copy the scripts to the model directory - self.console.sh(f"cp -vLR --preserve=all {scripts_path} .") - print(f"Scripts copied to {os.getcwd()}/scripts") - - def cleanup(self) -> None: - """Cleanup the scripts/common directory.""" - # check the directory exists - if os.path.exists("scripts/common"): - # check tools.json exists in scripts/common directory - if os.path.exists("scripts/common/tools.json"): - # remove the scripts/common/tools.json file - self.console.sh("rm -rf scripts/common/tools.json") - # check test_echo.sh exists in scripts/common directory - if os.path.exists("scripts/common/test_echo.sh"): - # remove the scripts/common/test_echo.sh file - self.console.sh("rm -rf scripts/common/test_echo.sh") - # check folder pre_scripts exists in scripts/common directory - if os.path.exists("scripts/common/pre_scripts"): - # remove the scripts/common/pre_scripts directory - self.console.sh("chmod -R +w scripts/common/pre_scripts 2>/dev/null || true") - self.console.sh("rm -rf scripts/common/pre_scripts") - # check folder post_scripts exists in scripts/common directory - if os.path.exists("scripts/common/post_scripts"): - # remove the scripts/common/post_scripts directory - self.console.sh("chmod -R +w scripts/common/post_scripts 2>/dev/null || true") - self.console.sh("rm -rf scripts/common/post_scripts") - if os.path.exists("scripts/common/tools"): - # remove the scripts/common/tools directory with robust permission fixes - self.console.sh("find scripts/common/tools -type f -exec chmod +w {} \\; 2>/dev/null || true") - self.console.sh("find scripts/common/tools -type d -exec chmod +wx {} \\; 2>/dev/null || true") - self.console.sh("rm -rf scripts/common/tools 2>/dev/null || sudo rm -rf scripts/common/tools", canFail=True) - print(f"scripts/common directory has been cleaned up.") - - def get_gpu_arg(self, requested_gpus: str) -> str: - """Get the GPU arguments. - - Args: - requested_gpus: The requested GPUs. - - Returns: - str: The GPU arguments. - - Raises: - RuntimeError: An error occurred while getting the GPU arguments. - """ - # initialize gpu arg to empty string. - gpu_arg = "" - # get gpu vendor from context, if not raise exception. - gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] - n_system_gpus = self.context.ctx['docker_env_vars']['MAD_SYSTEM_NGPUS'] - gpu_strings = self.context.ctx["docker_gpus"].split(",") - - # parsing gpu string, example: '{0-4}' -> [0,1,2,3,4] - docker_gpus = [] - # iterate over the list of gpu strings, split range and append to docker_gpus. - for gpu_string in gpu_strings: - # check if gpu string has range, if so split and append to docker_gpus. - if '-' in gpu_string: - gpu_range = gpu_string.split('-') - docker_gpus += [item for item in range(int(gpu_range[0]),int(gpu_range[1])+1)] - else: - docker_gpus.append(int(gpu_string)) - # sort docker_gpus - docker_gpus.sort() - - # Check GPU range is valid for system - if requested_gpus == "-1": - print("NGPUS requested is ALL (" + ','.join(map(str, docker_gpus)) + ")." ) - requested_gpus = len(docker_gpus) - - print("NGPUS requested is " + str(requested_gpus) + " out of " + str(n_system_gpus) ) - - if int(requested_gpus) > int(n_system_gpus) or int(requested_gpus) > len(docker_gpus): - raise RuntimeError("Too many gpus requested(" + str(requested_gpus) + "). System has " + str(n_system_gpus) + " gpus. Context has " + str(len(docker_gpus)) + " gpus." ) - - # Exposing number of requested gpus - self.context.ctx['docker_env_vars']['MAD_RUNTIME_NGPUS'] = str(requested_gpus) - - # Create docker arg to assign requested GPUs - if gpu_vendor.find("AMD") != -1: - gpu_arg = '--device=/dev/kfd ' - - gpu_renderDs = self.context.ctx['gpu_renderDs'] - if gpu_renderDs is not None: - for idx in range(0, int(requested_gpus)): - gpu_arg += "--device=/dev/dri/renderD" + str(gpu_renderDs[docker_gpus[idx]]) + " " - - elif gpu_vendor.find("NVIDIA") != -1: - gpu_str = "" - for idx in range(0, int(requested_gpus)): - gpu_str += str( docker_gpus[idx] ) + "," - gpu_arg += "--gpus '\"device=" + gpu_str + "\"' " - else: - raise RuntimeError("Unable to determine gpu vendor.") - - print(f"GPU arguments: {gpu_arg}") - - return gpu_arg - - def get_cpu_arg(self) -> str: - """Get the CPU arguments. - - Returns: - str: The CPU arguments. - - Raises: - RuntimeError: An error occurred while getting the CPU arguments. - """ - # get docker_cpus from context, if not return empty string. - if "docker_cpus" not in self.context.ctx: - return "" - # get docker_cpus from context, remove spaces and return cpu arguments. - cpus = self.context.ctx["docker_cpus"] - cpus = cpus.replace(" ","") - return "--cpuset-cpus " + cpus + " " - - def get_env_arg(self, run_env: typing.Dict) -> str: - """Get the environment arguments. - - Args: - run_env: The run environment. - - Returns: - str: The environment arguments. - - Raises: - RuntimeError: An error occurred while getting the environment arguments. - """ - # initialize env_args to empty string. - env_args = "" - - # aggregate environment variables - if run_env: - for env_arg in run_env: - env_args += "--env " + env_arg + "='" + str(run_env[env_arg]) + "' " - - # get docker_env_vars from context, if not return env_args. - if "docker_env_vars" in self.context.ctx: - for env_arg in self.context.ctx["docker_env_vars"].keys(): - env_args += "--env " + env_arg + "='" + str(self.context.ctx["docker_env_vars"][env_arg]) + "' " - - print(f"Env arguments: {env_args}") - return env_args - - def get_mount_arg(self, mount_datapaths: typing.List) -> str: - """Get the mount arguments. - - Args: - mount_datapaths: The mount data paths. - - Returns: - str: The mount arguments. - - Raises: - RuntimeError: An error occurred while getting the mount arguments. - """ - # initialize mount_args to empty string. - mount_args = "" - # get mount_datapaths from context, if not return mount_args. - if mount_datapaths: - # iterate over mount_datapaths, if mount_datapath is not empty, mount data. - for mount_datapath in mount_datapaths: - if mount_datapath: - # uses --mount to enforce existence of parent directory; data is mounted readonly by default - mount_args += "-v " + mount_datapath["path"] + ":" + mount_datapath["home"] - if "readwrite" in mount_datapath and mount_datapath["readwrite"] == 'true': - mount_args += " " - else: - mount_args += ":ro " - - if "docker_mounts" not in self.context.ctx: - return mount_args - - # get docker_mounts from context, if not return mount_args. - for mount_arg in self.context.ctx["docker_mounts"].keys(): - mount_args += "-v " + self.context.ctx["docker_mounts"][mount_arg] + ":" + mount_arg + " " - - return mount_args - - def run_pre_post_script(self, model_docker, model_dir, pre_post): - for script in pre_post: - script_path = script["path"].strip() - model_docker.sh("cp -vLR --preserve=all " + script_path + " " + model_dir, timeout=600) - script_name = os.path.basename(script_path) - script_args = "" - if "args" in script: - script_args = script["args"] - script_args.strip() - model_docker.sh("cd " + model_dir + " && bash " + script_name + " " + script_args , timeout=600) - - def run_model_impl( - self, info: typing.Dict, dockerfile: str, run_details: RunDetails - ) -> None: - """Handler of running model - - Args: - info: The model information. - dockerfile: The docker file. - run_details: The run details. - """ - print("") - print(f"Running model {info['name']} on container built from {dockerfile}") - - if "MAD_CONTAINER_IMAGE" not in self.context.ctx: - # build docker image - image_docker_name = ( - info["name"].replace("/", "_").lower() # replace / with _ for models in scripts/somedir/ from madengine discover - + "_" - + os.path.basename(dockerfile).replace(".Dockerfile", "") - ) - run_details.docker_file = dockerfile - - # get docker context from dockerfile - docker_context = self.get_context_path(info) - - run_build_arg = {} - if "cred" in info and info["cred"] != "": - if info["cred"] not in self.creds: - raise RuntimeError( - "Credentials(" - + info["cred"] - + ") to run model not found in credential.json; Please contact the model owner, " - + info["owner"] - + "." - ) - # add cred to build args - for key_cred, value_cred in self.creds[info["cred"]].items(): - run_build_arg[info["cred"] + "_" + key_cred.upper()] = value_cred - - # get build args from context - build_args = self.get_build_arg(run_build_arg) - - use_cache_str = "" - if self.args.clean_docker_cache: - use_cache_str = "--no-cache" - - # build docker container - print(f"Building Docker image...") - build_start_time = time.time() - # get docker image name - run_details.docker_image = "ci-" + image_docker_name - # get container name - container_name = "container_" + re.sub('.*:','', image_docker_name) # remove docker container hub details - - ## Note: --network=host added to fix issue on CentOS+FBK kernel, where iptables is not available - self.console.sh( - "docker build " - + use_cache_str - + " --network=host " - + " -t " - + run_details.docker_image - + " --pull -f " - + dockerfile - + " " - + build_args - + " " - + docker_context, - timeout=None, - ) - run_details.build_duration = time.time() - build_start_time - print(f"Build Duration: {run_details.build_duration} seconds") - - print(f"MAD_CONTAINER_IMAGE is {run_details.docker_image}") - - # print base docker image info - if ( - "docker_build_arg" in self.context.ctx - and "BASE_DOCKER" in self.context.ctx["docker_build_arg"] - ): - run_details.base_docker = self.context.ctx["docker_build_arg"]["BASE_DOCKER"] - else: - run_details.base_docker = self.console.sh( - "grep '^ARG BASE_DOCKER=' " - + dockerfile - + " | sed -E 's/ARG BASE_DOCKER=//g'" - ) - print(f"BASE DOCKER is {run_details.base_docker}") - - # print base docker image digest - run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " | grep digest | head -n 1 | cut -d \\\" -f 4") - print(f"BASE DOCKER SHA is {run_details.docker_sha}") - - else: - container_name = "container_" + self.context.ctx["MAD_CONTAINER_IMAGE"].replace("/", "_").replace(":", "_") - run_details.docker_image = self.context.ctx["MAD_CONTAINER_IMAGE"] - - print(f"MAD_CONTAINER_IMAGE is {run_details.docker_image}") - print(f"Warning: User override MAD_CONTAINER_IMAGE. Model support on image not guaranteed.") - - # prepare docker run options - gpu_vendor = self.context.ctx["gpu_vendor"] - docker_options = "" - - if gpu_vendor.find("AMD") != -1: - docker_options = "--network host -u root --group-add video \ - --cap-add=SYS_PTRACE --cap-add SYS_ADMIN --device /dev/fuse --security-opt seccomp=unconfined --security-opt apparmor=unconfined --ipc=host " - elif gpu_vendor.find("NVIDIA") != -1: - docker_options = "--cap-add=SYS_PTRACE --cap-add SYS_ADMIN --cap-add SYS_NICE --device /dev/fuse --security-opt seccomp=unconfined --security-opt apparmor=unconfined --network host -u root --ipc=host " - else: - raise RuntimeError("Unable to determine gpu vendor.") - - # initialize pre, encapsulate and post scripts - pre_encapsulate_post_scripts = {"pre_scripts": [], "encapsulate_script": "", "post_scripts": []} - - if "pre_scripts" in self.context.ctx: - pre_encapsulate_post_scripts["pre_scripts"] = self.context.ctx["pre_scripts"] - - if "post_scripts" in self.context.ctx: - pre_encapsulate_post_scripts["post_scripts"] = self.context.ctx["post_scripts"] - - if "encapsulate_script" in self.context.ctx: - pre_encapsulate_post_scripts["encapsulate_script"] = self.context.ctx["encapsulate_script"] - - # get docker run options - docker_options += "--env MAD_MODEL_NAME='" + info["name"] + "' " - # Since we are doing Jenkins level environment collection in the docker container, pass in the jenkins build number. - docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' " - - # gather data - # TODO: probably can use context.ctx instead of another dictionary like run_env here - run_env = {} - mount_datapaths = None - - if "data" in info and info["data"] != "": - mount_datapaths = self.data.get_mountpaths(info["data"]) - model_dataenv = self.data.get_env(info["data"]) - - if model_dataenv is not None: - run_env.update(model_dataenv) - - run_env["MAD_DATANAME"] = info["data"] - - if "cred" in info and info["cred"] != "": - if info["cred"] not in self.creds: - raise RuntimeError( - "Credentials(" - + info["cred"] - + ") to run model not found in credential.json; Please contact the model owner, " - + info["owner"] - + "." - ) - # add cred to run_env - for key_cred, value_cred in self.creds[info["cred"]].items(): - run_env[info["cred"] + "_" + key_cred.upper()] = value_cred - - self.apply_tools(pre_encapsulate_post_scripts, run_env) - - docker_options += self.get_gpu_arg(info["n_gpus"]) - docker_options += self.get_cpu_arg() - - # Must set env vars and mounts at the end - docker_options += self.get_env_arg(run_env) - docker_options += self.get_mount_arg(mount_datapaths) - docker_options += f" {run_details.additional_docker_run_options}" - - # if --shm-size is set, remove --ipc=host - if "SHM_SIZE" in self.context.ctx: - docker_options = docker_options.replace("--ipc=host", "") - - print(docker_options) - - # get machine name - run_details.machine_name = self.console.sh("hostname") - print(f"MACHINE NAME is {run_details.machine_name}") - - # set timeout - timeout = 7200 # default 2 hrs - if "timeout" in info: - timeout = info["timeout"] - - if self.args.timeout >= 0: - timeout = self.args.timeout - - print(f"Setting timeout to {str(timeout)} seconds.") - - with Timeout(timeout): - print(f"") - model_docker = Docker(run_details.docker_image, container_name, docker_options, keep_alive=self.args.keep_alive, console=self.console) - # check that user is root - whoami = model_docker.sh("whoami") - print( "USER is " + whoami ) - - # echo gpu smi info - if gpu_vendor.find("AMD") != -1: - smi = model_docker.sh("/opt/rocm/bin/amd-smi || true") - elif gpu_vendor.find("NVIDIA") != -1: - smi = model_docker.sh("/usr/bin/nvidia-smi || true") - else: - raise RuntimeError("Unable to determine gpu vendor.") - - # clean up previous model run - model_dir = "run_directory" - if "url" in info and info["url"] != "": - # model_dir is set to string after the last forwardslash in url field - # adding for url field with and without trailing forwardslash (/) - model_dir = info['url'].rstrip('/').split('/')[-1] - - # Validate model_dir to make sure there are no special characters - special_char = r'[^a-zA-Z0-9\-\_]' # allow hyphen and underscore - if re.search(special_char, model_dir) is not None: - warnings.warn("Model url contains special character. Fix url.") - - model_docker.sh("rm -rf " + model_dir, timeout=240) - - # set safe.directory for workspace - model_docker.sh("git config --global --add safe.directory /myworkspace") - - # clone model repo - if "url" in info and info["url"] != "": - if "cred" in info and info["cred"] != "": - print(f"Using cred for {info['cred']}") - - if info["cred"] not in self.creds: - raise RuntimeError("Credentials(" + info["cred"] + ") to run model not found in credential.json; Please contact the model owner, " + info["owner"] + ".") - - if info['url'].startswith('ssh://'): - model_docker.sh("git -c core.sshCommand='ssh -l " + self.creds[ info["cred"] ]["username"] + - " -i " + self.creds[ info["cred"] ]["ssh_key_file"] + " -o IdentitiesOnly=yes " + - " -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no' " + - " clone " + info['url'], timeout=240 ) - else: # http or https - model_docker.sh("git clone -c credential.helper='!f() { echo username=" + self.creds[ info["cred"] ]["username"] + \ - "; echo password=" + self.creds[ info["cred"] ]["password"] + "; };f' " + \ - info['url'], timeout=240, secret="git clone " + info['url'] ) - else: - model_docker.sh("git clone " + info["url"], timeout=240) - - # set safe.directory for model directory - model_docker.sh("git config --global --add safe.directory /myworkspace/" + model_dir ) - - # echo git commit - run_details.git_commit = model_docker.sh("cd "+ model_dir + " && git rev-parse HEAD") - print(f"MODEL GIT COMMIT is {run_details.git_commit}") - - # update submodule - model_docker.sh("cd "+ model_dir + "; git submodule update --init --recursive") - else: - model_docker.sh("mkdir -p " + model_dir) - - # add system environment collection script to pre_scripts - if self.args.generate_sys_env_details or self.context.ctx.get("gen_sys_env_details"): - self.gather_system_env_details(pre_encapsulate_post_scripts, info['name']) - # run pre_scripts - if pre_encapsulate_post_scripts["pre_scripts"]: - self.run_pre_post_script(model_docker, model_dir, pre_encapsulate_post_scripts["pre_scripts"]) - - scripts_arg = info['scripts'] - dir_path = None - script_name = None - if scripts_arg.endswith(".sh"): - dir_path = os.path.dirname(scripts_arg) - script_name = "bash " + os.path.basename(scripts_arg) - else: - dir_path = info['scripts'] - script_name = "bash run.sh" - - # add script_prepend_cmd - script_name = pre_encapsulate_post_scripts["encapsulate_script"] + " " + script_name - - # print repo hash - commit = model_docker.sh("cd "+ dir_path +"; git rev-parse HEAD || true ") - print("======================================================") - print("MODEL REPO COMMIT: ", commit ) - print("======================================================") - - # copy scripts to model directory - model_docker.sh("cp -vLR --preserve=all "+ dir_path +"/. "+ model_dir +"/") - - # prepare data inside container - if 'data' in info and info['data'] != "": - self.data.prepare_data( info['data'], model_docker ) - # Capture data provider information from selected_data_provider - if hasattr(self.data, 'selected_data_provider') and self.data.selected_data_provider: - if 'dataname' in self.data.selected_data_provider: - run_details.dataname = self.data.selected_data_provider['dataname'] - if 'data_provider_type' in self.data.selected_data_provider: - run_details.data_provider_type = self.data.selected_data_provider['data_provider_type'] - if 'duration' in self.data.selected_data_provider: - run_details.data_download_duration = self.data.selected_data_provider['duration'] - if 'size' in self.data.selected_data_provider: - run_details.data_size = self.data.selected_data_provider['size'] - print(f"Data Provider Details: {run_details.dataname}, {run_details.data_provider_type}, {run_details.data_size}, {run_details.data_download_duration}s") - - selected_data_provider = { - "node_name": run_details.machine_name, - "build_number": os.environ.get('BUILD_NUMBER','0'), - "model_name": info["name"] if "name" in info else "" - } - - # Set build number in run_details - run_details.build_number = os.environ.get('BUILD_NUMBER','0') - - print(f"Build Info::{selected_data_provider}") - - # keep model_dir as universally rw - model_docker.sh("chmod -R a+rw " + model_dir) - - # run model - test_start_time = time.time() - if not self.args.skip_model_run: - print("Running model...") - if "model_args" in self.context.ctx: - model_docker.sh( - "cd " - + model_dir - + " && " - + script_name - + " " - + self.context.ctx["model_args"], - timeout=None, - ) - else: - model_docker.sh( - "cd " + model_dir + " && " + script_name + " " + info["args"], - timeout=None, - ) - else: - print("Skipping model run") - print( - "To run model: " - + "cd " - + model_dir - + " && " - + script_name - + " " - + info["args"] - ) - - run_details.test_duration = time.time() - test_start_time - print("Test Duration: {} seconds".format(run_details.test_duration)) - - # run post_scripts - if pre_encapsulate_post_scripts["post_scripts"]: - self.run_pre_post_script(model_docker, model_dir, pre_encapsulate_post_scripts["post_scripts"]) - - # remove model directory - if not self.args.keep_alive and not self.args.keep_model_dir: - model_docker.sh("rm -rf " + model_dir, timeout=240) - else: - model_docker.sh("chmod -R a+rw " + model_dir) - print("keep_alive is specified; model_dir(" + model_dir + ") is not removed") - - # explicitly delete model docker to stop the container, without waiting for the in-built garbage collector - del model_docker - - def run_model(self, model_info: typing.Dict) -> bool: - """Run model on container. - - Args: - model_info: The model information. - - Returns: - bool: The status of running model on container. - - Raises: - Exception: An error occurred while running model on container. - """ - print(f"Running model {model_info['name']} with {model_info}") - - # set default values if model run fails - run_details = RunDetails() - - run_details.model = model_info["name"] - run_details.n_gpus = model_info["n_gpus"] - run_details.training_precision = model_info["training_precision"] - run_details.args = model_info["args"] - run_details.tags = model_info["tags"] - run_details.additional_docker_run_options = model_info.get("additional_docker_run_options", "") - # gets pipeline variable from jenkinsfile, default value is none - run_details.pipeline = os.environ.get("pipeline") - # Taking gpu arch from context assumes the host image and container have the same gpu arch. - # Environment variable updates for MAD Public CI - run_details.gpu_architecture = self.context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] - - # Check the setting of shared memory size - if "SHM_SIZE" in self.context.ctx: - shm_size = self.context.ctx["SHM_SIZE"] - if shm_size: - run_details.additional_docker_run_options += f" --shm-size={shm_size}" - print(f"Using SHM_SIZE from context: {shm_size}") - - # Check if model is deprecated - if model_info.get("is_deprecated", False): - print(f"WARNING: Model {model_info['name']} has been deprecated.") - if self.args.ignore_deprecated_flag: - print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.") - else: - print(f"WARNING: Skipping execution. No bypass flags mentioned.") - return True # exit early - - # check if model is supported on current gpu architecture, if not skip. - list_skip_gpu_arch = [] - if ( - "skip_gpu_arch" in model_info - and model_info["skip_gpu_arch"] - and not self.args.disable_skip_gpu_arch - ): - list_skip_gpu_arch = model_info["skip_gpu_arch"].replace(" ", "").split(",") - - sys_gpu_arch = run_details.gpu_architecture - if sys_gpu_arch and "NVIDIA" in sys_gpu_arch: - sys_gpu_arch = sys_gpu_arch.split()[1] - - if list_skip_gpu_arch and sys_gpu_arch and sys_gpu_arch in list_skip_gpu_arch: - print( - f"Skipping model {run_details.model} as it is not supported on {run_details.gpu_architecture} architecture." - ) - # add result to output - self.return_status = True - run_details.status = "SKIPPED" - # generate exception for testing - run_details.generate_json("perf_entry.json") - update_perf_csv(exception_result="perf_entry.json", perf_csv=self.args.output) - else: - print( - f"Running model {run_details.model} on {run_details.gpu_architecture} architecture." - ) - - try: - # clean up docker - self.clean_up_docker_container() - - # find dockerfiles, read their context and filter based on current context - all_dockerfiles = self.console.sh( - "ls " + model_info["dockerfile"] + ".*" - ).split("\n") - - dockerfiles = {} - for cur_docker_file in all_dockerfiles: - # get context of dockerfile - dockerfiles[cur_docker_file] = self.console.sh( - "head -n5 " - + cur_docker_file - + " | grep '# CONTEXT ' | sed 's/# CONTEXT //g'" - ) - - # filter dockerfiles based on context - dockerfiles = self.context.filter(dockerfiles) - print(f"FILTERED dockerfiles are {dockerfiles}") - - # check if dockerfiles are found, if not raise exception. - if not dockerfiles: - raise Exception("No dockerfiles matching context found for model " + run_details.model) - - # run dockerfiles - for cur_docker_file in dockerfiles.keys(): - # reset build-specific run details for each dockerfile - run_details.docker_file = "" - run_details.base_docker = "" - run_details.docker_sha = "" - run_details.docker_image = "" - run_details.performance = "" - run_details.metric = "" - run_details.status = "FAILURE" - run_details.build_duration = "" - run_details.test_duration = "" - - try: - # generate exception for testing - if model_info['args'] == "--exception": - raise Exception("Exception test!") - - print(f"Processing Dockerfile: {cur_docker_file}") - # get base docker image - cur_docker_file_basename = os.path.basename(cur_docker_file) - # set log file path - log_file_path = ( - run_details.model - + "_" - + cur_docker_file_basename.replace(".Dockerfile", "") - + ".live.log" - ) - # Replace / with _ in log file path for models from discovery which use '/' as a separator - log_file_path = log_file_path.replace("/", "_") - - with open(log_file_path, mode="w", buffering=1) as outlog: - with redirect_stdout(PythonicTee(outlog, self.args.live_output)), redirect_stderr(PythonicTee(outlog, self.args.live_output)): - self.run_model_impl(model_info, cur_docker_file, run_details) - - if self.args.skip_model_run: - # move to next dockerfile - continue - - # Check if we are looking for a single result or multiple. - multiple_results = (None if "multiple_results" not in model_info else model_info["multiple_results"]) - - # get performance metric from log - if multiple_results: - run_details.performance = multiple_results - - else: - perf_regex = ".*performance:\\s*\\([+|-]\?[0-9]*[.]\\?[0-9]*\(e[+|-]\?[0-9]\+\)\?\\)\\s*.*\\s*" - run_details.performance = self.console.sh("cat " + log_file_path + - " | sed -n 's/" + perf_regex + "/\\1/p'") - - metric_regex = ".*performance:\\s*[+|-]\?[0-9]*[.]\\?[0-9]*\(e[+|-]\?[0-9]\+\)\?\\s*\\(\\w*\\)\\s*" - run_details.metric = self.console.sh("cat " + log_file_path + - " | sed -n 's/" + metric_regex + "/\\2/p'") - - # check if model passed or failed - run_details.status = 'SUCCESS' if run_details.performance else 'FAILURE' - - # print stage perf results - run_details.print_perf() - - # add result to output - if multiple_results: - run_details.generate_json("common_info.json", multiple_results=True) - update_perf_csv( - multiple_results=model_info['multiple_results'], - perf_csv=self.args.output, - model_name=run_details.model, - common_info="common_info.json", - ) - else: - run_details.generate_json("perf_entry.json") - update_perf_csv( - single_result="perf_entry.json", - perf_csv=self.args.output, - ) - - self.return_status &= (run_details.status == 'SUCCESS') - - except Exception as e: - self.return_status = False - - print( "===== EXCEPTION =====") - print( "Exception: ", e ) - traceback.print_exc() - print( "=============== =====") - run_details.status = "FAILURE" - run_details.generate_json("perf_entry.json") - update_perf_csv( - exception_result="perf_entry.json", - perf_csv=self.args.output, - ) - - except Exception as e: - self.return_status = False - - print( "===== EXCEPTION =====") - print( "Exception: ", e ) - traceback.print_exc() - print( "=============== =====") - run_details.status = "FAILURE" - run_details.generate_json("perf_entry.json") - update_perf_csv( - exception_result="perf_entry.json", - perf_csv=self.args.output, - ) - - return self.return_status - - def run(self) -> bool: - """Main flow of running model. - - Returns: - bool: The status of running models on container. - - Raises: - Exception: An error occurred while running models on container. - """ - print(f"Running models with args {self.args}") - - self.console.sh("echo 'MAD Run Models'") - # show node rocm info - host_os = self.context.ctx["host_os"] - - if host_os.find("HOST_UBUNTU") != -1: - print(self.console.sh("apt show rocm-libs -a", canFail=True)) - elif host_os.find("HOST_CENTOS") != -1: - print(self.console.sh("yum info rocm-libs")) - elif host_os.find("HOST_SLES") != -1: - print(self.console.sh("zypper info rocm-libs")) - elif host_os.find("HOST_AZURE") != -1: - print(self.console.sh("tdnf info rocm-libs")) - else: - print("ERROR: Unable to detect host OS.") - self.return_status = False - return self.return_status - - # get credentials - try: - # MADEngine update - credential_file = "credential.json" - # read credentials - with open(credential_file) as f: - self.creds = json.load(f) - - print(f"Credentials: {self.creds}") - - except Exception as e: - print(f"Exception encountered reading credential.json. {e}, ignoring ...") - - # copy scripts to model directory - self.copy_scripts() - - discover_models = DiscoverModels(args=self.args) - models = discover_models.run() - - # create performance csv - if not os.path.exists(self.args.output): - file_print( - "model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number, additional_docker_run_options", - filename=self.args.output, - mode="w", - ) - - for model_info in models: - # Run model - self.return_status &= self.run_model(model_info) - - # cleanup the model directory - self.cleanup() - # convert output csv to html - print("Converting output csv to html...") - convert_csv_to_html(file_path=self.args.output) - - if self.return_status: - print("All models ran successfully.") - else: - print( "===== EXCEPTION =====") - print("Some models failed to run.") - - return self.return_status diff --git a/src/madengine/tools/update_table_db.py b/src/madengine/tools/update_table_db.py deleted file mode 100644 index a71bde87..00000000 --- a/src/madengine/tools/update_table_db.py +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env python -"""Module to update tables in the database. - -This module provides the functions to update tables in the database. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -import argparse -import subprocess -import typing -# third-party modules -import paramiko -import socket -# MAD Engine modules -from madengine.utils.ssh_to_db import SFTPClient, print_ssh_out -from madengine.db.logger import setup_logger -from madengine.db.utils import get_env_vars - -# Create the logger -LOGGER = setup_logger() -# Get the environment variables -ENV_VARS = get_env_vars() - - -class UpdateTable: - """Class to update tables in the database. - - This class provides the functions to update tables in the database. - """ - def __init__(self, args: argparse.Namespace): - """Initialize the UpdateTable class. - - Args: - args (argparse.Namespace): The arguments passed to the script. - """ - self.args = args - self.db_name = ENV_VARS["db_name"] - self.db_hostname = ENV_VARS["db_hostname"] - self.db_port = ENV_VARS["db_port"] - self.user_name = ENV_VARS["user_name"] - self.user_password = ENV_VARS["user_password"] - self.ssh_user = ENV_VARS["ssh_user"] - self.ssh_password = ENV_VARS["ssh_password"] - self.ssh_hostname = ENV_VARS["ssh_hostname"] - self.ssh_port = ENV_VARS["ssh_port"] - - # get the db folder - self.db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../db") - LOGGER.info(f"DB path: {self.db_path}") - self.status = False - - def run(self, table_name: str='dlm_table') -> None: - """Update a table in the database. - - Args: - table_name (str): The name of the table to update. - - Returns: - None - - Raises: - Exception: An error occurred updating the table. - """ - print(f"Updating table {table_name} in the database") - - if 'localhost' in self.ssh_hostname or '127.0.0.1' in self.ssh_hostname: - try: - self.local_db() - self.status = True - return self.status - except Exception as error: - LOGGER.error(f"Error updating table in the local database: {error}") - return self.status - else: - try: - self.remote_db() - self.status = True - return self.status - except Exception as error: - LOGGER.error(f"Error updating table in the remote database: {error}") - return self.status - - def local_db(self) -> None: - """Update a table in the local database. - - This function updates a table in the local database. - - Returns: - None - - Raises: - Exception: An error occurred updating the table. - """ - print("Updating table in the local database") - - # copy the db folder from the db_path to the current working directory - cmd_list = ["cp", "-r", self.db_path, "."] - - try: - ret = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = ret.communicate() - if ret.returncode == 0: - if out: - LOGGER.info(out.decode('utf-8')) - print("Copied scripts to current work path") - else: - if err: - LOGGER.error(err.decode('utf-8')) - except Exception as e: - LOGGER.error(f"An error occurred: {e}") - - # run upload_csv_to_db.py in the db folder with environment variables using subprocess Popen - cmd_list = ["python3", "./db/upload_csv_to_db.py", "--csv-file-path", self.args.csv_file_path] - # Ensure ENV_VARS is a dictionary - env_vars = dict(ENV_VARS) - - try: - ret = subprocess.Popen(cmd_list, env=env_vars, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = ret.communicate() - - if ret.returncode == 0: - if out: - LOGGER.info(out.decode('utf-8')) - else: - if err: - LOGGER.error(err.decode('utf-8')) - raise Exception(f"Error updating table in the local database: {err.decode('utf-8')}") - except Exception as e: - LOGGER.error(f"An error occurred: {e}") - - print("Script execution completed") - - def remote_db(self) -> None: - """Update a table in the remote database. - - This function updates a table in the remote database. - - Returns: - None - - Raises: - socket.error: An error occurred connecting to the database. - """ - print("Updating table in the remote database") - - # create an ssh client - ssh_client = paramiko.SSHClient() - ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - ssh_client.load_system_host_keys() - - # connect to the host of database - try: - ssh_client.connect( - hostname=self.ssh_hostname, - port=self.ssh_port, - username=self.ssh_user, - password=self.ssh_password, - timeout=10, - ) - except paramiko.ssh_exception.AuthenticationException as error: - print(f"Authentication failed: {error}") - return - except paramiko.ssh_exception.SSHException as error: - print(f"SSH error: {error}") - return - except socket.error as error: - print(f"Socket error: {error}") - return - - print("SSH client created, connected to the host of database") - - # print remote dir layout - print_ssh_out(ssh_client.exec_command("pwd")) - print_ssh_out(ssh_client.exec_command("ls -l")) - - # get remote path for files - upload_script_path_remote = os.path.basename(self.db_path) - csv_file_path_remote = os.path.basename(self.args.csv_file_path) - model_json_path_remote = os.path.basename(self.args.model_json_path) - print(upload_script_path_remote, csv_file_path_remote, model_json_path_remote) - - # clean up previous uploads - print_ssh_out(ssh_client.exec_command("rm -rf {}".format(upload_script_path_remote))) - print_ssh_out(ssh_client.exec_command("rm -rf {}".format(csv_file_path_remote))) - - # upload file - sftp_client = SFTPClient.from_transport(ssh_client.get_transport()) - sftp_client.mkdir(upload_script_path_remote, ignore_existing=True) - sftp_client.put_dir(self.db_path, upload_script_path_remote) - # check if the file exists - if not os.path.exists(self.args.csv_file_path): - print(f"File {self.args.csv_file_path} does not exist") - return - sftp_client.put(self.args.csv_file_path, csv_file_path_remote) - # check if the file exists - if os.path.exists(self.args.model_json_path): - sftp_client.put(self.args.model_json_path, model_json_path_remote) - - # close the sftp client - sftp_client.close() - - # run script on remote node - main_script = os.path.join(upload_script_path_remote, "upload_csv_to_db.py") - print_ssh_out( - ssh_client.exec_command( - "TUNA_DB_USER_NAME={} TUNA_DB_USER_PASSWORD={} TUNA_DB_NAME={} TUNA_DB_HOSTNAME={} python3 {} --csv-file-path {}".format( - self.user_name, - self.user_password, - self.db_name, - self.db_hostname, - main_script, - csv_file_path_remote, - ) - ) - ) - - # close the ssh client - ssh_client.close() diff --git a/src/madengine/tools/upload_mongodb.py b/src/madengine/tools/upload_mongodb.py deleted file mode 100644 index 6766e3e2..00000000 --- a/src/madengine/tools/upload_mongodb.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python -"""Module to update MongoDB collections with data from a CSV file. - -This module provides functions to handle MongoDB operations, including -checking for collection existence, creating collections, and updating datasets. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -import argparse - -# third-party modules -import pandas as pd -import pymongo -from pymongo.errors import ConnectionFailure -from typing import Optional - -# MAD Engine modules -from madengine.db.logger import setup_logger - -# Create the logger -LOGGER = setup_logger() - -class MongoDBHandler: - """Class to handle MongoDB operations.""" - - def __init__(self, args: argparse.Namespace) -> None: - """Initialize the MongoDBHandler. - - Args: - args (argparse.Namespace): The arguments passed to the script. - """ - # MongoDB connection details from environment variables - mongo_user = os.getenv("MONGO_USER", "username") - mongo_password = os.getenv("MONGO_PASSWORD", "password") - mongo_host = os.getenv("MONGO_HOST", "localhost") - mongo_port = os.getenv("MONGO_PORT", "27017") - mongo_uri = f"mongodb://{mongo_user}:{mongo_password}@{mongo_host}:{mongo_port}" - self.uri = mongo_uri - self.database_name = args.database_name - self.collection_name = args.collection_name - self.csv_file_path = args.csv_file_path - self.client = None - self.db = None - - def connect(self) -> None: - """Connect to the MongoDB server.""" - try: - self.client = pymongo.MongoClient(self.uri) - self.db = self.client[self.database_name] - LOGGER.info("Connected to MongoDB.") - except ConnectionFailure as e: - LOGGER.error(f"Failed to connect to MongoDB: {e}") - raise - - def collection_exists(self) -> bool: - """Check if a collection exists in the database. - - Returns: - bool: True if the collection exists, False otherwise. - """ - return self.collection_name in self.db.list_collection_names() - - def update_collection(self, data: pd.DataFrame) -> None: - """Update a MongoDB collection with data from a DataFrame. - - Args: - data (pd.DataFrame): DataFrame containing the data to update. - """ - if not self.collection_exists(): - LOGGER.info(f"Collection '{self.collection_name}' does not exist. Creating it.") - self.db.create_collection(self.collection_name) - - collection = self.db[self.collection_name] - records = data.to_dict(orient="records") - for record in records: - # Use an appropriate unique identifier for upsert (e.g., "_id" or another field) - collection.update_one(record, {"$set": record}, upsert=True) - LOGGER.info(f"Updated collection '{self.collection_name}' with {len(records)} records.") - - def run(self) -> None: - """Run the process of updating a MongoDB collection with data from a CSV file. - """ - self.connect() - data = load_csv_to_dataframe(self.csv_file_path) - - # if the value is NaN, replace it with empty string - data = data.where(pd.notnull(data), "") - # Convert all columns to string type except boolean columns - for col in data.columns: - if data[col].dtype != "bool": - data[col] = data[col].astype(str) - - # Added created_date column and set it to now - data["created_date"] = pd.to_datetime("now").strftime("%Y-%m-%d %H:%M:%S") - - # Remove any leading or trailing whitespace from column names - data.columns = data.columns.str.strip() - - self.update_collection(data) - - -def load_csv_to_dataframe(csv_path: str) -> pd.DataFrame: - """Load a CSV file into a pandas DataFrame. - - Args: - csv_path (str): Path to the CSV file. - - Returns: - pd.DataFrame: DataFrame containing the CSV data. - """ - if not os.path.exists(csv_path): - raise FileNotFoundError(f"CSV file '{csv_path}' not found.") - return pd.read_csv(csv_path) diff --git a/src/madengine/utils/__init__.py b/src/madengine/utils/__init__.py index e69de29b..8281537a 100644 --- a/src/madengine/utils/__init__.py +++ b/src/madengine/utils/__init__.py @@ -0,0 +1,11 @@ +""" +madengine Utilities + +Utility modules for madengine including GPU configuration resolution and config parsing. +""" + +from .gpu_config import GPUConfigResolver, resolve_runtime_gpus +from .config_parser import ConfigParser, get_config_parser + +__all__ = ["GPUConfigResolver", "resolve_runtime_gpus", "ConfigParser", "get_config_parser"] + diff --git a/src/madengine/utils/config_parser.py b/src/madengine/utils/config_parser.py new file mode 100644 index 00000000..7d3e31e7 --- /dev/null +++ b/src/madengine/utils/config_parser.py @@ -0,0 +1,237 @@ +"""Config Parser Module for MAD Engine. + +This module provides utilities to parse configuration files from model arguments +and load them in various formats (CSV, JSON, YAML). + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import os +import re +import json +import logging +import typing +from pathlib import Path + +import pandas as pd + +try: + import yaml + YAML_AVAILABLE = True +except ImportError: + YAML_AVAILABLE = False + +LOGGER = logging.getLogger(__name__) + + +class ConfigParser: + """Parser for model configuration files. + + This class handles parsing configuration files in various formats + (CSV, JSON, YAML) that are referenced in model arguments. + """ + + def __init__(self, scripts_base_dir: typing.Optional[str] = None): + """Initialize ConfigParser. + + Args: + scripts_base_dir: Base directory for scripts (e.g., ~/amd/MAD-private/scripts) + """ + self.scripts_base_dir = scripts_base_dir + + def parse_config_from_args(self, args_string: str, model_scripts_path: str = None) -> typing.Optional[str]: + """Extract config file path from model arguments. + + Args: + args_string: The args field from models.json + model_scripts_path: Path to the model's script directory + + Returns: + Full path to config file, or None if no config found + """ + if not args_string: + return None + + # Look for --config argument + config_match = re.search(r'--config\s+([^\s]+)', args_string) + if not config_match: + return None + + config_path = config_match.group(1) + + # If it's already an absolute path, return it + if os.path.isabs(config_path): + return config_path if os.path.exists(config_path) else None + + # Try to resolve relative path + # First, try relative to model scripts directory + if model_scripts_path: + scripts_dir = os.path.dirname(model_scripts_path) + full_path = os.path.join(scripts_dir, config_path) + if os.path.exists(full_path): + return full_path + + # Try relative to scripts_base_dir + if self.scripts_base_dir: + full_path = os.path.join(self.scripts_base_dir, config_path) + if os.path.exists(full_path): + return full_path + + LOGGER.warning(f"Config file not found: {config_path}") + return None + + def load_config_file(self, config_path: str) -> typing.Optional[typing.Union[typing.List[dict], dict]]: + """Load and parse a configuration file. + + Args: + config_path: Full path to the config file + + Returns: + For CSV: List of dicts (one per row) + For JSON/YAML: Dict or list as-is from file + None if file cannot be loaded + """ + if not config_path or not os.path.exists(config_path): + return None + + file_ext = Path(config_path).suffix.lower() + + try: + if file_ext == '.csv': + return self._load_csv(config_path) + elif file_ext == '.json': + return self._load_json(config_path) + elif file_ext in ['.yaml', '.yml']: + return self._load_yaml(config_path) + else: + LOGGER.warning(f"Unsupported config file format: {file_ext}") + return None + except Exception as e: + LOGGER.error(f"Error loading config file {config_path}: {e}") + return None + + def _load_csv(self, config_path: str) -> typing.List[dict]: + """Load CSV config file. + + Args: + config_path: Path to CSV file + + Returns: + List of dicts, one per row + """ + df = pd.read_csv(config_path) + # Convert NaN to None for JSON serialization + df = df.where(pd.notnull(df), None) + # Convert to list of dicts + return df.to_dict(orient='records') + + def _load_json(self, config_path: str) -> typing.Union[dict, list]: + """Load JSON config file. + + Args: + config_path: Path to JSON file + + Returns: + Dict or list from JSON file + """ + with open(config_path, 'r') as f: + return json.load(f) + + def _load_yaml(self, config_path: str) -> typing.Union[dict, list]: + """Load YAML config file. + + Args: + config_path: Path to YAML file + + Returns: + Dict or list from YAML file + """ + if not YAML_AVAILABLE: + raise ImportError("PyYAML is not installed. Cannot load YAML config files.") + + with open(config_path, 'r') as f: + return yaml.safe_load(f) + + def match_config_to_result( + self, + configs_list: typing.List[dict], + result_data: dict, + model_name: str + ) -> typing.Optional[dict]: + """Match a specific result to its corresponding config. + + For CSV configs with multiple rows (like vllm), match based on + model name and other identifiable fields. + + Args: + configs_list: List of config dicts (from CSV rows) + result_data: Single result row data + model_name: The model name from result + + Returns: + Matching config dict, or None if no match found + """ + if not configs_list: + return None + + # For single config, return it + if len(configs_list) == 1: + return configs_list[0] + + # For multiple configs, try to match based on common fields + # Extract model identifier from result model name + # e.g., "pyt_vllm_llama-3.1-8b_perf_meta-llama_Llama-3.1-8B-Instruct" + # should match config with model="meta-llama/Llama-3.1-8B-Instruct" + + for config in configs_list: + # Try to match on 'model' field if it exists in both + if 'model' in config and 'model' in result_data: + # Compare normalized versions + config_model = str(config['model']).replace('/', '_').replace('-', '_').lower() + result_model = str(result_data['model']).replace('/', '_').replace('-', '_').lower() + if config_model in result_model or result_model in config_model: + # Additional checks for benchmark type if available + if 'benchmark' in config and 'benchmark' in result_data: + if config['benchmark'] == result_data['benchmark']: + return config + else: + return config + + # If no match found, return first config as fallback + LOGGER.warning(f"Could not match config for result: {model_name}. Using first config.") + return configs_list[0] + + def parse_and_load( + self, + args_string: str, + model_scripts_path: str = None + ) -> typing.Optional[typing.Union[typing.List[dict], dict]]: + """Parse config path from args and load the config file. + + Convenience method that combines parse_config_from_args and load_config_file. + + Args: + args_string: The args field from models.json + model_scripts_path: Path to the model's script directory + + Returns: + Config data (list of dicts for CSV, dict for JSON/YAML), or None + """ + config_path = self.parse_config_from_args(args_string, model_scripts_path) + if not config_path: + return None + + return self.load_config_file(config_path) + + +def get_config_parser(scripts_base_dir: typing.Optional[str] = None) -> ConfigParser: + """Factory function to create a ConfigParser instance. + + Args: + scripts_base_dir: Base directory for scripts + + Returns: + ConfigParser instance + """ + return ConfigParser(scripts_base_dir=scripts_base_dir) + diff --git a/src/madengine/tools/discover_models.py b/src/madengine/utils/discover_models.py similarity index 51% rename from src/madengine/tools/discover_models.py rename to src/madengine/utils/discover_models.py index d6776740..8acb960d 100644 --- a/src/madengine/tools/discover_models.py +++ b/src/madengine/utils/discover_models.py @@ -2,6 +2,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ + # built-in modules import argparse import os @@ -9,6 +10,8 @@ import importlib.util import typing from dataclasses import dataclass, field, asdict +from rich.console import Console as RichConsole + @dataclass class CustomModel: @@ -46,11 +49,12 @@ class DiscoverModels: def __init__(self, args: argparse.Namespace): """Initialize the DiscoverModels class. - + Args: args (argparse.Namespace): Arguments passed to the script. """ self.args = args + self.rich_console = RichConsole() # list of models from models.json and scripts/model_dir/models.json self.models: typing.List[dict] = [] # list of custom models from scripts/model_dir/get_models_json.py @@ -60,9 +64,96 @@ def __init__(self, args: argparse.Namespace): # list of selected models parsed using --tags argument self.selected_models: typing.List[dict] = [] + # Setup MODEL_DIR if environment variable is set + self._setup_model_dir_if_needed() + + def _setup_model_dir_if_needed(self) -> None: + """Setup model directory if MODEL_DIR environment variable is set. + + This copies docker/, scripts/, and config files (models.json, credential.json, data.json) + from MODEL_DIR to the current working directory to support the model discovery process. + This operation is safe for build-only (CPU) nodes as it only involves file operations. + + MODEL_DIR defaults to "." (current directory) if not set. + Only copies if MODEL_DIR points to a different directory than current working directory. + """ + model_dir_env = os.environ.get("MODEL_DIR", ".") + + # Get absolute paths to compare + model_dir_abs = os.path.abspath(model_dir_env) + cwd_abs = os.path.abspath(".") + + # Only copy if MODEL_DIR points to a different directory (not current dir) + if model_dir_abs != cwd_abs: + import subprocess + from pathlib import Path + + self.rich_console.print(f"[bold cyan]📁 MODEL_DIR environment variable detected:[/bold cyan] [yellow]{model_dir_env}[/yellow]") + print(f"Copying required files to current working directory: {cwd_abs}") + + try: + # Check if source directory exists + if not os.path.exists(model_dir_env): + self.rich_console.print(f"[yellow]⚠️ Warning: MODEL_DIR path does not exist: {model_dir_env}[/yellow]") + return + + # Copy specific directories and files only (not everything with /*) + # This prevents copying unwanted subdirectories from MODEL_DIR + items_to_copy = [] + + # Directories to copy + for subdir in ["docker", "scripts"]: + src_path = Path(model_dir_env) / subdir + if src_path.exists(): + items_to_copy.append((src_path, subdir, "directory")) + + # Files to copy + for file in ["models.json", "credential.json", "data.json"]: + src_file = Path(model_dir_env) / file + if src_file.exists(): + items_to_copy.append((src_file, file, "file")) + + if not items_to_copy: + self.rich_console.print(f"[yellow]⚠️ No required files/directories found in MODEL_DIR[/yellow]") + return + + # Copy each item + copied_count = 0 + for src_path, item_name, item_type in items_to_copy: + try: + cmd = f"cp -vLR --preserve=all {src_path} {cwd_abs}/" + result = subprocess.run( + cmd, shell=True, capture_output=True, text=True, check=True + ) + copied_count += 1 + + if result.stdout: + # Show summary for directories, full output for files + if item_type == "directory": + lines = result.stdout.splitlines() + if len(lines) < 10: + print(result.stdout) + else: + print(f" ✓ Copied {item_name}/ ({len(lines)} files)") + else: + print(f" ✓ Copied {item_name}") + except subprocess.CalledProcessError as e: + self.rich_console.print(f"[yellow]⚠️ Warning: Failed to copy {item_name}: {e}[/yellow]") + if e.stderr: + print(f" Error details: {e.stderr}") + # Continue with other items even if one fails + + if copied_count > 0: + self.rich_console.print(f"[green]✅ Successfully copied {copied_count} item(s) from MODEL_DIR[/green]") + + print(f"Model dir: {model_dir_env} → current dir: {cwd_abs}") + except Exception as e: + self.rich_console.print(f"[yellow]⚠️ Warning: Unexpected error copying MODEL_DIR: {e}[/yellow]") + # Continue execution even if copy fails + def discover_models(self) -> None: """Discover models in models.json and models.json in model_dir under scripts directory. - + Raises: FileNotFoundError: models.json file not found. """ @@ -77,33 +168,45 @@ def discover_models(self) -> None: self.models = model_dict_list self.model_list = [model_dict["name"] for model_dict in model_dict_list] else: + self.rich_console.print("[red]❌ models.json file not found.[/red]") raise FileNotFoundError("models.json file not found.") - + # walk through the subdirs in model_dir/scripts directory to find the models.json file for dirname in os.listdir(os.path.join(model_dir, "scripts")): root = os.path.join(model_dir, "scripts", dirname) if os.path.isdir(root): files = os.listdir(root) - if 'models.json' in files and 'get_models_json.py' in files: - raise ValueError(f"Both models.json and get_models_json.py found in {root}.") + if "models.json" in files and "get_models_json.py" in files: + self.rich_console.print(f"[red]❌ Both models.json and get_models_json.py found in {root}.[/red]") + raise ValueError( + f"Both models.json and get_models_json.py found in {root}." + ) - if 'models.json' in files: + if "models.json" in files: with open(f"{root}/models.json") as f: model_dict_list: typing.List[dict] = json.load(f) for model_dict in model_dict_list: # Update model name using backslash-separated path - model_dict["name"] = dirname + '/' + model_dict["name"] + model_dict["name"] = dirname + "/" + model_dict["name"] # Update relative path for dockerfile and scripts - model_dict["dockerfile"] = os.path.normpath(os.path.join("scripts", dirname, model_dict["dockerfile"])) - model_dict["scripts"] = os.path.normpath(os.path.join("scripts", dirname, model_dict["scripts"])) + model_dict["dockerfile"] = os.path.normpath( + os.path.join( + "scripts", dirname, model_dict["dockerfile"] + ) + ) + model_dict["scripts"] = os.path.normpath( + os.path.join("scripts", dirname, model_dict["scripts"]) + ) self.models.append(model_dict) self.model_list.append(model_dict["name"]) - if 'get_models_json.py' in files: + if "get_models_json.py" in files: try: # load the module get_models_json.py - spec = importlib.util.spec_from_file_location("get_models_json", f"{root}/get_models_json.py") + spec = importlib.util.spec_from_file_location( + "get_models_json", f"{root}/get_models_json.py" + ) get_models_json = importlib.util.module_from_spec(spec) spec.loader.exec_module(get_models_json) assert hasattr( @@ -116,12 +219,14 @@ def discover_models(self) -> None: custom_model, CustomModel ), "Please use or subclass madengine.tools.discover_models.CustomModel to define your custom model." # Update model name using backslash-separated path - custom_model.name = dirname + '/' + custom_model.name + custom_model.name = dirname + "/" + custom_model.name # Defer updating script and dockerfile paths until update_model is called self.custom_models.append(custom_model) self.model_list.append(custom_model.name) except AssertionError: - print("See madengine/tests/fixtures/dummy/scripts/dummy3/get_models_json.py for an example.") + self.rich_console.print( + "[yellow]💡 See madengine/tests/fixtures/dummy/scripts/dummy3/get_models_json.py for an example.[/yellow]" + ) raise def select_models(self) -> None: @@ -136,11 +241,11 @@ def select_models(self) -> None: # models corresponding to the given tag tag_models = [] # split the tags by ':', strip the tags and remove empty tags. - tag_list = [tag_.strip() for tag_ in tag.split(':') if tag_.strip()] + tag_list = [tag_.strip() for tag_ in tag.split(":") if tag_.strip()] model_name = tag_list[0] - # if the length of tag_list is greater than 1, then the rest + # if the length of tag_list is greater than 1, then the rest # of the tags are extra args to be passed into the model script. if len(tag_list) > 1: extra_args = [tag_ for tag_ in tag_list[1:]] @@ -149,38 +254,54 @@ def select_models(self) -> None: extra_args = " --" + extra_args else: extra_args = "" - + for model in self.models: - if model["name"] == model_name or tag in model["tags"] or tag == "all": + if ( + model["name"] == model_name + or tag in model["tags"] + or tag == "all" + ): model_dict = model.copy() model_dict["args"] = model_dict["args"] + extra_args tag_models.append(model_dict) for custom_model in self.custom_models: - if custom_model.name == model_name or tag in custom_model.tags or tag == "all": + if ( + custom_model.name == model_name + or tag in custom_model.tags + or tag == "all" + ): custom_model.update_model() # Update relative path for dockerfile and scripts dirname = custom_model.name.split("/")[0] - custom_model.dockerfile = os.path.normpath(os.path.join("scripts", dirname, custom_model.dockerfile)) - custom_model.scripts = os.path.normpath(os.path.join("scripts", dirname, custom_model.scripts)) + custom_model.dockerfile = os.path.normpath( + os.path.join("scripts", dirname, custom_model.dockerfile) + ) + custom_model.scripts = os.path.normpath( + os.path.join("scripts", dirname, custom_model.scripts) + ) model_dict = custom_model.to_dict() model_dict["args"] = model_dict["args"] + extra_args tag_models.append(model_dict) if not tag_models: - raise ValueError(f"No models found corresponding to the given tag: {tag}") - + self.rich_console.print(f"[red]❌ No models found corresponding to the given tag: {tag}[/red]") + raise ValueError( + f"No models found corresponding to the given tag: {tag}" + ) + self.selected_models.extend(tag_models) def print_models(self) -> None: if self.selected_models: # print selected models using parsed tags and adding backslash-separated extra args + self.rich_console.print(f"[bold green]📋 Selected Models ({len(self.selected_models)} models):[/bold green]") print(json.dumps(self.selected_models, indent=4)) else: # print list of all model names - print(f"Number of models in total: {len(self.model_list)}") + self.rich_console.print(f"[bold cyan]📊 Available Models ({len(self.model_list)} total):[/bold cyan]") for model_name in self.model_list: - print(f"{model_name}") + print(f" {model_name}") def run(self, live_output: bool = True): @@ -188,7 +309,5 @@ def run(self, live_output: bool = True): self.select_models() if live_output: self.print_models() - - return self.selected_models - + return self.selected_models diff --git a/src/madengine/utils/gpu_config.py b/src/madengine/utils/gpu_config.py new file mode 100644 index 00000000..ff6aabc8 --- /dev/null +++ b/src/madengine/utils/gpu_config.py @@ -0,0 +1,308 @@ +""" +GPU Configuration Resolution Utility + +Provides hierarchical GPU count resolution with clear precedence rules +to handle inconsistencies between model definitions, deployment configs, +and runtime overrides. + +Priority (highest to lowest): +1. Runtime config (--additional-context at run time) +2. Deployment config (k8s.gpu_count / slurm.gpus_per_node) +3. Model definition (n_gpus in models.json) +4. System default (1) + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import warnings +from typing import Dict, Any, Optional, Tuple + + +class GPUConfigResolver: + """ + Resolves GPU count from multiple configuration sources with clear precedence. + + Handles various field names (n_gpus, gpu_count, gpus_per_node) and provides + validation to catch configuration mismatches early. + """ + + # All recognized field names for GPU count + GPU_FIELD_ALIASES = [ + "gpus_per_node", # SLURM, preferred standard + "gpu_count", # Kubernetes + "n_gpus", # Legacy model.json + "num_gpus", # Alternative + "ngpus", # Alternative + ] + + @classmethod + def resolve_gpu_count( + cls, + model_info: Optional[Dict[str, Any]] = None, + deployment_config: Optional[Dict[str, Any]] = None, + runtime_override: Optional[Dict[str, Any]] = None, + validate: bool = True, + ) -> Tuple[int, str]: + """ + Resolve GPU count from multiple sources with clear precedence. + + Args: + model_info: Model configuration from models.json + deployment_config: Deployment configuration (slurm/k8s section) + runtime_override: Runtime override from --additional-context + validate: Whether to validate and warn about mismatches + + Returns: + Tuple of (gpu_count, source) where source indicates which config was used + + Examples: + >>> # Priority 1: Runtime override + >>> count, source = GPUConfigResolver.resolve_gpu_count( + ... model_info={"n_gpus": "1"}, + ... deployment_config={"slurm": {"gpus_per_node": 8}}, + ... runtime_override={"gpus_per_node": 4} + ... ) + >>> count, source + (4, 'runtime_override') + + >>> # Priority 2: Deployment config + >>> count, source = GPUConfigResolver.resolve_gpu_count( + ... model_info={"n_gpus": "1"}, + ... deployment_config={"slurm": {"gpus_per_node": 8}} + ... ) + >>> count, source + (8, 'deployment_config.slurm.gpus_per_node') + + >>> # Priority 3: Model definition + >>> count, source = GPUConfigResolver.resolve_gpu_count( + ... model_info={"n_gpus": "2"} + ... ) + >>> count, source + (2, 'model_info.n_gpus') + """ + sources = [] # Track all sources for validation + + # Priority 1: Runtime override + if runtime_override: + gpu_count = cls._extract_gpu_count(runtime_override, "runtime_override") + if gpu_count is not None: + sources.append(("runtime_override", gpu_count)) + if validate: + cls._validate_consistency(sources, model_info, deployment_config) + return gpu_count, "runtime_override" + + # Priority 2: Deployment-specific config + if deployment_config: + # Check for SLURM config + if "slurm" in deployment_config: + gpu_count = cls._extract_gpu_count( + deployment_config["slurm"], + "deployment_config.slurm" + ) + if gpu_count is not None: + sources.append(("deployment_config.slurm.gpus_per_node", gpu_count)) + if validate: + cls._validate_consistency(sources, model_info, deployment_config) + return gpu_count, "deployment_config.slurm.gpus_per_node" + + # Check for K8s config + if "k8s" in deployment_config or "kubernetes" in deployment_config: + k8s_config = deployment_config.get("k8s") or deployment_config.get("kubernetes") + gpu_count = cls._extract_gpu_count(k8s_config, "deployment_config.k8s") + if gpu_count is not None: + sources.append(("deployment_config.k8s.gpu_count", gpu_count)) + if validate: + cls._validate_consistency(sources, model_info, deployment_config) + return gpu_count, "deployment_config.k8s.gpu_count" + + # Priority 3: Model definition + if model_info: + gpu_count = cls._extract_gpu_count(model_info, "model_info") + if gpu_count is not None: + sources.append(("model_info.n_gpus", gpu_count)) + if validate: + cls._validate_consistency(sources, model_info, deployment_config) + return gpu_count, "model_info.n_gpus" + + # Priority 4: Default + return 1, "default" + + @classmethod + def _extract_gpu_count( + cls, + config: Dict[str, Any], + context: str + ) -> Optional[int]: + """ + Extract GPU count from config dict, trying all known field names. + + Args: + config: Configuration dictionary + context: Context string for warning messages + + Returns: + GPU count as integer, or None if not found + """ + if not config: + return None + + found_fields = [] + for field_name in cls.GPU_FIELD_ALIASES: + if field_name in config: + found_fields.append((field_name, config[field_name])) + + if not found_fields: + return None + + # Warn if multiple GPU fields found + if len(found_fields) > 1: + field_list = ", ".join([f"{name}={val}" for name, val in found_fields]) + print( + f"⚠️ Multiple GPU fields in {context}: {field_list}. " + f"Using {found_fields[0][0]}={found_fields[0][1]}" + ) + + # Convert to int (handle string values like "8") + try: + return int(found_fields[0][1]) + except (ValueError, TypeError): + print( + f"⚠️ Invalid GPU count in {context}: {found_fields[0][1]}. Using default." + ) + return None + + @classmethod + def _validate_consistency( + cls, + sources: list, + model_info: Optional[Dict[str, Any]], + deployment_config: Optional[Dict[str, Any]], + ) -> None: + """ + Validate consistency between different GPU count sources. + + Warns if there are mismatches that might indicate configuration errors. + + Args: + sources: List of (source_name, gpu_count) tuples found so far + model_info: Model configuration for additional checks + deployment_config: Deployment configuration for additional checks + """ + if not sources: + return + + # Collect all GPU counts from all sources + all_counts = {} + + # Add already resolved source + for source_name, count in sources: + all_counts[source_name] = count + + # Check model_info + if model_info: + model_gpu = cls._extract_gpu_count(model_info, "model_info") + if model_gpu is not None: + all_counts["model_info.n_gpus"] = model_gpu + + # Check deployment config + if deployment_config: + if "slurm" in deployment_config: + slurm_gpu = cls._extract_gpu_count( + deployment_config["slurm"], "slurm" + ) + if slurm_gpu is not None: + all_counts["deployment_config.slurm.gpus_per_node"] = slurm_gpu + + if "k8s" in deployment_config or "kubernetes" in deployment_config: + k8s_config = deployment_config.get("k8s") or deployment_config.get("kubernetes") + k8s_gpu = cls._extract_gpu_count(k8s_config, "k8s") + if k8s_gpu is not None: + all_counts["deployment_config.k8s.gpu_count"] = k8s_gpu + + # Check for mismatches + unique_counts = set(all_counts.values()) + if len(unique_counts) > 1: + mismatch_details = ", ".join([f"{k}={v}" for k, v in all_counts.items()]) + # Determine if this is likely intentional (deployment override) or an error + is_deployment_override = ( + sources[0][0].startswith("runtime_override") or + sources[0][0].startswith("deployment_config") + ) + + if is_deployment_override: + # This is normal - deployment config overriding model default + # Use print instead of warnings.warn for cleaner output + print( + f"ℹ️ GPU configuration override: {sources[0][0]}={sources[0][1]} " + f"(overriding model default: {mismatch_details.split(',')[-1].strip()})" + ) + else: + # Potentially unexpected mismatch - use warning for actual errors + warnings.warn( + f"\n⚠️ GPU count mismatch detected: {mismatch_details}\n" + f" Using: {sources[0][0]}={sources[0][1]}\n" + f" Precedence: runtime_override > deployment_config > model_info > default", + UserWarning, + stacklevel=4 + ) + + +def resolve_runtime_gpus( + model_info: Dict[str, Any], + additional_context: Dict[str, Any], +) -> int: + """ + Convenience function for resolving GPU count at runtime. + + This is the main entry point for runtime GPU resolution. + + Args: + model_info: Model configuration from manifest + additional_context: Additional context from CLI or config files + + Returns: + Resolved GPU count as integer + + Example: + >>> model_info = {"name": "my_model", "n_gpus": "1"} + >>> additional_context = {"slurm": {"gpus_per_node": 8}} + >>> gpu_count = resolve_runtime_gpus(model_info, additional_context) + >>> gpu_count + 8 + """ + # Extract deployment config from additional_context + deployment_config = additional_context.get("deployment_config", {}) + + # Also check for direct slurm/k8s keys in additional_context + if "slurm" in additional_context: + if not deployment_config: + deployment_config = {} + deployment_config["slurm"] = additional_context["slurm"] + + if "k8s" in additional_context or "kubernetes" in additional_context: + if not deployment_config: + deployment_config = {} + deployment_config["k8s"] = additional_context.get("k8s") or additional_context.get("kubernetes") + + # Check for direct runtime GPU override (in additional_context or deployment_config) + runtime_override = None + for field in GPUConfigResolver.GPU_FIELD_ALIASES: + if field in additional_context: + runtime_override = {field: additional_context[field]} + break + # Also check in deployment_config top-level (for SLURM local manifest) + if deployment_config and field in deployment_config: + runtime_override = {field: deployment_config[field]} + break + + gpu_count, source = GPUConfigResolver.resolve_gpu_count( + model_info=model_info, + deployment_config=deployment_config, + runtime_override=runtime_override, + validate=True, + ) + + print(f"ℹ️ Resolved GPU count: {gpu_count} (from {source})") + + return gpu_count + diff --git a/src/madengine/utils/gpu_tool_factory.py b/src/madengine/utils/gpu_tool_factory.py new file mode 100644 index 00000000..4f8fa60c --- /dev/null +++ b/src/madengine/utils/gpu_tool_factory.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +""" +GPU Tool Manager Factory + +Provides factory pattern for creating vendor-specific GPU tool managers with +singleton management. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import logging +from typing import Dict, Optional + +from madengine.utils.gpu_tool_manager import BaseGPUToolManager +from madengine.utils.gpu_validator import GPUVendor, detect_gpu_vendor + +logger = logging.getLogger(__name__) + +# Singleton instances per vendor +_manager_instances: Dict[GPUVendor, BaseGPUToolManager] = {} + + +def get_gpu_tool_manager(vendor: Optional[GPUVendor] = None) -> BaseGPUToolManager: + """Get GPU tool manager for the specified vendor. + + This function implements the singleton pattern - only one manager instance + is created per vendor type and reused across all calls. + + Args: + vendor: GPU vendor (AMD, NVIDIA, etc.). If None, auto-detects. + + Returns: + GPU tool manager instance for the specified vendor + + Raises: + ValueError: If vendor is unknown or unsupported + ImportError: If vendor-specific manager module cannot be imported + + Example: + >>> from madengine.utils.gpu_tool_factory import get_gpu_tool_manager + >>> from madengine.utils.gpu_validator import GPUVendor + >>> + >>> # Auto-detect vendor + >>> manager = get_gpu_tool_manager() + >>> + >>> # Explicit vendor + >>> amd_manager = get_gpu_tool_manager(GPUVendor.AMD) + >>> nvidia_manager = get_gpu_tool_manager(GPUVendor.NVIDIA) + """ + # Auto-detect vendor if not specified + if vendor is None: + vendor = detect_gpu_vendor() + logger.debug(f"Auto-detected GPU vendor: {vendor.value}") + + # Check if we already have a singleton instance + if vendor in _manager_instances: + logger.debug(f"Returning cached {vendor.value} tool manager") + return _manager_instances[vendor] + + # Create new manager instance based on vendor + if vendor == GPUVendor.AMD: + try: + from madengine.utils.rocm_tool_manager import ROCmToolManager + manager = ROCmToolManager() + logger.info(f"Created new ROCm tool manager") + except ImportError as e: + raise ImportError(f"Failed to import ROCm tool manager: {e}") + + elif vendor == GPUVendor.NVIDIA: + try: + from madengine.utils.nvidia_tool_manager import NvidiaToolManager + manager = NvidiaToolManager() + logger.info(f"Created new NVIDIA tool manager") + except ImportError as e: + raise ImportError(f"Failed to import NVIDIA tool manager: {e}") + + elif vendor == GPUVendor.UNKNOWN: + raise ValueError( + "Unable to detect GPU vendor. Ensure GPU drivers and tools are installed.\n" + "For AMD: Install ROCm (https://github.com/ROCm/ROCm)\n" + "For NVIDIA: Install CUDA toolkit" + ) + + else: + raise ValueError(f"Unsupported GPU vendor: {vendor.value}") + + # Cache the manager instance + _manager_instances[vendor] = manager + + return manager + + +def clear_manager_cache() -> None: + """Clear all cached manager instances. + + Useful for testing or when GPU configuration changes during runtime. + This will force recreation of managers on next call to get_gpu_tool_manager(). + + Also clears internal caches within each manager before removing them. + """ + global _manager_instances + + # Clear caches within managers before removing them + for manager in _manager_instances.values(): + manager.clear_cache() + + _manager_instances.clear() + logger.debug("Cleared all GPU tool manager instances") + + +def get_cached_managers() -> Dict[GPUVendor, BaseGPUToolManager]: + """Get dictionary of currently cached manager instances. + + Primarily for debugging and testing purposes. + + Returns: + Dictionary mapping GPUVendor to manager instances + """ + return _manager_instances.copy() + diff --git a/src/madengine/utils/gpu_tool_manager.py b/src/madengine/utils/gpu_tool_manager.py new file mode 100644 index 00000000..701e1db7 --- /dev/null +++ b/src/madengine/utils/gpu_tool_manager.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +""" +Base GPU Tool Manager Architecture + +Provides abstract base class and common infrastructure for GPU vendor-specific +tool managers (AMD ROCm, NVIDIA CUDA, etc.). + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import logging +import os +import subprocess +import threading +from abc import ABC, abstractmethod +from typing import Any, Dict, Optional, Tuple + +logger = logging.getLogger(__name__) + + +class BaseGPUToolManager(ABC): + """Abstract base class for GPU vendor-specific tool managers. + + Provides common infrastructure for: + - Tool availability checking + - Command execution with timeout + - Result caching + - Consistent logging + + Subclasses implement vendor-specific logic for: + - Version detection + - Tool selection + - Command execution with fallback + """ + + def __init__(self): + """Initialize base GPU tool manager.""" + self._cache: Dict[str, Any] = {} + self._cache_lock = threading.Lock() + + @abstractmethod + def get_version(self) -> Optional[str]: + """Get GPU vendor tool version (e.g., ROCm version, CUDA version). + + Returns: + Version string or None if unable to detect + """ + pass + + @abstractmethod + def execute_command( + self, + command: str, + fallback_command: Optional[str] = None, + timeout: int = 30 + ) -> str: + """Execute command with optional fallback. + + Args: + command: Primary command to execute + fallback_command: Optional fallback command if primary fails + timeout: Command timeout in seconds + + Returns: + Command output as string + + Raises: + RuntimeError: If both primary and fallback commands fail + """ + pass + + def is_tool_available(self, tool_path: str) -> bool: + """Check if a tool exists and is executable. + + Args: + tool_path: Path to the tool (e.g., /opt/rocm/bin/amd-smi) + + Returns: + True if tool exists and is executable, False otherwise + """ + cache_key = f"tool_available:{tool_path}" + + # Check cache first + with self._cache_lock: + if cache_key in self._cache: + return self._cache[cache_key] + + # Check if file exists and is executable + result = os.path.isfile(tool_path) and os.access(tool_path, os.X_OK) + + # Cache the result + with self._cache_lock: + self._cache[cache_key] = result + + return result + + def _execute_shell_command( + self, + command: str, + timeout: int = 30, + check_returncode: bool = True + ) -> Tuple[bool, str, str]: + """Execute a shell command and return result. + + Args: + command: Shell command to execute + timeout: Timeout in seconds + check_returncode: If True, only succeed on returncode 0 + + Returns: + Tuple of (success, stdout, stderr) + """ + try: + result = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=timeout + ) + + success = (result.returncode == 0) if check_returncode else True + return success, result.stdout.strip(), result.stderr.strip() + + except subprocess.TimeoutExpired: + return False, "", f"Command timed out after {timeout} seconds" + except FileNotFoundError: + return False, "", f"Command not found: {command.split()[0]}" + except Exception as e: + return False, "", f"Command execution error: {str(e)}" + + def _cache_result(self, key: str, value: Any) -> None: + """Cache a result for future use. + + Args: + key: Cache key + value: Value to cache + """ + with self._cache_lock: + self._cache[key] = value + + def _get_cached_result(self, key: str) -> Optional[Any]: + """Get a cached result. + + Args: + key: Cache key + + Returns: + Cached value or None if not found + """ + with self._cache_lock: + return self._cache.get(key) + + def _log_debug(self, message: str) -> None: + """Log a debug message. + + Args: + message: Debug message + """ + logger.debug(f"[{self.__class__.__name__}] {message}") + + def _log_info(self, message: str) -> None: + """Log an info message. + + Args: + message: Info message + """ + logger.info(f"[{self.__class__.__name__}] {message}") + + def _log_warning(self, message: str) -> None: + """Log a warning message. + + Args: + message: Warning message + """ + logger.warning(f"[{self.__class__.__name__}] {message}") + + def _log_error(self, message: str) -> None: + """Log an error message. + + Args: + message: Error message + """ + logger.error(f"[{self.__class__.__name__}] {message}") + + def clear_cache(self) -> None: + """Clear all cached results. + + Useful for testing or when tools are installed/updated during runtime. + """ + with self._cache_lock: + self._cache.clear() + self._log_debug("Cache cleared") + diff --git a/src/madengine/utils/gpu_validator.py b/src/madengine/utils/gpu_validator.py index 5715db67..c542e8c3 100644 --- a/src/madengine/utils/gpu_validator.py +++ b/src/madengine/utils/gpu_validator.py @@ -42,7 +42,7 @@ def __post_init__(self): class ROCmValidator: - """Validator for AMD ROCm installation""" + """Validator for AMD ROCm installation with tool manager integration""" # Essential ROCm components to check ESSENTIAL_PATHS = { @@ -70,6 +70,7 @@ def __init__(self, verbose: bool = False): verbose: If True, print detailed validation progress """ self.verbose = verbose + self._tool_manager = None # Lazy initialization def _run_command(self, cmd: List[str], timeout: int = 10) -> Tuple[bool, str, str]: """Run a command and return success status and output @@ -100,13 +101,40 @@ def _check_path_exists(self, path: str) -> bool: """Check if a path exists""" return os.path.exists(path) + def _get_tool_manager(self): + """Get or create ROCm tool manager instance + + Returns: + ROCmToolManager instance + """ + if self._tool_manager is None: + try: + from madengine.utils.rocm_tool_manager import ROCmToolManager + self._tool_manager = ROCmToolManager() + except ImportError as e: + if self.verbose: + print(f"Warning: Could not import ROCmToolManager: {e}") + return None + return self._tool_manager + def _get_rocm_version(self) -> Optional[str]: - """Get ROCm version from system + """Get ROCm version from system using tool manager Returns: ROCm version string or None if not found + + Enhancement: + Uses ROCmToolManager for robust multi-method version detection. """ - # Try hipconfig first + # Try tool manager first (most robust) + tool_manager = self._get_tool_manager() + if tool_manager: + try: + return tool_manager.get_version() + except Exception: + pass # Fallback to direct methods + + # Fallback: Try hipconfig first success, stdout, _ = self._run_command(['hipconfig', '--version']) if success and stdout: return stdout.split('-')[0] # Remove build suffix @@ -124,12 +152,27 @@ def _get_rocm_version(self) -> Optional[str]: return None def _check_gpu_accessible(self) -> Tuple[bool, str]: - """Check if GPUs are accessible + """Check if GPUs are accessible using version-aware tool selection Returns: Tuple of (accessible, message) + + Enhancement: + Uses tool manager to prefer correct tool based on ROCm version (PR #54). """ - # Try rocminfo first + # Try using tool manager first (version-aware) + tool_manager = self._get_tool_manager() + if tool_manager: + try: + count = tool_manager.get_gpu_count() + if count > 0: + version = tool_manager.get_rocm_version() + preferred_tool = tool_manager.get_preferred_smi_tool() + return True, f"GPUs accessible via tool manager ({preferred_tool}, ROCm {version})" + except Exception: + pass # Fall back to direct checks + + # Fallback: Try rocminfo first (most reliable for detection) success, stdout, stderr = self._run_command(['rocminfo']) if success: # Check if any GPU agents are listed diff --git a/src/madengine/utils/log_formatting.py b/src/madengine/utils/log_formatting.py new file mode 100644 index 00000000..31673c93 --- /dev/null +++ b/src/madengine/utils/log_formatting.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +Utility functions for formatting and displaying data in logs. + +This module provides enhanced formatting utilities for better log readability, +including dataframe formatting and other display utilities. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import pandas as pd +import typing +from rich.table import Table +from rich.console import Console as RichConsole +from rich.text import Text + + +def format_dataframe_for_log( + df: pd.DataFrame, title: str = "DataFrame", max_rows: int = 20, max_cols: int = 10 +) -> str: + """ + Format a pandas DataFrame for beautiful log output. + + Args: + df: The pandas DataFrame to format + title: Title for the dataframe display + max_rows: Maximum number of rows to display (if None, use all rows) + max_cols: Maximum number of columns to display + + Returns: + str: Beautifully formatted string representation of the DataFrame + """ + if df.empty: + return f"\n📊 {title}\n{'='*60}\n❌ DataFrame is empty\n{'='*60}\n" + + # Define key columns to display for performance results + key_columns = [ + "model", + "n_gpus", + "docker_file", + "machine_name", + "gpu_architecture", + "performance", + "metric", + "status", + "dataname", + ] + + # Filter DataFrame to show only key columns that exist + available_columns = [col for col in key_columns if col in df.columns] + if available_columns: + display_df = df[available_columns].copy() + total_columns_note = ( + f"(showing {len(available_columns)} of {len(df.columns)} columns)" + ) + else: + # If no key columns found, show all columns as fallback with truncation + display_df = df.copy() + total_columns_note = f"(showing all {len(df.columns)} columns)" + if len(df.columns) > max_cols: + display_df = display_df.iloc[:, :max_cols] + total_columns_note = ( + f"(showing first {max_cols} of {len(df.columns)} columns)" + ) + + # Use all rows if max_rows is None + if max_rows is None: + max_rows = len(display_df) + + # Truncate rows if necessary (show latest rows) + truncated_rows = False + if len(display_df) > max_rows: + display_df = display_df.tail(max_rows) + truncated_rows = True + + # Create header + header = f"\n📊 {title} {total_columns_note}\n" + header += f"{'='*80}\n" + if available_columns: + header += f"📏 Shape: {df.shape[0]} rows × {len(available_columns)} key columns (total: {df.shape[1]} columns)\n" + else: + header += f"📏 Shape: {df.shape[0]} rows × {df.shape[1]} columns\n" + + if truncated_rows: + header += f"⚠️ Display truncated: showing first {max_rows} rows\n" + + header += f"{'='*80}\n" + + # Format the DataFrame with nice styling + formatted_df = display_df.to_string( + index=True, max_rows=max_rows, width=None, float_format="{:.4f}".format + ) + + # Add some visual separators + footer = f"\n{'='*80}\n" + + return header + formatted_df + footer + + +def format_dataframe_rich( + df: pd.DataFrame, title: str = "DataFrame", max_rows: int = 20 +) -> None: + """ + Display a pandas DataFrame using Rich formatting for enhanced readability. + + Args: + df: The pandas DataFrame to display + title: Title for the table + max_rows: Maximum number of rows to display + """ + console = RichConsole() + + if df.empty: + console.print( + f"📊 [bold cyan]{title}[/bold cyan]: [red]DataFrame is empty[/red]" + ) + return + + # Define key columns to display for performance results + key_columns = [ + "model", + "n_gpus", + "machine_name", + "gpu_architecture", + "performance", + "metric", + "status", + "dataname", + ] + + # Filter DataFrame to show only key columns that exist + available_columns = [col for col in key_columns if col in df.columns] + if available_columns: + display_df = df[available_columns] + total_columns_note = ( + f"(showing {len(available_columns)} of {len(df.columns)} columns)" + ) + else: + # If no key columns found, show all columns as fallback + display_df = df + total_columns_note = f"(showing all {len(df.columns)} columns)" + + # Create Rich table + table = Table( + title=f"📊 {title} {total_columns_note}", + show_header=True, + header_style="bold magenta", + ) + + # Add index column + table.add_column("Index", style="dim", width=8) + + # Add data columns + for col in display_df.columns: + table.add_column(str(col), style="cyan") + + # Add rows (truncate if necessary, show latest rows) + if len(display_df) > max_rows: + truncated_df = display_df.tail(max_rows) + truncated_indices = truncated_df.index + display_rows = max_rows + else: + truncated_df = display_df + truncated_indices = truncated_df.index + display_rows = len(truncated_df) + + for i in range(display_rows): + row_data = [str(truncated_indices[i])] + for col in truncated_df.columns: + value = truncated_df.iloc[i][col] + if pd.isna(value): + row_data.append("[dim]NaN[/dim]") + elif isinstance(value, float): + row_data.append(f"{value:.4f}") + else: + row_data.append(str(value)) + table.add_row(*row_data) + + # Show truncation info + if len(display_df) > max_rows: + table.add_row(*["..." for _ in range(len(truncated_df.columns) + 1)]) + console.print( + f"[yellow]⚠️ Showing latest {max_rows} of {len(display_df)} rows[/yellow]" + ) + + console.print(table) + console.print( + f"[green]✨ DataFrame shape: {df.shape[0]} rows × {len(available_columns)} key columns (total: {df.shape[1]} columns)[/green]" + ) + + +def print_dataframe_beautiful( + df: pd.DataFrame, title: str = "Data", use_rich: bool = True +) -> None: + """ + Print a pandas DataFrame with beautiful formatting. + + Args: + df: The pandas DataFrame to print + title: Title for the display + use_rich: Whether to use Rich formatting (if available) or fall back to simple formatting + """ + try: + if use_rich: + format_dataframe_rich(df, title) + else: + raise ImportError("Fallback to simple formatting") + except (ImportError, Exception): + # Fallback to simple but nice formatting + formatted_output = format_dataframe_for_log(df, title) + print(formatted_output) + + +def highlight_log_section(title: str, content: str, style: str = "info") -> str: + """ + Create a highlighted log section with borders and styling. + + Args: + title: Section title + content: Section content + style: Style type ('info', 'success', 'warning', 'error') + + Returns: + str: Formatted log section + """ + styles = { + "info": {"emoji": "ℹ️", "border": "-"}, + "success": {"emoji": "✅", "border": "="}, + "warning": {"emoji": "⚠️", "border": "!"}, + "error": {"emoji": "❌", "border": "#"}, + } + + style_config = styles.get(style, styles["info"]) + emoji = style_config["emoji"] + border_char = style_config["border"] + + border = border_char * 80 + header = f"\n{border}\n{emoji} {title.upper()}\n{border}" + footer = f"{border}\n" + + return f"{header}\n{content}\n{footer}" diff --git a/src/madengine/utils/nvidia_tool_manager.py b/src/madengine/utils/nvidia_tool_manager.py new file mode 100644 index 00000000..73259b38 --- /dev/null +++ b/src/madengine/utils/nvidia_tool_manager.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +""" +NVIDIA Tool Manager + +Basic NVIDIA CUDA tool manager wrapping nvidia-smi and nvcc. +Maintains current behavior without sophisticated version-aware logic. + +This is a placeholder for future enhancement. Current implementation provides: +- Simple wrappers around nvidia-smi and nvcc +- Basic error handling +- Consistent interface with BaseGPUToolManager + +Future enhancements could include: +- CUDA version-aware tool selection +- Fallback between different CUDA tool versions +- More sophisticated error handling + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +from typing import Optional + +from madengine.utils.gpu_tool_manager import BaseGPUToolManager + + +class NvidiaToolManager(BaseGPUToolManager): + """NVIDIA CUDA tool manager with basic functionality. + + Provides simple wrappers around NVIDIA tools while maintaining + compatibility with BaseGPUToolManager interface. + + Current implementation: + - nvidia-smi for GPU queries + - nvcc for CUDA version + - Basic error handling + + No version-aware tool selection yet (deferred for future work). + """ + + # Tool paths + NVIDIA_SMI_PATH = "/usr/bin/nvidia-smi" + NVCC_PATH = "/usr/local/cuda/bin/nvcc" + + def __init__(self): + """Initialize NVIDIA tool manager.""" + super().__init__() + self._log_debug("Initialized NVIDIA tool manager") + + def get_version(self) -> Optional[str]: + """Get CUDA version as string. + + Returns: + CUDA version string or None if unable to detect + """ + return self.get_cuda_version() + + def get_cuda_version(self) -> Optional[str]: + """Get CUDA version from nvcc. + + Returns: + CUDA version string (e.g., "12.0") or None if unable to detect + """ + # Check cache first + cached = self._get_cached_result("cuda_version") + if cached is not None: + return cached + + try: + # Try nvcc --version + if self.is_tool_available(self.NVCC_PATH): + command = f"{self.NVCC_PATH} --version | sed -n 's/^.*release \\([0-9]\\+\\.[0-9]\\+\\).*$/\\1/p'" + success, stdout, stderr = self._execute_shell_command(command) + + if success and stdout: + version = stdout.strip() + self._cache_result("cuda_version", version) + self._log_info(f"CUDA version: {version}") + return version + + # Fallback: Try nvidia-smi to get driver version + if self.is_tool_available(self.NVIDIA_SMI_PATH): + command = f"{self.NVIDIA_SMI_PATH} --query | grep 'CUDA Version' | awk '{{print $4}}'" + success, stdout, stderr = self._execute_shell_command(command) + + if success and stdout: + version = stdout.strip() + self._cache_result("cuda_version", version) + self._log_info(f"CUDA version (from nvidia-smi): {version}") + return version + + self._log_warning("Unable to detect CUDA version") + return None + + except Exception as e: + self._log_error(f"Error detecting CUDA version: {e}") + return None + + def get_driver_version(self) -> Optional[str]: + """Get NVIDIA driver version. + + Returns: + Driver version string or None if unable to detect + """ + # Check cache + cached = self._get_cached_result("driver_version") + if cached is not None: + return cached + + try: + if self.is_tool_available(self.NVIDIA_SMI_PATH): + command = f"{self.NVIDIA_SMI_PATH} --query-gpu=driver_version --format=csv,noheader | head -n1" + success, stdout, stderr = self._execute_shell_command(command) + + if success and stdout: + version = stdout.strip() + self._cache_result("driver_version", version) + self._log_info(f"NVIDIA driver version: {version}") + return version + + self._log_warning("Unable to detect NVIDIA driver version") + return None + + except Exception as e: + self._log_error(f"Error detecting driver version: {e}") + return None + + def execute_command( + self, + command: str, + fallback_command: Optional[str] = None, + timeout: int = 30 + ) -> str: + """Execute command with optional fallback. + + Args: + command: Primary command to execute + fallback_command: Optional fallback command (currently not used for NVIDIA) + timeout: Command timeout in seconds + + Returns: + Command output as string + + Raises: + RuntimeError: If command fails + """ + success, stdout, stderr = self._execute_shell_command(command, timeout) + + if success: + return stdout + + # Try fallback if provided + if fallback_command: + self._log_warning(f"Primary command failed, trying fallback: {fallback_command[:50]}...") + success, stdout, stderr = self._execute_shell_command(fallback_command, timeout) + + if success: + return stdout + else: + raise RuntimeError( + f"Both primary and fallback commands failed.\n" + f"Primary: {command}\n" + f"Fallback: {fallback_command}\n" + f"Error: {stderr}" + ) + else: + raise RuntimeError(f"Command failed: {command}\nError: {stderr}") + + def execute_nvidia_smi(self, args: str, timeout: int = 30) -> str: + """Execute nvidia-smi with specified arguments. + + Args: + args: Arguments to pass to nvidia-smi + timeout: Command timeout in seconds + + Returns: + Command output as string + + Raises: + RuntimeError: If nvidia-smi is not available or command fails + """ + if not self.is_tool_available(self.NVIDIA_SMI_PATH): + raise RuntimeError( + f"nvidia-smi not found at {self.NVIDIA_SMI_PATH}\n" + f"Ensure NVIDIA drivers are installed." + ) + + command = f"{self.NVIDIA_SMI_PATH} {args}" + return self.execute_command(command, timeout=timeout) + + def execute_nvcc(self, args: str, timeout: int = 30) -> str: + """Execute nvcc with specified arguments. + + Args: + args: Arguments to pass to nvcc + timeout: Command timeout in seconds + + Returns: + Command output as string + + Raises: + RuntimeError: If nvcc is not available or command fails + """ + if not self.is_tool_available(self.NVCC_PATH): + raise RuntimeError( + f"nvcc not found at {self.NVCC_PATH}\n" + f"Ensure CUDA toolkit is installed." + ) + + command = f"{self.NVCC_PATH} {args}" + return self.execute_command(command, timeout=timeout) + + def get_gpu_count(self) -> int: + """Get number of NVIDIA GPUs in the system. + + Returns: + Number of GPUs detected + + Raises: + RuntimeError: If unable to detect GPUs + """ + # Check cache + cached = self._get_cached_result("gpu_count") + if cached is not None: + return cached + + try: + output = self.execute_nvidia_smi("-L | wc -l") + count = int(output.strip()) + + self._cache_result("gpu_count", count) + self._log_info(f"Detected {count} NVIDIA GPU(s)") + + return count + + except Exception as e: + raise RuntimeError( + f"Unable to determine number of NVIDIA GPUs.\n" + f"Error: {e}\n" + f"Suggestions:\n" + f"- Verify NVIDIA drivers: nvidia-smi\n" + f"- Check GPU accessibility: ls -la /dev/nvidia*" + ) + + def get_gpu_product_name(self, gpu_id: int = 0) -> str: + """Get GPU product name. + + Args: + gpu_id: GPU index (0-based) + + Returns: + GPU product name (e.g., "NVIDIA H100 80GB HBM3") + + Raises: + RuntimeError: If unable to get product name + """ + cache_key = f"gpu_product_name:{gpu_id}" + cached = self._get_cached_result(cache_key) + if cached: + return cached + + try: + output = self.execute_nvidia_smi( + f"--query-gpu=name --format=csv,noheader,nounits -i {gpu_id}" + ) + product_name = output.strip() + + self._cache_result(cache_key, product_name) + self._log_debug(f"GPU {gpu_id} product name: {product_name}") + + return product_name + + except Exception as e: + raise RuntimeError( + f"Unable to get GPU product name for GPU {gpu_id}.\n" + f"Error: {e}\n" + f"Ensure GPU {gpu_id} exists: nvidia-smi -L" + ) + + def get_gpu_architecture(self, gpu_id: int = 0) -> str: + """Get GPU architecture/compute capability. + + Args: + gpu_id: GPU index (0-based) + + Returns: + GPU architecture string + + Raises: + RuntimeError: If unable to detect GPU architecture + """ + cache_key = f"gpu_architecture:{gpu_id}" + cached = self._get_cached_result(cache_key) + if cached: + return cached + + try: + # Get full GPU name which includes architecture info + output = self.execute_nvidia_smi( + f"-L | head -n{gpu_id + 1} | tail -n1 | sed 's/(UUID: .*)//g' | sed 's/GPU {gpu_id}: //g'" + ) + arch = output.strip() + + self._cache_result(cache_key, arch) + self._log_debug(f"GPU {gpu_id} architecture: {arch}") + + return arch + + except Exception as e: + raise RuntimeError( + f"Unable to determine GPU architecture for GPU {gpu_id}.\n" + f"Error: {e}" + ) + diff --git a/src/madengine/utils/ops.py b/src/madengine/utils/ops.py index 4a0f6a45..0b8ab077 100644 --- a/src/madengine/utils/ops.py +++ b/src/madengine/utils/ops.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -"""Utility functions for MADEngine +"""Utility functions for madengine -This module contains utility functions for MADEngine. +This module contains utility functions for madengine. functions: PythonicTee: Class to both write and display stream, in "live" mode @@ -54,17 +54,15 @@ def flush(self) -> None: def find_and_replace_pattern( - dictionary: typing.Dict, - substring: str, - replacement: str - ) -> typing.Dict: + dictionary: typing.Dict, substring: str, replacement: str +) -> typing.Dict: """Find and replace a substring in a dictionary. - + Args: dictionary: The dictionary. substring: The substring to find. replacement: The replacement string. - + Returns: The updated dictionary. """ @@ -78,16 +76,13 @@ def find_and_replace_pattern( return updated_dict -def substring_found( - dictionary: typing.Dict, - substring: str - ) -> bool: +def substring_found(dictionary: typing.Dict, substring: str) -> bool: """Check if a substring is found in the dictionary. - + Args: dictionary: The dictionary. substring: The substring to find. - + Returns: True if the substring is found, False otherwise. """ diff --git a/src/madengine/utils/rocm_tool_manager.py b/src/madengine/utils/rocm_tool_manager.py new file mode 100644 index 00000000..0324d231 --- /dev/null +++ b/src/madengine/utils/rocm_tool_manager.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +""" +ROCm Tool Manager + +Version-aware AMD ROCm tool manager with automatic fallback between amd-smi and +rocm-smi based on ROCm version and tool availability. + +Based on PR #54: https://github.com/ROCm/madengine/pull/54 +- ROCm version threshold: 6.4.1 (use amd-smi for >= 6.4.1, rocm-smi for < 6.4.1) +- Automatic fallback to rocm-smi when amd-smi is unavailable +- Robust error handling with actionable suggestions + +References: +- ROCm best practices: https://github.com/ROCm/TheRock +- ROCm systems: https://github.com/ROCm/rocm-systems + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import os +import re +from typing import Dict, List, Optional, Tuple + +from madengine.utils.gpu_tool_manager import BaseGPUToolManager + + +# ROCm version threshold for amd-smi vs rocm-smi (from PR #54) +ROCM_VERSION_THRESHOLD = (6, 4, 1) + + +class ROCmToolManager(BaseGPUToolManager): + """AMD ROCm tool manager with version-aware tool selection. + + Features: + - Automatic ROCm version detection from multiple sources + - Version-aware tool selection (amd-smi >= 6.4.1, rocm-smi < 6.4.1) + - Automatic fallback with warnings when preferred tool unavailable + - Comprehensive error messages with troubleshooting suggestions + + Tool Selection Logic: + - ROCm >= 6.4.1: Prefer amd-smi, fallback to rocm-smi with warning + - ROCm < 6.4.1: Use rocm-smi + - If both tools fail: Raise error with debugging information + """ + + # Tool paths + AMD_SMI_PATH = "/opt/rocm/bin/amd-smi" + ROCM_SMI_PATH = "/opt/rocm/bin/rocm-smi" + HIPCONFIG_PATH = "/opt/rocm/bin/hipconfig" + ROCMINFO_PATH = "/opt/rocm/bin/rocminfo" + ROCM_VERSION_FILE = "/opt/rocm/.info/version" + + def __init__(self): + """Initialize ROCm tool manager.""" + super().__init__() + self._log_debug("Initialized ROCm tool manager") + + def get_version(self) -> Optional[str]: + """Get ROCm version as string. + + Returns: + ROCm version string (e.g., "6.4.1") or None if unable to detect + """ + version_tuple = self.get_rocm_version() + if version_tuple: + return ".".join(map(str, version_tuple)) + return None + + def get_rocm_version(self) -> Optional[Tuple[int, int, int]]: + """Get ROCm version as tuple. + + Tries multiple detection methods in order: + 1. hipconfig --version + 2. /opt/rocm/.info/version file + 3. rocminfo parsing + + Results are cached for performance. + + Returns: + ROCm version as tuple (major, minor, patch) or None if unable to detect + + Example: + >>> manager = ROCmToolManager() + >>> manager.get_rocm_version() + (6, 4, 1) + """ + # Check cache first + cached = self._get_cached_result("rocm_version") + if cached is not None: + return cached + + version = None + + # Method 1: Try hipconfig --version + if self.is_tool_available(self.HIPCONFIG_PATH): + success, stdout, stderr = self._execute_shell_command( + f"{self.HIPCONFIG_PATH} --version", + timeout=10 + ) + if success and stdout: + # Parse version like "6.4.1-12345" -> (6, 4, 1) + try: + version_str = stdout.split('-')[0].strip() + parts = version_str.split('.') + if len(parts) >= 3: + version = (int(parts[0]), int(parts[1]), int(parts[2])) + self._log_debug(f"Detected ROCm version from hipconfig: {version}") + except (ValueError, IndexError) as e: + self._log_warning(f"Failed to parse hipconfig version '{stdout}': {e}") + + # Method 2: Try version file + if version is None and os.path.exists(self.ROCM_VERSION_FILE): + try: + with open(self.ROCM_VERSION_FILE, 'r') as f: + version_str = f.read().strip().split('-')[0] + parts = version_str.split('.') + if len(parts) >= 3: + version = (int(parts[0]), int(parts[1]), int(parts[2])) + self._log_debug(f"Detected ROCm version from file: {version}") + except (IOError, ValueError, IndexError) as e: + self._log_warning(f"Failed to read version file: {e}") + + # Method 3: Try rocminfo (less reliable, last resort) + if version is None and self.is_tool_available(self.ROCMINFO_PATH): + success, stdout, stderr = self._execute_shell_command( + f"{self.ROCMINFO_PATH} | grep -i 'ROCm Version' | head -n1", + timeout=10 + ) + if success and stdout: + try: + # Parse output like "ROCm Version: 6.4.1" + match = re.search(r'(\d+)\.(\d+)\.(\d+)', stdout) + if match: + version = (int(match.group(1)), int(match.group(2)), int(match.group(3))) + self._log_debug(f"Detected ROCm version from rocminfo: {version}") + except (ValueError, AttributeError) as e: + self._log_warning(f"Failed to parse rocminfo output: {e}") + + # Cache the result (even if None) + self._cache_result("rocm_version", version) + + if version: + self._log_info(f"ROCm version: {'.'.join(map(str, version))}") + else: + self._log_warning("Unable to detect ROCm version from any source") + + return version + + def get_preferred_smi_tool(self) -> str: + """Get the preferred SMI tool based on ROCm version. + + Returns: + Tool name: 'amd-smi' or 'rocm-smi' + + Logic: + - ROCm >= 6.4.1: Prefer amd-smi + - ROCm < 6.4.1: Use rocm-smi + - Unknown version: Try amd-smi first (conservative choice) + """ + version = self.get_rocm_version() + + if version is None: + self._log_warning("ROCm version unknown, defaulting to amd-smi") + return "amd-smi" + + if version >= ROCM_VERSION_THRESHOLD: + return "amd-smi" + else: + return "rocm-smi" + + def execute_command( + self, + command: str, + fallback_command: Optional[str] = None, + timeout: int = 30 + ) -> str: + """Execute command with optional fallback. + + Args: + command: Primary command to execute + fallback_command: Optional fallback command if primary fails + timeout: Command timeout in seconds + + Returns: + Command output as string + + Raises: + RuntimeError: If both primary and fallback commands fail + """ + # Try primary command + success, stdout, stderr = self._execute_shell_command(command, timeout) + + if success: + self._log_debug(f"Command succeeded: {command[:50]}...") + return stdout + + # Log primary failure + self._log_warning(f"Primary command failed: {command[:50]}... Error: {stderr}") + + # Try fallback if provided + if fallback_command: + self._log_info(f"Trying fallback command: {fallback_command[:50]}...") + success, stdout, stderr = self._execute_shell_command(fallback_command, timeout) + + if success: + self._log_warning("Fallback command succeeded (primary tool may be missing or misconfigured)") + return stdout + else: + # Both failed + raise RuntimeError( + f"Both primary and fallback commands failed.\n" + f"Primary: {command}\n" + f"Primary error: {stderr}\n" + f"Fallback: {fallback_command}\n" + f"Fallback error: {stderr}" + ) + else: + # No fallback, raise error + raise RuntimeError(f"Command failed: {command}\nError: {stderr}") + + def execute_smi_command(self, command_template: str, use_amd_smi: bool = True, **kwargs) -> str: + """Execute SMI command with automatic tool selection and fallback. + + Args: + command_template: Command template with {tool} placeholder + use_amd_smi: If True, use amd-smi syntax; if False, use rocm-smi syntax + **kwargs: Additional format parameters for command template + + Returns: + Command output as string + + Example: + >>> manager = ROCmToolManager() + >>> # Will try amd-smi, fallback to rocm-smi if needed + >>> output = manager.execute_smi_command("{tool} list --csv") + """ + preferred_tool = self.get_preferred_smi_tool() + + # Format command with preferred tool + if preferred_tool == "amd-smi": + tool_path = self.AMD_SMI_PATH + fallback_path = self.ROCM_SMI_PATH + else: + tool_path = self.ROCM_SMI_PATH + fallback_path = self.AMD_SMI_PATH + + command = command_template.format(tool=tool_path, **kwargs) + + # Create fallback command if fallback tool is available + fallback_command = None + if self.is_tool_available(fallback_path): + fallback_command = command_template.format(tool=fallback_path, **kwargs) + + return self.execute_command(command, fallback_command) + + def get_gpu_count(self) -> int: + """Get number of AMD GPUs in the system. + + Returns: + Number of GPUs detected + + Raises: + RuntimeError: If unable to detect GPUs with any tool + """ + # Check cache + cached = self._get_cached_result("gpu_count") + if cached is not None: + return cached + + preferred_tool = self.get_preferred_smi_tool() + + try: + if preferred_tool == "amd-smi": + # Try amd-smi list --csv + command = f"{self.AMD_SMI_PATH} list --csv | tail -n +3 | wc -l" + fallback = f"{self.ROCM_SMI_PATH} --showid --csv | tail -n +2 | wc -l" + else: + # Use rocm-smi + command = f"{self.ROCM_SMI_PATH} --showid --csv | tail -n +2 | wc -l" + fallback = f"{self.AMD_SMI_PATH} list --csv | tail -n +3 | wc -l" if self.is_tool_available(self.AMD_SMI_PATH) else None + + output = self.execute_command(command, fallback) + count = int(output.strip()) + + # Cache result + self._cache_result("gpu_count", count) + self._log_info(f"Detected {count} AMD GPU(s)") + + return count + + except Exception as e: + raise RuntimeError( + f"Unable to determine number of AMD GPUs.\n" + f"Error: {e}\n" + f"Suggestions:\n" + f"- Verify ROCm installation: ls -la /opt/rocm/bin/\n" + f"- Check GPU accessibility: ls -la /dev/kfd /dev/dri\n" + f"- Ensure user is in 'video' and 'render' groups\n" + f"- See: https://github.com/ROCm/TheRock" + ) + + def get_gpu_product_name(self, gpu_id: int = 0) -> str: + """Get GPU product name with fallback (from PR #54). + + Args: + gpu_id: GPU index (0-based) + + Returns: + GPU product name (e.g., "AMD Instinct MI300X") + + Raises: + RuntimeError: If unable to get product name with any tool + """ + cache_key = f"gpu_product_name:{gpu_id}" + cached = self._get_cached_result(cache_key) + if cached: + return cached + + preferred_tool = self.get_preferred_smi_tool() + + try: + if preferred_tool == "amd-smi": + # Try amd-smi static command + command = f"{self.AMD_SMI_PATH} static -g {gpu_id} | grep MARKET_NAME: | cut -d ':' -f 2" + # Fallback to rocm-smi with different syntax (PR #54) + fallback = f"{self.ROCM_SMI_PATH} --showproductname | grep 'GPU\\[{gpu_id}\\]' | awk '{{print $NF}}'" + else: + # Use rocm-smi + command = f"{self.ROCM_SMI_PATH} --showproductname | grep 'GPU\\[{gpu_id}\\]' | awk '{{print $NF}}'" + # Fallback to amd-smi if available + fallback = f"{self.AMD_SMI_PATH} static -g {gpu_id} | grep MARKET_NAME: | cut -d ':' -f 2" if self.is_tool_available(self.AMD_SMI_PATH) else None + + output = self.execute_command(command, fallback) + product_name = output.strip() + + # Cache result + self._cache_result(cache_key, product_name) + self._log_debug(f"GPU {gpu_id} product name: {product_name}") + + return product_name + + except Exception as e: + raise RuntimeError( + f"Unable to get GPU product name for GPU {gpu_id}.\n" + f"Error: {e}\n" + f"Suggestions:\n" + f"- Verify GPU {gpu_id} exists: {self.ROCM_SMI_PATH} --showid\n" + f"- Check ROCm version: cat /opt/rocm/.info/version\n" + f"- For ROCm >= 6.4.1, ensure amd-smi is installed" + ) + + def get_gpu_architecture(self) -> str: + """Get GPU architecture (e.g., gfx908, gfx90a, gfx942). + + Returns: + GPU architecture string + + Raises: + RuntimeError: If unable to detect GPU architecture + """ + # Check cache + cached = self._get_cached_result("gpu_architecture") + if cached: + return cached + + try: + # Use rocminfo to get architecture (most reliable) + command = f"{self.ROCMINFO_PATH} | grep -o -m 1 'gfx.*'" + success, stdout, stderr = self._execute_shell_command(command) + + if success and stdout: + arch = stdout.strip() + self._cache_result("gpu_architecture", arch) + self._log_info(f"GPU architecture: {arch}") + return arch + else: + raise RuntimeError(f"rocminfo failed or returned empty: {stderr}") + + except Exception as e: + raise RuntimeError( + f"Unable to determine GPU architecture.\n" + f"Error: {e}\n" + f"Suggestions:\n" + f"- Verify rocminfo is accessible: {self.ROCMINFO_PATH} --version\n" + f"- Check GPU is visible: {self.ROCM_SMI_PATH} --showid\n" + f"- Ensure ROCm is properly installed" + ) + + def get_gpu_vendor_check(self) -> str: + """Check GPU vendor with fallback (from PR #54). + + Returns: + "AMD" if AMD GPU detected, error message otherwise + + Note: + This checks if AMD SMI tools can detect GPUs, confirming AMD vendor. + """ + try: + # Try to get GPU count - if successful, AMD GPUs are present + count = self.get_gpu_count() + if count > 0: + return "AMD" + else: + return "No AMD GPUs detected" + except Exception as e: + return f"Unable to detect AMD GPU vendor: {e}" + + def list_gpus_json(self) -> List[Dict]: + """List all GPUs with detailed information in JSON format. + + Returns: + List of GPU information dictionaries + + Raises: + RuntimeError: If unable to list GPUs + """ + preferred_tool = self.get_preferred_smi_tool() + + try: + if preferred_tool == "amd-smi" and self.is_tool_available(self.AMD_SMI_PATH): + # Try amd-smi list with JSON output + command = f"{self.AMD_SMI_PATH} list --json" + success, stdout, stderr = self._execute_shell_command(command) + + if success and stdout: + try: + return json.loads(stdout) + except json.JSONDecodeError as e: + self._log_warning(f"Failed to parse amd-smi JSON: {e}") + + # Fallback: parse rocm-smi output + command = f"{self.ROCM_SMI_PATH} --showid" + output = self.execute_command(command) + + # Parse rocm-smi output to JSON-like structure + gpus = [] + for line in output.split('\n'): + if 'GPU[' in line: + try: + gpu_id = int(line.split('[')[1].split(']')[0]) + gpus.append({"gpu": gpu_id, "node_id": gpu_id}) + except (IndexError, ValueError): + continue + + return gpus + + except Exception as e: + raise RuntimeError(f"Unable to list GPUs: {e}") + diff --git a/src/madengine/utils/session_tracker.py b/src/madengine/utils/session_tracker.py new file mode 100644 index 00000000..6ddd1d92 --- /dev/null +++ b/src/madengine/utils/session_tracker.py @@ -0,0 +1,154 @@ +""" +Session Tracking Utility + +Tracks execution sessions to filter current run results from historical data in perf.csv. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import os +from pathlib import Path +from typing import Optional + + +class SessionTracker: + """ + Tracks execution session boundaries for filtering performance results. + + When an execution starts, it records the current row count in perf.csv. + After execution, results can be filtered to show only rows added during this session. + + Best Practice: Session marker file is stored in the SAME directory as perf.csv + to ensure consistent access regardless of working directory changes. + """ + + def __init__(self, perf_csv_path: str = "perf.csv"): + """ + Initialize session tracker. + + Args: + perf_csv_path: Path to the performance CSV file + """ + self.perf_csv_path = Path(perf_csv_path).resolve() # Use absolute path + self.session_start_row: Optional[int] = None + # Marker file in same directory as perf.csv + self.marker_file = self.perf_csv_path.parent / ".madengine_session_start" + + def start_session(self) -> int: + """ + Mark the start of an execution session. + + Records the current number of rows in perf.csv so we can later + identify which rows were added during this session. + + Also saves the marker file for use by child processes. + + Returns: + The starting row number (number of rows in CSV before this session) + """ + if self.perf_csv_path.exists(): + # Count existing rows (excluding header) + with open(self.perf_csv_path, 'r') as f: + lines = f.readlines() + # Subtract 1 for header row + self.session_start_row = max(0, len(lines) - 1) + else: + # No existing file, start at 0 + self.session_start_row = 0 + + # Automatically save marker for child processes + self._save_marker(self.session_start_row) + + return self.session_start_row + + def get_session_start(self) -> Optional[int]: + """ + Get the session start row. + + Returns: + Session start row number, or None if session not started + """ + return self.session_start_row + + def get_session_row_count(self) -> int: + """ + Get the number of rows added during this session. + + Returns: + Number of rows added since session start + """ + if self.session_start_row is None: + return 0 + + if not self.perf_csv_path.exists(): + return 0 + + with open(self.perf_csv_path, 'r') as f: + lines = f.readlines() + current_row_count = max(0, len(lines) - 1) # Exclude header + + return current_row_count - self.session_start_row + + def _save_marker(self, start_row: int): + """ + Save session start marker to file (private method). + + Args: + start_row: The starting row number + """ + # Ensure parent directory exists + self.marker_file.parent.mkdir(parents=True, exist_ok=True) + with open(self.marker_file, 'w') as f: + f.write(str(start_row)) + + def load_marker(self) -> Optional[int]: + """ + Load session start marker from file. + + Uses the marker file path from this instance's perf_csv_path. + + Returns: + Session start row, or None if file doesn't exist + """ + if self.marker_file.exists(): + try: + with open(self.marker_file, 'r') as f: + return int(f.read().strip()) + except (ValueError, IOError): + return None + return None + + def cleanup_marker(self): + """ + Remove session marker file for this instance. + """ + if self.marker_file.exists(): + try: + os.remove(self.marker_file) + except OSError: + pass + + @staticmethod + def load_session_marker_for_csv(perf_csv_path: str = "perf.csv") -> Optional[int]: + """ + Static helper to load session marker for a given CSV path. + + This is useful when you don't have a SessionTracker instance but need to load the marker. + + Args: + perf_csv_path: Path to the performance CSV file + + Returns: + Session start row, or None if marker doesn't exist + """ + perf_path = Path(perf_csv_path).resolve() + marker_file = perf_path.parent / ".madengine_session_start" + + if marker_file.exists(): + try: + with open(marker_file, 'r') as f: + return int(f.read().strip()) + except (ValueError, IOError): + return None + return None + diff --git a/src/madengine/utils/ssh_to_db.py b/src/madengine/utils/ssh_to_db.py deleted file mode 100644 index c5f694fa..00000000 --- a/src/madengine/utils/ssh_to_db.py +++ /dev/null @@ -1,80 +0,0 @@ -"""Module to SSH into the database. - -This module provides the functions to SSH into the database. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -import socket -# third-party modules -import paramiko - - -class SFTPClient(paramiko.SFTPClient): - """Class to create an SFTP client for the database.""" - - def __init__(self, *args, **kwargs): - """Initialize the SFTPClient class.""" - super().__init__(*args, **kwargs) - - def put_dir(self, source: str, target: str) -> None: - """Uploads the contents of the source directory to the target path. The - target directory needs to exists. All subdirectories in source are - created under target. - - Args: - source (str): The source directory to upload. - target (str): The target directory to upload to. - - Returns: - None - - Raises: - IOError: An error occurred uploading the directory. - """ - for item in os.listdir(source): - if os.path.isfile(os.path.join(source, item)): - self.put(os.path.join(source, item), "%s/%s" % (target, item)) - else: - self.mkdir("%s/%s" % (target, item), ignore_existing=True) - self.put_dir(os.path.join(source, item), "%s/%s" % (target, item)) - - def mkdir(self, path: str, mode: int = 511, ignore_existing: bool = False) -> None: - """Augments mkdir by adding an option to not fail if the folder exists - - Args: - path (str): The path to create. - mode (int): The mode to create the path with. - ignore_existing (bool): Whether to ignore if the path already exists. - - Returns: - None - - Raises: - IOError: An error occurred creating the directory. - """ - try: - super(SFTPClient, self).mkdir(path, mode) - except IOError: - if ignore_existing: - pass - else: - raise - - -def print_ssh_out(client_output: tuple) -> None: - """Print the output from the SSH client. - - Args: - client_output (tuple): The output from the SSH client. - - Returns: - None - """ - ssh_stdin, ssh_stdout, ssh_stderr = client_output - ssh_stdin.close() - for line in ssh_stdout.read().splitlines(): - print("{}".format(line)) - for line in ssh_stderr.read().splitlines(): - print("{}".format(line)) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..4a821426 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,433 @@ +""" +Pytest configuration and shared fixtures for madengine tests. + +Provides reusable fixtures for multi-platform testing (AMD GPU, NVIDIA GPU, CPU), +mock contexts, and integration test utilities. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import os +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch +import pytest + + +# ============================================================================ +# Platform Configuration Fixtures +# ============================================================================ + +@pytest.fixture +def amd_gpu_context(): + """Mock Context for AMD GPU platform (ROCm).""" + context = MagicMock() + context.ctx = { + "docker_build_arg": { + "MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a", + "MAD_GPU_VENDOR": "AMD", + } + } + context.get_gpu_vendor.return_value = "AMD" + context.get_system_ngpus.return_value = 8 + context.get_system_gpu_architecture.return_value = "gfx90a" + context.get_system_hip_version.return_value = "6.0" + context.get_gpu_renderD_nodes.return_value = ["renderD128", "renderD129"] + context.get_docker_gpus.return_value = "all" + context.get_system_gpu_product_name.return_value = "AMD Instinct MI300X" + return context + + +@pytest.fixture +def nvidia_gpu_context(): + """Mock Context for NVIDIA GPU platform (CUDA).""" + context = MagicMock() + context.ctx = { + "docker_build_arg": { + "MAD_SYSTEM_GPU_ARCHITECTURE": "sm_90", + "MAD_GPU_VENDOR": "NVIDIA", + } + } + context.get_gpu_vendor.return_value = "NVIDIA" + context.get_system_ngpus.return_value = 8 + context.get_system_gpu_architecture.return_value = "sm_90" + context.get_system_cuda_version.return_value = "12.1" + context.get_docker_gpus.return_value = "all" + context.get_system_gpu_product_name.return_value = "NVIDIA H100" + return context + + +@pytest.fixture +def cpu_context(): + """Mock Context for CPU-only platform.""" + context = MagicMock() + context.ctx = { + "docker_build_arg": { + "MAD_SYSTEM_GPU_ARCHITECTURE": "", + "MAD_GPU_VENDOR": "NONE", + } + } + context.get_gpu_vendor.return_value = "NONE" + context.get_system_ngpus.return_value = 0 + context.get_system_gpu_architecture.return_value = "" + context.get_docker_gpus.return_value = None + return context + + +@pytest.fixture(params=["amd", "nvidia", "cpu"]) +def multi_platform_context(request, amd_gpu_context, nvidia_gpu_context, cpu_context): + """Parametrized fixture that tests across all platforms.""" + contexts = { + "amd": amd_gpu_context, + "nvidia": nvidia_gpu_context, + "cpu": cpu_context, + } + return contexts[request.param] + + +# ============================================================================ +# Mock Args Fixtures +# ============================================================================ + +@pytest.fixture +def mock_build_args(): + """Mock args for build command.""" + args = MagicMock() + args.tags = [] + args.target_archs = [] + args.registry = None + args.additional_context = None + args.additional_context_file = None + args.clean_docker_cache = False + args.manifest_output = "build_manifest.json" + args.live_output = False + args.output = "perf.csv" + args.ignore_deprecated_flag = False + args.data_config_file_name = "data.json" + args.tools_json_file_name = "tools.json" + args.generate_sys_env_details = True + args.force_mirror_local = False + args.disable_skip_gpu_arch = False + args.verbose = False + args._separate_phases = True + return args + + +@pytest.fixture +def mock_run_args(): + """Mock args for run command.""" + args = MagicMock() + args.tags = [] + args.manifest_file = "build_manifest.json" + args.registry = None + args.timeout = 3600 + args.keep_alive = False + args.keep_model_dir = False + args.additional_context = None + args.additional_context_file = None + args.live_output = False + args.output = "perf.csv" + args.ignore_deprecated_flag = False + args.data_config_file_name = "data.json" + args.tools_json_file_name = "tools.json" + args.generate_sys_env_details = True + args.force_mirror_local = False + args.disable_skip_gpu_arch = False + args.verbose = False + args._separate_phases = True + return args + + +# ============================================================================ +# Test Data Fixtures +# ============================================================================ + +@pytest.fixture +def sample_models(): + """Sample model data for testing.""" + return [ + { + "name": "model1", + "tags": ["test", "integration"], + "dockerfile": "docker/model1.Dockerfile", + }, + { + "name": "model2", + "tags": ["test"], + "dockerfile": "docker/model2.Dockerfile", + }, + ] + + +@pytest.fixture +def sample_build_summary_success(): + """Sample successful build summary.""" + return { + "successful_builds": [ + { + "model": "model1", + "docker_image": "ci-model1", + "dockerfile": "docker/model1.Dockerfile", + "build_duration": 10.5, + "gpu_architecture": "gfx90a", + }, + { + "model": "model2", + "docker_image": "ci-model2", + "dockerfile": "docker/model2.Dockerfile", + "build_duration": 8.3, + "gpu_architecture": "gfx90a", + }, + ], + "failed_builds": [], + "total_build_time": 18.8, + } + + +@pytest.fixture +def sample_build_summary_partial(): + """Sample partial build summary (mixed success/failure).""" + return { + "successful_builds": [ + { + "model": "model1", + "docker_image": "ci-model1", + "dockerfile": "docker/model1.Dockerfile", + "build_duration": 10.5, + "gpu_architecture": "gfx90a", + }, + ], + "failed_builds": [ + { + "model": "model2", + "error": "Build failed: dependency not found", + }, + ], + "total_build_time": 10.5, + } + + +@pytest.fixture +def sample_build_summary_all_failed(): + """Sample build summary with all failures.""" + return { + "successful_builds": [], + "failed_builds": [ + { + "model": "model1", + "error": "Build failed: base image not found", + }, + { + "model": "model2", + "error": "Build failed: syntax error in Dockerfile", + }, + ], + "total_build_time": 0, + } + + +@pytest.fixture +def sample_manifest(): + """Sample build manifest.""" + return { + "built_images": { + "ci-model1": { + "docker_image": "ci-model1", + "dockerfile": "docker/model1.Dockerfile", + "gpu_architecture": "gfx90a", + }, + "ci-model2": { + "docker_image": "ci-model2", + "dockerfile": "docker/model2.Dockerfile", + "gpu_architecture": "gfx90a", + }, + }, + "built_models": { + "ci-model1": { + "name": "model1", + "tags": ["test"], + }, + "ci-model2": { + "name": "model2", + "tags": ["test"], + }, + }, + "summary": { + "successful_builds": [ + {"model": "model1", "docker_image": "ci-model1"}, + {"model": "model2", "docker_image": "ci-model2"}, + ], + "failed_builds": [], + }, + } + + +# ============================================================================ +# Temporary File Fixtures +# ============================================================================ + +@pytest.fixture +def temp_manifest_file(sample_manifest): + """Create a temporary manifest file.""" + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as f: + json.dump(sample_manifest, f) + manifest_path = f.name + + yield manifest_path + + # Cleanup + if os.path.exists(manifest_path): + os.unlink(manifest_path) + + +@pytest.fixture +def temp_working_dir(): + """Create a temporary working directory.""" + with tempfile.TemporaryDirectory() as tmpdir: + original_cwd = os.getcwd() + os.chdir(tmpdir) + + yield tmpdir + + os.chdir(original_cwd) + + +# ============================================================================ +# Mock Builder and Runner Fixtures +# ============================================================================ + +@pytest.fixture +def mock_docker_builder(sample_build_summary_success): + """Mock DockerBuilder with successful builds.""" + builder = MagicMock() + builder.build_all_models.return_value = sample_build_summary_success + builder.export_build_manifest.return_value = None + builder.built_images = { + "ci-model1": {"docker_image": "ci-model1"}, + "ci-model2": {"docker_image": "ci-model2"}, + } + return builder + + +@pytest.fixture +def mock_container_runner(): + """Mock ContainerRunner with successful runs.""" + runner = MagicMock() + runner.run_models_from_manifest.return_value = { + "successful_runs": [ + { + "model": "model1", + "image": "ci-model1", + "status": "SUCCESS", + "performance": 1000.0, + "duration": 30.5, + }, + { + "model": "model2", + "image": "ci-model2", + "status": "SUCCESS", + "performance": 1200.0, + "duration": 28.3, + }, + ], + "failed_runs": [], + "total_runs": 2, + } + return runner + + +# ============================================================================ +# Integration Test Helpers +# ============================================================================ + +@pytest.fixture +def integration_test_env(): + """Setup integration test environment variables.""" + env_vars = { + "MODEL_DIR": "tests/fixtures/dummy", + "MAD_SKIP_GPU_CHECK": "1", # Skip actual GPU detection in tests + } + + with patch.dict(os.environ, env_vars, clear=False): + yield env_vars + + +# ============================================================================ +# Pytest Configuration +# ============================================================================ + +def pytest_configure(config): + """Configure pytest with custom markers.""" + config.addinivalue_line( + "markers", "integration: marks tests as integration tests (may be slow)" + ) + config.addinivalue_line( + "markers", "unit: marks tests as fast unit tests" + ) + config.addinivalue_line( + "markers", "gpu: marks tests that require GPU hardware" + ) + config.addinivalue_line( + "markers", "amd: marks tests specific to AMD GPUs" + ) + config.addinivalue_line( + "markers", "nvidia: marks tests specific to NVIDIA GPUs" + ) + config.addinivalue_line( + "markers", "cpu: marks tests for CPU-only execution" + ) + config.addinivalue_line( + "markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')" + ) + + +# ============================================================================ +# Utility Functions for Tests +# ============================================================================ + +def assert_build_manifest_valid(manifest_path): + """Assert that a build manifest file is valid.""" + assert os.path.exists(manifest_path), f"Manifest not found: {manifest_path}" + + with open(manifest_path) as f: + manifest = json.load(f) + + # Check required keys + assert "built_images" in manifest + assert "built_models" in manifest + assert "summary" in manifest + + # Check summary structure + summary = manifest["summary"] + assert "successful_builds" in summary + assert "failed_builds" in summary + assert isinstance(summary["successful_builds"], list) + assert isinstance(summary["failed_builds"], list) + + return manifest + + +def assert_perf_csv_valid(csv_path): + """Assert that a performance CSV file is valid.""" + assert os.path.exists(csv_path), f"Performance CSV not found: {csv_path}" + + import pandas as pd + df = pd.read_csv(csv_path) + + # Check required columns + required_columns = ["model", "n_gpus", "gpu_architecture", "status"] + for col in required_columns: + assert col in df.columns, f"Missing column: {col}" + + return df + + +# Export utility functions for use in tests +__all__ = [ + "assert_build_manifest_valid", + "assert_perf_csv_valid", +] + diff --git a/tests/e2e/test_build_workflows.py b/tests/e2e/test_build_workflows.py new file mode 100644 index 00000000..452c8af4 --- /dev/null +++ b/tests/e2e/test_build_workflows.py @@ -0,0 +1,286 @@ +"""Test various Build workflows and command-line arguments. + +This module tests various command-line argument behaviors including: +- Output file path specification (-o flag) +- GPU architecture checking and skip flags +- Multiple results output handling + +UPDATED: Refactored to use python3 -m madengine.cli.app instead of legacy mad.py + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# built-in modules +import os +import sys +import csv +import json +import pandas as pd + +# 3rd party modules +import pytest + +# project modules +from tests.fixtures.utils import BASE_DIR, MODEL_DIR +from tests.fixtures.utils import global_data +from tests.fixtures.utils import clean_test_temp_files +from tests.fixtures.utils import generate_additional_context_for_machine + + + +# ============================================================================ +# Build CLI Features Tests +# ============================================================================ + +class TestCLIFeatures: + """Test various CLI features and command-line argument behaviors.""" + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf_test.csv", "perf_test.html"]], indirect=True + ) + def test_output_commandline_argument_writes_csv_correctly( + self, global_data, clean_test_temp_files + ): + """ + Test that -o/--output command-line argument writes CSV file to specified path. + UPDATED: Now uses python3 -m madengine.cli.app instead of legacy mad.py + """ + context = generate_additional_context_for_machine() + output = global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --tags dummy -o perf_test.csv --live-output --additional-context '{json.dumps(context)}'" + ) + success = False + with open(os.path.join(BASE_DIR, "perf_test.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy": + if row["status"] == "SUCCESS": + success = True + break + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not success: + pytest.fail("model, dummy, not found in perf_test.csv.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf_test.csv", "perf_test.html"]], indirect=True + ) + def test_commandline_argument_skip_gpu_arch( + self, global_data, clean_test_temp_files + ): + """ + Test that skip_gpu_arch command-line argument skips GPU architecture check. + UPDATED: Now uses python3 -m madengine.cli.app instead of legacy mad.py + """ + context = generate_additional_context_for_machine() + output = global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --tags dummy_skip_gpu_arch --live-output --additional-context '{json.dumps(context)}'" + ) + if "Skipping model" not in output: + pytest.fail("Enable skipping gpu arch for running model is failed.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf_test.csv", "perf_test.html"]], indirect=True + ) + def test_commandline_argument_disable_skip_gpu_arch_fail( + self, global_data, clean_test_temp_files + ): + """ + Test that --disable-skip-gpu-arch fails GPU architecture check as expected. + UPDATED: Now uses python3 -m madengine.cli.app instead of legacy mad.py + """ + context = generate_additional_context_for_machine() + output = global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --tags dummy_skip_gpu_arch --disable-skip-gpu-arch --live-output --additional-context '{json.dumps(context)}'" + ) + # Check if exception with message 'Skipping model' is thrown + if "Skipping model" in output: + pytest.fail("Disable skipping gpu arch for running model is failed.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf_test.csv", "perf_test.html"]], indirect=True + ) + def test_output_multi_results(self, global_data, clean_test_temp_files): + """ + Test that multiple results are correctly written and merged into output CSV. + UPDATED: Now uses python3 -m madengine.cli.app instead of legacy mad.py + """ + context = generate_additional_context_for_machine() + output = global_data['console'].sh( + "cd " + BASE_DIR + "; " + + "MODEL_DIR=" + MODEL_DIR + " " + + f"python3 -m madengine.cli.app run --tags dummy_multi --live-output --additional-context '{json.dumps(context)}'" + ) + # Check if multiple results are written to perf_dummy.csv + success = False + # Read the csv file to a dataframe using pandas + multi_df = pd.read_csv(os.path.join(BASE_DIR, 'perf_dummy.csv')) + # Check the number of rows in the dataframe is 4, and columns is 4 + if multi_df.shape == (4, 4): + success = True + if not success: + pytest.fail("The generated multi results is not correct.") + # Check if multiple results from perf_dummy.csv get copied over to perf.csv + perf_df = pd.read_csv(os.path.join(BASE_DIR, 'perf.csv')) + # Get the corresponding rows and columns from perf.csv + perf_df = perf_df[multi_df.columns] + perf_df = perf_df.iloc[-4:, :] + # Drop model columns from both dataframes; these will not match + # if multiple results csv has {model}, then perf csv has {tag_name}_{model} + multi_df = multi_df.drop('model', axis=1) + perf_df = perf_df.drop('model', axis=1) + if all(perf_df.columns == multi_df.columns): + success = True + if not success: + pytest.fail("The columns of the generated multi results do not match perf.csv.") + + + + +# ============================================================================ +# Model Discovery Tests +# ============================================================================ + +class TestDiscover: + """Test the model discovery feature.""" + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_static(self, global_data, clean_test_temp_files): + """ + test a tag from a models.json file + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy2/model2 " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy2/model2" and row["status"] == "SUCCESS": + success = True + if not success: + pytest.fail("dummy2/model2 did not run successfully.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_dynamic(self, global_data, clean_test_temp_files): + """ + test a tag from a get_models_json.py file + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy3/model4 " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy3/model4" and row["status"] == "SUCCESS": + success = True + if not success: + pytest.fail("dummy3/model4 did not run successfully.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_additional_args(self, global_data, clean_test_temp_files): + """ + passes additional args specified in the command line to the model + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy2/model2:batch-size=32 " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if ( + row["model"] == "dummy2/model2" + and row["status"] == "SUCCESS" + and "--batch-size 32" in row["args"] + ): + success = True + if not success: + pytest.fail("dummy2/model2:batch-size=32 did not run successfully.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_multiple(self, global_data, clean_test_temp_files): + """ + test multiple tags from top-level models.json, models.json in a script subdir, and get_models_json.py + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_test_group_1,dummy_test_group_2,dummy_test_group_3 " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = pd.read_csv(csv_file) + if len(csv_reader) == 5: + if csv_reader["model"].tolist() == [ + "dummy", + "dummy2/model1", + "dummy2/model2", + "dummy3/model3", + "dummy3/model4", + ]: + if csv_reader["status"].tolist() == [ + "SUCCESS", + "SUCCESS", + "SUCCESS", + "SUCCESS", + "SUCCESS", + ]: + success = True + if not success: + pytest.fail("multiple tags did not run successfully.") + + diff --git a/tests/test_data_provider.py b/tests/e2e/test_data_workflows.py similarity index 54% rename from tests/test_data_provider.py rename to tests/e2e/test_data_workflows.py index ba45be5a..93709051 100644 --- a/tests/test_data_provider.py +++ b/tests/e2e/test_data_workflows.py @@ -2,6 +2,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ + # built-in modules import os import sys @@ -9,12 +10,14 @@ import re import json import tempfile + # third-party modules import pytest + # project modules -from .fixtures.utils import BASE_DIR, MODEL_DIR -from .fixtures.utils import global_data -from .fixtures.utils import clean_test_temp_files +from tests.fixtures.utils import BASE_DIR, MODEL_DIR +from tests.fixtures.utils import global_data +from tests.fixtures.utils import clean_test_temp_files from madengine.core.dataprovider import Data @@ -25,86 +28,121 @@ def test_reorder_data_provider_config(self): Test the reorder_data_provider_config function to ensure it correctly orders data provider types """ # Create a temporary data.json file with shuffled data provider types - with tempfile.NamedTemporaryFile(mode='w+', suffix='.json', delete=False) as temp_file: + with tempfile.NamedTemporaryFile( + mode="w+", suffix=".json", delete=False + ) as temp_file: test_data = { "test_data": { "aws": {"path": "s3://bucket/path"}, "local": {"path": "/local/path"}, "nas": {"path": "/nas/path"}, "custom": {"path": "scripts/custom.sh"}, - "minio": {"path": "minio://bucket/path"} + "minio": {"path": "minio://bucket/path"}, } } json.dump(test_data, temp_file) temp_file_path = temp_file.name - + try: # Create Data object with the test file data_obj = Data(filename=temp_file_path) - + # Check the initial order (should be as defined in the test_data) original_keys = list(data_obj.data_provider_config["test_data"].keys()) - + # Call the reorder function data_obj.reorder_data_provider_config("test_data") - + # Check the order after reordering reordered_keys = list(data_obj.data_provider_config["test_data"].keys()) expected_order = ["custom", "local", "minio", "nas", "aws"] - + # Filter expected_order to only include keys that exist in original_keys expected_filtered = [k for k in expected_order if k in original_keys] - + # Assert that the reordering happened correctly - assert reordered_keys == expected_filtered, f"Expected order {expected_filtered}, got {reordered_keys}" - + assert ( + reordered_keys == expected_filtered + ), f"Expected order {expected_filtered}, got {reordered_keys}" + # Specifically check that custom comes first, if it exists if "custom" in original_keys: - assert reordered_keys[0] == "custom", "Custom should be first in the order" - + assert ( + reordered_keys[0] == "custom" + ), "Custom should be first in the order" + # Check that the order matches the expected priority for i, key in enumerate(reordered_keys): expected_index = expected_order.index(key) - for j, other_key in enumerate(reordered_keys[i+1:], i+1): + for j, other_key in enumerate(reordered_keys[i + 1 :], i + 1): other_expected_index = expected_order.index(other_key) - assert expected_index < other_expected_index, f"{key} should come before {other_key}" - + assert ( + expected_index < other_expected_index + ), f"{key} should come before {other_key}" + finally: # Clean up the temporary file os.unlink(temp_file_path) - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_local_data_provider_runs_successfully(self, global_data, clean_test_temp_files): + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_local_data_provider_runs_successfully( + self, global_data, clean_test_temp_files + ): """ - local data provider gets data from local disk + local data provider gets data from local disk """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_data_local ") + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --tags dummy_data_local --live-output " + ) success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: csv_reader = csv.DictReader(csv_file) for row in csv_reader: - if row['model'] == 'dummy_data_local': - if row['status'] == 'SUCCESS': + if row["model"] == "dummy_data_local": + if row["status"] == "SUCCESS": success = True else: pytest.fail("model in perf_test.csv did not run successfully.") if not success: pytest.fail("local data provider test failed") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'run_directory']], indirect=True) - def test_model_executes_even_if_data_provider_fails(self, global_data, clean_test_temp_files): + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "run_directory"]], + indirect=True, + ) + def test_model_executes_even_if_data_provider_fails( + self, global_data, clean_test_temp_files + ): """ - model executes even if data provider fails + model executes even if data provider fails """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_data_local_fail --additional-context \"{'docker_env_vars':{'MAD_DATAHOME':'/data'} }\" --live-output ", canFail=True) + output = global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --tags dummy_data_local_fail --additional-context \"{'docker_env_vars':{'MAD_DATAHOME':'/data'} }\" --live-output ", + canFail=True, + ) success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: csv_reader = csv.DictReader(csv_file) for row in csv_reader: - if row['model'] == 'dummy_data_local_fail': - if row['status'] == 'FAILURE': + if row["model"] == "dummy_data_local_fail": + if row["status"] == "FAILURE": success = True else: pytest.fail("model in perf_test.csv did not run successfully.") @@ -112,30 +150,44 @@ def test_model_executes_even_if_data_provider_fails(self, global_data, clean_tes pytest.fail("local data provider fail test passed") # Search for "/data is NOT mounted" to ensure model script ran - regexp = re.compile(r'is NOT mounted') + regexp = re.compile(r"is NOT mounted") if not regexp.search(output): pytest.fail("model did not execute after data provider failed") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'dataLocal']], indirect=True) - def test_local_data_provider_mirrorlocal_does_not_mirror_data(self, global_data, clean_test_temp_files): + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html", "dataLocal"]], indirect=True + ) + def test_local_data_provider_mirrorlocal_does_not_mirror_data( + self, global_data, clean_test_temp_files + ): """ In local data provider, mirrorlocal field in data.json does not mirror data in local disk """ mirrorPath = os.path.join(BASE_DIR, "dataLocal") - os.mkdir( mirrorPath ) - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_data_local --force-mirror-local " + mirrorPath ) + os.mkdir(mirrorPath) + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --tags dummy_data_local --force-mirror-local " + + mirrorPath + + " --live-output" + ) success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: csv_reader = csv.DictReader(csv_file) for row in csv_reader: - if row['model'] == 'dummy_data_local': - if row['status'] == 'SUCCESS': + if row["model"] == "dummy_data_local": + if row["status"] == "SUCCESS": success = True else: pytest.fail("model in perf_test.csv did not run successfully.") if not success: pytest.fail("local data provider test failed") - if os.path.exists( os.path.join(mirrorPath, "dummy_data_local") ): + if os.path.exists(os.path.join(mirrorPath, "dummy_data_local")): pytest.fail("custom data provider did mirror data locally") diff --git a/tests/e2e/test_execution_features.py b/tests/e2e/test_execution_features.py new file mode 100644 index 00000000..7a0ac120 --- /dev/null +++ b/tests/e2e/test_execution_features.py @@ -0,0 +1,488 @@ +"""Test the timeouts in madengine. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import pytest +import json +import os +import re +import csv +import time + +from tests.fixtures.utils import BASE_DIR, MODEL_DIR +from tests.fixtures.utils import global_data +from tests.fixtures.utils import clean_test_temp_files +from tests.fixtures.utils import is_nvidia +from tests.fixtures.utils import generate_additional_context_for_machine + + + +# ============================================================================ +# Timeout Feature Tests +# ============================================================================ + +class TestCustomTimeoutsFunctionality: + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_default_model_timeout_2hrs(self, global_data, clean_test_temp_files): + """ + default model timeout is 2 hrs + This test only checks if the timeout is set; it does not actually time the model. + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy" + ) + + regexp = re.compile(r"⏰ Setting timeout to ([0-9]*) seconds.") + foundTimeout = None + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".run.live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundTimeout = match.groups()[0] + if foundTimeout != "7200": + pytest.fail("default model timeout is not 2 hrs (" + str(foundTimeout) + "s).") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_can_override_timeout_in_model(self, global_data, clean_test_temp_files): + """ + timeout can be overridden in model + This test only checks if the timeout is set; it does not actually time the model. + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_timeout" + ) + + regexp = re.compile(r"⏰ Setting timeout to ([0-9]*) seconds.") + foundTimeout = None + with open( + os.path.join( + BASE_DIR, + "dummy_timeout_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".run.live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundTimeout = match.groups()[0] + if foundTimeout != "360": + pytest.fail( + "timeout in models.json (360s) could not override actual timeout (" + + str(foundTimeout) + + "s)." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_can_override_timeout_in_commandline( + self, global_data, clean_test_temp_files + ): + """ + timeout command-line argument overrides default timeout + This test only checks if the timeout is set; it does not actually time the model. + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy --timeout 120" + ) + + regexp = re.compile(r"⏰ Setting timeout to ([0-9]*) seconds.") + foundTimeout = None + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".run.live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundTimeout = match.groups()[0] + if foundTimeout != "120": + pytest.fail( + "timeout command-line argument (120s) could not override actual timeout (" + + foundTimeout + + "s)." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_commandline_timeout_overrides_model_timeout( + self, global_data, clean_test_temp_files + ): + """ + timeout command-line argument overrides model timeout + This test only checks if the timeout is set; it does not actually time the model. + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_timeout --timeout 120" + ) + + regexp = re.compile(r"⏰ Setting timeout to ([0-9]*) seconds.") + foundTimeout = None + with open( + os.path.join( + BASE_DIR, + "dummy_timeout_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".run.live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundTimeout = match.groups()[0] + if foundTimeout != "120": + pytest.fail( + "timeout in command-line argument (360s) could not override model.json timeout (" + + foundTimeout + + "s)." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "run_directory"]], + indirect=True, + ) + def test_timeout_in_commandline_timesout_correctly( + self, global_data, clean_test_temp_files + ): + """ + timeout command-line argument times model out correctly + """ + start_time = time.time() + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_sleep --timeout 60", + canFail=True, + timeout=180, + ) + + test_duration = time.time() - start_time + + assert test_duration == pytest.approx(60, 10) + + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "run_directory"]], + indirect=True, + ) + def test_timeout_in_model_timesout_correctly( + self, global_data, clean_test_temp_files + ): + """ + timeout in models.json times model out correctly + """ + start_time = time.time() + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_sleep", + canFail=True, + timeout=180, + ) + + test_duration = time.time() - start_time + + assert test_duration == pytest.approx(120, 20) + + + +# ============================================================================ +# Debugging Feature Tests +# ============================================================================ + +class TestDebuggingFunctionality: + """""" + + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "run_directory"]], + indirect=True, + ) + def test_keepAlive_keeps_docker_alive(self, global_data, clean_test_temp_files): + """ + keep-alive command-line argument keeps the docker container alive + UPDATED: Now uses python3 -m madengine.cli.app with additional-context + """ + context = generate_additional_context_for_machine() + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --live-output --tags dummy --keep-alive --additional-context '{json.dumps(context)}'" + ) + output = global_data["console"].sh( + "docker ps -aqf 'name=container_ci-dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + "'" + ) + + if not output: + pytest.fail("docker container not found after keep-alive argument.") + + global_data["console"].sh( + "docker container stop --time=1 container_ci-dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + ) + global_data["console"].sh( + "docker container rm -f container_ci-dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "run_directory"]], + indirect=True, + ) + def test_no_keepAlive_does_not_keep_docker_alive( + self, global_data, clean_test_temp_files + ): + """ + without keep-alive command-line argument, the docker container is not kept alive + UPDATED: Now uses python3 -m madengine.cli.app with additional-context + """ + context = generate_additional_context_for_machine() + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --live-output --tags dummy --additional-context '{json.dumps(context)}'" + ) + output = global_data["console"].sh( + "docker ps -aqf 'name=container_ci-dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + "'" + ) + + if output: + global_data["console"].sh( + "docker container stop --time=1 container_ci-dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + ) + global_data["console"].sh( + "docker container rm -f container_ci-dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + ) + pytest.fail( + "docker container found after not specifying keep-alive argument." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "run_directory"]], + indirect=True, + ) + def test_keepAlive_preserves_model_dir(self, global_data, clean_test_temp_files): + """ + keep-alive command-line argument will keep model directory after run + UPDATED: Now uses python3 -m madengine.cli.app with additional-context + """ + context = generate_additional_context_for_machine() + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --live-output --tags dummy --keep-alive --additional-context '{json.dumps(context)}'" + ) + + global_data["console"].sh( + "docker container stop --time=1 container_ci-dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + ) + global_data["console"].sh( + "docker container rm -f container_ci-dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + ) + if not os.path.exists(os.path.join(BASE_DIR, "run_directory")): + pytest.fail("model directory not left over after keep-alive argument.") + + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "run_directory"]], + indirect=True, + ) + def test_keepModelDir_keeps_model_dir(self, global_data, clean_test_temp_files): + """ + keep-model-dir command-line argument keeps model directory after run + UPDATED: Now uses python3 -m madengine.cli.app with additional-context + """ + context = generate_additional_context_for_machine() + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --live-output --tags dummy --keep-model-dir --additional-context '{json.dumps(context)}'" + ) + + if not os.path.exists(os.path.join(BASE_DIR, "run_directory")): + pytest.fail("model directory not left over after keep-model-dir argument.") + + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "run_directory"]], + indirect=True, + ) + def test_no_keepModelDir_does_not_keep_model_dir( + self, global_data, clean_test_temp_files + ): + """ + keep-model-dir command-line argument keeps model directory after run + UPDATED: Now uses python3 -m madengine.cli.app with additional-context + """ + context = generate_additional_context_for_machine() + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --live-output --tags dummy --additional-context '{json.dumps(context)}'" + ) + + if os.path.exists(os.path.join(BASE_DIR, "run_directory")): + pytest.fail( + "model directory left over after not specifying keep-model-dir (or keep-alive) argument." + ) + +# ============================================================================ +# Live Output Feature Tests +# ============================================================================ + +class TestLiveOutputFunctionality: + """Test the live output functionality.""" + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_default_silent_run(self, global_data, clean_test_temp_files): + """ + default run is silent + UPDATED: Now uses python3 -m madengine.cli.app instead of legacy mad.py + """ + context = generate_additional_context_for_machine() + output = global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --tags dummy --additional-context '{json.dumps(context)}'" + ) + + regexp = re.compile(r"performance: [0-9]* samples_per_second") + if regexp.search(output): + pytest.fail("default run is not silent") + + if "ARG BASE_DOCKER=" in output: + pytest.fail("default run is not silent") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_liveOutput_prints_output_to_screen( + self, global_data, clean_test_temp_files + ): + """ + live_output prints output to screen + UPDATED: Now uses python3 -m madengine.cli.app instead of legacy mad.py + """ + context = generate_additional_context_for_machine() + output = global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --live-output --tags dummy --live-output --additional-context '{json.dumps(context)}'" + ) + + regexp = re.compile(r"performance: [0-9]* samples_per_second") + if not regexp.search(output): + pytest.fail("default run is silent") + + if "ARG BASE_DOCKER=" not in output: + pytest.fail("default run is silent") + + diff --git a/tests/e2e/test_profiling_workflows.py b/tests/e2e/test_profiling_workflows.py new file mode 100644 index 00000000..907dcb7c --- /dev/null +++ b/tests/e2e/test_profiling_workflows.py @@ -0,0 +1,460 @@ +"""Test the profiling functionality. + +UPDATED: Refactored to use python3 -m madengine.cli.app instead of legacy mad.py + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# built-in modules +import os +import re +import sys +import csv +import json + +# third-party modules +import pytest + +# project modules +from tests.fixtures.utils import ( + BASE_DIR, + MODEL_DIR, + global_data, + clean_test_temp_files, + requires_gpu, + is_nvidia, + generate_additional_context_for_machine, +) + + +class TestProfilingFunctionality: + + @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "rocprof_output"]], + indirect=True, + ) + def test_rocprof_profiling_tool_runs_correctly( + self, global_data, clean_test_temp_files + ): + """ + specifying a profiling tool runs respective pre and post scripts + """ + # canFail is set to True because rocProf mode is failing the full DLM run; this test will test if the correct output files are generated + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rocprof\"}]}' ", + canFail=True, + ) + + # Check for both legacy rocprof (results.csv) and rocprofv3 (.db files) output + rocprof_output_dir = os.path.join(BASE_DIR, "rocprof_output") + legacy_output = os.path.join(rocprof_output_dir, "results.csv") + + # Check for rocprofv3 .db files in subdirectories + rocprofv3_output_found = False + if os.path.exists(rocprof_output_dir): + for root, dirs, files in os.walk(rocprof_output_dir): + for file in files: + if file.endswith("_results.db"): + rocprofv3_output_found = True + break + if rocprofv3_output_found: + break + + if not os.path.exists(legacy_output) and not rocprofv3_output_found: + pytest.fail( + "Neither rocprof_output/results.csv (legacy) nor *_results.db (rocprofv3) generated with rocprof profiling run." + ) + + @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "rpd_output"]], + indirect=True, + ) + def test_rpd_profiling_tool_runs_correctly( + self, global_data, clean_test_temp_files + ): + """ + specifying a profiling tool runs respective pre and post scripts + """ + # canFail is set to True because rpd mode is failing the full DLM run; this test will test if the correct output files are generated + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rpd\"}]}' ", + canFail=True, + ) + + if not os.path.exists(os.path.join(BASE_DIR, "rpd_output", "trace.rpd")): + pytest.fail("rpd_output/trace.rpd not generated with rpd profiling run.") + + @requires_gpu("gpu_info_power_profiler requires GPU hardware") + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "gpu_info_power_profiler_output.csv"]], + indirect=True, + ) + def test_gpu_info_power_profiling_tool_runs_correctly( + self, global_data, clean_test_temp_files + ): + """ + specifying a profiling tool runs respective pre and post scripts + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"gpu_info_power_profiler\"}]}' ", + canFail=False, + ) + + if not os.path.exists( + os.path.join(BASE_DIR, "gpu_info_power_profiler_output.csv") + ): + pytest.fail( + "gpu_info_power_profiler_output.csv not generated with gpu_info_power_profiler run." + ) + + @requires_gpu("gpu_info_vram_profiler requires GPU hardware") + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "gpu_info_vram_profiler_output.csv"]], + indirect=True, + ) + def test_gpu_info_vram_profiling_tool_runs_correctly( + self, global_data, clean_test_temp_files + ): + """ + specifying a profiling tool runs respective pre and post scripts + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"gpu_info_vram_profiler\"}]}' ", + canFail=False, + ) + + if not os.path.exists( + os.path.join(BASE_DIR, "gpu_info_vram_profiler_output.csv") + ): + pytest.fail( + "gpu_info_vram_profiler_output.csv not generated with gpu_info_vram_profiler run." + ) + + @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "library_trace.csv"]], + indirect=True, + ) + def test_rocblas_trace_runs_correctly(self, global_data, clean_test_temp_files): + """ + specifying a profiling tool runs respective pre and post scripts + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rocblas_trace\"}]}' ", + canFail=False, + ) + + regexp = re.compile(r"rocblas-bench") + foundMatch = None + with open(os.path.join(BASE_DIR, "library_trace.csv"), "r") as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundMatch = True + if not foundMatch: + pytest.fail( + "could not detect rocblas-bench in output log file with rocblas trace tool." + ) + + @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "library_trace.csv"]], + indirect=True, + ) + def test_tensile_trace_runs_correctly(self, global_data, clean_test_temp_files): + """ + specifying a profiling tool runs respective pre and post scripts + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"tensile_trace\"}]}' ", + canFail=True, # Allow failure due to missing performance metrics (trace tools suppress performance output) + ) + + regexp = re.compile(r"tensile,Cijk") + foundMatch = None + with open(os.path.join(BASE_DIR, "library_trace.csv"), "r") as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundMatch = True + if not foundMatch: + pytest.fail( + "could not detect tensile call in output log file with tensile trace tool." + ) + + @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "library_trace.csv"]], + indirect=True, + ) + def test_miopen_trace_runs_correctly(self, global_data, clean_test_temp_files): + """ + specifying a profiling tool runs respective pre and post scripts + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"miopen_trace\"}]}' ", + canFail=False, + ) + + regexp = re.compile(r"MIOpenDriver") + foundMatch = None + with open(os.path.join(BASE_DIR, "library_trace.csv"), "r") as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundMatch = True + if not foundMatch: + pytest.fail( + "could not detect miopen call in output log file with miopen trace tool." + ) + + @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_rccl_trace_runs_correctly(self, global_data, clean_test_temp_files): + """ + specifying a profiling tool runs respective pre and post scripts + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_prof_rccl --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rccl_trace\"}]}' ", + canFail=False, + ) + + regexp = re.compile(r"NCCL INFO AllReduce:") + foundMatch = None + with open( + os.path.join( + BASE_DIR, + "dummy_prof_rccl_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".run.live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundMatch = True + if not foundMatch: + pytest.fail( + "could not detect rccl call in output log file with rccl trace tool." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_toolA_runs_correctly(self, global_data, clean_test_temp_files): + """ + specifying a profiling tool runs respective pre and post scripts + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"test_tools_A\"}]}' ", + canFail=False, + ) + + match_str_array = ["pre_script A", "cmd_A", "post_script A"] + + match_str_idx = 0 + regexp = re.compile(match_str_array[match_str_idx]) + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".run.live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + print("MATCH = ", line) + match_str_idx = match_str_idx + 1 + if match_str_idx == len(match_str_array): + break + regexp = re.compile(match_str_array[match_str_idx]) + if match_str_idx != len(match_str_array): + print("Matched up to ", match_str_idx) + pytest.fail("all strings were not matched in toolA test.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_stackable_design_runs_correctly(self, global_data, clean_test_temp_files): + """ + specifying a profiling tool runs respective pre and post scripts + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"test_tools_A\"}, {\"name\": \"test_tools_B\"}]}' ", + canFail=False, + ) + + match_str_array = [ + "pre_script B", + "pre_script A", + "cmd_B", + "cmd_A", + "post_script A", + "post_script B", + ] + + match_str_idx = 0 + regexp = re.compile(match_str_array[match_str_idx]) + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".run.live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + print("MATCH = ", line) + match_str_idx = match_str_idx + 1 + if match_str_idx == len(match_str_array): + break + regexp = re.compile(match_str_array[match_str_idx]) + if match_str_idx != len(match_str_array): + print("Matched up to ", match_str_idx) + pytest.fail( + "all strings were not matched in the stacked test using toolA and toolB." + ) + + @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "rocprof_output"]], + indirect=True, + ) + def test_can_change_default_behavior_of_profiling_tool_with_additionalContext( + self, global_data, clean_test_temp_files + ): + """ + default behavior of a profiling tool can be changed from additional-context + """ + # Test overriding with --sys-trace (works with both rocprof and rocprofv3) + # Note: The '--' separator is required for rocprofv3 to distinguish between profiler options and the application command + # canFail is set to True because rocProf is failing; this test will test if the correct output files are generated + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_prof --additional-context '{\"gpu_vendor\": \"AMD\", \"guest_os\": \"UBUNTU\", \"tools\": [{\"name\": \"rocprof\", \"cmd\": \"bash ../scripts/common/tools/rocprof_wrapper.sh --sys-trace --\"}]}' ", + canFail=True, + ) + + # Check for profiling output (either legacy or rocprofv3 format) + rocprof_output_dir = os.path.join(BASE_DIR, "rocprof_output") + + # For rocprofv3 with --sys-trace, check for .db files + rocprofv3_output_found = False + if os.path.exists(rocprof_output_dir): + for root, dirs, files in os.walk(rocprof_output_dir): + for file in files: + if file.endswith("_results.db"): + rocprofv3_output_found = True + break + if rocprofv3_output_found: + break + + # Legacy check for results files + legacy_output = os.path.exists(os.path.join(BASE_DIR, "rocprof_output", "results.csv")) + + if not legacy_output and not rocprofv3_output_found: + pytest.fail( + "No profiling output generated with custom rocprof command override." + ) diff --git a/tests/e2e/test_run_workflows.py b/tests/e2e/test_run_workflows.py new file mode 100644 index 00000000..5b3f894d --- /dev/null +++ b/tests/e2e/test_run_workflows.py @@ -0,0 +1,572 @@ +"""Test the context module. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# built-in modules +import os +import sys +import csv + +# third-party modules +import pytest +import json + +# project modules +from tests.fixtures.utils import BASE_DIR, MODEL_DIR +from tests.fixtures.utils import global_data +from tests.fixtures.utils import clean_test_temp_files +from tests.fixtures.utils import get_gpu_nodeid_map +from tests.fixtures.utils import get_num_gpus +from tests.fixtures.utils import get_num_cpus +from tests.fixtures.utils import requires_gpu +from tests.fixtures.utils import generate_additional_context_for_machine + +from madengine.core.context import Context + + +# ============================================================================ +# Context Handling Tests +# ============================================================================ + +class TestContexts: + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_dockerfile_picked_on_detected_context_0( + self, global_data, clean_test_temp_files + ): + """ + picks dockerfile based on detected context and only those + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy_ctxtest": + if row["status"] == "SUCCESS" and row["performance"] == "0": + success = True + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not success: + pytest.fail("model did not pick correct context.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html", "ctx_test"]], indirect=True + ) + def test_dockerfile_picked_on_detected_context_1( + self, global_data, clean_test_temp_files + ): + """ + picks dockerfile based on detected context and only those + """ + with open(os.path.join(BASE_DIR, "ctx_test"), "w") as ctx_test_file: + print("1", file=ctx_test_file) + + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy_ctxtest": + if row["status"] == "SUCCESS" and row["performance"] == "1": + success = True + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not success: + pytest.fail("model did not pick correct context.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html", "ctx_test"]], indirect=True + ) + def test_all_dockerfiles_matching_context_executed( + self, global_data, clean_test_temp_files + ): + """ + All dockerfiles matching context is executed + """ + with open(os.path.join(BASE_DIR, "ctx_test"), "w") as ctx_test_file: + print("2", file=ctx_test_file) + + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest " + ) + + foundDockerfiles = [] + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy_ctxtest": + if row["status"] == "SUCCESS" and row["performance"] == "2": + foundDockerfiles.append( + row["docker_file"].replace(f"{MODEL_DIR}/", "") + ) + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not ( + "docker/dummy_ctxtest.ctx2a.ubuntu.amd.Dockerfile" in foundDockerfiles + and "docker/dummy_ctxtest.ctx2b.ubuntu.amd.Dockerfile" in foundDockerfiles + ): + pytest.fail( + "All dockerfiles matching context is not executed. Executed dockerfiles are " + + " ".join(foundDockerfiles) + ) + + def test_dockerfile_executed_if_contexts_keys_are_not_common(self): + """ + Dockerfile is executed even if all context keys are not common but common keys match + """ + # already tested in test_dockerfile_picked_on_detected_context_0 + pass + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_can_override_context_with_additionalContext_commandline( + self, global_data, clean_test_temp_files + ): + """ + Context can be overridden through additional-context command-line argument + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest --additional-context \"{'ctx_test': '1'}\" " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy_ctxtest": + if row["status"] == "SUCCESS" and row["performance"] == "1": + success = True + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not success: + pytest.fail("model did not pick correct context.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html", "ctx.json"]], indirect=True + ) + def test_can_override_context_with_additionalContextFile_commandline( + self, global_data, clean_test_temp_files + ): + """ + Context can be overridden through additional-context-file + """ + with open(os.path.join(BASE_DIR, "ctx.json"), "w") as ctx_json_file: + print('{ "ctx_test": "1" }', file=ctx_json_file) + + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest --additional-context-file ctx.json " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy_ctxtest": + if row["status"] == "SUCCESS" and row["performance"] == "1": + success = True + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not success: + pytest.fail("model did not pick correct context.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html", "ctx.json"]], indirect=True + ) + def test_additionalContext_commandline_overrides_additionalContextFile( + self, global_data, clean_test_temp_files + ): + """ + additional-context command-line argument has priority over additional-context-file + """ + with open(os.path.join(BASE_DIR, "ctx.json"), "w") as ctx_json_file: + print('{ "ctx_test": "2" }', file=ctx_json_file) + + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest --additional-context-file ctx.json --additional-context \"{'ctx_test': '1'}\" " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy_ctxtest": + if row["status"] == "SUCCESS" and row["performance"] == "1": + success = True + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not success: + pytest.fail("model did not pick correct context.") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_base_docker_override(self, global_data, clean_test_temp_files): + """ + BASE_DOCKER overrides base docker + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest --additional-context \"{'docker_build_arg':{'BASE_DOCKER':'rocm/tensorflow' }}\" " + ) + + foundBaseDocker = [] + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy_ctxtest": + if row["status"] == "SUCCESS" and row["performance"] == "0": + foundBaseDocker.append(row["base_docker"]) + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not "rocm/tensorflow" in foundBaseDocker: + pytest.fail( + "BASE_DOCKER does not override base docker. Expected: rocm/tensorflow Found:" + + str(foundBaseDocker) + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_docker_image_override(self, global_data, clean_test_temp_files): + """ + Using user-provided image passed in with MAD_CONTAINER_IMAGE + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest --additional-context \"{'docker_env_vars':{'ctxtest':'1'},'MAD_CONTAINER_IMAGE':'rocm/tensorflow:latest' }\" " + ) + + foundLocalImage = None + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy_ctxtest": + if row["status"] == "SUCCESS" and row["performance"] == "1": + foundLocalImage = row["docker_image"] + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not "rocm/tensorflow:latest" in foundLocalImage: + pytest.fail( + "MAD_CONTAINER_IMAGE does not override docker image. Expected: rocm/tensorflow:latest Found:" + + foundLocalImage + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_docker_env_vars_override(self, global_data, clean_test_temp_files): + """ + docker_env_vars pass environment variables into docker container + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_ctxtest --additional-context \"{'docker_env_vars':{'ctxtest':'1'} }\" " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy_ctxtest": + if row["status"] == "SUCCESS" and row["performance"] == "1": + success = True + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not success: + pytest.fail( + "docker_env_vars did not pass environment variables into docker container." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_docker_mounts_mount_host_paths_in_docker_container( + self, global_data, clean_test_temp_files + ): + """ + docker_mounts mount host paths inside docker containers + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_mountpath --additional-context \"{'docker_env_vars':{'MAD_DATAHOME':'/data'}, 'docker_mounts':{'/data':'/tmp'} }\" " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if row["model"] == "dummy_mountpath": + if row["status"] == "SUCCESS": + success = True + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not success: + pytest.fail( + "docker_mounts did not mount host paths inside docker container." + ) + + @requires_gpu("docker gpus requires GPU hardware") + @pytest.mark.skipif( + get_num_gpus() < 8, reason="test requires atleast 8 gpus" + ) + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "results_dummy_gpubind.csv"]], + indirect=True, + ) + def test_docker_gpus(self, global_data, clean_test_temp_files): + """ + docker_gpus binds gpus to docker containers + """ + + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_gpubind --additional-context \"{'docker_gpus':'0,2-4,5-5,7'}\" " + ) + + gpu_nodeid_map = get_gpu_nodeid_map() + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + gpu_node_ids = [] + for row in csv_reader: + if "dummy_gpubind" in row["model"]: + if row["status"] == "SUCCESS": + gpu_node_ids.append(row["performance"]) + else: + pytest.fail("model in perf_test.csv did not run successfully.") + + # Debug information + print(f"GPU node IDs from performance: {gpu_node_ids}") + print(f"GPU nodeid map: {gpu_nodeid_map}") + mapped_gpus = [gpu_nodeid_map.get(node_id) for node_id in gpu_node_ids] + print(f"Mapped GPUs: {mapped_gpus}") + + # Filter out None values and sort + valid_mapped_gpus = [gpu for gpu in mapped_gpus if gpu is not None] + sorted_gpus = sorted(valid_mapped_gpus) + print(f"Sorted valid GPUs: {sorted_gpus}") + + if sorted_gpus != [0, 2, 3, 4, 5, 7]: + pytest.fail(f"docker_gpus did not bind expected gpus in docker container. Expected: [0, 2, 3, 4, 5, 7], Got: {sorted_gpus}, Raw node IDs: {gpu_node_ids}, Mapping: {gpu_nodeid_map}") + + @pytest.mark.skipif( + get_num_cpus() < 64, reason="test requires atleast 64 cpus" + ) + @pytest.mark.parametrize( + "clean_test_temp_files", + [["perf.csv", "perf.html", "results_dummy_cpubind.csv"]], + indirect=True, + ) + def test_docker_cpus(self, global_data, clean_test_temp_files): + """ + docker_cpus binds cpus to docker containers + """ + + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy_cpubind --additional-context \"{'docker_cpus':'14-18,32,44-44,62'}\" " + ) + + success = False + with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: + csv_reader = csv.DictReader(csv_file) + for row in csv_reader: + if "dummy_cpubind" in row["model"]: + if ( + row["status"] == "SUCCESS" + and row["performance"] == "14-18|32|44|62" + ): + success = True + else: + pytest.fail("model in perf_test.csv did not run successfully.") + if not success: + pytest.fail("docker_cpus did not bind expected cpus in docker container.") + + def test_gpu_product_name_matches_arch(self): + """ + Check MAD_SYSTEM_GPU_PRODUCT_NAME is not empty and is valid. + + No models run for this test. + """ + + context = Context() + product_name = context.ctx['docker_env_vars']["MAD_SYSTEM_GPU_PRODUCT_NAME"] + + #fail the test if GPU product name is empty + if not product_name or not product_name.strip(): + pytest.fail("GPU product name is empty or just whitespaces") + + product_name = product_name.upper() + + #if product name has AMD or NVIDIA in it then it's a safe bet + #that it was parsed properly + if not ("AMD" in product_name or "NVIDIA" in product_name): + pytest.fail(f"Incorrect product name={product_name!r}") + + + +# ============================================================================ +# Tag Filtering Tests +# ============================================================================ + +class TestTagsFunctionality: + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_can_select_model_subset_with_commandline_tag_argument( + self, global_data, clean_test_temp_files + ): + """ + can select subset of models with tag with command-line argument + """ + context = generate_additional_context_for_machine() + output = global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --tags dummy_group_1 --live-output --additional-context '{json.dumps(context)}'" + ) + + # Check for model execution (handles ANSI codes in output) + if "dummy" not in output or "ci-dummy_dummy" not in output: + pytest.fail("dummy tag not selected with commandline --tags argument") + + if "dummy2" not in output or "ci-dummy2_dummy" not in output: + pytest.fail("dummy2 tag not selected with commandline --tags argument") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_all_models_matching_any_tag_selected_with_multiple_tags( + self, global_data, clean_test_temp_files + ): + """ + if multiple tags are specified, all models that match any tag will be selected + """ + context = generate_additional_context_for_machine() + output = global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --tags dummy_group_1,dummy_group_2 --live-output --additional-context '{json.dumps(context)}'" + ) + + # Check for model execution (handles ANSI codes in output) + if "dummy" not in output or "ci-dummy_dummy" not in output: + pytest.fail("dummy tag not selected with commandline --tags argument") + + if "dummy2" not in output or "ci-dummy2_dummy" not in output: + pytest.fail("dummy2 tag not selected with commandline --tags argument") + + if "dummy3" not in output or "ci-dummy3_dummy" not in output: + pytest.fail("dummy3 tag not selected with commandline --tags argument") + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_model_names_are_automatically_tags( + self, global_data, clean_test_temp_files + ): + """ + Each model name is automatically a tag + """ + context = generate_additional_context_for_machine() + output = global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + f"python3 -m madengine.cli.app run --tags dummy --live-output --additional-context '{json.dumps(context)}'" + ) + + # Check for model execution (handles ANSI codes in output) + if "dummy" not in output or "ci-dummy_dummy" not in output: + pytest.fail("dummy tag not selected with commandline --tags argument") + + diff --git a/tests/e2e/test_scripting_workflows.py b/tests/e2e/test_scripting_workflows.py new file mode 100644 index 00000000..682eb53a --- /dev/null +++ b/tests/e2e/test_scripting_workflows.py @@ -0,0 +1,345 @@ +"""Test the scripts for pre and post processing. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# built-in modules +import os +import re +import csv +import time + +# 3rd party modules +import pytest +import json + +# project modules +from tests.fixtures.utils import BASE_DIR, MODEL_DIR +from tests.fixtures.utils import global_data +from tests.fixtures.utils import clean_test_temp_files +from tests.fixtures.utils import is_nvidia +from tests.fixtures.utils import generate_additional_context_for_machine + + +class TestPrePostScriptsFunctionality: + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_pre_scripts_run_before_model(self, global_data, clean_test_temp_files): + """ + pre_scripts are run in docker container before model execution + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --tags dummy --additional-context \"{ 'pre_scripts':[{'path':'scripts/common/pre_scripts/pre_test.sh'}] }\" --live-output " + ) + + regexp = re.compile(r"Pre-Script test called ([0-9]*)") + foundLine = None + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundLine = match.groups()[0] + if foundLine != "0": + pytest.fail( + "pre_scripts specification did not run the selected pre-script." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_post_scripts_run_after_model(self, global_data, clean_test_temp_files): + """ + post_scripts are run in docker container after model execution + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy --additional-context \"{ 'post_scripts':[{'path':'scripts/common/post_scripts/post_test.sh'}] }\" " + ) + + regexp = re.compile(r"Post-Script test called ([0-9]*)") + foundLine = None + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundLine = match.groups()[0] + if foundLine != "0": + pytest.fail( + "post_scripts specification did not run the selected post-script." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_pre_scripts_accept_arguments(self, global_data, clean_test_temp_files): + """ + pre_scripts are run in docker container before model execution and accept arguments + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy --additional-context \"{ 'pre_scripts':[{'path':'scripts/common/pre_scripts/pre_test.sh', 'args':'1'}] }\" " + ) + + regexp = re.compile(r"Pre-Script test called ([0-9]*)") + foundLine = None + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundLine = match.groups()[0] + if foundLine != "1": + pytest.fail( + "pre_scripts specification did not run the selected pre-script." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_post_scripts_accept_arguments(self, global_data, clean_test_temp_files): + """ + post_scripts are run in docker container after model execution and accept arguments + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy --additional-context \"{ 'post_scripts':[{'path':'scripts/common/post_scripts/post_test.sh', 'args':'1'}] }\" " + ) + + regexp = re.compile(r"Post-Script test called ([0-9]*)") + foundLine = None + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundLine = match.groups()[0] + if foundLine != "1": + pytest.fail( + "post_scripts specification did not run the selected post-script." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_both_pre_and_post_scripts_run_before_and_after_model( + self, global_data, clean_test_temp_files + ): + """ + post_scripts are run in docker container after model execution + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy --additional-context \"{ 'pre_scripts':[{'path':'scripts/common/pre_scripts/pre_test.sh'}], 'post_scripts':[{'path':'scripts/common/post_scripts/post_test.sh'}] }\" " + ) + + regexp = re.compile(r"Pre-Script test called ([0-9]*)") + foundLine = None + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundLine = match.groups()[0] + if foundLine != "0": + pytest.fail( + "pre_scripts specification did not run the selected pre-script." + ) + + regexp = re.compile(r"Post-Script test called ([0-9]*)") + foundLine = None + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundLine = match.groups()[0] + if foundLine != "0": + pytest.fail( + "post_scripts specification did not run the selected post-script." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_all_pre_scripts_run_in_order(self, global_data, clean_test_temp_files): + """ + all pre_scripts are run in order + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy --additional-context \"{ 'pre_scripts':[{'path':'scripts/common/pre_scripts/pre_test.sh', 'args':'1'}, {'path':'scripts/common/pre_scripts/pre_test.sh', 'args':'2'} ] }\" " + ) + + regexp = re.compile(r"Pre-Script test called ([0-9]*)") + foundLine = None + pre_post_script_count = 0 + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundLine = match.groups()[0] + pre_post_script_count += 1 + if foundLine != str(pre_post_script_count): + pytest.fail( + "pre_scripts run in order. Did not find " + + str(pre_post_script_count) + ) + + if foundLine != "2": + pytest.fail( + "pre_scripts specification did not run the selected pre-script." + ) + + @pytest.mark.parametrize( + "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True + ) + def test_all_post_scripts_run_in_order(self, global_data, clean_test_temp_files): + """ + all post_scripts are run in order + """ + global_data["console"].sh( + "cd " + + BASE_DIR + + "; " + + "MODEL_DIR=" + + MODEL_DIR + + " " + + "python3 -m madengine.cli.app run --live-output --tags dummy --additional-context \"{ 'post_scripts':[{'path':'scripts/common/post_scripts/post_test.sh', 'args':'1'}, {'path':'scripts/common/post_scripts/post_test.sh', 'args':'2'} ] }\" " + ) + + regexp = re.compile(r"Post-Script test called ([0-9]*)") + foundLine = None + pre_post_script_count = 0 + with open( + os.path.join( + BASE_DIR, + "dummy_dummy.ubuntu." + + ("amd" if not is_nvidia() else "nvidia") + + ".live.log", + ), + "r", + ) as f: + while True: + line = f.readline() + if not line: + break + match = regexp.search(line) + if match: + foundLine = match.groups()[0] + pre_post_script_count += 1 + if foundLine != str(pre_post_script_count): + pytest.fail( + "post_scripts run in order. Did not find " + + str(pre_post_script_count) + ) + + if foundLine != "2": + pytest.fail( + "post_scripts specification did not run the selected post-script." + ) diff --git a/tests/fixtures/dummy/credential.json b/tests/fixtures/dummy/credential.json index 1b8a56df..04e514b5 100644 --- a/tests/fixtures/dummy/credential.json +++ b/tests/fixtures/dummy/credential.json @@ -1,21 +1,21 @@ { - "NAS_NODES": [ - { - "NAME": "default", - "HOST": "localhost", - "PORT": "22", - "USERNAME": "admin", - "PASSWORD": "admin" - } - ], - "MAD_AWS_S3": { - "USERNAME": "admin", - "PASSWORD": "admin" - }, - "MAD_MINIO": { - "USERNAME": "admin-access-key", - "PASSWORD": "admin-secret-key", - "MINIO_ENDPOINT": "http://127.0.1:9000", - "AWS_ENDPOINT_URL_S3": "http://127.0.1:9000" - } + "NAS_NODES": [ + { + "NAME": "default", + "HOST": "localhost", + "PORT": "22", + "USERNAME": "admin", + "PASSWORD": "admin" + } + ], + "MAD_AWS_S3": { + "USERNAME": "admin", + "PASSWORD": "admin" + }, + "MAD_MINIO": { + "USERNAME": "admin-access-key", + "PASSWORD": "admin-secret-key", + "MINIO_ENDPOINT": "http://127.0.1:9000", + "AWS_ENDPOINT_URL_S3": "http://127.0.1:9000" + } } \ No newline at end of file diff --git a/tests/fixtures/dummy/docker/dummy_deepspeed.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_deepspeed.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..33a9e100 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_deepspeed.ubuntu.amd.Dockerfile @@ -0,0 +1,33 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +ARG BASE_DOCKER=rocm/pytorch +FROM $BASE_DOCKER + +# ============================================================================ +# Install DeepSpeed and Dependencies +# ============================================================================ +# Install mpi4py (required for DeepSpeed distributed initialization) +RUN pip install mpi4py deepspeed + +# ============================================================================ +# ROCm/MIOpen Optimizations +# ============================================================================ +RUN if [ -d "$HOME/.config/miopen" ]; then \ + rm -rf $HOME/.config/miopen/* ; \ + fi && \ + if [ -d "/tmp/.miopen" ]; then \ + rm -rf /tmp/.miopen/* ; \ + fi + +ENV MIOPEN_FIND_MODE=1 \ + MIOPEN_USER_DB_PATH=/tmp/.miopen + +RUN mkdir -p /tmp/.miopen && chmod 1777 /tmp/.miopen + +# ============================================================================ +# DeepSpeed Environment +# ============================================================================ +ENV DEEPSPEED_LAUNCHER=deepspeed + +# Verify installations +RUN python3 -c "import deepspeed; print(f'DeepSpeed version: {deepspeed.__version__}')" +RUN rocminfo > /dev/null 2>&1 || echo "ROCm check (OK in build env)" diff --git a/tests/fixtures/dummy/docker/dummy_megatron_lm.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_megatron_lm.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..e297a17a --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_megatron_lm.ubuntu.amd.Dockerfile @@ -0,0 +1,48 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +# Using official ROCm Megatron-LM image from https://hub.docker.com/r/rocm/megatron-lm +ARG BASE_DOCKER=rocm/megatron-lm:latest +FROM $BASE_DOCKER + +# ============================================================================ +# ROCm/MIOpen Optimizations +# ============================================================================ +# Clear any existing MIOpen cache to ensure clean state +RUN if [ -d "$HOME/.config/miopen" ]; then \ + rm -rf $HOME/.config/miopen/* ; \ + fi && \ + if [ -d "/tmp/.miopen" ]; then \ + rm -rf /tmp/.miopen/* ; \ + fi + +# Configure MIOpen for optimal performance +ENV MIOPEN_FIND_MODE=1 \ + MIOPEN_USER_DB_PATH=/tmp/.miopen + +RUN mkdir -p /tmp/.miopen && chmod 1777 /tmp/.miopen + +# ============================================================================ +# Distributed Training Environment Variables +# ============================================================================ +# Optimized settings for ROCm distributed training +ENV MEGATRON_FRAMEWORK=megatron_lm \ + CUDA_DEVICE_MAX_CONNECTIONS=1 \ + NCCL_IB_DISABLE=1 \ + NCCL_SOCKET_IFNAME=eth0 \ + NCCL_DEBUG=WARN \ + TORCH_NCCL_HIGH_PRIORITY=1 \ + GPU_MAX_HW_QUEUES=2 \ + HSA_ENABLE_SDMA=0 \ + HSA_FORCE_FINE_GRAIN_PCIE=1 \ + RCCL_ENABLE_HIPGRAPH=0 + +# ============================================================================ +# Verify Installation +# ============================================================================ +# Verify Megatron-LM and ROCm are properly installed +RUN python3 -c "import megatron; print('✓ Megatron-LM available')" && \ + python3 -c "from megatron.core import parallel_state; print('✓ Megatron-Core available')" && \ + python3 -c "import torch; print(f'✓ PyTorch {torch.__version__}')" && \ + python3 -c "import torch; print(f'✓ CUDA/ROCm available: {torch.cuda.is_available()}')" && \ + rocminfo > /dev/null 2>&1 || echo "ROCm check (OK in build env)" + +WORKDIR /workspace diff --git a/tests/fixtures/dummy/docker/dummy_sglang.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_sglang.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..f45e5bc3 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_sglang.ubuntu.amd.Dockerfile @@ -0,0 +1,133 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +# SGLang Dockerfile for AMD ROCm - Using official SGLang image with ROCm 7.x support +# Reference: https://github.com/sgl-project/sglang + +# ============================================================================ +# Base Image: Official SGLang with ROCm 7.x Support +# ============================================================================ +# Using lmsysorg/sglang:latest which includes: +# - SGLang with latest features (RadixAttention, multi-modal support) +# - ROCm 7.x for AMD MI300X and latest GPU support +# - Pre-optimized kernels and dependencies +# - Ray for distributed inference +ARG BASE_DOCKER=lmsysorg/sglang:latest +FROM $BASE_DOCKER + +# ============================================================================ +# ROCm 7.x Environment Configuration +# ============================================================================ +# MIOpen configuration for optimal kernel selection +ENV MIOPEN_FIND_MODE=1 \ + MIOPEN_USER_DB_PATH=/tmp/.miopen \ + MIOPEN_CUSTOM_CACHE_DIR=/tmp/.miopen + +RUN mkdir -p /tmp/.miopen && chmod 1777 /tmp/.miopen + +# ROCm 7.x specific optimizations for MI300X +ENV HSA_FORCE_FINE_GRAIN_PCIE=1 \ + HSA_ENABLE_SDMA=0 \ + GPU_MAX_HW_QUEUES=2 \ + NCCL_DEBUG=WARN \ + NCCL_MIN_NCHANNELS=16 \ + TORCH_NCCL_ASYNC_ERROR_HANDLING=1 + +# ROCm 7.x advanced features +ENV ROCM_USE_FLASH_ATTENTION=1 \ + HIP_FORCE_DEV_KERNARG=1 + +# ============================================================================ +# SGLang Runtime Configuration +# ============================================================================ +# Core SGLang settings for production deployment +ENV SGLANG_ALLOW_LONG_MAX_MODEL_LEN=1 \ + SGLANG_USE_MODELSCOPE=False \ + SGLANG_LOGGING_LEVEL=INFO + +# SGLang RadixAttention - Automatic prefix caching for efficient KV cache +# Reference: https://github.com/sgl-project/sglang#radixattention +# This is SGLang's key innovation for 5-10x speedup on shared prefix workloads +ENV SGLANG_ENABLE_RADIX_CACHE=1 \ + SGLANG_RADIX_CACHE_SIZE=0.9 + +# Ray Configuration for Distributed Multi-Node Inference +# SGLang uses Ray for coordination across nodes +ENV RAY_DEDUP_LOGS=1 \ + RAY_BACKEND_LOG_LEVEL=warning \ + RAY_USAGE_STATS_ENABLED=0 \ + RAY_USAGE_STATS_ENABLED_OVERRIDE=0 + +# ============================================================================ +# Verification - Ensure ROCm 7.x and SGLang are properly configured +# ============================================================================ +# Verify SGLang installation (from base image) +RUN python3 -c "import sglang; \ + print(f'✓ SGLang version: {sglang.__version__}'); \ + print(f'✓ SGLang installation: Production-ready')" || \ + (echo "✗ SGLang import failed" && exit 1) + +# Verify PyTorch with ROCm 7.x +RUN python3 <<'EOF' +import torch +print(f'✓ PyTorch version: {torch.__version__}') +is_rocm = hasattr(torch.version, 'hip') and torch.version.hip is not None +print(f'✓ ROCm available: {is_rocm}') +if is_rocm: + hip_version = torch.version.hip + print(f'✓ ROCm/HIP version: {hip_version}') + major_version = int(hip_version.split('.')[0]) if hip_version else 0 + if major_version >= 7: + print(f'✓ ROCm 7.x+ detected (optimal for MI300X)') + else: + print(f'⚠ ROCm version < 7.0 (consider upgrading)') +EOF + +# GPU device check (will show count = 0 in build environment) +RUN python3 <<'EOF' || true +import torch +gpu_count = torch.cuda.device_count() +print(f'✓ GPU devices detected: {gpu_count}') +if gpu_count == 0: + print(' (No GPUs in build environment - GPUs will be available at runtime)') +else: + for i in range(gpu_count): + print(f' GPU {i}: {torch.cuda.get_device_name(i)}') +EOF + +# Verify key dependencies +RUN python3 -c "import transformers; print(f'✓ Transformers: {transformers.__version__}')" || \ + (echo "✗ Transformers check failed" && exit 1) + +# Verify Ray (optional - only needed for distributed multi-node inference) +RUN python3 -c "import ray; print(f'✓ Ray: {ray.__version__} (for distributed coordination)')" || \ + echo "⚠ Ray not found (optional - only needed for multi-node distributed inference)" + +# Verify SGLang server module (key for inference) +RUN python3 -c "from sglang import launch_server; print('✓ SGLang server module available')" || \ + (echo "✗ SGLang server module not found" && exit 1) + +# ============================================================================ +# Workspace Setup +# ============================================================================ +WORKDIR /workspace + +# ============================================================================ +# Final Environment Summary +# ============================================================================ +RUN echo "========================================================================" && \ + echo "✅ SGLang Docker Image Build Complete" && \ + echo "========================================================================" && \ + echo "Base Image: lmsysorg/sglang:latest" && \ + echo "ROCm Version: $(cat /opt/rocm/.info/version 2>/dev/null || echo '7.x')" && \ + echo "SGLang Version: $(python3 -c 'import sglang; print(sglang.__version__)')" && \ + echo "PyTorch Version: $(python3 -c 'import torch; print(torch.__version__)')" && \ + echo "Ray Version: $(python3 -c 'import ray; print(ray.__version__)')" && \ + echo "------------------------------------------------------------------------" && \ + echo "Build Type: Production (Official SGLang with ROCm 7.x)" && \ + echo "Target GPUs: AMD MI300X, MI250X (ROCm 7.x optimized)" && \ + echo "Key Features: RadixAttention, Multi-modal, Distributed Inference" && \ + echo "Reference: https://github.com/sgl-project/sglang" && \ + echo "========================================================================" && \ + echo "" && \ + echo "🚀 Ready for distributed LLM inference on AMD GPUs!" && \ + echo "" + diff --git a/tests/fixtures/dummy/docker/dummy_sglang_disagg.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_sglang_disagg.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..43d04337 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_sglang_disagg.ubuntu.amd.Dockerfile @@ -0,0 +1,186 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +# SGLang Disaggregated Dockerfile for AMD ROCm - Dummy Test Version +# Reference: https://github.com/sgl-project/sglang +# Reference: https://github.com/kvcache-ai/Mooncake (disaggregation framework) + +# ============================================================================ +# Base Image: Official SGLang with ROCm 7.x Support +# ============================================================================ +# Using lmsysorg/sglang:latest which includes: +# - SGLang with disaggregation support +# - ROCm 7.x for AMD MI300X +# - Ray for distributed coordination +ARG BASE_DOCKER=lmsysorg/sglang:latest +FROM $BASE_DOCKER + +# ============================================================================ +# ROCm 7.x Environment Configuration +# ============================================================================ +ENV MIOPEN_FIND_MODE=1 \ + MIOPEN_USER_DB_PATH=/tmp/.miopen \ + MIOPEN_CUSTOM_CACHE_DIR=/tmp/.miopen + +RUN mkdir -p /tmp/.miopen && chmod 1777 /tmp/.miopen + +# ROCm 7.x optimizations for MI300X +ENV HSA_FORCE_FINE_GRAIN_PCIE=1 \ + HSA_ENABLE_SDMA=0 \ + GPU_MAX_HW_QUEUES=2 \ + NCCL_DEBUG=WARN \ + NCCL_MIN_NCHANNELS=16 \ + TORCH_NCCL_ASYNC_ERROR_HANDLING=1 + +ENV ROCM_USE_FLASH_ATTENTION=1 \ + HIP_FORCE_DEV_KERNARG=1 + +# ============================================================================ +# SGLang Disaggregated Configuration +# ============================================================================ +# Core SGLang settings +ENV SGLANG_ALLOW_LONG_MAX_MODEL_LEN=1 \ + SGLANG_USE_MODELSCOPE=False \ + SGLANG_LOGGING_LEVEL=INFO + +# SGLang Disaggregation - Enable prefill/decode separation +ENV SGLANG_ENABLE_DISAGGREGATION=1 \ + SGLANG_DISAGG_TRANSFER_BACKEND=mooncake + +# RadixAttention for KV cache efficiency +ENV SGLANG_ENABLE_RADIX_CACHE=1 \ + SGLANG_RADIX_CACHE_SIZE=0.9 + +# Ray Configuration for distributed coordination +ENV RAY_DEDUP_LOGS=1 \ + RAY_BACKEND_LOG_LEVEL=warning \ + RAY_USAGE_STATS_ENABLED=0 \ + RAY_USAGE_STATS_ENABLED_OVERRIDE=0 + +# ============================================================================ +# Mooncake Framework Setup (Simplified for Dummy Test) +# ============================================================================ +# Mooncake is the KV cache transfer framework for disaggregated inference +# Reference: https://github.com/kvcache-ai/Mooncake +# +# For dummy testing, we create a minimal simulation environment +# Production deployments should use full Mooncake with RDMA support + +# Install dependencies for Mooncake simulation +RUN pip install --no-cache-dir \ + flask \ + py-spy \ + etcd3 \ + && rm -rf /root/.cache/pip/* + +# Create Mooncake cookbook directory structure (for dummy scripts) +RUN mkdir -p /opt/mooncake-cookbook && \ + chmod -R 755 /opt/mooncake-cookbook + +ENV MOONCAKE_COOKBOOK_PATH=/opt/mooncake-cookbook + +# Create dummy Mooncake environment setup script +RUN echo '#!/bin/bash' > /opt/mooncake-cookbook/set_env_vars.sh && \ + echo '# Mooncake Environment Variables (Dummy Test Mode)' >> /opt/mooncake-cookbook/set_env_vars.sh && \ + echo 'export MOONCAKE_TEST_MODE=1' >> /opt/mooncake-cookbook/set_env_vars.sh && \ + echo 'export MOONCAKE_TRANSFER_PROTOCOL=tcp' >> /opt/mooncake-cookbook/set_env_vars.sh && \ + echo 'export IBDEVICES=eth0' >> /opt/mooncake-cookbook/set_env_vars.sh && \ + echo 'echo "✓ Mooncake environment configured (test mode)"' >> /opt/mooncake-cookbook/set_env_vars.sh && \ + chmod +x /opt/mooncake-cookbook/set_env_vars.sh + +# Create dummy synchronization scripts for multi-node coordination +RUN echo '#!/usr/bin/env python3' > /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'import sys' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'import time' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'import argparse' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'parser = argparse.ArgumentParser()' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'parser.add_argument("--local-ip", default="localhost")' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'parser.add_argument("--local-port", type=int, default=5000)' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'parser.add_argument("--enable-port", action="store_true")' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'parser.add_argument("--node-ips", default="localhost")' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'parser.add_argument("--node-ports", default="5000")' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'args = parser.parse_args()' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'print(f"[Barrier] Synchronizing nodes...")' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'time.sleep(1) # Simulate barrier' >> /opt/mooncake-cookbook/socket_barrier.py && \ + echo 'print(f"[Barrier] All nodes synchronized")' >> /opt/mooncake-cookbook/socket_barrier.py && \ + chmod +x /opt/mooncake-cookbook/socket_barrier.py + +RUN echo '#!/usr/bin/env python3' > /opt/mooncake-cookbook/socket_wait.py && \ + echo 'import sys' >> /opt/mooncake-cookbook/socket_wait.py && \ + echo 'import time' >> /opt/mooncake-cookbook/socket_wait.py && \ + echo 'import argparse' >> /opt/mooncake-cookbook/socket_wait.py && \ + echo 'parser = argparse.ArgumentParser()' >> /opt/mooncake-cookbook/socket_wait.py && \ + echo 'parser.add_argument("--remote-ip", default="localhost")' >> /opt/mooncake-cookbook/socket_wait.py && \ + echo 'parser.add_argument("--remote-port", type=int, default=30000)' >> /opt/mooncake-cookbook/socket_wait.py && \ + echo 'args = parser.parse_args()' >> /opt/mooncake-cookbook/socket_wait.py && \ + echo 'print(f"[Wait] Waiting for {args.remote_ip}:{args.remote_port}")' >> /opt/mooncake-cookbook/socket_wait.py && \ + echo 'time.sleep(2) # Simulate wait' >> /opt/mooncake-cookbook/socket_wait.py && \ + echo 'print(f"[Wait] Connection closed")' >> /opt/mooncake-cookbook/socket_wait.py && \ + chmod +x /opt/mooncake-cookbook/socket_wait.py + +# ============================================================================ +# Verification - Ensure all components are ready +# ============================================================================ +# Verify SGLang with disaggregation support +RUN python3 -c "import sglang; \ + print(f'✓ SGLang version: {sglang.__version__}'); \ + print(f'✓ SGLang installation: Disaggregation-ready')" || \ + (echo "✗ SGLang import failed" && exit 1) + +# Verify SGLang disaggregation modules +RUN python3 -c "from sglang.srt.disaggregation.mini_lb import main; \ + print('✓ SGLang disaggregation modules available (mini_lb)')" || \ + (echo "⚠ SGLang disaggregation module check failed (may require newer version)" && true) + +# Verify PyTorch with ROCm 7.x +RUN python3 -c "import torch; \ + print(f'✓ PyTorch version: {torch.__version__}'); \ + is_rocm = hasattr(torch.version, 'hip') and torch.version.hip is not None; \ + print(f'✓ ROCm available: {is_rocm}'); \ + if is_rocm: \ + hip_version = torch.version.hip; \ + print(f'✓ ROCm/HIP version: {hip_version}')" || \ + (echo "✗ PyTorch/ROCm check failed" && exit 1) + +# Verify dependencies +RUN python3 -c "import transformers; print(f'✓ Transformers: {transformers.__version__}')" && \ + python3 -c "import ray; print(f'✓ Ray: {ray.__version__}')" && \ + python3 -c "import flask; print(f'✓ Flask: {flask.__version__}')" || \ + (echo "✗ Dependency check failed" && exit 1) + +# ============================================================================ +# Workspace Setup +# ============================================================================ +WORKDIR /workspace + +# Create run logs directory +RUN mkdir -p /run_logs && chmod 1777 /run_logs + +# ============================================================================ +# Final Environment Summary +# ============================================================================ +RUN echo "========================================================================" && \ + echo "✅ SGLang Disaggregated Docker Image Build Complete (Dummy Test)" && \ + echo "========================================================================" && \ + echo "Base Image: lmsysorg/sglang:latest" && \ + echo "ROCm Version: $(cat /opt/rocm/.info/version 2>/dev/null || echo '7.x')" && \ + echo "SGLang Version: $(python3 -c 'import sglang; print(sglang.__version__)')" && \ + echo "PyTorch Version: $(python3 -c 'import torch; print(torch.__version__)')" && \ + echo "Ray Version: $(python3 -c 'import ray; print(ray.__version__)')" && \ + echo "------------------------------------------------------------------------" && \ + echo "Build Type: Dummy Test (Disaggregated Architecture)" && \ + echo "Target GPUs: AMD MI300X, MI250X (ROCm 7.x optimized)" && \ + echo "Architecture: Prefill/Decode Separation" && \ + echo "Transfer Backend: Mooncake (simulated for testing)" && \ + echo "Min Nodes: 3 (1 proxy + 1 prefill + 1 decode)" && \ + echo "------------------------------------------------------------------------" && \ + echo "Key Features:" && \ + echo " • Disaggregated prefill/decode clusters" && \ + echo " • Mooncake framework simulation" && \ + echo " • Multi-node coordination (Ray + etcd)" && \ + echo " • RadixAttention for KV cache efficiency" && \ + echo "========================================================================" && \ + echo "" && \ + echo "🚀 Ready for SGLang Disaggregated testing on AMD GPUs!" && \ + echo " Note: This is a dummy/test image for madengine validation" && \ + echo " For production: Use full Mooncake with RDMA support" && \ + echo "" + diff --git a/tests/fixtures/dummy/docker/dummy_therock.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_therock.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..16dda670 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_therock.ubuntu.amd.Dockerfile @@ -0,0 +1,126 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +############################################################################### +# +# MIT License +# +# Copyright (c) Advanced Micro Devices, Inc. +# +# Dockerfile for PyTorch Benchmarking with TheRock ROCm Distribution +# TheRock provides HIP and ROCm components via Python pip packages +# Reference: https://github.com/ROCm/TheRock +# +############################################################################### +ARG BASE_DOCKER=ubuntu:24.04 +FROM ${BASE_DOCKER} + +# Set environment variables +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gfortran \ + git \ + ninja-build \ + cmake \ + g++ \ + pkg-config \ + xxd \ + patchelf \ + automake \ + libtool \ + python3-venv \ + python3-dev \ + python3-pip \ + libegl1-mesa-dev \ + wget \ + curl \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Create working directory +WORKDIR /workspace + +# Clone TheRock repository +ARG THEROCK_BRANCH=main +RUN git clone https://github.com/ROCm/TheRock.git /workspace/TheRock && \ + cd /workspace/TheRock && \ + git checkout ${THEROCK_BRANCH} + +WORKDIR /workspace/TheRock + +# Setup Python virtual environment and install dependencies +RUN python3 -m venv .venv && \ + . .venv/bin/activate && \ + pip install --upgrade pip && \ + pip install -r requirements.txt + +# Fetch sources (includes submodules and patches) +RUN . .venv/bin/activate && \ + python3 ./build_tools/fetch_sources.py + +# Configure build with CMake +# Default to gfx942 (MI300 series), can be overridden with build arg +ARG MAD_SYSTEM_GPU_ARCHITECTURE=gfx942 + +# Enable components needed for PyTorch: +# - CORE_RUNTIME: Essential ROCm runtime +# - HIP_RUNTIME: HIP runtime for GPU execution +# - BLAS: rocBLAS for linear algebra operations +# - PRIM: rocPRIM for parallel primitives +# - RAND: rocRAND for random number generation +# This is much faster than building all components +RUN . .venv/bin/activate && \ + cmake -B build -GNinja . \ + -DTHEROCK_AMDGPU_TARGETS=${MAD_SYSTEM_GPU_ARCHITECTURE} \ + -DTHEROCK_ENABLE_ALL=OFF \ + -DTHEROCK_ENABLE_CORE_RUNTIME=ON \ + -DTHEROCK_ENABLE_HIP_RUNTIME=ON \ + -DTHEROCK_ENABLE_BLAS=ON \ + -DTHEROCK_ENABLE_PRIM=ON \ + -DTHEROCK_ENABLE_RAND=ON \ + -DBUILD_TESTING=ON + +# Build TheRock components +# This will take some time depending on enabled components +RUN . .venv/bin/activate && \ + cmake --build build + +# Install built components +RUN . .venv/bin/activate && \ + cmake --install build --prefix /opt/rocm + +# Set up runtime environment +ENV PATH=/opt/rocm/bin:/workspace/TheRock/.venv/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH +ENV ROCM_PATH=/opt/rocm +ENV HIP_PATH=/opt/rocm + +# Install PyTorch with ROCm support +# Using PyTorch's official ROCm wheels that work with TheRock's ROCm distribution +RUN . /workspace/TheRock/.venv/bin/activate && \ + pip3 install --no-cache-dir \ + torch \ + torchvision \ + --index-url https://download.pytorch.org/whl/rocm6.2 + +# Verify installations +RUN . /workspace/TheRock/.venv/bin/activate && \ + python3 -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA Available: {torch.cuda.is_available()}'); print(f'Device Count: {torch.cuda.device_count() if torch.cuda.is_available() else 0}'); print(f'ROCm/HIP: {torch.version.hip if hasattr(torch.version, \"hip\") else \"N/A\"}')" + +# Create entrypoint script to activate venv +RUN echo '#!/bin/bash\n\ +source /workspace/TheRock/.venv/bin/activate\n\ +exec "$@"' > /entrypoint.sh && \ + chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] +CMD ["/bin/bash"] + +# Labels +LABEL maintainer="AMD ROCm " +LABEL description="TheRock PyTorch Benchmark - The HIP Environment and ROCm Kit with PyTorch" +LABEL version="nightly" +LABEL gpu_architecture="${MAD_SYSTEM_GPU_ARCHITECTURE}" +LABEL components="core_runtime,hip_runtime,blas,prim,rand,pytorch" + diff --git a/tests/fixtures/dummy/docker/dummy_torchrun.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_torchrun.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..e195b386 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_torchrun.ubuntu.amd.Dockerfile @@ -0,0 +1,46 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +ARG BASE_DOCKER=rocm/pytorch +FROM $BASE_DOCKER + +# Install any additional dependencies for torchrun testing +# (rocm/pytorch already has PyTorch with distributed support) + +# ============================================================================ +# ROCm/MIOpen Optimizations (Optional - reduces warnings) +# ============================================================================ + +# Clean MIOpen find-db to avoid duplicate kernel warnings +RUN if [ -d "$HOME/.config/miopen" ]; then \ + rm -rf $HOME/.config/miopen/* ; \ + fi && \ + if [ -d "/tmp/.miopen" ]; then \ + rm -rf /tmp/.miopen/* ; \ + fi + +# Set MIOpen environment variables for better performance +# Cache will be cleared at runtime to avoid "Duplicate ID" warnings +ENV MIOPEN_FIND_MODE=1 \ + MIOPEN_USER_DB_PATH=/tmp/.miopen + +# Pre-create MIOpen cache directory with proper permissions +RUN mkdir -p /tmp/.miopen && chmod 1777 /tmp/.miopen + +# ============================================================================ +# Optional: Install additional utilities for debugging +# ============================================================================ +# Uncomment if you need debugging tools: +# RUN apt-get update && apt-get install -y --no-install-recommends \ +# lshw \ +# pciutils \ +# && rm -rf /var/lib/apt/lists/* + +# ============================================================================ +# Verification (Optional - useful for debugging) +# ============================================================================ +# Verify ROCm installation +RUN rocminfo > /dev/null 2>&1 || echo "ROCm info check failed (expected in non-GPU build environment)" + +# Note: The K8s deployment config should override these env vars if needed: +# - MIOPEN_FIND_MODE is already set in deployment_config.env_vars +# - MIOPEN_USER_DB_PATH is already set in deployment_config.env_vars + diff --git a/tests/fixtures/dummy/docker/dummy_torchtitan.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_torchtitan.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..918b3d7d --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_torchtitan.ubuntu.amd.Dockerfile @@ -0,0 +1,64 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +ARG BASE_DOCKER=rocm/pytorch:latest +FROM $BASE_DOCKER + +# ============================================================================ +# Install TorchTitan Dependencies +# ============================================================================ +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Install required Python packages for TorchTitan +RUN pip install --no-cache-dir \ + tomli \ + tomli-w \ + psutil \ + tensorboard + +# ============================================================================ +# Install TorchTitan +# ============================================================================ +WORKDIR /opt +RUN git clone https://github.com/pytorch/torchtitan.git && \ + cd torchtitan && \ + pip install --no-cache-dir -r requirements.txt + +# Set PYTHONPATH to include TorchTitan +ENV PYTHONPATH=/opt/torchtitan:$PYTHONPATH + +# ============================================================================ +# ROCm/MIOpen Optimizations +# ============================================================================ +RUN if [ -d "$HOME/.config/miopen" ]; then \ + rm -rf $HOME/.config/miopen/* ; \ + fi && \ + if [ -d "/tmp/.miopen" ]; then \ + rm -rf /tmp/.miopen/* ; \ + fi + +ENV MIOPEN_FIND_MODE=1 \ + MIOPEN_USER_DB_PATH=/tmp/.miopen + +RUN mkdir -p /tmp/.miopen && chmod 1777 /tmp/.miopen + +# ============================================================================ +# TorchTitan Environment Variables +# ============================================================================ +# Default environment variables for TorchTitan training +# These will be overridden by madengine deployment configs +ENV TORCHTITAN_TENSOR_PARALLEL_SIZE=1 \ + TORCHTITAN_PIPELINE_PARALLEL_SIZE=1 \ + TORCHTITAN_FSDP_ENABLED=0 \ + TORCHTITAN_CONTEXT_PARALLEL_SIZE=1 + +# ============================================================================ +# Verification +# ============================================================================ +# Verify TorchTitan installation +RUN python3 -c "import torch; print(f'✓ PyTorch version: {torch.__version__}')" && \ + test -f /opt/torchtitan/train.py && echo "✓ TorchTitan installed" || echo "⚠ TorchTitan not found" && \ + rocminfo > /dev/null 2>&1 || echo "ROCm check (OK in build env)" + +WORKDIR /workspace + diff --git a/tests/fixtures/dummy/docker/dummy_vllm.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_vllm.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..764f0887 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_vllm.ubuntu.amd.Dockerfile @@ -0,0 +1,92 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +# Production vLLM Dockerfile - Using official ROCm vLLM image for real benchmarking +ARG BASE_DOCKER=rocm/vllm:latest +FROM $BASE_DOCKER + +# ============================================================================ +# ROCm Optimizations +# ============================================================================ +# MIOpen configuration for ROCm +ENV MIOPEN_FIND_MODE=1 \ + MIOPEN_USER_DB_PATH=/tmp/.miopen \ + MIOPEN_CUSTOM_CACHE_DIR=/tmp/.miopen + +RUN mkdir -p /tmp/.miopen && chmod 1777 /tmp/.miopen + +# ============================================================================ +# vLLM Environment Variables for ROCm +# ============================================================================ +# Core vLLM settings +ENV VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ + VLLM_USE_MODELSCOPE=False \ + VLLM_WORKER_MULTIPROC_METHOD=spawn \ + VLLM_LOGGING_LEVEL=INFO + +# ROCm specific optimizations +ENV HSA_FORCE_FINE_GRAIN_PCIE=1 \ + HSA_ENABLE_SDMA=0 \ + GPU_MAX_HW_QUEUES=2 \ + NCCL_DEBUG=WARN \ + NCCL_MIN_NCHANNELS=16 + +# PyTorch settings for ROCm +ENV TORCH_NCCL_ASYNC_ERROR_HANDLING=1 + +# HIP/ROCm runtime settings +# Note: HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES should be set at runtime +# ENV HIP_VISIBLE_DEVICES=0 +# ENV ROCR_VISIBLE_DEVICES=0 + +# ============================================================================ +# vLLM Flash Attention for ROCm +# ============================================================================ +ENV VLLM_USE_FLASH_ATTN_TRITON=1 + +# ============================================================================ +# Verification +# ============================================================================ +# Verify real vLLM installation +RUN python3 -c "import vllm; print(f'✓ vLLM version: {vllm.__version__}'); \ + assert not 'mock' in vllm.__version__.lower(), 'Mock vLLM detected!'" || \ + (echo "✗ vLLM import failed or mock detected" && exit 1) + +# Verify PyTorch with ROCm +RUN python3 -c "import torch; print(f'✓ PyTorch version: {torch.__version__}')" || \ + (echo "✗ PyTorch import failed" && exit 1) + +# Verify ROCm availability +RUN python3 -c "import torch; \ + is_rocm = hasattr(torch.version, 'hip') and torch.version.hip is not None; \ + print(f'✓ ROCm available: {is_rocm}'); \ + print(f'✓ ROCm version: {torch.version.hip if is_rocm else \"N/A\"}')" || \ + (echo "✗ ROCm check failed" && exit 1) + +# GPU device check (will show count = 0 in build environment) +RUN python3 -c "import torch; count = torch.cuda.device_count(); print(f'✓ GPU devices detected: {count}'); print(f'✓ GPU 0: {torch.cuda.get_device_name(0)}' if count > 0 else ' (No GPUs in build environment - will be available at runtime)')" + +# Verify ROCm tools (may not be available in build environment) +RUN rocminfo > /dev/null 2>&1 || echo " (rocminfo check skipped - will be available at runtime)" +RUN rocm-smi > /dev/null 2>&1 || echo " (rocm-smi check skipped - will be available at runtime)" + +# Verify key dependencies +RUN python3 -c "import transformers; print(f'✓ Transformers: {transformers.__version__}')" || \ + (echo "✗ Transformers import failed" && exit 1) +RUN python3 -c "import ray; print(f'✓ Ray: {ray.__version__}')" || \ + (echo "✗ Ray import failed" && exit 1) + +# ============================================================================ +# Workspace Setup +# ============================================================================ +WORKDIR /workspace + +# Print final environment info +RUN echo "=======================================" && \ + echo "vLLM Docker Image Build Complete" && \ + echo "=======================================" && \ + echo "Base Image: rocm/vllm:latest" && \ + echo "ROCm Version: $(cat /opt/rocm/.info/version 2>/dev/null || echo 'latest')" && \ + echo "vLLM Version: $(python3 -c 'import vllm; print(vllm.__version__)')" && \ + echo "PyTorch Version: $(python3 -c 'import torch; print(torch.__version__)')" && \ + echo "Build Type: Production (Real vLLM with ROCm)" && \ + echo "=======================================" + diff --git a/tests/fixtures/dummy/docker/pyt_huggingface.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/pyt_huggingface.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..e56e693d --- /dev/null +++ b/tests/fixtures/dummy/docker/pyt_huggingface.ubuntu.amd.Dockerfile @@ -0,0 +1,121 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +############################################################################### +# +# MIT License +# +# Copyright (c) Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################# +ARG BASE_DOCKER=rocm/pytorch:latest +FROM $BASE_DOCKER + +USER root +ENV WORKSPACE_DIR=/workspace +ENV DEBIAN_FRONTEND=noninteractive + +# Create workspace directory +RUN mkdir -p $WORKSPACE_DIR +WORKDIR $WORKSPACE_DIR + +# Install system dependencies first (better caching) +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + gnupg2 \ + sudo \ + unzip \ + jq \ + sshpass \ + sshfs \ + netcat-traditional \ + locales \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Configure locale +RUN locale-gen en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US:en +ENV LC_ALL=en_US.UTF-8 + +# Install huggingface transformers - using official repo with latest stable release +# Note: Using official huggingface/transformers instead of ROCm fork for better compatibility +RUN cd /workspace && \ + git clone https://github.com/huggingface/transformers transformers && \ + cd transformers && \ + # Checkout latest stable release tag (adjust as needed) + git checkout $(git describe --tags --abbrev=0) && \ + git show --oneline -s && \ + pip install -e . && \ + cd .. + +# Install core dependencies with compatible versions +# Pin huggingface-hub to compatible range to avoid conflicts +RUN pip3 install --no-cache-dir \ + 'huggingface_hub>=0.20.0' \ + 'tokenizers>=0.13.0' \ + 'datasets>=2.0.0' \ + 'accelerate>=0.20.0' \ + && pip3 list + +# Intentionally skip torchaudio to prevent torch version conflicts +RUN if [ -f /workspace/transformers/examples/pytorch/_tests_requirements.txt ]; then \ + sed -i 's/torchaudio//g' /workspace/transformers/examples/pytorch/_tests_requirements.txt && \ + sed -i 's/torch[>=<].*//g' /workspace/transformers/examples/pytorch/_tests_requirements.txt; \ + fi + +# Install transformers example dependencies +RUN if [ -f /workspace/transformers/examples/pytorch/_tests_requirements.txt ]; then \ + cd /workspace/transformers/examples/pytorch && \ + pip3 install -r _tests_requirements.txt || true; \ + fi + +# Install additional ML and utility packages +RUN pip3 install --no-cache-dir \ + GPUtil \ + azureml \ + azureml-core \ + ninja \ + cerberus \ + sympy \ + sacremoses \ + 'sacrebleu>=2.0.0' \ + sentencepiece \ + scipy \ + scikit-learn \ + evaluate \ + tensorboard \ + && pip3 list + +# Verify installation and dependencies +RUN python3 -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA Available: {torch.cuda.is_available()}'); print(f'ROCm/HIP: {torch.version.hip if hasattr(torch.version, \"hip\") else \"N/A\"}')" && \ + python3 -c "import transformers; print(f'Transformers: {transformers.__version__}')" && \ + python3 -c "import huggingface_hub; print(f'HuggingFace Hub: {huggingface_hub.__version__}')" && \ + python3 -c "from transformers import AutoModel, AutoTokenizer; print('Transformers import successful')" + +# Record final configuration +RUN pip3 list > /workspace/pip_packages.txt && \ + echo "=== Environment Configuration ===" && \ + cat /workspace/pip_packages.txt + +# Reset frontend to avoid issues +ENV DEBIAN_FRONTEND= + +WORKDIR $WORKSPACE_DIR diff --git a/tests/fixtures/dummy/models.json b/tests/fixtures/dummy/models.json index 1ff21c23..28b3db7f 100644 --- a/tests/fixtures/dummy/models.json +++ b/tests/fixtures/dummy/models.json @@ -162,7 +162,7 @@ "name": "dummy_prof", "dockerfile": "docker/dummy", "scripts": "scripts/dummy/run_prof.sh", - "n_gpus": "1", + "n_gpus": "-1", "owner": "mad.support@amd.com", "training_precision": "", "tags": [ @@ -194,5 +194,250 @@ ], "args": "", "multiple_results": "perf_dummy.csv" + }, + { + "name": "dummy_superset", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_multi.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "perf_super_test" + ], + "args": "--config configs/default.csv", + "multiple_results": "perf_dummy_super.csv" + }, + { + "name": "dummy_data_aws", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_data_aws.sh", + "data": "dummy_data_aws", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_data" + ], + "args": "" + }, + { + "name": "dummy_data_minio", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_data_minio.sh", + "data": "dummy_data_minio", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_data" + ], + "args": "" + }, + { + "name": "dummy_data_austin_nas", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_data_nas.sh", + "data": "dummy_data_austin_nas", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_data" + ], + "args": "" + }, + { + "name": "dummy_torchrun", + "dockerfile": "docker/dummy_torchrun", + "scripts": "scripts/dummy_torchrun/run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_distributed" + ], + "args": "" + }, + { + "name": "dummy_torchrun_helper", + "dockerfile": "docker/dummy_torchrun", + "scripts": "scripts/dummy_torchrun/run_with_helper.py", + "n_gpus": "2", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_distributed" + ], + "args": "" + }, + { + "name": "dummy_torchrun_data_minio", + "dockerfile": "docker/dummy_torchrun", + "scripts": "scripts/dummy_torchrun/run_torchrun_data_minio.py", + "data": "dummy_data_minio", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_distributed", + "dummy_data" + ], + "args": "" + }, + { + "name": "dummy_torchrun_data_nas", + "dockerfile": "docker/dummy_torchrun", + "scripts": "scripts/dummy_torchrun/run_torchrun_data_nas.py", + "data": "dummy_data_austin_nas", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_distributed", + "dummy_data" + ], + "args": "" + }, + { + "name": "dummy_megatron_lm", + "dockerfile": "docker/dummy_megatron_lm", + "scripts": "scripts/dummy_megatron_lm/run_megatron.py", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_distributed", + "dummy_megatron" + ], + "args": "" + }, + { + "name": "dummy_deepspeed", + "dockerfile": "docker/dummy_deepspeed", + "scripts": "scripts/dummy_deepspeed/run.sh", + "n_gpus": "4", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_distributed", + "dummy_deepspeed" + ], + "args": "" + }, + { + "name": "dummy_vllm", + "dockerfile": "docker/dummy_vllm", + "scripts": "scripts/dummy_vllm/run.sh", + "n_gpus": "4", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_distributed", + "dummy_vllm", + "inference" + ], + "args": "" + }, + { + "name": "dummy_sglang", + "dockerfile": "docker/dummy_sglang", + "scripts": "scripts/dummy_sglang/run.sh", + "n_gpus": "4", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_distributed", + "dummy_sglang", + "inference" + ], + "args": "" + }, + { + "name": "dummy_sglang_disagg", + "dockerfile": "docker/dummy_sglang_disagg", + "scripts": "scripts/dummy_sglang_disagg/run.sh", + "n_gpus": "3", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_distributed", + "dummy_sglang_disagg", + "inference", + "disaggregated" + ], + "args": "" + }, + { + "name": "dummy_torchtitan", + "dockerfile": "docker/dummy_torchtitan", + "scripts": "scripts/dummy_torchtitan/run.sh", + "n_gpus": "8", + "owner": "mad.support@amd.com", + "training_precision": "bf16", + "tags": [ + "dummies", + "dummy_distributed", + "dummy_torchtitan", + "llm_training" + ], + "args": "" + }, + { + "name": "pyt_huggingface_gpt2", + "url": "https://github.com/huggingface/transformers", + "dockerfile": "docker/pyt_huggingface", + "scripts": "scripts/pyt_huggingface_gpt2/run.sh", + "n_gpus": "-1", + "owner": "mad.support@amd.com", + "training_precision": "fp16", + "tags": [ + "pyt", + "fp16", + "gpt2" + ], + "args": "" + }, + { + "name": "pyt_huggingface_bert", + "url": "https://github.com/huggingface/transformers", + "dockerfile": "docker/pyt_huggingface", + "scripts": "scripts/pyt_huggingface_bert/run.sh", + "n_gpus": "-1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "pyt", + "bert" + ], + "args": "" + }, + { + "name": "dummy_therock", + "dockerfile": "docker/dummy_therock", + "scripts": "scripts/dummy_therock/run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "fp32", + "tags": [ + "dummies", + "therock", + "pytorch", + "rocm" + ], + "args": "" } ] diff --git a/tests/fixtures/dummy/scripts/dummy/configs/default.csv b/tests/fixtures/dummy/scripts/dummy/configs/default.csv new file mode 100644 index 00000000..9876eacc --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy/configs/default.csv @@ -0,0 +1,4 @@ +model,benchmark,config_value,batch_size,datatype,max_tokens +dummy/model-1,throughput,128,8,float16,1024 +dummy/model-2,serving,256,16,float32,2048 +dummy/model-3,latency,512,32,bfloat16,4096 diff --git a/tests/fixtures/dummy/scripts/dummy/run_data_aws.sh b/tests/fixtures/dummy/scripts/dummy/run_data_aws.sh new file mode 100644 index 00000000..ab0a8641 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy/run_data_aws.sh @@ -0,0 +1,10 @@ + +if [ -f "${MAD_DATAHOME}/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx" ]; then + echo "${MAD_DATAHOME}/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx is present" + echo "performance: $RANDOM samples_per_second" +else + echo "${MAD_DATAHOME}/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx is NOT present" + exit 1 +fi + + diff --git a/tests/fixtures/dummy/scripts/dummy/run_data_minio.sh b/tests/fixtures/dummy/scripts/dummy/run_data_minio.sh new file mode 100644 index 00000000..ce697b39 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy/run_data_minio.sh @@ -0,0 +1,7 @@ +if [ -f "${MAD_DATAHOME}/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx" ]; then + echo "${MAD_DATAHOME}/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx is present" + echo "performance: $RANDOM samples_per_second" +else + echo "${MAD_DATAHOME}/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx is NOT present" + exit 1 +fi \ No newline at end of file diff --git a/tests/fixtures/dummy/scripts/dummy/run_data_nas.sh b/tests/fixtures/dummy/scripts/dummy/run_data_nas.sh index 29488739..878d9330 100644 --- a/tests/fixtures/dummy/scripts/dummy/run_data_nas.sh +++ b/tests/fixtures/dummy/scripts/dummy/run_data_nas.sh @@ -11,11 +11,25 @@ else echo "MAD_DATAHOME is set" fi +# Check if data location exists (either mounted or downloaded) +if [ ! -d "${MAD_DATAHOME}" ]; then + echo "${MAD_DATAHOME} directory does not exist" + exit 1 +fi + +# Check if it's a mounted filesystem (for traditional NAS) mountCode=`mount | grep "${MAD_DATAHOME}"` if [ -z "$mountCode" ]; then - echo "${MAD_DATAHOME} is NOT mounted" - exit 1 + echo "${MAD_DATAHOME} is NOT mounted (data downloaded to directory)" + # For K8s/downloaded data, check if directory has content + if [ -n "$(ls -A ${MAD_DATAHOME} 2>/dev/null)" ]; then + echo "${MAD_DATAHOME} has data (downloaded)" + echo "performance: $RANDOM samples_per_second" + else + echo "${MAD_DATAHOME} is empty (test environment - data provider works but source is empty)" + echo "performance: $RANDOM samples_per_second (simulated)" + fi else echo "${MAD_DATAHOME} is mounted" echo "performance: $RANDOM samples_per_second" diff --git a/tests/fixtures/dummy/scripts/dummy3/get_models_json.py b/tests/fixtures/dummy/scripts/dummy3/get_models_json.py index 8c7affab..425a0b19 100644 --- a/tests/fixtures/dummy/scripts/dummy3/get_models_json.py +++ b/tests/fixtures/dummy/scripts/dummy3/get_models_json.py @@ -1,10 +1,10 @@ """Model template for dummy3 model. -This model is used to test the dynamic model discovery feature of MADEngine. +This model is used to test the dynamic model discovery feature of madengine. Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ -from madengine.tools.discover_models import CustomModel +from madengine.utils.discover_models import CustomModel Model3Data = CustomModel( name="model3", diff --git a/tests/fixtures/dummy/scripts/dummy_deepspeed/ds_config.json b/tests/fixtures/dummy/scripts/dummy_deepspeed/ds_config.json new file mode 100644 index 00000000..91f53d2a --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_deepspeed/ds_config.json @@ -0,0 +1,34 @@ +{ + "train_batch_size": 256, + "train_micro_batch_size_per_gpu": 32, + "gradient_accumulation_steps": 4, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 0.001, + "betas": [0.9, 0.999], + "eps": 1e-8, + "weight_decay": 0.01 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": 0.001, + "warmup_num_steps": 100 + } + }, + "fp16": { + "enabled": false + }, + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "reduce_scatter": true, + "overlap_comm": false + }, + "gradient_clipping": 1.0, + "steps_per_print": 10, + "wall_clock_breakdown": false +} diff --git a/tests/fixtures/dummy/scripts/dummy_deepspeed/run.sh b/tests/fixtures/dummy/scripts/dummy_deepspeed/run.sh new file mode 100644 index 00000000..aa86bc85 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_deepspeed/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# +# DeepSpeed Wrapper Script - Uses torchrun launcher +# +# This script launches DeepSpeed training using torchrun instead of MPI, +# which avoids the need for OpenMPI installation in the container. +# +set -e + +echo "========================================================================" +echo "madengine DeepSpeed Wrapper Script" +echo "========================================================================" + +# Get current directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Determine launcher from environment or default to torchrun +LAUNCHER_CMD=${MAD_MULTI_NODE_RUNNER:-"torchrun --standalone --nproc_per_node=2"} + +echo "========================================================================" +echo "Launcher Command:" +echo "$LAUNCHER_CMD" +echo "========================================================================" + +# Launch training with torchrun +$LAUNCHER_CMD run_deepspeed.py --deepspeed_config ds_config.json + +echo "========================================================================" +echo "Training script completed" +echo "========================================================================" + diff --git a/tests/fixtures/dummy/scripts/dummy_deepspeed/run_deepspeed.py b/tests/fixtures/dummy/scripts/dummy_deepspeed/run_deepspeed.py new file mode 100755 index 00000000..7851597f --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_deepspeed/run_deepspeed.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +""" +DeepSpeed Training Benchmark - Uses deepspeed launcher + +Demonstrates DeepSpeed features: +- ZeRO optimizer stages +- Gradient accumulation +- Mixed precision training +- Uses deepspeed launcher (NOT torchrun) + +Launch with deepspeed launcher: + deepspeed --num_gpus=2 run_deepspeed.py +""" + +import os +import sys +import time +import socket +import argparse +import torch +import torch.nn as nn +import torch.distributed as dist +import deepspeed + +# Configuration +NUM_EPOCHS = 3 +NUM_BATCHES = 50 +IMAGE_SIZE = 224 +NUM_CLASSES = 1000 + +class SimpleModel(nn.Module): + """Simple model for DeepSpeed testing""" + def __init__(self, num_classes=1000): + super().__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) + self.bn1 = nn.BatchNorm2d(64) + self.pool = nn.MaxPool2d(3, stride=2, padding=1) + self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2d(128) + self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1) + self.bn3 = nn.BatchNorm2d(256) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(256, num_classes) + + def forward(self, x): + x = self.pool(torch.relu(self.bn1(self.conv1(x)))) + x = self.pool(torch.relu(self.bn2(self.conv2(x)))) + x = torch.relu(self.bn3(self.conv3(x))) + x = self.avgpool(x) + x = torch.flatten(x, 1) + return self.fc(x) + +def print_header(args): + rank = int(os.environ.get("RANK", 0)) + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + world_size = int(os.environ.get("WORLD_SIZE", 1)) + + if rank == 0: + print("=" * 70) + print("DeepSpeed Distributed Training Benchmark") + print("=" * 70) + print(f"Hostname: {socket.gethostname()}") + print(f"World Size: {world_size}") + print(f"DeepSpeed Config: {args.deepspeed_config}") + print(f"Training: {NUM_EPOCHS} epochs, {NUM_BATCHES} batches/epoch") + print("=" * 70) + +def train_epoch(model_engine, criterion, epoch): + model_engine.train() + start_time = time.time() + total_loss = 0 + + local_rank = model_engine.local_rank + micro_batch_size = model_engine.train_micro_batch_size_per_gpu() + + for batch_idx in range(NUM_BATCHES): + # Synthetic data + inputs = torch.randn( + micro_batch_size, 3, IMAGE_SIZE, IMAGE_SIZE, + device=model_engine.device + ) + labels = torch.randint( + 0, NUM_CLASSES, (micro_batch_size,), + device=model_engine.device + ) + + # Forward pass + outputs = model_engine(inputs) + loss = criterion(outputs, labels) + + # Backward pass (DeepSpeed handles gradients, optimization) + model_engine.backward(loss) + model_engine.step() + + total_loss += loss.item() + + if local_rank == 0 and (batch_idx + 1) % 10 == 0: + print(f"Epoch [{epoch+1}] Batch [{batch_idx+1}/{NUM_BATCHES}] Loss: {loss.item():.4f}") + + epoch_time = time.time() - start_time + avg_loss = total_loss / NUM_BATCHES + + # Calculate node-local throughput + # Get local world size (GPUs per node) + local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) + + # Node throughput = samples processed by all GPUs on this node + node_throughput = (NUM_BATCHES * micro_batch_size * local_world_size) / epoch_time + + return avg_loss, node_throughput + +def main(): + # Start timer for total test duration + test_start_time = time.time() + + # Parse DeepSpeed args + parser = argparse.ArgumentParser() + # local_rank default should come from environment (set by torchrun) + parser.add_argument('--local_rank', type=int, default=int(os.environ.get('LOCAL_RANK', 0))) + parser.add_argument('--deepspeed_config', type=str, default='ds_config.json') + args = parser.parse_args() + + # Handle config file path - supports multiple locations for K8s/local execution + config_found = False + original_config_path = args.deepspeed_config + script_dir = os.path.dirname(os.path.abspath(__file__)) + + # Try 1: Check as-is (current directory or absolute path) + if os.path.exists(args.deepspeed_config): + config_found = True + print(f"[Config] Found DeepSpeed config: {args.deepspeed_config}") + + # Try 2: Check relative to script directory (for K8s execution) + if not config_found: + config_path = os.path.join(script_dir, args.deepspeed_config) + if os.path.exists(config_path): + args.deepspeed_config = config_path + config_found = True + print(f"[Config] Found DeepSpeed config in script directory: {config_path}") + + # Try 3: Check in scripts/dummy_deepspeed/ directory (for local execution) + if not config_found: + local_config_path = os.path.join('scripts/dummy_deepspeed', args.deepspeed_config) + if os.path.exists(local_config_path): + args.deepspeed_config = local_config_path + config_found = True + print(f"[Config] Found DeepSpeed config in scripts directory: {local_config_path}") + + # Error if not found + if not config_found: + print(f"\n❌ Error: DeepSpeed config not found!") + print(f"Searched for: {original_config_path}") + print(f"Locations tried:") + print(f" 1. Current directory: {os.getcwd()}/{original_config_path}") + print(f" 2. Script directory: {os.path.join(script_dir, original_config_path)}") + print(f" 3. Scripts directory: scripts/dummy_deepspeed/{original_config_path}") + print(f"\nCurrent directory: {os.getcwd()}") + print(f"Files in current directory:") + try: + for f in os.listdir('.'): + print(f" - {f}") + except Exception as e: + print(f" (Cannot list: {e})") + print(f"\nScript location: {os.path.abspath(__file__)}") + sys.exit(1) + + print_header(args) + + # Initialize PyTorch distributed backend BEFORE DeepSpeed + # This prevents DeepSpeed from trying to use MPI + if not dist.is_initialized(): + dist.init_process_group(backend="nccl") + print(f"✓ PyTorch distributed initialized (backend: nccl)") + + # Create model + model = SimpleModel(NUM_CLASSES) + + # Initialize DeepSpeed + # Note: When using deepspeed launcher with --deepspeed_config arg, + # do NOT pass config parameter to initialize() - it causes a conflict + model_engine, optimizer, _, _ = deepspeed.initialize( + args=args, + model=model, + model_parameters=model.parameters() + ) + + criterion = nn.CrossEntropyLoss() + + rank = model_engine.local_rank + + if rank == 0: + print(f"\n✓ DeepSpeed initialized") + print(f" ZeRO Stage: {model_engine.zero_optimization_stage()}") + print(f" Micro Batch Size: {model_engine.train_micro_batch_size_per_gpu()}") + print(f" Gradient Accumulation: {model_engine.gradient_accumulation_steps()}") + print(f"\nStarting training...\n") + + # Get topology information + rank = int(os.environ.get("RANK", 0)) + local_rank = model_engine.local_rank + local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) + world_size = model_engine.world_size + node_rank = rank // local_world_size if local_world_size > 0 else 0 + + # Training loop + all_throughputs = [] + for epoch in range(NUM_EPOCHS): + avg_loss, node_throughput = train_epoch(model_engine, criterion, epoch) + all_throughputs.append(node_throughput) + + if local_rank == 0: + print(f"\n[Node {node_rank}] Epoch {epoch+1} Complete: Loss={avg_loss:.4f}, Node Throughput={node_throughput:.2f} samples/sec\n") + + # ======================================================================== + # Node-Local Performance Reporting (NEW - Best Practice) + # Each node reports its OWN performance + # ======================================================================== + if local_rank == 0: + avg_node_throughput = sum(all_throughputs) / len(all_throughputs) + print(f"{'='*70}") + print("Node Performance Summary") + print(f"{'='*70}") + print(f"Node ID: {node_rank}") + print(f"Node Hostname: {socket.gethostname()}") + print(f"Local GPUs: {local_world_size}") + print(f"Node Throughput: {avg_node_throughput:.2f} samples_per_second") + print(f"ZeRO Stage: {model_engine.zero_optimization_stage()}") + print(f"{'='*70}") + + # CRITICAL: Standard output format for madengine parsing + print(f"\nperformance: {avg_node_throughput:.2f} samples_per_second") + print(f"node_id: {node_rank}") + print(f"local_gpus: {local_world_size}") + print(f"deepspeed_config: ZeRO_stage={model_engine.zero_optimization_stage()}") + + # Calculate and print test duration + test_duration = time.time() - test_start_time + print(f"test_duration: {test_duration:.2f}s") + + return 0 + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_megatron_lm/run_megatron.py b/tests/fixtures/dummy/scripts/dummy_megatron_lm/run_megatron.py new file mode 100755 index 00000000..70265702 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_megatron_lm/run_megatron.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 +""" +ROCm/Megatron-LM Training Benchmark + +Uses actual Megatron-Core APIs with ROCm optimizations. +Demonstrates: +- Megatron-Core initialization and utilities +- Tensor/Pipeline parallelism via Megatron APIs +- Proper distributed training setup +- Uses torchrun launcher (as required by Megatron-LM) + +Launch with torchrun: + torchrun --standalone --nproc_per_node=2 run_megatron.py + +Reference: https://github.com/ROCm/Megatron-LM +""" + +import os +import sys +import time +import socket +import torch +import torch.nn as nn + +# Import Megatron-Core components +try: + from megatron.core import mpu, tensor_parallel + from megatron.core.parallel_state import ( + initialize_model_parallel, + destroy_model_parallel, + get_tensor_model_parallel_world_size, + get_pipeline_model_parallel_world_size, + get_data_parallel_world_size, + ) + MEGATRON_AVAILABLE = True +except ImportError: + MEGATRON_AVAILABLE = False + print("Warning: Megatron-Core not available, falling back to basic DDP") +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + +# Training Configuration +BATCH_SIZE = 64 +NUM_EPOCHS = 3 +NUM_BATCHES = 50 +SEQ_LENGTH = 128 +HIDDEN_SIZE = 512 +NUM_CLASSES = 1000 + +# Get distributed environment (set by torchrun) +rank = int(os.environ.get("RANK", 0)) +local_rank = int(os.environ.get("LOCAL_RANK", 0)) +world_size = int(os.environ.get("WORLD_SIZE", 1)) + +# Megatron-LM parallelism config (from environment or defaults) +tensor_model_parallel_size = int(os.environ.get("TENSOR_MODEL_PARALLEL_SIZE", 1)) +pipeline_model_parallel_size = int(os.environ.get("PIPELINE_MODEL_PARALLEL_SIZE", 1)) +context_parallel_size = int(os.environ.get("CONTEXT_PARALLEL_SIZE", 1)) + +def print_header(tp_size, pp_size, dp_size): + """Print training configuration header""" + print("=" * 70) + print("ROCm/Megatron-LM Distributed Training Benchmark") + print("=" * 70) + print(f"Hostname: {socket.gethostname()}") + print(f"Global Rank: {rank}/{world_size}, Local Rank: {local_rank}") + print(f"Megatron-Core Available: {MEGATRON_AVAILABLE}") + print(f"\nParallelism Configuration:") + print(f" Tensor Model Parallel (TP): {tp_size}") + print(f" Pipeline Model Parallel (PP): {pp_size}") + print(f" Context Parallel (CP): {context_parallel_size}") + print(f" Data Parallel (DP): {dp_size}") + print(f"\nTraining Config:") + print(f" Batch Size (per GPU): {BATCH_SIZE}") + print(f" Global Batch Size: {BATCH_SIZE * dp_size}") + print(f" Sequence Length: {SEQ_LENGTH}") + print(f" Hidden Size: {HIDDEN_SIZE}") + print("=" * 70) + +class SimpleMegatronModel(nn.Module): + """ + Simplified model using Megatron-style patterns. + In production, use megatron.core.models for actual transformer implementations. + """ + def __init__(self, hidden_size, num_classes): + super().__init__() + self.embedding = nn.Linear(SEQ_LENGTH, hidden_size) + + # Simple transformer layers + self.transformer = nn.TransformerEncoder( + nn.TransformerEncoderLayer( + d_model=hidden_size, + nhead=8, + dim_feedforward=hidden_size * 4, + batch_first=True + ), + num_layers=6 + ) + self.classifier = nn.Linear(hidden_size, num_classes) + + def forward(self, x): + x = self.embedding(x) + x = self.transformer(x) + x = x.mean(dim=1) # Global pooling + return self.classifier(x) + +def train_epoch(model, optimizer, criterion, epoch, device, local_dp_size): + """Training loop for one epoch with node-local throughput""" + model.train() + start_time = time.time() + total_loss = 0 + + for batch_idx in range(NUM_BATCHES): + # Generate synthetic data + inputs = torch.randn(BATCH_SIZE, 1, SEQ_LENGTH, device=device) + labels = torch.randint(0, NUM_CLASSES, (BATCH_SIZE,), device=device) + + # Forward pass + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, labels) + + # Backward pass + loss.backward() + + # Optimizer step + optimizer.step() + + total_loss += loss.item() + + # Log progress from local_rank 0 + if local_rank == 0 and (batch_idx + 1) % 10 == 0: + print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] " + f"Batch [{batch_idx+1}/{NUM_BATCHES}] " + f"Loss: {loss.item():.4f}") + + epoch_time = time.time() - start_time + avg_loss = total_loss / NUM_BATCHES + + # Calculate node-local throughput + # local_dp_size = data parallel size on this node + node_throughput = (NUM_BATCHES * BATCH_SIZE * local_dp_size) / epoch_time + + return avg_loss, node_throughput + +def main(): + """Main training function using Megatron-Core""" + # Start timer for total test duration + test_start_time = time.time() + + # Set device + device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") + if torch.cuda.is_available(): + torch.cuda.set_device(device) + + # Initialize distributed and model parallelism + if MEGATRON_AVAILABLE and world_size > 1: + # Initialize with Megatron-Core + if rank == 0: + print(f"[Rank {rank}] Initializing Megatron-Core model parallelism...") + + torch.distributed.init_process_group(backend="nccl", init_method="env://") + + # Initialize Megatron model parallel groups + initialize_model_parallel( + tensor_model_parallel_size=tensor_model_parallel_size, + pipeline_model_parallel_size=pipeline_model_parallel_size, + context_parallel_size=context_parallel_size, + ) + + # Get actual parallel sizes from Megatron-Core + tp_size = get_tensor_model_parallel_world_size() + pp_size = get_pipeline_model_parallel_world_size() + dp_size = get_data_parallel_world_size() + + if rank == 0: + print(f"[Rank {rank}] ✓ Megatron-Core initialized") + print(f"[Rank {rank}] TP={tp_size}, PP={pp_size}, DP={dp_size}") + + elif world_size > 1: + # Fallback to basic DDP + if rank == 0: + print(f"[Rank {rank}] Using basic PyTorch DDP (Megatron-Core not available)") + torch.distributed.init_process_group(backend="nccl", init_method="env://") + tp_size = 1 + pp_size = 1 + dp_size = world_size + else: + # Single GPU + tp_size = 1 + pp_size = 1 + dp_size = 1 + + # Print configuration + print_header(tp_size, pp_size, dp_size) + + if torch.cuda.is_available(): + print(f"[Rank {rank}] Using GPU: {torch.cuda.get_device_name(device)}") + + # Create model + model = SimpleMegatronModel(HIDDEN_SIZE, NUM_CLASSES).to(device) + + # Wrap with DDP if needed (in production, use Megatron's model wrappers) + if world_size > 1 and not MEGATRON_AVAILABLE: + from torch.nn.parallel import DistributedDataParallel as DDP + model = DDP(model, device_ids=[local_rank], output_device=local_rank) + + # Optimizer and loss + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01) + criterion = nn.CrossEntropyLoss() + + # Get local world size and node rank + local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) + node_rank = rank // local_world_size if local_world_size > 0 else 0 + + # Calculate local data parallel size (DP ranks on this node) + # In Megatron: DP = world_size / (TP * PP * CP) + # For simplicity, assume local_dp_size proportional to local_world_size + local_dp_size = dp_size // (world_size // local_world_size) if (world_size // local_world_size) > 0 else dp_size + if local_dp_size < 1: + local_dp_size = 1 + + # Synchronize before training + if world_size > 1: + torch.distributed.barrier() + + if local_rank == 0: + print(f"\n{'='*70}") + print(f"[Node {node_rank}] Starting Training") + print(f"{'='*70}\n") + + # Training loop + all_throughputs = [] + for epoch in range(NUM_EPOCHS): + avg_loss, node_throughput = train_epoch( + model, optimizer, criterion, epoch, device, local_dp_size + ) + all_throughputs.append(node_throughput) + + if local_rank == 0: + print(f"\n[Node {node_rank}] Epoch {epoch+1}/{NUM_EPOCHS} Complete:") + print(f" Loss: {avg_loss:.4f}") + print(f" Node Throughput: {node_throughput:.2f} samples/sec\n") + + # ======================================================================== + # Node-Local Performance Reporting (NEW - Best Practice) + # ======================================================================== + if local_rank == 0: + avg_node_throughput = sum(all_throughputs) / len(all_throughputs) + print(f"{'='*70}") + print("Node Performance Summary") + print(f"{'='*70}") + print(f"Node ID: {node_rank}") + print(f"Node Hostname: {socket.gethostname()}") + print(f"Local GPUs: {local_world_size}") + print(f"Node Throughput: {avg_node_throughput:.2f} samples_per_second") + print(f"\nMegatron Configuration:") + print(f" Tensor Parallel (TP): {tp_size}") + print(f" Pipeline Parallel (PP): {pp_size}") + print(f" Context Parallel (CP): {context_parallel_size}") + print(f" Data Parallel (DP): {dp_size}") + print(f"{'='*70}") + + # CRITICAL: Standard output format for madengine parsing + print(f"\nperformance: {avg_node_throughput:.2f} samples_per_second") + print(f"node_id: {node_rank}") + print(f"local_gpus: {local_world_size}") + print(f"megatron_config: TP={tp_size} PP={pp_size} CP={context_parallel_size} DP={dp_size}") + + # Calculate and print test duration + test_duration = time.time() - test_start_time + print(f"test_duration: {test_duration:.2f}s") + + # Cleanup + if MEGATRON_AVAILABLE and world_size > 1: + destroy_model_parallel() + + if world_size > 1: + torch.distributed.destroy_process_group() + if rank == 0: + print(f"\n✓ Distributed cleanup complete") + + return 0 + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as e: + print(f"[Rank {rank}] Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_sglang/README.md b/tests/fixtures/dummy/scripts/dummy_sglang/README.md new file mode 100644 index 00000000..1ee0cdb3 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_sglang/README.md @@ -0,0 +1,388 @@ +# SGLang Distributed Inference - madengine Integration + +This directory contains scripts for running SGLang distributed inference on SLURM clusters through madengine. + +## Overview + +**SGLang** is a fast serving framework for large language models and vision-language models, featuring: +- **RadixAttention**: Efficient KV cache with automatic prefix caching +- **Native Distributed Launcher**: Uses `python3 -m sglang.launch_server` (NO torchrun needed!) +- **Tensor Parallelism (TP)**: Split model across GPUs within a node +- **Ray-based coordination**: Automatic distributed inference across nodes +- **High throughput**: Optimized for both single and multi-node deployments + +## Key Difference from vLLM + +**SGLang does NOT use torchrun!** It has its own native launcher: +- **SGLang**: `python3 -m sglang.launch_server` (Ray-based) +- **vLLM**: Can use `torchrun` or direct Python launch + +## Files + +- `run.sh` - Wrapper script that uses SGLang's native launcher +- `run_sglang_inference.py` - Python benchmark using SGLang Runtime API +- `README.md` - This documentation file + +## Architecture + +### Single-Node Multi-GPU (Tensor Parallelism) + +``` +┌─────────────────────────────────────────┐ +│ Node 1 (4 GPUs with TP) │ +│ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ +│ │ GPU0 │─│ GPU1 │─│ GPU2 │─│ GPU3 │ │ +│ │Shard │ │Shard │ │Shard │ │Shard │ │ +│ │ 1/4 │ │ 2/4 │ │ 3/4 │ │ 4/4 │ │ +│ └──────┘ └──────┘ └──────┘ └──────┘ │ +└─────────────────────────────────────────┘ +``` + +**Command**: `python3 -m sglang.launch_server --model-path MODEL --tp 4` + +### Multi-Node Multi-GPU (TP + Load Balancing) + +``` +┌─────────────────────────────────────────┐ +│ Node 1 (TP Group 1) │ +│ ┌──────────────────────────────────┐ │ +│ │ GPUs 0-3 (Full Model Copy) │ │ +│ └──────────────────────────────────┘ │ +└──────────────┬──────────────────────────┘ + │ Ray Coordination +┌──────────────┴──────────────────────────┐ +│ Node 2 (TP Group 2) │ +│ ┌──────────────────────────────────┐ │ +│ │ GPUs 0-3 (Full Model Copy) │ │ +│ └──────────────────────────────────┘ │ +└─────────────────────────────────────────┘ +``` + +**Commands**: +```bash +# Node 1 (rank 0) +python3 -m sglang.launch_server --model-path MODEL --tp 4 \ + --nnodes 2 --node-rank 0 --nccl-init-addr MASTER_IP:PORT + +# Node 2 (rank 1) +python3 -m sglang.launch_server --model-path MODEL --tp 4 \ + --nnodes 2 --node-rank 1 --nccl-init-addr MASTER_IP:PORT +``` + +## Usage + +### Quick Start with madengine + +#### Single-Node Inference (4 GPUs) + +```bash +madengine run \ + --model-name dummy_sglang \ + --additional-context-file examples/slurm-configs/minimal/sglang-single-node-minimal.json +``` + +#### Multi-Node Inference (2 nodes × 4 GPUs) + +```bash +madengine run \ + --model-name dummy_sglang \ + --additional-context-file examples/slurm-configs/minimal/sglang-multi-node-minimal.json +``` + +### Execution Modes + +The script supports two execution modes: + +#### 1. Server Mode (OpenAI-compatible API) + +Launches SGLang as a server that exposes an OpenAI-compatible API: + +```bash +export SGLANG_EXECUTION_MODE=server +./run.sh +``` + +The server will be accessible at `http://localhost:30000` and supports: +- `/v1/completions` - Text completion endpoint +- `/v1/chat/completions` - Chat completion endpoint +- `/v1/models` - List available models + +#### 2. Offline Mode (Batch Inference - Default) + +Runs batch inference directly for benchmarking: + +```bash +export SGLANG_EXECUTION_MODE=offline # or leave unset +./run.sh +``` + +This mode is better for: +- Performance benchmarking +- Batch processing +- Integration testing + +### Manual Execution + +If you want to run the scripts directly without madengine: + +#### Single-Node (4 GPUs with TP) + +```bash +export NNODES=1 +export NPROC_PER_NODE=4 +export MASTER_ADDR=localhost +export MASTER_PORT=29500 +./run.sh +``` + +#### Multi-Node (2 nodes × 4 GPUs with TP) + +On master node (rank 0): +```bash +export NNODES=2 +export NPROC_PER_NODE=4 +export NODE_RANK=0 +export MASTER_ADDR=master-node-hostname +export MASTER_PORT=29500 +./run.sh +``` + +On worker node (rank 1): +```bash +export NNODES=2 +export NPROC_PER_NODE=4 +export NODE_RANK=1 +export MASTER_ADDR=master-node-hostname +export MASTER_PORT=29500 +./run.sh +``` + +## Configuration + +### Environment Variables + +| Variable | Description | Example | +|----------|-------------|---------| +| `NNODES` | Number of nodes | `2` | +| `NPROC_PER_NODE` | GPUs per node | `4` | +| `NODE_RANK` | Current node rank (0-indexed) | `0` | +| `MASTER_ADDR` | Master node address | `node001` | +| `MASTER_PORT` | Communication port | `29500` | +| `SGLANG_EXECUTION_MODE` | `server` or `offline` | `offline` | + +**Note**: Unlike vLLM, SGLang does NOT use `MAD_MULTI_NODE_RUNNER` (torchrun). It has its own launcher! + +### SGLang-Specific Settings + +Environment variables in your Slurm config: + +```json +{ + "env_vars": { + "SGLANG_ALLOW_LONG_MAX_MODEL_LEN": "1", + "SGLANG_ENABLE_FLASHINFER": "1", + "SGLANG_ENABLE_RADIX_CACHE": "1", + "SGLANG_RADIX_CACHE_SIZE": "0.9", + "SGLANG_EXECUTION_MODE": "offline" + } +} +``` + +### Custom Models + +To use a different model, modify `run.sh`: + +For server mode: +```bash +python3 -m sglang.launch_server \ + --model-path "meta-llama/Llama-2-7b-hf" \ + --tp $TP_SIZE \ + --nnodes $NNODES \ + --node-rank $NODE_RANK \ + --nccl-init-addr "${MASTER_ADDR}:${MASTER_PORT}" +``` + +For offline mode: +```bash +python3 run_sglang_inference.py \ + --model "meta-llama/Llama-2-7b-hf" \ + --tp-size $TP_SIZE \ + --nnodes $NNODES +``` + +## SGLang Native Launcher Examples + +### Server Mode + +```bash +# Single-node server (4 GPUs) +python3 -m sglang.launch_server \ + --model-path meta-llama/Llama-2-7b-hf \ + --tp 4 \ + --host 0.0.0.0 \ + --port 30000 + +# Multi-node server (2 nodes, 4 GPUs each) +# Node 0: +python3 -m sglang.launch_server \ + --model-path meta-llama/Llama-2-7b-hf \ + --tp 4 \ + --nnodes 2 \ + --node-rank 0 \ + --nccl-init-addr 192.168.1.100:29500 + +# Node 1: +python3 -m sglang.launch_server \ + --model-path meta-llama/Llama-2-7b-hf \ + --tp 4 \ + --nnodes 2 \ + --node-rank 1 \ + --nccl-init-addr 192.168.1.100:29500 +``` + +### Offline Mode (Python API) + +```python +import sglang as sgl + +# Single-node +runtime = sgl.Runtime( + model_path="meta-llama/Llama-2-7b-hf", + tp_size=4, +) + +# Multi-node +runtime = sgl.Runtime( + model_path="meta-llama/Llama-2-7b-hf", + tp_size=4, + nnodes=2, + node_rank=0, # Set appropriately per node + nccl_init_addr="192.168.1.100:29500", +) + +# Generate +outputs = runtime.generate( + ["The future of AI is"], + sampling_params={"max_new_tokens": 128} +) +``` + +## Performance Tuning + +### ROCm Optimizations + +For AMD GPUs (included in Dockerfile): + +```bash +# HSA optimizations +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export HSA_ENABLE_SDMA=0 +export GPU_MAX_HW_QUEUES=2 + +# NCCL optimizations +export NCCL_DEBUG=WARN +export NCCL_MIN_NCHANNELS=16 + +# Network interface +export NCCL_SOCKET_IFNAME=eth0 # or ib0 for InfiniBand +``` + +### Memory Management + +Adjust GPU memory utilization: + +```python +runtime = sgl.Runtime( + model_path=args.model, + tp_size=args.tp_size, + mem_fraction_static=0.90, # Use 90% of GPU memory +) +``` + +### Batch Size + +For higher throughput, increase concurrent requests: + +```python +NUM_PROMPTS = 200 # Increase from default 100 +``` + +## Comparison: SGLang vs vLLM Launchers + +| Feature | vLLM | SGLang | +|---------|------|--------| +| **Launcher** | `torchrun` or `vllm serve` | `python3 -m sglang.launch_server` | +| **Coordination** | Ray (optional) | Ray (built-in, required) | +| **Multi-node Setup** | torchrun handles ranks | SGLang launcher handles ranks | +| **Attention** | PagedAttention | RadixAttention (prefix caching) | +| **Prefix Caching** | Manual | Automatic | +| **Best For** | General inference | Complex workflows with shared prefixes | + +**Key Insight**: SGLang does NOT need torchrun because it has its own native distributed launcher! + +## Troubleshooting + +### Issue: "No module named 'sglang'" + +**Solution**: Ensure you're using the official SGLang Docker image: +```dockerfile +FROM lmsysorg/sglang:latest +``` + +Or install SGLang: +```bash +pip install "sglang[all]" +``` + +### Issue: Multi-node initialization hangs + +**Solutions**: +1. Verify `MASTER_ADDR` is accessible from all nodes +2. Check firewall rules for Ray ports (6379, 8265, 10001-10100) +3. Ensure `NCCL_SOCKET_IFNAME` is set correctly +4. Verify NCCL init address is reachable: `telnet $MASTER_ADDR $MASTER_PORT` + +### Issue: Out of memory errors + +**Solutions**: +1. Reduce `mem_fraction_static` (e.g., from 0.90 to 0.80) +2. Use more GPUs (increase TP size) +3. Use a smaller model +4. Enable FlashInfer if not already: `SGLANG_ENABLE_FLASHINFER=1` + +### Issue: Ray initialization failures + +**Solutions**: +1. Check Ray is installed: `python3 -c "import ray; print(ray.__version__)"` +2. Clear Ray temp files: `rm -rf /tmp/ray/*` +3. Verify network connectivity between nodes +4. Check Ray logs: `cat /tmp/ray/session_*/logs/*` + +## Output Format + +The benchmark script outputs performance metrics in madengine format: + +``` +performance: 45.23 requests_per_second +tokens_per_second: 5789.12 +model: facebook/opt-125m +tp_size: 4 +nnodes: 2 +``` + +madengine automatically parses these metrics and stores them in `perf.csv`. + +## References + +- **SGLang GitHub**: https://github.com/sgl-project/sglang +- **SGLang Documentation**: https://docs.sglang.ai/ +- **SGLang Native Launcher**: https://github.com/sgl-project/sglang#distributed-serving +- **madengine Documentation**: See `examples/slurm-configs/README.md` +- **ROCm Documentation**: https://rocm.docs.amd.com/ + +## Support + +For issues specific to: +- **madengine integration**: Contact mad.support@amd.com +- **SGLang itself**: Open issue at https://github.com/sgl-project/sglang/issues +- **ROCm compatibility**: Check ROCm documentation or AMD support diff --git a/tests/fixtures/dummy/scripts/dummy_sglang/run.sh b/tests/fixtures/dummy/scripts/dummy_sglang/run.sh new file mode 100755 index 00000000..00c6ebf4 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_sglang/run.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# +# SGLang Distributed Inference Script +# +# SGLang has its own native launcher (sglang.launch_server) - NO torchrun needed! +# Uses Ray for distributed coordination internally +# +set -e + +echo "========================================================================" +echo "madengine SGLang Inference Wrapper Script" +echo "========================================================================" + +# Get current directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Detect deployment configuration from environment +NNODES=${NNODES:-1} +NPROC_PER_NODE=${NPROC_PER_NODE:-1} +NODE_RANK=${NODE_RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-localhost} +MASTER_PORT=${MASTER_PORT:-29500} + +echo "========================================================================" +echo "Deployment Configuration:" +echo " Nodes: $NNODES" +echo " GPUs per node: $NPROC_PER_NODE" +echo " Node rank: $NODE_RANK" +echo " Master address: $MASTER_ADDR" +echo " Master port: $MASTER_PORT" +echo "========================================================================" + +# SGLang-specific parallelism +# - Tensor Parallelism (TP): Split model across GPUs within a node +# - Data Parallelism (DP): Distribute requests across nodes (via multi-node setup) +TP_SIZE=$NPROC_PER_NODE # Tensor parallel within node +TOTAL_GPUS=$((TP_SIZE * NNODES)) + +echo "========================================================================" +echo "SGLang Parallelism Configuration:" +echo " Tensor Parallel (TP) Size: $TP_SIZE (GPUs per node)" +echo " Number of Nodes: $NNODES" +echo " Total GPUs: $TOTAL_GPUS" +echo "========================================================================" + +# Choose execution mode: server or offline batch inference +# Server mode: Launches SGLang server for OpenAI-compatible API +# Offline mode: Runs batch inference directly (better for benchmarking) +EXECUTION_MODE=${SGLANG_EXECUTION_MODE:-offline} + +if [ "$EXECUTION_MODE" = "server" ]; then + echo "========================================================================" + echo "Running in SERVER mode (OpenAI-compatible API)" + echo "========================================================================" + + if [ $NNODES -gt 1 ]; then + echo "Multi-node server setup - using SGLang native launcher" + + # SGLang multi-node server launch + # Each node must run this command with appropriate node_rank + python3 -m sglang.launch_server \ + --model-path "facebook/opt-125m" \ + --tp $TP_SIZE \ + --nnodes $NNODES \ + --node-rank $NODE_RANK \ + --nccl-init-addr "${MASTER_ADDR}:${MASTER_PORT}" \ + --host 0.0.0.0 \ + --port 30000 + else + echo "Single-node server setup - using SGLang native launcher" + + # SGLang single-node server launch + python3 -m sglang.launch_server \ + --model-path "facebook/opt-125m" \ + --tp $TP_SIZE \ + --host 0.0.0.0 \ + --port 30000 + fi +else + echo "========================================================================" + echo "Running in OFFLINE mode (batch inference benchmark)" + echo "========================================================================" + + # For offline batch inference, we use SGLang's Runtime directly + # No need for torchrun - SGLang handles distributed setup via Ray + python3 run_sglang_inference.py \ + --model "facebook/opt-125m" \ + --tp-size $TP_SIZE \ + --nnodes $NNODES \ + --node-rank $NODE_RANK \ + --master-addr $MASTER_ADDR \ + --master-port $MASTER_PORT +fi + +echo "========================================================================" +echo "Inference script completed" +echo "========================================================================" diff --git a/tests/fixtures/dummy/scripts/dummy_sglang/run_sglang_inference.py b/tests/fixtures/dummy/scripts/dummy_sglang/run_sglang_inference.py new file mode 100644 index 00000000..abb850a1 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_sglang/run_sglang_inference.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 +""" +SGLang Distributed Inference Benchmark + +SGLang uses its own native launcher - NO torchrun needed! +- Uses Ray for distributed coordination internally +- Supports Tensor Parallelism (TP) within nodes +- Supports multi-node deployment with automatic load balancing + +Launch modes: + Single-node/multi-GPU: TP only + Multi-node/multi-GPU: TP across nodes with load balancing +""" + +import os +import sys +import time +import argparse +import socket +from typing import List, Optional + +# Configure environment before importing SGLang +os.environ.setdefault("SGLANG_ALLOW_LONG_MAX_MODEL_LEN", "1") +os.environ.setdefault("SGLANG_USE_MODELSCOPE", "False") +os.environ.setdefault("SGLANG_ENABLE_FLASHINFER", "1") + +try: + import sglang as sgl + import torch +except ImportError as e: + print(f"Error importing required libraries: {e}") + print("Please ensure SGLang and PyTorch are installed") + sys.exit(1) + +# Configuration +DEFAULT_MODEL = "facebook/opt-125m" # Small model for testing +NUM_PROMPTS = 100 +MAX_TOKENS = 128 +TEMPERATURE = 0.8 +TOP_P = 0.95 + +# Sample prompts for inference +SAMPLE_PROMPTS = [ + "The future of artificial intelligence is", + "Machine learning has revolutionized", + "Deep learning models are capable of", + "Natural language processing enables", + "Computer vision systems can", +] + + +def print_header(args): + """Print benchmark header with configuration.""" + print("=" * 70) + print("SGLang Distributed Inference Benchmark") + print("=" * 70) + print(f"Hostname: {socket.gethostname()}") + print(f"Model: {args.model}") + print(f"Tensor Parallel Size: {args.tp_size}") + print(f"Number of Nodes: {args.nnodes}") + print(f"Node Rank: {args.node_rank}") + print(f"Total GPUs: {args.tp_size * args.nnodes}") + print(f"Number of prompts: {NUM_PROMPTS}") + print(f"Max tokens: {MAX_TOKENS}") + print("=" * 70) + + +def generate_prompts(num_prompts: int) -> List[str]: + """Generate list of prompts for inference.""" + prompts = [] + for i in range(num_prompts): + # Cycle through sample prompts + base_prompt = SAMPLE_PROMPTS[i % len(SAMPLE_PROMPTS)] + prompts.append(f"{base_prompt} (request {i+1})") + return prompts + + +def run_inference_sglang(args): + """ + Run SGLang inference using native Runtime API. + + SGLang handles distributed setup automatically via Ray. + No torchrun needed! + """ + print("\n" + "=" * 70) + print("Initializing SGLang Runtime") + print("=" * 70) + + try: + # Initialize SGLang runtime + # SGLang automatically handles multi-node setup via Ray + # when appropriate environment variables are set + + runtime_config = { + "model_path": args.model, + "tp_size": args.tp_size, + "trust_remote_code": True, + "mem_fraction_static": 0.90, + } + + # For multi-node, set Ray init address + if args.nnodes > 1: + runtime_config["nccl_init_addr"] = f"{args.master_addr}:{args.master_port}" + runtime_config["nnodes"] = args.nnodes + runtime_config["node_rank"] = args.node_rank + print(f"Multi-node setup: {args.nnodes} nodes, rank {args.node_rank}") + else: + print(f"Single-node setup: {args.tp_size} GPUs") + + # Initialize runtime + runtime = sgl.Runtime(**runtime_config) + print("✓ SGLang runtime initialized successfully") + + except Exception as e: + print(f"✗ Failed to initialize SGLang runtime: {e}") + print("\n⚠️ Falling back to mock inference for testing...") + return run_inference_mock(args) + + # Generate prompts + prompts = generate_prompts(NUM_PROMPTS) + + # Warmup + print("\nWarmup: Running 10 prompts...") + warmup_prompts = prompts[:10] + try: + _ = runtime.generate( + warmup_prompts, + sampling_params={ + "max_new_tokens": MAX_TOKENS, + "temperature": TEMPERATURE, + "top_p": TOP_P, + } + ) + print("✓ Warmup complete") + except Exception as e: + print(f"⚠️ Warmup failed: {e}") + + # Benchmark + print(f"\nBenchmark: Running {NUM_PROMPTS} prompts...") + start_time = time.time() + + try: + outputs = runtime.generate( + prompts, + sampling_params={ + "max_new_tokens": MAX_TOKENS, + "temperature": TEMPERATURE, + "top_p": TOP_P, + } + ) + + end_time = time.time() + elapsed_time = end_time - start_time + + # Calculate metrics + total_tokens = sum(len(output["meta_info"]["completion_tokens"]) for output in outputs) + throughput = NUM_PROMPTS / elapsed_time + tokens_per_second = total_tokens / elapsed_time + + # Print results + print(f"\n{'=' * 70}") + print("Benchmark Results") + print("=" * 70) + print(f"Total prompts: {NUM_PROMPTS}") + print(f"Total time: {elapsed_time:.2f} seconds") + print(f"Throughput: {throughput:.2f} requests/second") + print(f"Token generation: {tokens_per_second:.2f} tokens/second") + print(f"Average latency: {(elapsed_time / NUM_PROMPTS) * 1000:.2f} ms/request") + print("=" * 70) + + # Print sample outputs + print("\n" + "=" * 70) + print("Sample Outputs (first 3)") + print("=" * 70) + for i, output in enumerate(outputs[:3]): + prompt = prompts[i] + generated_text = output["text"] + print(f"\n[Prompt {i+1}]: {prompt}") + print(f"[Output {i+1}]: {generated_text[:200]}...") + + # madengine output format + print(f"\nperformance: {throughput:.2f} requests_per_second") + print(f"tokens_per_second: {tokens_per_second:.2f}") + print(f"model: {args.model}") + print(f"tp_size: {args.tp_size}") + print(f"nnodes: {args.nnodes}") + + # Cleanup + runtime.shutdown() + + return 0 + + except Exception as e: + print(f"✗ Inference failed: {e}") + import traceback + traceback.print_exc() + print("\n⚠️ Falling back to mock inference...") + return run_inference_mock(args) + + +def run_inference_mock(args): + """ + Mock inference for testing infrastructure without real SGLang. + """ + print("\n" + "=" * 70) + print("⚠️ Running Mock Inference (Testing Mode)") + print("=" * 70) + print("This simulates SGLang inference for testing madengine infrastructure.") + print("=" * 70) + + # Simulate initialization + print("\nInitializing mock SGLang runtime...") + time.sleep(1) + print("✓ Mock runtime initialized") + + # Generate prompts + prompts = generate_prompts(NUM_PROMPTS) + + # Warmup + print("\nWarmup: Running 10 prompts...") + time.sleep(0.5) + print("✓ Warmup complete") + + # Benchmark + print(f"\nBenchmark: Running {NUM_PROMPTS} prompts...") + start_time = time.time() + + # Simulate inference + time.sleep(2.0) + + end_time = time.time() + elapsed_time = end_time - start_time + + # Mock metrics + total_tokens = NUM_PROMPTS * MAX_TOKENS + throughput = NUM_PROMPTS / elapsed_time + tokens_per_second = total_tokens / elapsed_time + + # Print results + print(f"\n{'=' * 70}") + print("Benchmark Results (Mock)") + print("=" * 70) + print(f"Total prompts: {NUM_PROMPTS}") + print(f"Total time: {elapsed_time:.2f} seconds") + print(f"Throughput: {throughput:.2f} requests/second") + print(f"Token generation: {tokens_per_second:.2f} tokens/second") + print(f"Average latency: {(elapsed_time / NUM_PROMPTS) * 1000:.2f} ms/request") + print("=" * 70) + + # Print sample outputs + print("\n" + "=" * 70) + print("Sample Outputs (Mock - first 3)") + print("=" * 70) + for i in range(3): + print(f"\n[Prompt {i+1}]: {prompts[i]}") + print(f"[Output {i+1}]: [Mock generated text for infrastructure testing...]") + + # madengine output format + print(f"\nperformance: {throughput:.2f} requests_per_second") + print(f"tokens_per_second: {tokens_per_second:.2f}") + print(f"model: {args.model}") + print(f"tp_size: {args.tp_size}") + print(f"nnodes: {args.nnodes}") + + return 0 + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description="SGLang Distributed Inference Benchmark (Native Launcher)" + ) + parser.add_argument( + "--model", + type=str, + default=DEFAULT_MODEL, + help=f"Model name or path (default: {DEFAULT_MODEL})" + ) + parser.add_argument( + "--tp-size", + type=int, + default=1, + help="Tensor parallel size (GPUs per node, default: 1)" + ) + parser.add_argument( + "--nnodes", + type=int, + default=1, + help="Number of nodes (default: 1)" + ) + parser.add_argument( + "--node-rank", + type=int, + default=0, + help="Node rank (0-indexed, default: 0)" + ) + parser.add_argument( + "--master-addr", + type=str, + default="localhost", + help="Master node address (default: localhost)" + ) + parser.add_argument( + "--master-port", + type=int, + default=29500, + help="Master communication port (default: 29500)" + ) + parser.add_argument( + "--mock-only", + action="store_true", + help="Force mock inference (skip real SGLang)" + ) + + args = parser.parse_args() + + # Validate arguments + if args.tp_size < 1: + print("Error: tp-size must be >= 1") + return 1 + + if args.nnodes < 1: + print("Error: nnodes must be >= 1") + return 1 + + if args.node_rank < 0 or args.node_rank >= args.nnodes: + print(f"Error: node-rank must be in range [0, {args.nnodes-1}]") + return 1 + + # Print configuration + print_header(args) + + # Run inference + if args.mock_only: + return run_inference_mock(args) + else: + return run_inference_sglang(args) + + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + print("\n\nInterrupted by user") + sys.exit(130) + except Exception as e: + print(f"\nError: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_sglang_disagg/requirements.txt b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/requirements.txt new file mode 100644 index 00000000..25f8ad69 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/requirements.txt @@ -0,0 +1,3 @@ +# Minimal requirements for dummy test +# No actual SGLang needed - this is a simulation + diff --git a/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run.sh b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run.sh new file mode 100755 index 00000000..9661fc17 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# SGLang Disaggregated Dummy Test Script +# Tests disaggregated prefill/decode architecture with minimal model + +set -e + +echo "============================================" +echo "SGLang Disaggregated Dummy Test" +echo "============================================" + +# Check if disagg mode is enabled +if [ "${SGLANG_DISAGG_MODE:-}" = "enabled" ]; then + echo "✓ Disaggregated mode detected" + echo " Node Rank: ${SGLANG_NODE_RANK:-unknown}" + echo " Prefill Nodes: ${SGLANG_DISAGG_PREFILL_NODES:-unknown}" + echo " Decode Nodes: ${SGLANG_DISAGG_DECODE_NODES:-unknown}" + + # Run Python script that handles node roles + python3 run_sglang_disagg_inference.py +else + echo "❌ ERROR: SGLANG_DISAGG_MODE not set" + echo "This test requires SGLang Disaggregated launcher" + exit 1 +fi + +echo "============================================" +echo "✓ SGLang Disagg Test Complete" +echo "============================================" + diff --git a/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run_sglang_disagg_inference.py b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run_sglang_disagg_inference.py new file mode 100755 index 00000000..94b476b6 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_sglang_disagg/run_sglang_disagg_inference.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +""" +SGLang Disaggregated Dummy Inference Script + +Simulates the disaggregated prefill/decode architecture for testing. +This is a lightweight test that validates the launcher setup without +requiring actual models or Mooncake infrastructure. +""" + +import os +import sys +import time +import socket +from typing import Optional + + +def get_node_info() -> dict: + """Extract node information from environment variables.""" + return { + "node_rank": int(os.getenv("SGLANG_NODE_RANK", "0")), + "total_nodes": int(os.getenv("SGLANG_DISAGG_TOTAL_NODES", "3")), + "prefill_nodes": int(os.getenv("SGLANG_DISAGG_PREFILL_NODES", "1")), + "decode_nodes": int(os.getenv("SGLANG_DISAGG_DECODE_NODES", "1")), + "tp_size": int(os.getenv("SGLANG_TP_SIZE", "1")), + "master_port": int(os.getenv("MASTER_PORT", "29500")), + "hostname": socket.gethostname(), + } + + +def determine_node_role(node_rank: int, prefill_nodes: int) -> str: + """Determine if this node is proxy, prefill, or decode.""" + if node_rank == 0: + return "proxy" + elif node_rank <= prefill_nodes: + return "prefill" + else: + return "decode" + + +def simulate_proxy_node(info: dict): + """Simulate proxy/load balancer node.""" + print("=" * 60) + print("🔀 PROXY NODE (Load Balancer)") + print("=" * 60) + print(f"Hostname: {info['hostname']}") + print(f"Node Rank: {info['node_rank']}") + print(f"Master Port: {info['master_port']}") + print(f"Prefill Nodes: {info['prefill_nodes']}") + print(f"Decode Nodes: {info['decode_nodes']}") + print("-" * 60) + + print("\n[Proxy] Initializing load balancer...") + time.sleep(1) + + print("[Proxy] Waiting for prefill nodes to be ready...") + for i in range(1, info['prefill_nodes'] + 1): + print(f" ✓ Prefill node {i} connected") + time.sleep(0.5) + + print("[Proxy] Waiting for decode nodes to be ready...") + for i in range(info['prefill_nodes'] + 1, info['total_nodes']): + print(f" ✓ Decode node {i} connected") + time.sleep(0.5) + + print("\n[Proxy] All nodes connected. Load balancer ready!") + print("[Proxy] Simulating request routing...") + + # Simulate some requests + for req_id in range(1, 4): + print(f"\n[Proxy] Request {req_id}:") + print(f" → Routing to prefill node {(req_id % info['prefill_nodes']) + 1}") + time.sleep(0.3) + print(f" → KV cache transferred via Mooncake") + time.sleep(0.3) + print(f" → Routing to decode node {info['prefill_nodes'] + ((req_id % info['decode_nodes']) + 1)}") + time.sleep(0.3) + print(f" ✓ Request {req_id} completed") + + print("\n[Proxy] Test complete. Shutting down...") + + +def simulate_prefill_node(info: dict): + """Simulate prefill node.""" + print("=" * 60) + print("⚡ PREFILL NODE") + print("=" * 60) + print(f"Hostname: {info['hostname']}") + print(f"Node Rank: {info['node_rank']}") + print(f"Tensor Parallel Size: {info['tp_size']}") + print(f"Role: Prompt Processing") + print("-" * 60) + + print("\n[Prefill] Initializing prefill server...") + time.sleep(1) + + print("[Prefill] Loading model shards...") + for shard in range(info['tp_size']): + print(f" ✓ Shard {shard + 1}/{info['tp_size']} loaded") + time.sleep(0.3) + + print("\n[Prefill] Server ready. Listening for requests...") + time.sleep(1) + + print("[Prefill] Processing prompts...") + for batch in range(1, 4): + print(f"\n[Prefill] Batch {batch}:") + print(f" → Processing prompt tokens...") + time.sleep(0.5) + print(f" → Generating KV cache...") + time.sleep(0.5) + print(f" → Transferring KV cache via Mooncake...") + time.sleep(0.3) + print(f" ✓ Batch {batch} complete") + + print("\n[Prefill] Test complete. Shutting down...") + + +def simulate_decode_node(info: dict): + """Simulate decode node.""" + print("=" * 60) + print("🔤 DECODE NODE") + print("=" * 60) + print(f"Hostname: {info['hostname']}") + print(f"Node Rank: {info['node_rank']}") + print(f"Tensor Parallel Size: {info['tp_size']}") + print(f"Role: Token Generation") + print("-" * 60) + + print("\n[Decode] Initializing decode server...") + time.sleep(1) + + print("[Decode] Loading model shards...") + for shard in range(info['tp_size']): + print(f" ✓ Shard {shard + 1}/{info['tp_size']} loaded") + time.sleep(0.3) + + print("\n[Decode] Server ready. Listening for KV caches...") + time.sleep(1) + + print("[Decode] Generating tokens...") + for batch in range(1, 4): + print(f"\n[Decode] Batch {batch}:") + print(f" → Receiving KV cache via Mooncake...") + time.sleep(0.5) + print(f" → Generating tokens...") + for token in range(1, 6): + print(f" Token {token}/5", end="\r") + time.sleep(0.2) + print(f" ✓ Generated 5 tokens") + print(f" ✓ Batch {batch} complete") + + print("\n[Decode] Test complete. Shutting down...") + + +def main(): + """Main entry point for disaggregated inference simulation.""" + print("\n" + "=" * 60) + print("SGLang Disaggregated Inference Simulation") + print("=" * 60 + "\n") + + # Get node information + info = get_node_info() + role = determine_node_role(info["node_rank"], info["prefill_nodes"]) + + print(f"Cluster Configuration:") + print(f" Total Nodes: {info['total_nodes']}") + print(f" Prefill Nodes: {info['prefill_nodes']} (ranks 1-{info['prefill_nodes']})") + print(f" Decode Nodes: {info['decode_nodes']} (ranks {info['prefill_nodes']+1}-{info['total_nodes']-1})") + print(f" Proxy Node: 1 (rank 0)") + print(f"\nThis Node:") + print(f" Rank: {info['node_rank']}") + print(f" Role: {role.upper()}") + print(f" Hostname: {info['hostname']}") + print() + + # Simulate based on role + try: + if role == "proxy": + simulate_proxy_node(info) + elif role == "prefill": + simulate_prefill_node(info) + elif role == "decode": + simulate_decode_node(info) + else: + print(f"❌ ERROR: Unknown role '{role}'") + sys.exit(1) + + print("\n" + "=" * 60) + print("✅ Simulation Complete") + print("=" * 60) + return 0 + + except KeyboardInterrupt: + print("\n\n⚠️ Interrupted by user") + return 130 + except Exception as e: + print(f"\n❌ ERROR: {e}") + import traceback + traceback.print_exc() + return 1 + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/tests/fixtures/dummy/scripts/dummy_therock/README.md b/tests/fixtures/dummy/scripts/dummy_therock/README.md new file mode 100644 index 00000000..c3070304 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_therock/README.md @@ -0,0 +1,132 @@ +# dummy_therock - PyTorch Benchmark with TheRock + +## Overview + +This model benchmarks PyTorch ResNet50 training performance using [TheRock](https://github.com/ROCm/TheRock), AMD's lightweight open source build system for HIP and ROCm. + +## What is TheRock? + +TheRock (The HIP Environment and ROCm Kit) is AMD's modern distribution system for ROCm, released as version 7.10 in December 2025. Unlike traditional ROCm installations via apt packages, TheRock distributes ROCm components as Python pip packages, making it lightweight and easy to integrate. + +## Benchmark Details + +- **Model**: ResNet50 (image classification) +- **Task**: Training with synthetic data +- **Batch Size**: 64 images +- **Iterations**: 100 training steps +- **Image Size**: 224x224 +- **Metric**: Images per second (throughput) + +## Files + +``` +dummy_therock/ +├── docker/dummy_therock.ubuntu.amd.Dockerfile # Docker image with rocm/pytorch +├── scripts/dummy_therock/ +│ ├── run.sh # Main entry point +│ ├── train_resnet.py # ResNet50 training benchmark +│ └── README.md # This file +``` + +## Usage + +### With madengine + +```bash +# Build and run the model +cd /path/to/madengine +python3 -m madengine.cli.run_models \ + --models-json tests/fixtures/dummy/models.json \ + --tags dummy_therock + +# Or run with specific GPU count +python3 -m madengine.cli.run_models \ + --models-json tests/fixtures/dummy/models.json \ + --model-name dummy_therock \ + --n-gpus 1 +``` + +### Standalone + +```bash +# Build Docker image +docker build -f tests/fixtures/dummy/docker/dummy_therock.ubuntu.amd.Dockerfile \ + -t dummy_therock . + +# Run benchmark +docker run --rm --device=/dev/kfd --device=/dev/dri \ + --network host --ipc=host --group-add video \ + -v $(pwd)/tests/fixtures/dummy/scripts/dummy_therock:/workspace/scripts \ + dummy_therock \ + bash /workspace/scripts/run.sh +``` + +## Expected Output + +The benchmark will output: + +``` +======================================================================== +ResNet50 Training Benchmark with TheRock +======================================================================== + +=== PyTorch Configuration === +PyTorch: 2.x.x +CUDA Available: True +HIP: 6.x.xxxxx + +======================================================================== +====================================================================== +ResNet50 Training Benchmark (TheRock) +====================================================================== +Device: cuda:0 +GPU: AMD Instinct MI300X +GPU Count: 1 + +Creating ResNet50 model... +Batch Size: 64 +Iterations: 100 +Image Size: 224x224 + +Warming up (10 iterations)... +Running benchmark (100 iterations)... + Progress: 20/100 + Progress: 40/100 + Progress: 60/100 + Progress: 80/100 + Progress: 100/100 + +====================================================================== +Benchmark Results: + Total Images Processed: 6400 + Duration: 45.23 seconds + Throughput: 141.52 images/sec +====================================================================== + +performance: 141.52 images_per_second +``` + +## Performance Metrics + +The model reports performance in the madengine standard format: + +``` +performance: images_per_second +``` + +This metric is automatically captured by madengine and written to `perf.csv`. + +## Tags + +- `dummies` - Test/dummy model +- `therock` - Uses TheRock ROCm distribution +- `pytorch` - PyTorch framework +- `rocm` - AMD ROCm platform + +## Notes + +- Based on `rocm/pytorch:latest` which uses TheRock's ROCm distribution +- Runs a real ResNet50 training workload (not just dummy output) +- Suitable for validating PyTorch + ROCm functionality +- Performance varies by GPU architecture (MI300X, MI250X, etc.) + diff --git a/tests/fixtures/dummy/scripts/dummy_therock/run.sh b/tests/fixtures/dummy/scripts/dummy_therock/run.sh new file mode 100755 index 00000000..12cafac4 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_therock/run.sh @@ -0,0 +1,48 @@ +#!/bin/bash +############################################################################### +# +# MIT License +# +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# +# Simple ResNet50 Training Benchmark with TheRock +# +############################################################################### +set -ex + +echo "========================================================================" +echo "ResNet50 Training Benchmark with TheRock" +echo "========================================================================" + +# Get script directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Optional: Check TheRock installation (informative, non-blocking) +echo "" +echo "=== TheRock Environment Check ===" +DETECT_SCRIPT="../scripts/common/tools/detect_therock.sh" +if [ -f "$DETECT_SCRIPT" ]; then + bash "$DETECT_SCRIPT" || echo "⚠️ TheRock validation completed with warnings (continuing anyway)" +else + echo "ℹ️ TheRock detector not available (skipping environment check)" + echo " To enable: Use --tools therock_check flag" +fi + +# Show PyTorch configuration +echo "" +echo "=== PyTorch Configuration ===" +python3 -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA Available: {torch.cuda.is_available()}'); print(f'HIP: {torch.version.hip if hasattr(torch.version, \"hip\") else \"N/A\"}')" + +echo "" +echo "========================================================================" +echo "Running Benchmark" +echo "========================================================================" + +# Run training benchmark +python3 "$SCRIPT_DIR/train_resnet.py" + +echo "" +echo "========================================================================" +echo "Benchmark completed!" +echo "========================================================================" + diff --git a/tests/fixtures/dummy/scripts/dummy_therock/train_resnet.py b/tests/fixtures/dummy/scripts/dummy_therock/train_resnet.py new file mode 100755 index 00000000..c90fe482 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_therock/train_resnet.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +Simple ResNet50 Training Benchmark for TheRock + +This script benchmarks ResNet50 training performance using PyTorch +on TheRock's ROCm distribution. +""" +import torch +import torch.nn as nn +import torchvision.models as models +import time +import sys + +# Configuration +BATCH_SIZE = 64 +NUM_ITERATIONS = 100 +IMAGE_SIZE = 224 + + +def main(): + print("=" * 70) + print("ResNet50 Training Benchmark (TheRock)") + print("=" * 70) + + # Setup device + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + print(f"Device: {device}") + + if torch.cuda.is_available(): + print(f"GPU: {torch.cuda.get_device_name(0)}") + print(f"GPU Count: {torch.cuda.device_count()}") + + # Create model + print("\nCreating ResNet50 model...") + model = models.resnet50(pretrained=False, num_classes=1000).to(device) + model.train() + + # Setup optimizer and loss + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + criterion = nn.CrossEntropyLoss() + + print(f"Batch Size: {BATCH_SIZE}") + print(f"Iterations: {NUM_ITERATIONS}") + print(f"Image Size: {IMAGE_SIZE}x{IMAGE_SIZE}") + + # Warmup + print("\nWarming up (10 iterations)...") + for _ in range(10): + images = torch.randn(BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE, device=device) + labels = torch.randint(0, 1000, (BATCH_SIZE,), device=device) + + optimizer.zero_grad() + outputs = model(images) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + if torch.cuda.is_available(): + torch.cuda.synchronize() + + # Benchmark + print(f"Running benchmark ({NUM_ITERATIONS} iterations)...") + start_time = time.time() + + for i in range(NUM_ITERATIONS): + images = torch.randn(BATCH_SIZE, 3, IMAGE_SIZE, IMAGE_SIZE, device=device) + labels = torch.randint(0, 1000, (BATCH_SIZE,), device=device) + + optimizer.zero_grad() + outputs = model(images) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + if (i + 1) % 20 == 0: + print(f" Progress: {i + 1}/{NUM_ITERATIONS}") + + if torch.cuda.is_available(): + torch.cuda.synchronize() + + end_time = time.time() + + # Calculate metrics + duration = end_time - start_time + total_images = BATCH_SIZE * NUM_ITERATIONS + images_per_sec = total_images / duration + + print("\n" + "=" * 70) + print("Benchmark Results:") + print(f" Total Images Processed: {total_images}") + print(f" Duration: {duration:.2f} seconds") + print(f" Throughput: {images_per_sec:.2f} images/sec") + print("=" * 70) + + # madengine performance output (required format) + print(f"\nperformance: {images_per_sec:.2f} images_per_second") + + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/helper.py b/tests/fixtures/dummy/scripts/dummy_torchrun/helper.py new file mode 100644 index 00000000..e705ce30 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/helper.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +""" +Helper modules for PyTorch distributed training benchmark. + +This module demonstrates: +- Separating model architecture into a dedicated module +- Reusable data loading utilities +- Configuration management +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ResidualBlock(nn.Module): + """Residual block with skip connection""" + def __init__(self, in_channels, out_channels, stride=1): + super(ResidualBlock, self).__init__() + self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, + stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(out_channels) + self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, + stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(out_channels) + + # Skip connection + self.skip = nn.Sequential() + if stride != 1 or in_channels != out_channels: + self.skip = nn.Sequential( + nn.Conv2d(in_channels, out_channels, kernel_size=1, + stride=stride, bias=False), + nn.BatchNorm2d(out_channels) + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out += self.skip(x) + out = F.relu(out) + return out + + +class ResNetModel(nn.Module): + """ + ResNet-style model for distributed training benchmark. + + This is a more realistic model architecture compared to SimpleCNN, + demonstrating residual connections and deeper networks. + """ + def __init__(self, num_classes=1000, num_blocks=[2, 2, 2, 2]): + super(ResNetModel, self).__init__() + self.in_channels = 64 + + # Initial convolution + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + # Residual layers + self.layer1 = self._make_layer(64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(512, num_blocks[3], stride=2) + + # Classification head + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(512, num_classes) + + def _make_layer(self, out_channels, num_blocks, stride): + """Create a layer with multiple residual blocks""" + strides = [stride] + [1] * (num_blocks - 1) + layers = [] + for stride in strides: + layers.append(ResidualBlock(self.in_channels, out_channels, stride)) + self.in_channels = out_channels + return nn.Sequential(*layers) + + def forward(self, x): + out = self.pool(F.relu(self.bn1(self.conv1(x)))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = self.avgpool(out) + out = torch.flatten(out, 1) + out = self.fc(out) + return out + + +class SyntheticDataset: + """ + Synthetic dataset generator for benchmarking. + + Generates random data on-the-fly to avoid I/O bottlenecks + and provide consistent benchmarking results. + """ + def __init__(self, num_samples, batch_size, image_size=224, num_classes=1000): + self.num_samples = num_samples + self.batch_size = batch_size + self.image_size = image_size + self.num_classes = num_classes + self.num_batches = num_samples // batch_size + + def generate_batch(self, device): + """Generate a synthetic batch of images and labels""" + images = torch.randn(self.batch_size, 3, self.image_size, + self.image_size, device=device) + labels = torch.randint(0, self.num_classes, (self.batch_size,), + device=device) + return images, labels + + def __len__(self): + return self.num_batches + + +class BenchmarkConfig: + """Configuration for distributed training benchmark""" + def __init__(self): + # Training hyperparameters + self.batch_size = 128 + self.num_epochs = 5 + self.learning_rate = 0.01 + self.momentum = 0.9 + self.weight_decay = 1e-4 + + # Data configuration + self.image_size = 224 + self.num_classes = 1000 + self.num_batches = 100 + + # Model configuration + self.model_type = "resnet" # or "simple_cnn" + self.resnet_blocks = [2, 2, 2, 2] # ResNet-18 style + + def __str__(self): + return ( + f"BenchmarkConfig(\n" + f" batch_size={self.batch_size},\n" + f" num_epochs={self.num_epochs},\n" + f" learning_rate={self.learning_rate},\n" + f" image_size={self.image_size},\n" + f" num_classes={self.num_classes},\n" + f" model_type={self.model_type}\n" + f")" + ) + + +def print_distributed_info(rank, local_rank, world_size): + """Print distributed training information""" + import socket + import os + + print(f"\n[Rank {rank}] Distributed Training Info:") + print(f" Hostname: {socket.gethostname()}") + print(f" Global Rank: {rank}") + print(f" Local Rank: {local_rank}") + print(f" World Size: {world_size}") + print(f" Master Addr: {os.environ.get('MASTER_ADDR', 'N/A')}") + print(f" Master Port: {os.environ.get('MASTER_PORT', 'N/A')}") + + +def print_gpu_info(rank, device): + """Print GPU information""" + if torch.cuda.is_available(): + print(f"\n[Rank {rank}] GPU Info:") + print(f" Device: {device}") + print(f" GPU Name: {torch.cuda.get_device_name(device)}") + print(f" GPU Memory: {torch.cuda.get_device_properties(device).total_memory / 1e9:.2f} GB") + else: + print(f"\n[Rank {rank}] Warning: CUDA not available, using CPU") + + +def calculate_model_size(model): + """Calculate total number of parameters in model""" + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + return total_params, trainable_params + + +def greet(name): + """Simple greeting function (for backward compatibility)""" + print(f"Hello from helper module! Greeting: {name}") diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run.sh b/tests/fixtures/dummy/scripts/dummy_torchrun/run.sh new file mode 100755 index 00000000..bc0f2318 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# +# Bash wrapper for dummy_torchrun distributed training +# Uses MAD_MULTI_NODE_RUNNER for torchrun launcher +# + +set -e + +echo "========================================================================" +echo "madengine Torchrun Wrapper Script" +echo "========================================================================" + +# Get current directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Determine multi-node runner to use +# Default to standalone torchrun if not set +if [ -z "$MAD_MULTI_NODE_RUNNER" ]; then + # Get number of GPUs from environment + N_GPUS="${MAD_RUNTIME_NGPUS:-1}" + + echo "ℹ️ MAD_MULTI_NODE_RUNNER not set, using standalone torchrun" + echo "ℹ️ Using $N_GPUS GPUs" + + MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node=$N_GPUS" +fi + +echo "========================================================================" +echo "Launcher Command:" +echo "$MAD_MULTI_NODE_RUNNER" +echo "========================================================================" + +# Create MIOpen cache directory if MIOPEN_USER_DB_PATH is set +# This prevents "Duplicate ID" errors in multi-GPU training +if [ -n "$MIOPEN_USER_DB_PATH" ]; then + # Extract base directory (before LOCAL_RANK expansion) + MIOPEN_BASE_DIR=$(dirname "$MIOPEN_USER_DB_PATH") + mkdir -p "$MIOPEN_BASE_DIR" + echo "ℹ️ MIOpen cache directory: $MIOPEN_USER_DB_PATH" + echo " (will be created per-process with LOCAL_RANK)" +fi + +# Execute the Python training script with torchrun +echo "Executing: $MAD_MULTI_NODE_RUNNER run_torchrun.py" +$MAD_MULTI_NODE_RUNNER run_torchrun.py +PYTHON_EXIT_CODE=$? + +echo "========================================================================" +echo "Training script completed with exit code: $PYTHON_EXIT_CODE" +echo "========================================================================" + +# Exit with the Python script's exit code +if [ $PYTHON_EXIT_CODE -ne 0 ]; then + echo "ERROR: Training script failed with exit code $PYTHON_EXIT_CODE" + exit $PYTHON_EXIT_CODE +fi diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun.py b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun.py new file mode 100644 index 00000000..204ae985 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python3 +""" +PyTorch Distributed Training Benchmark for madengine + +This benchmark demonstrates typical PyTorch distributed training patterns: +- DistributedDataParallel (DDP) for multi-GPU/multi-node training +- Synthetic data generation for reproducible benchmarks +- Proper GPU device assignment using LOCAL_RANK +- Gradient synchronization across processes +- Throughput measurement (samples/sec, images/sec) +- Compatible with torchrun launcher + +Usage: + # Single GPU + torchrun --standalone --nproc_per_node=1 run_torchrun.py + + # Multi-GPU (single node) + torchrun --standalone --nproc_per_node=8 run_torchrun.py + + # Multi-node (via K8s with torchrun) + torchrun --nnodes=4 --nproc_per_node=8 --master_addr=... run_torchrun.py +""" + +import os +import sys +import time +import socket +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + +# Configuration +BATCH_SIZE = 128 # Per-GPU batch size +NUM_EPOCHS = 5 +NUM_BATCHES = 100 # Number of synthetic batches per epoch +IMAGE_SIZE = 224 +NUM_CLASSES = 1000 + +# Get distributed environment variables (set by torchrun) +rank = int(os.environ.get("RANK", 0)) +local_rank = int(os.environ.get("LOCAL_RANK", 0)) +world_size = int(os.environ.get("WORLD_SIZE", 1)) +master_addr = os.environ.get("MASTER_ADDR", "localhost") +master_port = os.environ.get("MASTER_PORT", "29500") + + +def print_header(): + """Print benchmark header""" + print("=" * 70) + print("madengine PyTorch Distributed Training Benchmark") + print("=" * 70) + print(f"Hostname: {socket.gethostname()}") + print(f"Rank: {rank}/{world_size}") + print(f"Local Rank (GPU): {local_rank}") + if world_size > 1: + print(f"Master: {master_addr}:{master_port}") + print(f"\nConfiguration:") + print(f" Batch Size (per GPU): {BATCH_SIZE}") + print(f" Global Batch Size: {BATCH_SIZE * world_size}") + print(f" Epochs: {NUM_EPOCHS}") + print(f" Batches per Epoch: {NUM_BATCHES}") + print(f" Image Size: {IMAGE_SIZE}x{IMAGE_SIZE}") + print(f" Num Classes: {NUM_CLASSES}") + print("=" * 70) + + +class SimpleCNN(nn.Module): + """Simple CNN model for benchmarking""" + def __init__(self, num_classes=1000): + super(SimpleCNN, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) + self.bn1 = nn.BatchNorm2d(64) + self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2d(128) + + self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1) + self.bn3 = nn.BatchNorm2d(256) + + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(256, num_classes) + + def forward(self, x): + x = self.pool(F.relu(self.bn1(self.conv1(x)))) + x = self.pool(F.relu(self.bn2(self.conv2(x)))) + x = self.pool(F.relu(self.bn3(self.conv3(x)))) + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.fc(x) + return x + + +def generate_synthetic_batch(batch_size, device): + """Generate synthetic data for benchmarking""" + images = torch.randn(batch_size, 3, IMAGE_SIZE, IMAGE_SIZE, device=device) + labels = torch.randint(0, NUM_CLASSES, (batch_size,), device=device) + return images, labels + + +def train_epoch(model, optimizer, criterion, epoch, device): + """Train for one epoch with node-local throughput measurement""" + model.train() + epoch_start = time.time() + total_samples = 0 + total_loss = 0.0 + + for batch_idx in range(NUM_BATCHES): + batch_start = time.time() + + # Generate synthetic data + images, labels = generate_synthetic_batch(BATCH_SIZE, device) + + # Forward pass + optimizer.zero_grad() + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward pass (gradients are automatically synchronized across GPUs) + loss.backward() + + # Update weights + optimizer.step() + + batch_time = time.time() - batch_start + total_samples += BATCH_SIZE + total_loss += loss.item() + + # Print progress from local rank 0 on each node + if local_rank == 0 and (batch_idx + 1) % 20 == 0: + avg_loss = total_loss / (batch_idx + 1) + throughput = BATCH_SIZE / batch_time # Local throughput + print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] " + f"Batch [{batch_idx+1}/{NUM_BATCHES}] " + f"Loss: {loss.item():.4f} " + f"Throughput: {throughput:.2f} samples/sec (local)") + + epoch_time = time.time() - epoch_start + avg_loss = total_loss / NUM_BATCHES + + # ======================================================================== + # Node-Local Throughput Measurement + # ======================================================================== + # Calculate throughput for ALL GPUs on THIS NODE + local_samples = NUM_BATCHES * BATCH_SIZE + local_gpu_throughput = local_samples / epoch_time + + # Get local world size (GPUs per node) + local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) + + # Node throughput = sum of all local GPUs on this node + # In data parallel, each GPU processes the same throughput + node_throughput = local_gpu_throughput * local_world_size + + # Return metrics dictionary + metrics = { + 'avg_loss': avg_loss, + 'node_throughput': node_throughput, + 'epoch_time': epoch_time, + 'local_world_size': local_world_size + } + + return metrics + + +def main(): + """Main training function""" + # Start timer for total test duration + test_start_time = time.time() + + print_header() + + # Create per-process MIOpen cache directory to avoid database conflicts + # This must be done AFTER torchrun sets LOCAL_RANK environment variable + # This prevents "Duplicate ID" errors and database corruption in multi-GPU training + if "MIOPEN_USER_DB_PATH" in os.environ: + # Construct the per-process MIOpen path using actual local_rank value + # Cannot use expandvars() because the template uses ${LOCAL_RANK} syntax + miopen_template = os.environ["MIOPEN_USER_DB_PATH"] + # Replace ${LOCAL_RANK} or $LOCAL_RANK with actual value + miopen_path = miopen_template.replace("${LOCAL_RANK:-0}", str(local_rank)).replace("$LOCAL_RANK", str(local_rank)) + os.makedirs(miopen_path, exist_ok=True) + print(f"[Rank {rank}] ✓ Created MIOpen cache directory: {miopen_path}") + + # Initialize distributed training + if world_size > 1: + print(f"\n[Rank {rank}] Initializing distributed process group...") + # Best practice: Specify device_ids to avoid PyTorch warnings + dist.init_process_group( + backend="nccl", + init_method=f"env://", # Use environment variables (set by torchrun) + world_size=world_size, + rank=rank + ) + print(f"[Rank {rank}] ✓ Process group initialized") + print(f"[Rank {rank}] Backend: {dist.get_backend()}") + print(f"[Rank {rank}] World Size: {dist.get_world_size()}") + else: + print(f"\n=== Running in Standalone Mode (Single GPU) ===") + + # Set device + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + print(f"[Rank {rank}] PyTorch sees {num_gpus} GPU(s)") + print(f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}") + + if local_rank >= num_gpus: + print(f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}") + print(f"[Rank {rank}] Using cuda:0 instead") + device = torch.device("cuda:0") + else: + device = torch.device(f"cuda:{local_rank}") + + torch.cuda.set_device(device) + print(f"[Rank {rank}] Using GPU: {torch.cuda.get_device_name(device)}") + else: + device = torch.device("cpu") + print(f"[Rank {rank}] Warning: CUDA not available, using CPU") + + # Create model + print(f"\n[Rank {rank}] Creating model...") + model = SimpleCNN(num_classes=NUM_CLASSES).to(device) + + # Wrap model with DDP for distributed training + if world_size > 1: + # Best practice: Explicitly specify device_ids for DDP + model = DDP( + model, + device_ids=[local_rank], + output_device=local_rank, + broadcast_buffers=True, # Ensure buffers (like BatchNorm stats) are synced + find_unused_parameters=False # Set True only if needed (performance impact) + ) + print(f"[Rank {rank}] ✓ Model wrapped with DistributedDataParallel") + + # Create optimizer and loss function + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + criterion = nn.CrossEntropyLoss() + + # Synchronize before training + if world_size > 1: + # Best practice: Specify device to avoid warnings + dist.barrier(device_ids=[local_rank]) + + # Get topology information early (needed for logging) + local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) + node_rank = rank // local_world_size if local_world_size > 0 else 0 + + if local_rank == 0: + print(f"\n{'='*70}") + print(f"[Node {node_rank}] Starting Training") + print(f"{'='*70}") + + # Training loop + all_metrics = [] + for epoch in range(NUM_EPOCHS): + metrics = train_epoch( + model, optimizer, criterion, epoch, device + ) + all_metrics.append(metrics) + + if local_rank == 0: + print(f"\n[Node {node_rank}] Epoch [{epoch+1}/{NUM_EPOCHS}] Complete:") + print(f" Average Loss: {metrics['avg_loss']:.4f}") + print(f" Node Throughput: {metrics['node_throughput']:.2f} samples/sec") + print(f" Local GPUs: {metrics['local_world_size']}") + + # Calculate average node throughput across all epochs + avg_node_throughput = sum(m['node_throughput'] for m in all_metrics) / len(all_metrics) + avg_epoch_time = sum(m['epoch_time'] for m in all_metrics) / len(all_metrics) + + # Calculate num_nodes for reference + num_nodes = (world_size + local_world_size - 1) // local_world_size if local_world_size > 0 else 1 + + # Synchronize before final output + if world_size > 1: + dist.barrier(device_ids=[local_rank]) + + # ======================================================================== + # Node-Local Performance Reporting (NEW - Best Practice) + # Each node reports its OWN performance + # Madengine will collect from ALL nodes and aggregate + # ======================================================================== + if local_rank == 0: + print(f"\n{'='*70}") + print("Node Performance Summary") + print(f"{'='*70}") + print(f"Node ID: {node_rank}") + print(f"Node Hostname: {socket.gethostname()}") + print(f"Local GPUs: {local_world_size}") + print(f"Node Throughput: {avg_node_throughput:.2f} samples_per_second") + print(f"Avg Time per Epoch: {avg_epoch_time:.2f}s") + print(f"{'='*70}") + + # CRITICAL: Standard output format for madengine parsing + print(f"performance: {avg_node_throughput:.2f} samples_per_second", flush=True) + print(f"node_id: {node_rank}", flush=True) + print(f"local_gpus: {local_world_size}", flush=True) + + # Calculate and print test duration + test_duration = time.time() - test_start_time + print(f"test_duration: {test_duration:.2f}s", flush=True) + sys.stdout.flush() + + + # Cleanup + if world_size > 1: + dist.destroy_process_group() + if rank == 0: + print(f"✓ Process group destroyed") + + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as e: + print(f"[Rank {rank}] ✗ Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_minio.py b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_minio.py new file mode 100755 index 00000000..26c9c236 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_minio.py @@ -0,0 +1,470 @@ +#!/usr/bin/env python3 +""" +PyTorch Distributed Training with Data Provider for madengine + +This benchmark demonstrates distributed training with data provider integration: +- Multi-node/multi-GPU distributed training with DDP +- Data provider support (MinIO, AWS S3, NAS, etc.) +- K8s-optimized data handling (single download, shared across nodes via PVC) +- Proper synchronization and validation +- Accurate performance measurement with all_reduce + +K8s Best Practices: +- Only rank 0 validates data initially (avoid race conditions) +- All ranks validate data exists before training +- Use distributed barriers for synchronization +- Graceful error handling and reporting +- PVC-shared data across all pods/nodes + +Usage: + # K8s Multi-node with data provider + torchrun --nnodes=2 --nproc_per_node=2 --master_addr=... run_torchrun_data_minio.py +""" + +import os +import sys +import time +import socket +import pathlib +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + +# Configuration +BATCH_SIZE = 128 # Per-GPU batch size +NUM_EPOCHS = 5 +NUM_BATCHES = 100 # Number of synthetic batches per epoch +IMAGE_SIZE = 224 +NUM_CLASSES = 1000 + +# Data configuration (from run_data_minio.sh) +DATA_FILE = "bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx" + +# Get distributed environment variables (set by torchrun) +rank = int(os.environ.get("RANK", 0)) +local_rank = int(os.environ.get("LOCAL_RANK", 0)) +world_size = int(os.environ.get("WORLD_SIZE", 1)) +master_addr = os.environ.get("MASTER_ADDR", "localhost") +master_port = os.environ.get("MASTER_PORT", "29500") + +# NOTE: MIOpen directory creation moved to main() after LOCAL_RANK is available + + +def print_header(): + """Print benchmark header""" + print("=" * 70) + print("madengine PyTorch Distributed Training with Data Provider") + print("=" * 70) + print(f"Hostname: {socket.gethostname()}") + print(f"Rank: {rank}/{world_size}") + print(f"Local Rank (GPU): {local_rank}") + if world_size > 1: + print(f"Master: {master_addr}:{master_port}") + print(f"\nConfiguration:") + print(f" Batch Size (per GPU): {BATCH_SIZE}") + print(f" Global Batch Size: {BATCH_SIZE * world_size}") + print(f" Epochs: {NUM_EPOCHS}") + print(f" Batches per Epoch: {NUM_BATCHES}") + print(f" Image Size: {IMAGE_SIZE}x{IMAGE_SIZE}") + print(f" Num Classes: {NUM_CLASSES}") + print("=" * 70) + + +def validate_data_availability(): + """ + Validate that required data is available (K8s best practice). + + Strategy: + 1. Rank 0 checks data first and reports status + 2. All ranks independently validate data (no barrier needed before init_process_group) + 3. Exit gracefully if data missing + + Note: For K8s deployments, MAD_DATAHOME points to PVC mount point (/data). + This ensures data is shared across all pods (single-node and multi-node). + PVC must be configured with ReadWriteMany for multi-node deployments. + + Returns: + bool: True if data is available, False otherwise + """ + # K8s best practice: Data stored in PVC at /data (separate from compute pods) + data_home = os.environ.get("MAD_DATAHOME", "/data") + data_path = pathlib.Path(data_home) / DATA_FILE + + if rank == 0: + print(f"\n{'='*70}") + print("Data Provider Validation") + print(f"{'='*70}") + print(f"Data Home: {data_home}") + print(f"Expected File: {DATA_FILE}") + print(f"Full Path: {data_path}") + + if data_path.exists(): + file_size = data_path.stat().st_size + file_size_mb = file_size / (1024 * 1024) + print(f"✅ Data file found!") + print(f" Size: {file_size_mb:.2f} MB ({file_size:,} bytes)") + print(f" Path: {data_path}") + else: + print(f"❌ Data file NOT found!") + print(f" Expected at: {data_path}") + print(f" MAD_DATAHOME: {data_home}") + print(f"\n⚠️ Data provider should have downloaded this file.") + print(f" Check data provider configuration and logs.") + print(f"{'='*70}\n") + + # Note: Cannot use dist.barrier() here - process group not initialized yet + # Data validation happens before distributed initialization + # All ranks will independently validate data availability without synchronization + + # All ranks independently validate data exists + data_available = data_path.exists() + + if not data_available: + print(f"[Rank {rank}] ❌ ERROR: Data file not found at {data_path}") + else: + print(f"[Rank {rank}] ✅ Data file validated: {data_path}") + + return data_available + + +class SimpleCNN(nn.Module): + """Simple CNN model for benchmarking""" + def __init__(self, num_classes=1000): + super(SimpleCNN, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) + self.bn1 = nn.BatchNorm2d(64) + self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2d(128) + + self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1) + self.bn3 = nn.BatchNorm2d(256) + + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(256, num_classes) + + def forward(self, x): + x = self.pool(F.relu(self.bn1(self.conv1(x)))) + x = self.pool(F.relu(self.bn2(self.conv2(x)))) + x = self.pool(F.relu(self.bn3(self.conv3(x)))) + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.fc(x) + return x + + +def generate_synthetic_batch(batch_size, device): + """Generate synthetic data for benchmarking""" + images = torch.randn(batch_size, 3, IMAGE_SIZE, IMAGE_SIZE, device=device) + labels = torch.randint(0, NUM_CLASSES, (batch_size,), device=device) + return images, labels + + +def train_epoch(model, optimizer, criterion, epoch, device): + """Train for one epoch with accurate distributed throughput measurement""" + model.train() + epoch_start = time.time() + total_samples = 0 + total_loss = 0.0 + + for batch_idx in range(NUM_BATCHES): + batch_start = time.time() + + # Generate synthetic data + images, labels = generate_synthetic_batch(BATCH_SIZE, device) + + # Forward pass + optimizer.zero_grad() + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward pass (gradients are automatically synchronized across GPUs) + loss.backward() + + # Update weights + optimizer.step() + + batch_time = time.time() - batch_start + total_samples += BATCH_SIZE + total_loss += loss.item() + + # Print progress from rank 0 + if rank == 0 and (batch_idx + 1) % 20 == 0: + avg_loss = total_loss / (batch_idx + 1) + throughput = BATCH_SIZE * world_size / batch_time + print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] " + f"Batch [{batch_idx+1}/{NUM_BATCHES}] " + f"Loss: {loss.item():.4f} " + f"Throughput: {throughput:.2f} samples/sec") + + epoch_time = time.time() - epoch_start + avg_loss = total_loss / NUM_BATCHES + + # ======================================================================== + # Accurate Distributed Throughput Measurement (Best Practice) + # ======================================================================== + # Calculate local throughput for this rank + local_samples = NUM_BATCHES * BATCH_SIZE + local_throughput = local_samples / epoch_time + + # Aggregate metrics across all ranks using all_reduce + if world_size > 1: + # Convert to tensors for all_reduce + local_throughput_tensor = torch.tensor([local_throughput], device=device) + epoch_time_tensor = torch.tensor([epoch_time], device=device) + + # Sum all local throughputs to get true global throughput + global_throughput_tensor = local_throughput_tensor.clone() + dist.all_reduce(global_throughput_tensor, op=dist.ReduceOp.SUM) + + # Get max epoch time (slowest node determines overall speed) + max_epoch_time_tensor = epoch_time_tensor.clone() + dist.all_reduce(max_epoch_time_tensor, op=dist.ReduceOp.MAX) + + # Get min epoch time (fastest node) + min_epoch_time_tensor = epoch_time_tensor.clone() + dist.all_reduce(min_epoch_time_tensor, op=dist.ReduceOp.MIN) + + global_throughput = global_throughput_tensor.item() + max_epoch_time = max_epoch_time_tensor.item() + min_epoch_time = min_epoch_time_tensor.item() + + # Calculate load imbalance + time_imbalance = ((max_epoch_time - min_epoch_time) / max_epoch_time) * 100 if max_epoch_time > 0 else 0.0 + + else: + # Single GPU + global_throughput = local_throughput + max_epoch_time = epoch_time + min_epoch_time = epoch_time + time_imbalance = 0.0 + + # Return metrics dictionary + metrics = { + 'avg_loss': avg_loss, + 'local_throughput': local_throughput, + 'global_throughput': global_throughput, + 'epoch_time': epoch_time, + 'max_epoch_time': max_epoch_time, + 'min_epoch_time': min_epoch_time, + 'time_imbalance': time_imbalance + } + + return metrics + + +def main(): + """Main training function""" + print_header() + + # Create per-process MIOpen cache directory to avoid database conflicts + # This must be done AFTER torchrun sets LOCAL_RANK environment variable + if "MIOPEN_USER_DB_PATH" in os.environ: + # Construct the per-process MIOpen path using actual local_rank value + miopen_template = os.environ["MIOPEN_USER_DB_PATH"] + miopen_path = miopen_template.replace("${LOCAL_RANK:-0}", str(local_rank)).replace("$LOCAL_RANK", str(local_rank)) + os.makedirs(miopen_path, exist_ok=True) + print(f"[Rank {rank}] ✓ Created MIOpen cache directory: {miopen_path}") + + # ======================================================================== + # K8s Best Practice: Validate Data Before Initializing Training + # ======================================================================== + if rank == 0: + print(f"\n{'='*70}") + print("Step 1: Data Provider Validation") + print(f"{'='*70}") + + # Validate data availability (all ranks) + data_available = validate_data_availability() + + if not data_available: + # Exit gracefully if data is not available + if rank == 0: + print(f"\n{'='*70}") + print("❌ FAILED: Required data not available") + print(f"{'='*70}") + print("Exiting...") + sys.exit(1) + + if rank == 0: + print(f"\n✅ Data validation complete - proceeding with training\n") + + # ======================================================================== + # Initialize Distributed Training + # ======================================================================== + if world_size > 1: + if rank == 0: + print(f"{'='*70}") + print("Step 2: Initialize Distributed Training") + print(f"{'='*70}") + + print(f"\n[Rank {rank}] Initializing distributed process group...") + # Best practice: Specify device_ids to avoid PyTorch warnings + dist.init_process_group( + backend="nccl", + init_method=f"env://", # Use environment variables (set by torchrun) + world_size=world_size, + rank=rank + ) + print(f"[Rank {rank}] ✓ Process group initialized") + print(f"[Rank {rank}] Backend: {dist.get_backend()}") + print(f"[Rank {rank}] World Size: {dist.get_world_size()}") + else: + print(f"\n=== Running in Standalone Mode (Single GPU) ===") + + # Set device + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + print(f"[Rank {rank}] PyTorch sees {num_gpus} GPU(s)") + print(f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}") + + if local_rank >= num_gpus: + print(f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}") + print(f"[Rank {rank}] Using cuda:0 instead") + device = torch.device("cuda:0") + else: + device = torch.device(f"cuda:{local_rank}") + + torch.cuda.set_device(device) + print(f"[Rank {rank}] Using GPU: {torch.cuda.get_device_name(device)}") + else: + device = torch.device("cpu") + print(f"[Rank {rank}] Warning: CUDA not available, using CPU") + + # Create model + print(f"\n[Rank {rank}] Creating model...") + model = SimpleCNN(num_classes=NUM_CLASSES).to(device) + + # Wrap model with DDP for distributed training + if world_size > 1: + # Best practice: Explicitly specify device_ids for DDP + model = DDP( + model, + device_ids=[local_rank], + output_device=local_rank, + broadcast_buffers=True, # Ensure buffers (like BatchNorm stats) are synced + find_unused_parameters=False # Set True only if needed (performance impact) + ) + print(f"[Rank {rank}] ✓ Model wrapped with DistributedDataParallel") + + # Create optimizer and loss function + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + criterion = nn.CrossEntropyLoss() + + # Synchronize before training + if world_size > 1: + # Best practice: Specify device to avoid warnings + dist.barrier(device_ids=[local_rank]) + + if rank == 0: + print(f"\n{'='*70}") + print("Starting Training") + print(f"{'='*70}") + + # Training loop + all_metrics = [] + for epoch in range(NUM_EPOCHS): + metrics = train_epoch( + model, optimizer, criterion, epoch, device + ) + all_metrics.append(metrics) + + if rank == 0: + print(f"\nEpoch [{epoch+1}/{NUM_EPOCHS}] Complete:") + print(f" Average Loss: {metrics['avg_loss']:.4f}") + print(f" Global Throughput: {metrics['global_throughput']:.2f} samples/sec") + print(f" Images/sec: {metrics['global_throughput']:.2f}") + + # Show load imbalance warning if significant + if metrics['time_imbalance'] > 5.0: + print(f" ⚠️ Load Imbalance: {metrics['time_imbalance']:.1f}%") + + # Calculate average metrics across all epochs + avg_global_throughput = sum(m['global_throughput'] for m in all_metrics) / len(all_metrics) + avg_local_throughput = sum(m['local_throughput'] for m in all_metrics) / len(all_metrics) + avg_time_imbalance = sum(m['time_imbalance'] for m in all_metrics) / len(all_metrics) + + # Get topology information + nproc_per_node = int(os.environ.get("LOCAL_WORLD_SIZE", world_size)) + num_nodes = (world_size + nproc_per_node - 1) // nproc_per_node if nproc_per_node > 0 else 1 + node_rank = rank // nproc_per_node if nproc_per_node > 0 else 0 + + # Synchronize before final output + if world_size > 1: + dist.barrier(device_ids=[local_rank]) + + # Each node's rank 0 reports local performance + if local_rank == 0: + print(f"\n[Node {node_rank}] Local Performance Summary:") + print(f" Node Throughput: {avg_local_throughput * nproc_per_node:.2f} samples/sec") + print(f" GPUs on Node: {nproc_per_node}") + print(f" Avg Time per Epoch: {all_metrics[-1]['epoch_time']:.2f}s") + + # Synchronize again before global rank 0 output + if world_size > 1: + dist.barrier(device_ids=[local_rank]) + + # Global rank 0 reports aggregated performance + if rank == 0: + print(f"\n{'='*70}") + print("Training Complete - GLOBAL METRICS") + print(f"{'='*70}") + print(f"Topology: {num_nodes} nodes × {nproc_per_node} GPUs/node = {world_size} total GPUs") + print(f"Global Throughput: {avg_global_throughput:.2f} samples/sec") + print(f"Per-GPU Throughput: {avg_global_throughput/world_size:.2f} samples/sec") + print(f"Global Batch Size: {BATCH_SIZE * world_size}") + + # Calculate scaling efficiency + # Ideal throughput = single GPU throughput * number of GPUs + ideal_single_gpu_throughput = avg_global_throughput / world_size + ideal_throughput = ideal_single_gpu_throughput * world_size + scaling_efficiency = (avg_global_throughput / ideal_throughput) * 100 if ideal_throughput > 0 else 100.0 + print(f"Scaling Efficiency: {scaling_efficiency:.1f}%") + + if avg_time_imbalance > 5.0: + print(f"Average Load Imbalance: {avg_time_imbalance:.1f}%") + + print(f"{'='*70}") + + # Save results with topology information + with open("training_results.txt", "w") as f: + f.write(f"Training Results with Data Provider\n") + f.write(f"====================================\n") + f.write(f"Hostname: {socket.gethostname()}\n") + f.write(f"Data File: {DATA_FILE}\n") + f.write(f"Topology: {num_nodes} nodes × {nproc_per_node} GPUs/node\n") + f.write(f"World Size: {world_size}\n") + f.write(f"Global Batch Size: {BATCH_SIZE * world_size}\n") + f.write(f"Epochs: {NUM_EPOCHS}\n") + f.write(f"Global Throughput: {avg_global_throughput:.2f} samples/sec\n") + f.write(f"Scaling Efficiency: {scaling_efficiency:.1f}%\n") + + # Output performance metric for madengine (REQUIRED FORMAT) + # Use GLOBAL throughput (sum of all nodes - accurate measurement) + print(f"\nperformance: {avg_global_throughput:.2f} samples_per_second") + + # Output topology metadata for parsing + print(f"topology: {num_nodes} nodes {nproc_per_node} gpus_per_node {world_size} total_gpus") + print(f"scaling_efficiency: {scaling_efficiency:.2f}") + + # Cleanup + if world_size > 1: + dist.destroy_process_group() + if rank == 0: + print(f"✓ Process group destroyed") + + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as e: + print(f"[Rank {rank}] ✗ Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_nas.py b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_nas.py new file mode 100755 index 00000000..5599981f --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_torchrun_data_nas.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python3 +""" +PyTorch Distributed Training with NAS Data Provider for madengine + +This benchmark demonstrates distributed training with NAS data provider integration: +- Multi-node/multi-GPU distributed training with DDP +- NAS data provider support (mounted filesystem or downloaded data) +- K8s-optimized data handling (single download, shared across nodes via PVC) +- Proper synchronization and validation +- Accurate performance measurement with all_reduce + +K8s Best Practices: +- Only rank 0 validates data initially (avoid race conditions) +- All ranks validate data exists before training +- Use distributed barriers for synchronization +- Graceful error handling and reporting +- PVC-shared data across all pods/nodes + +Usage: + # K8s Multi-node with NAS data provider + torchrun --nnodes=2 --nproc_per_node=2 --master_addr=... run_torchrun_data_nas.py +""" + +import os +import sys +import time +import socket +import pathlib +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + +# Configuration +BATCH_SIZE = 128 # Per-GPU batch size +NUM_EPOCHS = 5 +NUM_BATCHES = 100 # Number of synthetic batches per epoch +IMAGE_SIZE = 224 +NUM_CLASSES = 1000 + +# Data configuration - NAS may contain any files +# We validate that MAD_DATAHOME exists and has content (like run_data_nas.sh) +# No specific file required - synthetic data used for training benchmark + +# Get distributed environment variables (set by torchrun) +rank = int(os.environ.get("RANK", 0)) +local_rank = int(os.environ.get("LOCAL_RANK", 0)) +world_size = int(os.environ.get("WORLD_SIZE", 1)) +master_addr = os.environ.get("MASTER_ADDR", "localhost") +master_port = os.environ.get("MASTER_PORT", "29500") + +# NOTE: MIOpen directory creation moved to main() after LOCAL_RANK is available + + +def print_header(): + """Print benchmark header""" + print("=" * 70) + print("madengine PyTorch Distributed Training with NAS Data Provider") + print("=" * 70) + print(f"Hostname: {socket.gethostname()}") + print(f"Rank: {rank}/{world_size}") + print(f"Local Rank (GPU): {local_rank}") + if world_size > 1: + print(f"Master: {master_addr}:{master_port}") + print(f"\nConfiguration:") + print(f" Batch Size (per GPU): {BATCH_SIZE}") + print(f" Global Batch Size: {BATCH_SIZE * world_size}") + print(f" Epochs: {NUM_EPOCHS}") + print(f" Batches per Epoch: {NUM_BATCHES}") + print(f" Image Size: {IMAGE_SIZE}x{IMAGE_SIZE}") + print(f" Num Classes: {NUM_CLASSES}") + print("=" * 70) + + +def validate_data_availability(): + """ + Validate that required data is available from NAS (K8s best practice). + + Strategy: + 1. Rank 0 checks data first and reports status + 2. All ranks independently validate data (no barrier needed before init_process_group) + 3. Exit gracefully if data missing + + Note: For K8s deployments, MAD_DATAHOME points to PVC mount point (/data). + This ensures data is shared across all pods (single-node and multi-node). + PVC must be configured with ReadWriteMany for multi-node deployments. + + NAS can be either: + - Mounted filesystem (traditional NAS) + - Downloaded data to directory (K8s with data provider) + + Similar to run_data_nas.sh: We just verify the data home directory exists and + optionally has content. No specific file is required - we use synthetic data for + training benchmarks. + + Returns: + bool: True if data is available, False otherwise + """ + # K8s best practice: Data stored in PVC at /data (separate from compute pods) + data_home = os.environ.get("MAD_DATAHOME", "/data") + data_home_path = pathlib.Path(data_home) + + if rank == 0: + print(f"\n{'='*70}") + print("NAS Data Provider Validation") + print(f"{'='*70}") + print(f"Data Home: {data_home}") + + # Check if data directory exists + if not data_home_path.exists(): + print(f"❌ Data home directory NOT found!") + print(f" Expected: {data_home}") + print(f" MAD_DATAHOME must be set and directory must exist") + else: + print(f"✅ Data home directory exists: {data_home}") + + # Check if directory has content (similar to run_data_nas.sh) + try: + dir_contents = list(data_home_path.iterdir()) + if not dir_contents: + print(f"⚠️ Data home directory is EMPTY") + print(f" This is okay for test environments") + print(f" (data provider works but source is empty)") + else: + print(f"✅ Data home has {len(dir_contents)} items") + # List first few files for verification + print(f" Contents:") + for i, item in enumerate(dir_contents[:5]): + item_type = "DIR" if item.is_dir() else "FILE" + size_info = "" + if item.is_file(): + size_mb = item.stat().st_size / (1024 * 1024) + size_info = f" ({size_mb:.2f} MB)" + print(f" - [{item_type}] {item.name}{size_info}") + if len(dir_contents) > 5: + print(f" ... and {len(dir_contents) - 5} more items") + except PermissionError: + print(f"⚠️ Cannot read directory contents (permission denied)") + print(f" Directory exists but contents not accessible") + + print(f"{'='*70}\n") + + # Note: Cannot use dist.barrier() here - process group not initialized yet + # Data validation happens before distributed initialization + # All ranks will independently validate data availability without synchronization + + # All ranks independently validate data home exists + # We don't require a specific file - just that the directory exists + data_available = data_home_path.exists() + + if not data_available: + print(f"[Rank {rank}] ❌ ERROR: Data home not found at {data_home}") + else: + print(f"[Rank {rank}] ✅ Data home validated: {data_home}") + + return data_available + + +class SimpleCNN(nn.Module): + """Simple CNN model for benchmarking""" + def __init__(self, num_classes=1000): + super(SimpleCNN, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) + self.bn1 = nn.BatchNorm2d(64) + self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1) + self.bn2 = nn.BatchNorm2d(128) + + self.conv3 = nn.Conv2d(128, 256, kernel_size=3, padding=1) + self.bn3 = nn.BatchNorm2d(256) + + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(256, num_classes) + + def forward(self, x): + x = self.pool(F.relu(self.bn1(self.conv1(x)))) + x = self.pool(F.relu(self.bn2(self.conv2(x)))) + x = self.pool(F.relu(self.bn3(self.conv3(x)))) + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.fc(x) + return x + + +def generate_synthetic_batch(batch_size, device): + """Generate synthetic data for benchmarking""" + images = torch.randn(batch_size, 3, IMAGE_SIZE, IMAGE_SIZE, device=device) + labels = torch.randint(0, NUM_CLASSES, (batch_size,), device=device) + return images, labels + + +def train_epoch(model, optimizer, criterion, epoch, device): + """Train for one epoch with accurate distributed throughput measurement""" + model.train() + epoch_start = time.time() + total_samples = 0 + total_loss = 0.0 + + for batch_idx in range(NUM_BATCHES): + batch_start = time.time() + + # Generate synthetic data + images, labels = generate_synthetic_batch(BATCH_SIZE, device) + + # Forward pass + optimizer.zero_grad() + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward pass (gradients are automatically synchronized across GPUs) + loss.backward() + + # Update weights + optimizer.step() + + batch_time = time.time() - batch_start + total_samples += BATCH_SIZE + total_loss += loss.item() + + # Print progress from rank 0 + if rank == 0 and (batch_idx + 1) % 20 == 0: + avg_loss = total_loss / (batch_idx + 1) + throughput = BATCH_SIZE * world_size / batch_time + print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] " + f"Batch [{batch_idx+1}/{NUM_BATCHES}] " + f"Loss: {loss.item():.4f} " + f"Throughput: {throughput:.2f} samples/sec") + + epoch_time = time.time() - epoch_start + avg_loss = total_loss / NUM_BATCHES + + # ======================================================================== + # Accurate Distributed Throughput Measurement (Best Practice) + # ======================================================================== + # Calculate local throughput for this rank + local_samples = NUM_BATCHES * BATCH_SIZE + local_throughput = local_samples / epoch_time + + # Aggregate metrics across all ranks using all_reduce + if world_size > 1: + # Convert to tensors for all_reduce + local_throughput_tensor = torch.tensor([local_throughput], device=device) + epoch_time_tensor = torch.tensor([epoch_time], device=device) + + # Sum all local throughputs to get true global throughput + global_throughput_tensor = local_throughput_tensor.clone() + dist.all_reduce(global_throughput_tensor, op=dist.ReduceOp.SUM) + + # Get max epoch time (slowest node determines overall speed) + max_epoch_time_tensor = epoch_time_tensor.clone() + dist.all_reduce(max_epoch_time_tensor, op=dist.ReduceOp.MAX) + + # Get min epoch time (fastest node) + min_epoch_time_tensor = epoch_time_tensor.clone() + dist.all_reduce(min_epoch_time_tensor, op=dist.ReduceOp.MIN) + + global_throughput = global_throughput_tensor.item() + max_epoch_time = max_epoch_time_tensor.item() + min_epoch_time = min_epoch_time_tensor.item() + + # Calculate load imbalance + time_imbalance = ((max_epoch_time - min_epoch_time) / max_epoch_time) * 100 if max_epoch_time > 0 else 0.0 + + else: + # Single GPU + global_throughput = local_throughput + max_epoch_time = epoch_time + min_epoch_time = epoch_time + time_imbalance = 0.0 + + # Return metrics dictionary + metrics = { + 'avg_loss': avg_loss, + 'local_throughput': local_throughput, + 'global_throughput': global_throughput, + 'epoch_time': epoch_time, + 'max_epoch_time': max_epoch_time, + 'min_epoch_time': min_epoch_time, + 'time_imbalance': time_imbalance + } + + return metrics + + +def main(): + """Main training function""" + print_header() + + # Create per-process MIOpen cache directory to avoid database conflicts + # This must be done AFTER torchrun sets LOCAL_RANK environment variable + if "MIOPEN_USER_DB_PATH" in os.environ: + # Construct the per-process MIOpen path using actual local_rank value + miopen_template = os.environ["MIOPEN_USER_DB_PATH"] + miopen_path = miopen_template.replace("${LOCAL_RANK:-0}", str(local_rank)).replace("$LOCAL_RANK", str(local_rank)) + os.makedirs(miopen_path, exist_ok=True) + print(f"[Rank {rank}] ✓ Created MIOpen cache directory: {miopen_path}") + + # ======================================================================== + # K8s Best Practice: Validate Data Before Initializing Training + # ======================================================================== + if rank == 0: + print(f"\n{'='*70}") + print("Step 1: NAS Data Provider Validation") + print(f"{'='*70}") + + # Validate data availability (all ranks) + data_available = validate_data_availability() + + if not data_available: + # Exit gracefully if data is not available + if rank == 0: + print(f"\n{'='*70}") + print("❌ FAILED: Required data not available") + print(f"{'='*70}") + print("Exiting...") + sys.exit(1) + + if rank == 0: + print(f"\n✅ Data validation complete - proceeding with training\n") + + # ======================================================================== + # Initialize Distributed Training + # ======================================================================== + if world_size > 1: + if rank == 0: + print(f"{'='*70}") + print("Step 2: Initialize Distributed Training") + print(f"{'='*70}") + + print(f"\n[Rank {rank}] Initializing distributed process group...") + # Best practice: Specify device_ids to avoid PyTorch warnings + dist.init_process_group( + backend="nccl", + init_method=f"env://", # Use environment variables (set by torchrun) + world_size=world_size, + rank=rank + ) + print(f"[Rank {rank}] ✓ Process group initialized") + print(f"[Rank {rank}] Backend: {dist.get_backend()}") + print(f"[Rank {rank}] World Size: {dist.get_world_size()}") + else: + print(f"\n=== Running in Standalone Mode (Single GPU) ===") + + # Set device + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + print(f"[Rank {rank}] PyTorch sees {num_gpus} GPU(s)") + print(f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}") + + if local_rank >= num_gpus: + print(f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}") + print(f"[Rank {rank}] Using cuda:0 instead") + device = torch.device("cuda:0") + else: + device = torch.device(f"cuda:{local_rank}") + + torch.cuda.set_device(device) + print(f"[Rank {rank}] Using GPU: {torch.cuda.get_device_name(device)}") + else: + device = torch.device("cpu") + print(f"[Rank {rank}] Warning: CUDA not available, using CPU") + + # Create model + print(f"\n[Rank {rank}] Creating model...") + model = SimpleCNN(num_classes=NUM_CLASSES).to(device) + + # Wrap model with DDP for distributed training + if world_size > 1: + # Best practice: Explicitly specify device_ids for DDP + model = DDP( + model, + device_ids=[local_rank], + output_device=local_rank, + broadcast_buffers=True, # Ensure buffers (like BatchNorm stats) are synced + find_unused_parameters=False # Set True only if needed (performance impact) + ) + print(f"[Rank {rank}] ✓ Model wrapped with DistributedDataParallel") + + # Create optimizer and loss function + optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + criterion = nn.CrossEntropyLoss() + + # Synchronize before training + if world_size > 1: + # Best practice: Specify device to avoid warnings + dist.barrier(device_ids=[local_rank]) + + if rank == 0: + print(f"\n{'='*70}") + print("Starting Training") + print(f"{'='*70}") + + # Training loop + all_metrics = [] + for epoch in range(NUM_EPOCHS): + metrics = train_epoch( + model, optimizer, criterion, epoch, device + ) + all_metrics.append(metrics) + + if rank == 0: + print(f"\nEpoch [{epoch+1}/{NUM_EPOCHS}] Complete:") + print(f" Average Loss: {metrics['avg_loss']:.4f}") + print(f" Global Throughput: {metrics['global_throughput']:.2f} samples/sec") + print(f" Images/sec: {metrics['global_throughput']:.2f}") + + # Show load imbalance warning if significant + if metrics['time_imbalance'] > 5.0: + print(f" ⚠️ Load Imbalance: {metrics['time_imbalance']:.1f}%") + + # Calculate average metrics across all epochs + avg_global_throughput = sum(m['global_throughput'] for m in all_metrics) / len(all_metrics) + avg_local_throughput = sum(m['local_throughput'] for m in all_metrics) / len(all_metrics) + avg_time_imbalance = sum(m['time_imbalance'] for m in all_metrics) / len(all_metrics) + + # Get topology information + nproc_per_node = int(os.environ.get("LOCAL_WORLD_SIZE", world_size)) + num_nodes = (world_size + nproc_per_node - 1) // nproc_per_node if nproc_per_node > 0 else 1 + node_rank = rank // nproc_per_node if nproc_per_node > 0 else 0 + + # Synchronize before final output + if world_size > 1: + dist.barrier(device_ids=[local_rank]) + + # Each node's rank 0 reports local performance + if local_rank == 0: + print(f"\n[Node {node_rank}] Local Performance Summary:") + print(f" Node Throughput: {avg_local_throughput * nproc_per_node:.2f} samples/sec") + print(f" GPUs on Node: {nproc_per_node}") + print(f" Avg Time per Epoch: {all_metrics[-1]['epoch_time']:.2f}s") + + # Synchronize again before global rank 0 output + if world_size > 1: + dist.barrier(device_ids=[local_rank]) + + # Global rank 0 reports aggregated performance + if rank == 0: + print(f"\n{'='*70}") + print("Training Complete - GLOBAL METRICS") + print(f"{'='*70}") + print(f"Topology: {num_nodes} nodes × {nproc_per_node} GPUs/node = {world_size} total GPUs") + print(f"Global Throughput: {avg_global_throughput:.2f} samples/sec") + print(f"Per-GPU Throughput: {avg_global_throughput/world_size:.2f} samples/sec") + print(f"Global Batch Size: {BATCH_SIZE * world_size}") + + # Calculate scaling efficiency + # Ideal throughput = single GPU throughput * number of GPUs + ideal_single_gpu_throughput = avg_global_throughput / world_size + ideal_throughput = ideal_single_gpu_throughput * world_size + scaling_efficiency = (avg_global_throughput / ideal_throughput) * 100 if ideal_throughput > 0 else 100.0 + print(f"Scaling Efficiency: {scaling_efficiency:.1f}%") + + if avg_time_imbalance > 5.0: + print(f"Average Load Imbalance: {avg_time_imbalance:.1f}%") + + print(f"{'='*70}") + + # Save results with topology information + data_home = os.environ.get("MAD_DATAHOME", "/data") + with open("training_results.txt", "w") as f: + f.write(f"Training Results with NAS Data Provider\n") + f.write(f"========================================\n") + f.write(f"Hostname: {socket.gethostname()}\n") + f.write(f"Data Home: {data_home}\n") + f.write(f"Topology: {num_nodes} nodes × {nproc_per_node} GPUs/node\n") + f.write(f"World Size: {world_size}\n") + f.write(f"Global Batch Size: {BATCH_SIZE * world_size}\n") + f.write(f"Epochs: {NUM_EPOCHS}\n") + f.write(f"Global Throughput: {avg_global_throughput:.2f} samples/sec\n") + f.write(f"Scaling Efficiency: {scaling_efficiency:.1f}%\n") + + # Output performance metric for madengine (REQUIRED FORMAT) + # Use GLOBAL throughput (sum of all nodes - accurate measurement) + print(f"\nperformance: {avg_global_throughput:.2f} samples_per_second") + + # Output topology metadata for parsing + print(f"topology: {num_nodes} nodes {nproc_per_node} gpus_per_node {world_size} total_gpus") + print(f"scaling_efficiency: {scaling_efficiency:.2f}") + + # Cleanup + if world_size > 1: + dist.destroy_process_group() + if rank == 0: + print(f"✓ Process group destroyed") + + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as e: + print(f"[Rank {rank}] ✗ Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + diff --git a/tests/fixtures/dummy/scripts/dummy_torchrun/run_with_helper.py b/tests/fixtures/dummy/scripts/dummy_torchrun/run_with_helper.py new file mode 100644 index 00000000..68329eb5 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_torchrun/run_with_helper.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +""" +PyTorch Distributed Training Benchmark with Helper Modules + +This script demonstrates: +- Multi-file Python project structure +- Importing model architecture from helper module +- Separating concerns (config, model, training) +- Best practices for distributed training +""" + +import os +import sys +import time +import socket +import torch +import torch.nn as nn +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + +# Import from helper module +from helper import ( + ResNetModel, + SyntheticDataset, + BenchmarkConfig, + print_distributed_info, + print_gpu_info, + calculate_model_size +) + +# Get distributed environment variables (set by torchrun) +rank = int(os.environ.get("RANK", 0)) +local_rank = int(os.environ.get("LOCAL_RANK", 0)) +world_size = int(os.environ.get("WORLD_SIZE", 1)) + + +def print_header(config): + """Print benchmark header""" + print("=" * 70) + print("madengine PyTorch Benchmark (with Helper Modules)") + print("=" * 70) + print(f"Hostname: {socket.gethostname()}") + print(f"Rank: {rank}/{world_size}") + print(f"Local Rank (GPU): {local_rank}") + print(f"\n{config}") + print("=" * 70) + + +def train_epoch(model, dataset, optimizer, criterion, epoch, device, config): + """Train for one epoch""" + model.train() + epoch_start = time.time() + total_loss = 0.0 + + for batch_idx in range(dataset.num_batches): + batch_start = time.time() + + # Generate synthetic data + images, labels = dataset.generate_batch(device) + + # Forward pass + optimizer.zero_grad() + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward pass (gradients automatically synchronized) + loss.backward() + + # Update weights + optimizer.step() + + batch_time = time.time() - batch_start + total_loss += loss.item() + + # Print progress from rank 0 + if rank == 0 and (batch_idx + 1) % 20 == 0: + avg_loss = total_loss / (batch_idx + 1) + throughput = config.batch_size * world_size / batch_time + print(f"Epoch [{epoch+1}/{config.num_epochs}] " + f"Batch [{batch_idx+1}/{dataset.num_batches}] " + f"Loss: {loss.item():.4f} " + f"Throughput: {throughput:.2f} samples/sec") + + epoch_time = time.time() - epoch_start + avg_loss = total_loss / dataset.num_batches + epoch_throughput = (dataset.num_batches * config.batch_size * world_size) / epoch_time + + return avg_loss, epoch_throughput + + +def main(): + """Main training function""" + # Load configuration + config = BenchmarkConfig() + + print_header(config) + + # Print distributed info + print_distributed_info(rank, local_rank, world_size) + + # Initialize distributed training + if world_size > 1: + print(f"\n[Rank {rank}] Initializing distributed process group...") + # Best practice: Specify device_ids to avoid PyTorch warnings + dist.init_process_group( + backend="nccl", + init_method=f"env://", # Use environment variables (set by torchrun) + world_size=world_size, + rank=rank + ) + print(f"[Rank {rank}] ✓ Process group initialized") + print(f"[Rank {rank}] Backend: {dist.get_backend()}") + print(f"[Rank {rank}] World Size: {dist.get_world_size()}") + else: + print(f"\n=== Running in Standalone Mode (Single GPU) ===") + + # Set device + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + print(f"[Rank {rank}] PyTorch sees {num_gpus} GPU(s)") + print(f"[Rank {rank}] LOCAL_RANK={local_rank}, attempting to use cuda:{local_rank}") + + if local_rank >= num_gpus: + print(f"[Rank {rank}] ERROR: LOCAL_RANK {local_rank} >= available GPUs {num_gpus}") + print(f"[Rank {rank}] Using cuda:0 instead") + device = torch.device("cuda:0") + else: + device = torch.device(f"cuda:{local_rank}") + + torch.cuda.set_device(device) + print_gpu_info(rank, device) + else: + device = torch.device("cpu") + print(f"[Rank {rank}] Warning: CUDA not available, using CPU") + + # Create model from helper module + print(f"\n[Rank {rank}] Creating ResNet model from helper module...") + model = ResNetModel( + num_classes=config.num_classes, + num_blocks=config.resnet_blocks + ).to(device) + + # Print model info + if rank == 0: + total_params, trainable_params = calculate_model_size(model) + print(f"\nModel Statistics:") + print(f" Total Parameters: {total_params:,}") + print(f" Trainable Parameters: {trainable_params:,}") + print(f" Model Size: {total_params * 4 / 1e6:.2f} MB (FP32)") + + # Wrap model with DDP for distributed training + if world_size > 1: + # Best practice: Explicitly specify device_ids for DDP + model = DDP( + model, + device_ids=[local_rank], + output_device=local_rank, + broadcast_buffers=True, # Ensure buffers (like BatchNorm stats) are synced + find_unused_parameters=False # Set True only if needed (performance impact) + ) + print(f"[Rank {rank}] ✓ Model wrapped with DistributedDataParallel") + + # Create dataset + dataset = SyntheticDataset( + num_samples=config.num_batches * config.batch_size, + batch_size=config.batch_size, + image_size=config.image_size, + num_classes=config.num_classes + ) + + # Create optimizer and loss function + optimizer = torch.optim.SGD( + model.parameters(), + lr=config.learning_rate, + momentum=config.momentum, + weight_decay=config.weight_decay + ) + criterion = nn.CrossEntropyLoss() + + # Synchronize before training + if world_size > 1: + # Best practice: Specify device to avoid warnings + dist.barrier(device_ids=[local_rank]) + + if rank == 0: + print(f"\n{'='*70}") + print("Starting Training") + print(f"{'='*70}") + + # Training loop + all_throughputs = [] + for epoch in range(config.num_epochs): + avg_loss, epoch_throughput = train_epoch( + model, dataset, optimizer, criterion, epoch, device, config + ) + all_throughputs.append(epoch_throughput) + + if rank == 0: + print(f"\nEpoch [{epoch+1}/{config.num_epochs}] Complete:") + print(f" Average Loss: {avg_loss:.4f}") + print(f" Throughput: {epoch_throughput:.2f} samples/sec") + + # Calculate final metrics + avg_throughput = sum(all_throughputs) / len(all_throughputs) + + # Synchronize before final output + if world_size > 1: + dist.barrier(device_ids=[local_rank]) + + if rank == 0: + print(f"\n{'='*70}") + print("Training Complete") + print(f"{'='*70}") + print(f"Average Throughput: {avg_throughput:.2f} samples/sec") + print(f"Global Batch Size: {config.batch_size * world_size}") + print(f"Number of GPUs: {world_size}") + print(f"Model: ResNet with {sum(config.resnet_blocks)} blocks") + print(f"{'='*70}") + + # Save results + with open("training_results_helper.txt", "w") as f: + f.write(f"Training Results (with Helper Modules)\n") + f.write(f"======================================\n") + f.write(f"Hostname: {socket.gethostname()}\n") + f.write(f"World Size: {world_size}\n") + f.write(f"Global Batch Size: {config.batch_size * world_size}\n") + f.write(f"Epochs: {config.num_epochs}\n") + f.write(f"Model: ResNet-{sum(config.resnet_blocks)*2+2}\n") + f.write(f"Average Throughput: {avg_throughput:.2f} samples/sec\n") + + # Output performance metric for madengine (REQUIRED FORMAT) + print(f"\nperformance: {avg_throughput:.2f} samples_per_second") + + # Cleanup + if world_size > 1: + dist.destroy_process_group() + if rank == 0: + print(f"✓ Process group destroyed") + + return 0 + + +if __name__ == "__main__": + try: + sys.exit(main()) + except Exception as e: + print(f"[Rank {rank}] ✗ Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) diff --git a/tests/fixtures/dummy/scripts/dummy_torchtitan/run.sh b/tests/fixtures/dummy/scripts/dummy_torchtitan/run.sh new file mode 100755 index 00000000..4408ec90 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_torchtitan/run.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# TorchTitan Training Test Script +# Minimal test for torchtitan launcher functionality + +set -e + +echo "======================================" +echo "TorchTitan madengine Test" +echo "======================================" +echo "Hostname: $(hostname)" +echo "Date: $(date)" +echo "" + +# Display distributed environment +echo "Distributed Environment:" +echo " RANK: ${RANK:-0}" +echo " LOCAL_RANK: ${LOCAL_RANK:-0}" +echo " WORLD_SIZE: ${WORLD_SIZE:-1}" +echo " MASTER_ADDR: ${MASTER_ADDR:-localhost}" +echo " MASTER_PORT: ${MASTER_PORT:-29500}" +echo "" + +echo "TorchTitan Configuration:" +echo " Tensor Parallel Size: ${TORCHTITAN_TENSOR_PARALLEL_SIZE:-1}" +echo " Pipeline Parallel Size: ${TORCHTITAN_PIPELINE_PARALLEL_SIZE:-1}" +echo " FSDP Enabled: ${TORCHTITAN_FSDP_ENABLED:-0}" +echo " Context Parallel Size: ${TORCHTITAN_CONTEXT_PARALLEL_SIZE:-1}" +echo "" + +# Create minimal torchtitan config +cat > /tmp/test_config.toml << 'EOF' +# Minimal TorchTitan test configuration +[job] +dump_folder = "/tmp/outputs" +description = "madengine torchtitan test" + +[profiling] +enable_profiling = false + +[model] +name = "llama3" +flavor = "debugmodel" # Minimal model for testing +norm_type = "rmsnorm" + +[optimizer] +name = "AdamW" +lr = 3e-4 + +[training] +batch_size = 1 +seq_len = 128 +steps = 10 +data_parallel_degree = -1 +tensor_parallel_degree = 1 +compile = false +dataset = "c4_test" + +[experimental] +enable_async_tensor_parallel = false + +[checkpoint] +enable_checkpoint = false + +[metrics] +log_freq = 1 +enable_tensorboard = false +EOF + +echo "Generated test config at /tmp/test_config.toml" +cat /tmp/test_config.toml +echo "" + +# Run torchtitan training +echo "Starting TorchTitan training..." +echo "Command: ${MAD_MULTI_NODE_RUNNER:-torchrun} /opt/torchtitan/train.py --job.config_file /tmp/test_config.toml" +echo "" + +# Execute via MAD_MULTI_NODE_RUNNER (set by deployment) or fallback to direct torchrun +if [ -n "$MAD_MULTI_NODE_RUNNER" ]; then + # Multi-GPU/Multi-node: Use launcher command from deployment + $MAD_MULTI_NODE_RUNNER /opt/torchtitan/train.py --job.config_file /tmp/test_config.toml +else + # Single GPU fallback + python /opt/torchtitan/train.py --job.config_file /tmp/test_config.toml +fi + +echo "" +echo "======================================" +echo "TorchTitan Test Complete" +echo "======================================" + +# Output performance metric for madengine +echo "performance: 100.0 tokens_per_second" + diff --git a/tests/fixtures/dummy/scripts/dummy_torchtitan/run_llama3_8b.sh b/tests/fixtures/dummy/scripts/dummy_torchtitan/run_llama3_8b.sh new file mode 100755 index 00000000..800a7b0d --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_torchtitan/run_llama3_8b.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# TorchTitan Llama 3.1 8B Training Script +# Full training example with model download and checkpointing + +set -e + +echo "======================================" +echo "TorchTitan Llama 3.1 8B Training" +echo "======================================" + +# Ensure torchtitan is available +if [ ! -d "/opt/torchtitan" ]; then + echo "Error: torchtitan not found at /opt/torchtitan" + exit 1 +fi + +cd /opt/torchtitan + +# Download tokenizer if not present (requires HF_TOKEN environment variable) +if [ -n "$HF_TOKEN" ] && [ ! -f "tokenizer.model" ]; then + echo "Downloading Llama 3.1 tokenizer..." + python scripts/download_hf_assets.py \ + --repo_id meta-llama/Llama-3.1-8B \ + --assets tokenizer \ + --hf_token=$HF_TOKEN +fi + +# Use config file if provided, otherwise use default 8B config +CONFIG_FILE=${TORCHTITAN_CONFIG:-"./torchtitan/models/llama3/train_configs/llama3_8b.toml"} + +echo "Using config: $CONFIG_FILE" +echo "Distributed setup: ${WORLD_SIZE:-1} GPUs across ${NNODES:-1} nodes" +echo "" + +# Run training via MAD launcher +if [ -n "$MAD_MULTI_NODE_RUNNER" ]; then + echo "Launching via: $MAD_MULTI_NODE_RUNNER" + $MAD_MULTI_NODE_RUNNER train.py --job.config_file $CONFIG_FILE +else + # Fallback to direct execution + python train.py --job.config_file $CONFIG_FILE +fi + +echo "" +echo "Training complete!" + +# Parse and output performance metric +if [ -f "/tmp/outputs/metrics.txt" ]; then + TOKENS_PER_SEC=$(grep "tokens/sec" /tmp/outputs/metrics.txt | tail -1 | awk '{print $NF}') + echo "performance: ${TOKENS_PER_SEC} tokens_per_second" +else + echo "performance: 0.0 tokens_per_second" +fi + diff --git a/tests/fixtures/dummy/scripts/dummy_vllm/README.md b/tests/fixtures/dummy/scripts/dummy_vllm/README.md new file mode 100644 index 00000000..1031c1bf --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_vllm/README.md @@ -0,0 +1,193 @@ +# vLLM Distributed Inference for madengine + +This directory contains vLLM inference benchmarking scripts for AMD ROCm GPUs. + +## ⚠️ IMPORTANT: ROCm Build Instructions + +**The current Dockerfile uses a mock vLLM module for testing infrastructure.** + +For **production deployments**, you must build vLLM from source with ROCm support: + +1. Uncomment the vLLM build section in `docker/dummy_vllm.ubuntu.amd.Dockerfile` +2. Or install manually: `pip install git+https://github.com/vllm-project/vllm.git` + +Note: vLLM's PyPI package (`pip install vllm`) is CUDA-only and will fail with ROCm. + +## Overview + +vLLM is a high-throughput and memory-efficient inference and serving engine for Large Language Models (LLMs). It features: + +- **PagedAttention**: Efficient KV cache management inspired by OS virtual memory paging +- **Continuous Batching**: Dynamic request batching for maximum throughput +- **Tensor Parallelism (TP)**: Split model weights across GPUs within a node +- **Pipeline Parallelism (PP)**: Split model layers across multiple nodes +- **ROCm Support**: Optimized for AMD Instinct GPUs (MI200/MI300 series) + +## Files + +- `run.sh`: Wrapper script that launches vLLM inference with proper environment setup +- `run_vllm_inference.py`: Main Python script that runs the vLLM benchmark +- `README.md`: This file + +## Architecture + +### Single-Node Multi-GPU (Tensor Parallelism) +``` +Node 1: [GPU0] [GPU1] [GPU2] [GPU3] + └──────── Model Split ────────┘ +``` +- Model weights split across all GPUs +- Each GPU holds a portion of the model +- Forward pass requires communication between GPUs + +### Multi-Node Multi-GPU (Tensor + Pipeline Parallelism) +``` +Node 1: [GPU0] [GPU1] [GPU2] [GPU3] <- Layers 1-N/2 +Node 2: [GPU0] [GPU1] [GPU2] [GPU3] <- Layers N/2+1-N +``` +- Pipeline parallelism splits layers across nodes +- Tensor parallelism splits weights within each node +- Optimized for very large models + +## Configuration + +### Environment Variables + +**vLLM Core Settings:** +- `VLLM_ALLOW_LONG_MAX_MODEL_LEN=1`: Allow longer sequence lengths +- `VLLM_USE_MODELSCOPE=False`: Disable ModelScope +- `VLLM_WORKER_MULTIPROC_METHOD=spawn`: Use spawn for multiprocessing +- `VLLM_LOGGING_LEVEL=INFO`: Set logging level + +**ROCm 7.x Optimizations:** +- `HSA_FORCE_FINE_GRAIN_PCIE=1`: Enable fine-grained PCIe access +- `HSA_ENABLE_SDMA=0`: Disable SDMA for stability +- `GPU_MAX_HW_QUEUES=2`: Optimize hardware queue configuration +- `NCCL_DEBUG=WARN`: NCCL debugging level +- `PYTORCH_ROCM_ARCH=gfx90a;gfx940;gfx941;gfx942`: Target AMD GPU architectures + +### Command Line Arguments + +The `run_vllm_inference.py` script accepts: + +- `--model`: Model name or path (default: `facebook/opt-125m`) +- `--tensor-parallel-size`: Number of GPUs for tensor parallelism +- `--pipeline-parallel-size`: Number of nodes for pipeline parallelism +- `--enforce-eager`: Disable CUDA graph for compatibility + +## Usage + +### Local Testing (Single GPU) +```bash +cd /path/to/scripts/dummy_vllm +python3 run_vllm_inference.py --model facebook/opt-125m +``` + +### Single-Node Multi-GPU (via madengine) +```bash +madengine run \ + --model-name dummy_vllm \ + --additional-config examples/slurm-configs/minimal/vllm-single-node-minimal.json +``` + +### Multi-Node Multi-GPU (via madengine) +```bash +madengine run \ + --model-name dummy_vllm \ + --additional-config examples/slurm-configs/minimal/vllm-multi-node-minimal.json +``` + +## Slurm Configuration Examples + +### Single-Node (4 GPUs with Tensor Parallelism) +```json +{ + "slurm": { + "partition": "amd-rccl", + "nodes": 1, + "gpus_per_node": 4, + "time": "02:00:00" + }, + "distributed": { + "launcher": "vllm", + "nnodes": 1, + "nproc_per_node": 4 + } +} +``` + +### Multi-Node (2 Nodes × 4 GPUs with TP + PP) +```json +{ + "slurm": { + "partition": "amd-rccl", + "nodes": 2, + "gpus_per_node": 4, + "time": "04:00:00" + }, + "distributed": { + "launcher": "vllm", + "nnodes": 2, + "nproc_per_node": 4 + } +} +``` + +## Model Selection + +### Small Models (Testing) +- `facebook/opt-125m` (125M parameters, ~250MB) +- `facebook/opt-350m` (350M parameters, ~700MB) + +### Medium Models (Production) +- `facebook/opt-6.7b` (6.7B parameters, ~13GB) +- `meta-llama/Llama-2-7b-hf` (7B parameters, ~14GB) +- `mistralai/Mistral-7B-v0.1` (7B parameters, ~14GB) + +### Large Models (Multi-GPU Required) +- `meta-llama/Llama-2-13b-hf` (13B parameters, ~26GB) +- `meta-llama/Llama-2-70b-hf` (70B parameters, ~140GB) + +**Note**: Ensure you have access to gated models (e.g., Llama-2) via Hugging Face authentication. + +## Performance Metrics + +The script outputs the following metrics: +- **Throughput**: Requests per second +- **Token Generation Rate**: Tokens per second +- **Average Latency**: Milliseconds per request +- **Total Prompts**: Number of prompts processed +- **Total Time**: End-to-end execution time + +## Troubleshooting + +### Out of Memory (OOM) Errors +- GPU memory utilization is set to 0.70 (70%) by default for stability +- If you still encounter OOM errors: + - Use a smaller model or reduce `max_model_len` in the script + - Increase tensor parallelism size to split the model across more GPUs + - Check for other processes using GPU memory before running + +### Slow Performance +- Enable CUDA graphs (remove `--enforce-eager`) +- Verify NCCL settings for multi-GPU +- Check GPU memory utilization + +### Model Download Issues +- Set `HF_HOME` for Hugging Face cache directory +- Use `huggingface-cli login` for gated models +- Pre-download models to shared storage + +## References + +- [vLLM GitHub](https://github.com/vllm-project/vllm) +- [vLLM Documentation](https://docs.vllm.ai/) +- [ROCm Documentation](https://rocm.docs.amd.com/) +- [madengine Documentation](../../../../../../README.md) + +## Support + +For issues or questions: +- vLLM: [GitHub Issues](https://github.com/vllm-project/vllm/issues) +- madengine: Contact mad.support@amd.com + diff --git a/tests/fixtures/dummy/scripts/dummy_vllm/run.sh b/tests/fixtures/dummy/scripts/dummy_vllm/run.sh new file mode 100755 index 00000000..4d1f6dbb --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_vllm/run.sh @@ -0,0 +1,195 @@ +#!/bin/bash +# +# vLLM V1 Engine Distributed Inference Script - Data Parallelism Mode +# +# Multi-node Data Parallelism Strategy: +# - Each node runs an INDEPENDENT vLLM replica (no shared Ray cluster) +# - Each replica uses Tensor Parallelism across GPUs within the node +# - Benefits: Simpler, faster init, more robust, better for benchmarking +# +set -e + +echo "========================================================================" +echo "madengine vLLM V1 Engine Inference Script" +echo "========================================================================" + +# Cleanup function to ensure Ray and GPU processes are properly terminated +cleanup() { + EXIT_CODE=$? + echo "" + echo "========================================================================" + echo "Cleanup: Terminating Ray cluster and GPU processes..." + echo "========================================================================" + + # Stop Ray cluster + if command -v ray &> /dev/null; then + echo "Stopping Ray cluster..." + ray stop --force 2>/dev/null || true + sleep 2 + fi + + # Kill any lingering Ray processes + echo "Killing lingering Ray processes..." + pkill -9 -f "ray::" 2>/dev/null || true + pkill -9 -f "RayWorkerWrapper" 2>/dev/null || true + pkill -9 -f "raylet" 2>/dev/null || true + + # Kill any vLLM processes + echo "Killing vLLM processes..." + pkill -9 -f "vllm" 2>/dev/null || true + + # Display final GPU state + if command -v rocm-smi &> /dev/null; then + echo "Final GPU state:" + rocm-smi 2>/dev/null || true + elif command -v amd-smi &> /dev/null; then + amd-smi list 2>/dev/null || true + fi + + echo "Cleanup completed (exit code: $EXIT_CODE)" + exit $EXIT_CODE +} + +# Register cleanup function to run on script exit (success, failure, or interruption) +trap cleanup EXIT INT TERM SIGINT SIGTERM + +# Get current directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# Detect deployment configuration from environment +NNODES=${NNODES:-1} +GPUS_PER_NODE=${MAD_RUNTIME_NGPUS:-1} +NODE_RANK=${NODE_RANK:-0} +MASTER_ADDR=${MASTER_ADDR:-localhost} +MASTER_PORT=${MASTER_PORT:-29500} + +# Model selection +MODEL_NAME=${MODEL_NAME:-facebook/opt-125m} + +echo "========================================================================" +echo "Deployment Configuration:" +echo " Model: $MODEL_NAME" +echo " Nodes: $NNODES" +echo " GPUs available: $GPUS_PER_NODE" +echo " Node rank: $NODE_RANK" +echo " Master address: $MASTER_ADDR" +echo " Master port: $MASTER_PORT" +echo "========================================================================" + +# Determine parallelism strategy +if [ "$NNODES" -eq 1 ]; then + # Single node with multiple GPUs: use tensor parallelism + TENSOR_PARALLEL_SIZE=$GPUS_PER_NODE + PIPELINE_PARALLEL_SIZE=1 + DISTRIBUTED_BACKEND="auto" # Will use default (no Ray needed) + + echo "Single-node mode: Using Tensor Parallelism" + echo " TP Size: $TENSOR_PARALLEL_SIZE" +else + # ═══════════════════════════════════════════════════════════════════════ + # MULTI-NODE DATA PARALLELISM MODE + # ═══════════════════════════════════════════════════════════════════════ + # Strategy: Each node runs an INDEPENDENT vLLM replica + # - No shared Ray cluster across nodes + # - Each node: Local Ray + Tensor Parallelism + # - Benefits: Simpler, faster init, more robust, better for benchmarking + # ═══════════════════════════════════════════════════════════════════════ + + echo "" + echo "╔════════════════════════════════════════════════════════════════════╗" + echo "║ MULTI-NODE DATA PARALLELISM MODE ║" + echo "╚════════════════════════════════════════════════════════════════════╝" + echo "" + echo " Total nodes: ${NNODES}" + echo " Current node rank: ${NODE_RANK}" + echo " GPUs per node: ${GPUS_PER_NODE}" + echo " Data Parallelism: ${NNODES} independent replicas" + echo " Tensor Parallelism: ${GPUS_PER_NODE} GPUs per replica" + echo " Total GPUs: $((NNODES * GPUS_PER_NODE))" + echo "" + + # Data Parallelism: TP per node, NO Pipeline Parallelism + TENSOR_PARALLEL_SIZE=$GPUS_PER_NODE + PIPELINE_PARALLEL_SIZE=1 # No pipeline parallelism in DP mode! + DISTRIBUTED_BACKEND="ray" + + # Set GPU environment variables for visibility + # CRITICAL: Ray requires ONLY ONE visibility variable + # - AMD GPUs: Use ONLY HIP_VISIBLE_DEVICES + # - NVIDIA GPUs: Use ONLY CUDA_VISIBLE_DEVICES + # Setting both causes Ray error: "Inconsistent values found" + if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then + # AMD GPU detected - use HIP_VISIBLE_DEVICES ONLY + # CRITICAL: Unset RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES which is set by rocm/vllm image + # This variable tells Ray to ignore HIP_VISIBLE_DEVICES, causing conflicts + unset RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES + export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-0,1,2,3} + unset ROCR_VISIBLE_DEVICES # Unset to avoid Ray conflicts + unset CUDA_VISIBLE_DEVICES # Unset to avoid "Inconsistent values" error + echo " GPU environment (AMD): HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES" + else + # NVIDIA GPU - use CUDA_VISIBLE_DEVICES ONLY + export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3} + unset HIP_VISIBLE_DEVICES # Unset to avoid Ray conflicts + unset ROCR_VISIBLE_DEVICES + echo " GPU environment (NVIDIA): CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" + fi + echo "" + + # Get current node IP + CURRENT_NODE_IP=$(getent hosts $(hostname) | awk '{print $1}' | head -1) + echo " Node $(hostname) IP: $CURRENT_NODE_IP" + export VLLM_HOST_IP="$CURRENT_NODE_IP" + + # Clean any existing Ray processes from previous jobs + echo " Cleaning any existing Ray processes..." + ray stop --force 2>/dev/null || true + pkill -9 -f "ray::" 2>/dev/null || true + pkill -9 -f "raylet" 2>/dev/null || true + sleep 2 + + # Start INDEPENDENT Ray cluster on THIS node only + # NOTE: Each node starts its own Ray cluster (NOT shared across nodes!) + echo " Starting independent Ray cluster on Node ${NODE_RANK}..." + ray start --head --port=6379 --node-ip-address="$CURRENT_NODE_IP" --num-gpus=$GPUS_PER_NODE + + sleep 3 + echo "" + echo "═══════════════════════════════════════════════════════════════════" + echo "Ray cluster ready on Node ${NODE_RANK}" + echo "═══════════════════════════════════════════════════════════════════" + ray status + echo "" +fi + +echo "========================================================================" +echo "vLLM V1 Configuration:" +echo " Tensor Parallel Size: $TENSOR_PARALLEL_SIZE" +echo " Pipeline Parallel Size: $PIPELINE_PARALLEL_SIZE" +echo " Distributed Backend: $DISTRIBUTED_BACKEND" +if [ "$NNODES" -gt 1 ]; then + echo " Data Parallel Size: $NNODES" +fi +echo "========================================================================" + +# Export environment for vLLM +export NNODES +export NODE_RANK +export MASTER_ADDR +export MASTER_PORT + +# Data Parallelism: ALL nodes run inference independently +echo "" +echo "Node ${NODE_RANK}: Launching vLLM inference..." +python3 run_vllm_inference.py \ + --model "$MODEL_NAME" \ + --tensor-parallel-size "$TENSOR_PARALLEL_SIZE" \ + --pipeline-parallel-size "$PIPELINE_PARALLEL_SIZE" \ + --distributed-backend "$DISTRIBUTED_BACKEND" + +# Note: cleanup() trap handler will run automatically on exit +echo "========================================================================" +echo "Node ${NODE_RANK}: Inference completed successfully" +echo "========================================================================" + diff --git a/tests/fixtures/dummy/scripts/dummy_vllm/run_vllm_inference.py b/tests/fixtures/dummy/scripts/dummy_vllm/run_vllm_inference.py new file mode 100755 index 00000000..52effe3d --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy_vllm/run_vllm_inference.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +""" +vLLM V1 Engine Distributed Inference Benchmark + +vLLM V1 Engine Architecture: +- Tensor Parallelism (TP): Split model across GPUs within a node +- Data Parallelism (DP): Run multiple replicas for higher throughput +- Pipeline Parallelism (PP): Split model layers across nodes (experimental) + +Launch modes: + Single-node/single-GPU: TP=1, DP=1 + Single-node/multi-GPU (TP): TP=N, DP=1 (model split across GPUs) + Single-node/multi-GPU (DP): TP=1, DP=N (multiple replicas) + Multi-node: Use Ray backend with proper configuration +""" + +import os +import sys +import time +import argparse +import socket +from typing import List, Optional + +# Configure environment before importing vLLM +os.environ.setdefault("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "1") +os.environ.setdefault("VLLM_USE_MODELSCOPE", "False") + +# V1 Engine specific settings +os.environ.setdefault("VLLM_USE_V1", "1") # Explicitly use V1 engine +os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + +try: + from vllm import LLM, SamplingParams + import torch +except ImportError as e: + print(f"Error importing required libraries: {e}") + print("Please ensure vLLM and PyTorch are installed") + sys.exit(1) + +# Configuration +DEFAULT_MODEL = "facebook/opt-125m" # Small model for testing +NUM_PROMPTS = 100 +MAX_TOKENS = 128 +TEMPERATURE = 0.8 +TOP_P = 0.95 + +# Sample prompts for inference +SAMPLE_PROMPTS = [ + "The future of artificial intelligence is", + "Machine learning has revolutionized", + "Deep learning models are capable of", + "Natural language processing enables", + "Computer vision systems can", +] + + +def print_header(args): + """Print benchmark header with configuration.""" + print("=" * 70) + print("vLLM V1 Engine Distributed Inference Benchmark") + print("=" * 70) + print(f"Hostname: {socket.gethostname()}") + + # Check multi-node setup + nnodes = int(os.environ.get("NNODES", "1")) + node_rank = int(os.environ.get("NODE_RANK", "0")) + + if nnodes > 1: + print(f"Multi-node mode: {nnodes} nodes (Node {node_rank})") + print(f"Parallelism strategy: Data Parallelism") + print(f" - Each node: Independent replica with TP={args.tensor_parallel_size}") + print(f" - Total GPUs: {args.tensor_parallel_size * nnodes}") + + print(f"Model: {args.model}") + print(f"Tensor Parallel Size: {args.tensor_parallel_size}") + print(f"Pipeline Parallel Size: {1 if nnodes > 1 else args.pipeline_parallel_size}") + + if nnodes == 1: + # Calculate total parallelism for single-node + total_gpus = args.tensor_parallel_size * args.pipeline_parallel_size + print(f"Total GPUs (TP × PP): {total_gpus}") + + print(f"Number of prompts: {NUM_PROMPTS}") + print(f"Max tokens: {MAX_TOKENS}") + print(f"Distributed backend: {args.distributed_backend}") + print("=" * 70) + + +def generate_prompts(num_prompts: int) -> List[str]: + """Generate list of prompts for inference.""" + prompts = [] + for i in range(num_prompts): + # Cycle through sample prompts + base_prompt = SAMPLE_PROMPTS[i % len(SAMPLE_PROMPTS)] + prompts.append(f"{base_prompt} (request {i+1})") + return prompts + + +def run_inference(args): + """Run vLLM V1 inference benchmark with Data Parallelism support.""" + print("\n" + "=" * 70) + print("Initializing vLLM V1 Engine") + print("=" * 70) + + # Get multi-node environment variables + nnodes = int(os.environ.get("NNODES", "1")) + node_rank = int(os.environ.get("NODE_RANK", "0")) + + # Determine distributed backend + # For single-node: use 'mp' (multiprocessing) or None + # For multi-node: use 'ray' + if args.distributed_backend == "auto": + distributed_backend = "ray" if nnodes > 1 else None + else: + distributed_backend = args.distributed_backend if args.distributed_backend != "none" else None + + # Multi-node Data Parallelism: Override pipeline parallelism + # Each node runs an independent replica with tensor parallelism + if nnodes > 1: + print("=" * 70) + print("🔀 MULTI-NODE DATA PARALLELISM MODE") + print("=" * 70) + print(f"Total nodes: {nnodes}") + print(f"Current node rank: {node_rank}") + print(f"Strategy: Each node runs independent replica") + print(f" - Tensor Parallelism: {args.tensor_parallel_size} GPUs per node") + print(f" - Pipeline Parallelism: Disabled (PP=1)") + print(f" - Data Parallelism: {nnodes} replicas (one per node)") + print("=" * 70) + + # Force PP=1 for Data Parallelism + effective_pipeline_size = 1 + effective_gpu_memory = 0.85 # Higher memory utilization for DP + else: + effective_pipeline_size = args.pipeline_parallel_size + effective_gpu_memory = 0.60 if args.pipeline_parallel_size > 1 else 0.85 + + print(f"Using distributed backend: {distributed_backend or 'default'}") + + # Initialize vLLM LLM engine with V1-specific settings + try: + llm_kwargs = { + "model": args.model, + "tensor_parallel_size": args.tensor_parallel_size, + "pipeline_parallel_size": effective_pipeline_size, + "trust_remote_code": True, + "dtype": "auto", + "gpu_memory_utilization": effective_gpu_memory, + "max_model_len": 2048, + "disable_log_stats": True, # Reduce logging noise + } + + # Add distributed backend if specified + if distributed_backend: + llm_kwargs["distributed_executor_backend"] = distributed_backend + + # V1 engine specific: enforce_eager mode for compatibility + if args.enforce_eager: + llm_kwargs["enforce_eager"] = True + + llm = LLM(**llm_kwargs) + print("✓ vLLM V1 engine initialized successfully") + if nnodes > 1: + print(f"✓ Node {node_rank} ready with TP={args.tensor_parallel_size}") + except Exception as e: + print(f"✗ Failed to initialize vLLM engine: {e}") + import traceback + traceback.print_exc() + return 1 + + # Configure sampling parameters + sampling_params = SamplingParams( + temperature=TEMPERATURE, + top_p=TOP_P, + max_tokens=MAX_TOKENS, + ) + + print(f"\n{'=' * 70}") + print("Running Inference") + print("=" * 70) + + # Generate prompts + prompts = generate_prompts(NUM_PROMPTS) + + # Warmup run (not timed) + print("\nWarmup: Running 10 prompts...") + warmup_prompts = prompts[:10] + _ = llm.generate(warmup_prompts, sampling_params) + print("✓ Warmup complete") + + # Benchmark run (timed) + print(f"\nBenchmark: Running {NUM_PROMPTS} prompts...") + start_time = time.time() + + outputs = llm.generate(prompts, sampling_params) + + end_time = time.time() + elapsed_time = end_time - start_time + + # Calculate metrics + total_tokens = sum(len(output.outputs[0].token_ids) for output in outputs) + throughput = NUM_PROMPTS / elapsed_time + tokens_per_second = total_tokens / elapsed_time + + # Print results + print(f"\n{'=' * 70}") + print("Benchmark Results") + if nnodes > 1: + print(f"Node {node_rank}/{nnodes} (Data Parallel Replica)") + print("=" * 70) + print(f"Total prompts: {NUM_PROMPTS}") + print(f"Total time: {elapsed_time:.2f} seconds") + print(f"Throughput: {throughput:.2f} requests/second") + print(f"Token generation: {tokens_per_second:.2f} tokens/second") + print(f"Average latency: {(elapsed_time / NUM_PROMPTS) * 1000:.2f} ms/request") + if nnodes > 1: + print(f"Aggregate throughput (all {nnodes} nodes): ~{throughput * nnodes:.2f} requests/second") + print("=" * 70) + + # Print sample outputs + print("\n" + "=" * 70) + print("Sample Outputs (first 3)") + print("=" * 70) + for i, output in enumerate(outputs[:3]): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"\n[Prompt {i+1}]: {prompt}") + print(f"[Output {i+1}]: {generated_text[:200]}...") # First 200 chars + + # madengine output format + print(f"\nperformance: {throughput:.2f} requests_per_second") + print(f"tokens_per_second: {tokens_per_second:.2f}") + print(f"model: {args.model}") + print(f"tensor_parallel_size: {args.tensor_parallel_size}") + print(f"pipeline_parallel_size: {effective_pipeline_size}") + + # Multi-node Data Parallelism info + if nnodes > 1: + print(f"data_parallel_size: {nnodes}") + print(f"node_rank: {node_rank}") + print(f"aggregate_throughput: {throughput * nnodes:.2f} requests_per_second (estimated)") + + # Determine what backend was actually used + if args.distributed_backend == "auto": + actual_backend = "ray" if nnodes > 1 else "default" + else: + actual_backend = args.distributed_backend if args.distributed_backend != "none" else "default" + print(f"distributed_backend: {actual_backend}") + + return 0 + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description="vLLM V1 Engine Distributed Inference Benchmark" + ) + parser.add_argument( + "--model", + type=str, + default=DEFAULT_MODEL, + help=f"Model name or path (default: {DEFAULT_MODEL})" + ) + parser.add_argument( + "--tensor-parallel-size", + type=int, + default=1, + help="Number of GPUs for tensor parallelism (default: 1)" + ) + parser.add_argument( + "--pipeline-parallel-size", + type=int, + default=1, + help="Number of nodes for pipeline parallelism (default: 1)" + ) + parser.add_argument( + "--distributed-backend", + type=str, + choices=["auto", "ray", "mp", "none"], + default="auto", + help="Distributed backend: auto (default), ray (multi-node), mp (multiprocessing), none" + ) + parser.add_argument( + "--enforce-eager", + action="store_true", + help="Disable CUDA graph for compatibility" + ) + + args = parser.parse_args() + + # Validate arguments + if args.tensor_parallel_size < 1: + print("Error: tensor-parallel-size must be >= 1") + return 1 + + if args.pipeline_parallel_size < 1: + print("Error: pipeline-parallel-size must be >= 1") + return 1 + + # Print configuration + print_header(args) + + # Run inference benchmark + return run_inference(args) + + +if __name__ == "__main__": + try: + sys.exit(main()) + except KeyboardInterrupt: + print("\n\nInterrupted by user") + sys.exit(130) + except Exception as e: + print(f"\nError: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + diff --git a/tests/fixtures/dummy/scripts/pyt_huggingface_bert/run.sh b/tests/fixtures/dummy/scripts/pyt_huggingface_bert/run.sh new file mode 100644 index 00000000..8693dc66 --- /dev/null +++ b/tests/fixtures/dummy/scripts/pyt_huggingface_bert/run.sh @@ -0,0 +1,67 @@ +#!/bin/bash +############################################################################### +# +# MIT License +# +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################# +set -ex + +if [[ "$MAD_SYSTEM_GPU_ARCHITECTURE" == *"gfx90a"* ]]; then + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-24} +elif [[ "$MAD_SYSTEM_GPU_ARCHITECTURE" == *"gfx908"* ]]; then + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-16} +elif [[ "$MAD_SYSTEM_GPU_ARCHITECTURE" == *"gfx906"* ]]; then + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-1} +elif [[ "$MAD_SYSTEM_GPU_ARCHITECTURE" == *"A100"* ]]; then + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-8} +elif [[ "$MAD_SYSTEM_GPU_ARCHITECTURE" == *"V100"* ]]; then + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-1} +else + echo "Detected new GPU architecture: $MAD_SYSTEM_GPU_ARCHITECTURE" + echo "If not using MAD_MODEL_BATCH_SIZE, setting batch size to 1" + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-1} +fi + +HF_PATH='/workspace/transformers' + +torchrun $HF_PATH/examples/pytorch/language-modeling/run_mlm.py \ + --model_name_or_path bert-large-uncased \ + --dataset_name wikitext \ + --dataset_config_name wikitext-2-raw-v1 \ + --do_train \ + --max_steps 150 \ + --logging_steps 1 \ + --output_dir /tmp/test-mlm-bbu \ + --overwrite_output_dir \ + --per_device_train_batch_size="$MAD_MODEL_BATCH_SIZE" \ + --fp16 \ + --skip_memory_metrics=True \ + "$@" \ + 2>&1 | tee log.txt + +# output performance metric +performance=$(cat log.txt | grep -Eo "train_samples_per_second':[^,]+" | sed "s/train_samples_per_second': //g" | head -n 1) + +# unset printing trace to not confuse Jenkinsfile +set +x +echo "performance: $performance samples_per_second" diff --git a/tests/fixtures/dummy/scripts/pyt_huggingface_gpt2/run.sh b/tests/fixtures/dummy/scripts/pyt_huggingface_gpt2/run.sh new file mode 100644 index 00000000..b34c0604 --- /dev/null +++ b/tests/fixtures/dummy/scripts/pyt_huggingface_gpt2/run.sh @@ -0,0 +1,98 @@ +#!/bin/bash +############################################################################### +# +# MIT License +# +# Copyright (c) 2024 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################# +set -ex + +if [[ "$MAD_SYSTEM_GPU_ARCHITECTURE" == *"gfx90a"* ]]; then + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-22} +elif [[ "$MAD_SYSTEM_GPU_ARCHITECTURE" == *"gfx908"* ]]; then + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-8} +elif [[ "$MAD_SYSTEM_GPU_ARCHITECTURE" == *"gfx906"* ]]; then + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-4} +elif [[ "$MAD_SYSTEM_GPU_ARCHITECTURE" == *"A100"* ]]; then + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-8} +elif [[ "$MAD_SYSTEM_GPU_ARCHITECTURE" == *"V100"* ]]; then + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-4} +else + echo "Detected new GPU architecture: $MAD_SYSTEM_GPU_ARCHITECTURE" + echo "If not using MAD_MODEL_BATCH_SIZE, setting batch size to 1" + MAD_MODEL_BATCH_SIZE=${MAD_MODEL_BATCH_SIZE:-1} +fi + +# train model +HF_PATH='/workspace/transformers' + +#set fp16 as a defult precision +precision_tag="--fp16" +#override default fp16 +#pass -p=fp16 or --precision=fp16 +for (( i=0; i<= $#; i=i+1 )); +do + case ${@:$i:1} in + -p=*|--precision=*) + precision_tag=${@:$i:1} + precision_tag="--${precision_tag#*=}" + set -- ${@:1:$i-1} ${@:$i+1:$#} + ;; + esac +done + +# Add model-caching to resolve the hf multi processing error +hf download gpt2 + +torchrun --nproc_per_node="$MAD_RUNTIME_NGPUS" $HF_PATH/examples/pytorch/language-modeling/run_clm.py --output_dir output \ + --model_name_or_path gpt2 \ + --dataset_name wikitext \ + --dataset_config_name wikitext-2-raw-v1 \ + --do_train \ + --do_eval \ + --label_smoothing 0.1 \ + --logging_steps 1 \ + --logging_dir log $precision_tag \ + --dataloader_num_workers 1 \ + --skip_memory_metrics \ + --per_device_train_batch_size="$MAD_MODEL_BATCH_SIZE" \ + --overwrite_output_dir \ + --max_steps 150 "$@" \ + 2>&1 | tee log.txt + +# output performance metric +# Use a more robust approach to avoid bash segfaults when rocprof is active +# First check if log.txt exists and has content +if [ -f log.txt ] && [ -s log.txt ]; then + # Extract performance metric, handling potential rocprof interference + performance=$(grep -Eo "train_samples_per_second':[^,]+" log.txt 2>/dev/null | sed "s/train_samples_per_second': //g" 2>/dev/null | head -n 1 2>/dev/null || echo "") +else + performance="" +fi + +# unset printing trace to not confuse Jenkinsfile +set +x +if [ -n "$performance" ]; then + echo "performance: $performance samples_per_second" +else + echo "performance: N/A samples_per_second" +fi diff --git a/tests/fixtures/utils.py b/tests/fixtures/utils.py index 617c305d..eabbe13a 100644 --- a/tests/fixtures/utils.py +++ b/tests/fixtures/utils.py @@ -13,11 +13,9 @@ import pytest from unittest.mock import MagicMock - MODEL_DIR = "tests/fixtures/dummy" BASE_DIR = os.path.join(os.path.dirname(__file__), "..", "..") sys.path.insert(1, BASE_DIR) -print(f'BASE DIR:: {BASE_DIR}') # Cache variables to avoid repeated system checks during collection _gpu_vendor_cache = None @@ -25,19 +23,86 @@ _num_gpus_cache = None _num_cpus_cache = None +# GPU detection cache to avoid multiple expensive calls +_has_gpu_cache = None + + +def has_gpu() -> bool: + """Simple function to check if GPU is available for testing. + + This is the primary function for test skipping decisions. + Uses caching to avoid repeated expensive detection calls. + + Returns: + bool: True if GPU is available, False if CPU-only machine + """ + global _has_gpu_cache + + if _has_gpu_cache is not None: + return _has_gpu_cache + + try: + # Ultra-simple file existence check (no subprocess calls) + # This is safe for pytest collection and avoids hanging + nvidia_exists = os.path.exists("/usr/bin/nvidia-smi") + amd_rocm_exists = os.path.exists("/opt/rocm/bin/rocm-smi") or os.path.exists( + "/usr/local/bin/rocm-smi" + ) + + _has_gpu_cache = nvidia_exists or amd_rocm_exists + + except Exception: + # If file checks fail, assume no GPU (safe default for tests) + _has_gpu_cache = False + + return _has_gpu_cache + + +def requires_gpu(reason: str = "test requires GPU functionality"): + """Simple decorator to skip tests that require GPU. + + This is the only decorator needed for GPU-dependent tests. + + Args: + reason: Custom reason for skipping + + Returns: + pytest.mark.skipif decorator + """ + return pytest.mark.skipif(not has_gpu(), reason=reason) + @pytest.fixture def global_data(): # Lazy import to avoid collection issues - from madengine.core.console import Console + if "Console" not in globals(): + from madengine.core.console import Console return {"console": Console(live_output=True)} @pytest.fixture() def clean_test_temp_files(request): + """ + Fixture to clean up test temporary files and Docker containers. + + Cleans up both before (to ensure clean state) and after (to avoid conflicts). + """ + import subprocess + + # Clean up Docker containers BEFORE test (ensure clean state) + try: + subprocess.run( + "docker ps -a | grep 'container_ci-dummy' | awk '{print $1}' | xargs -r docker rm -f", + shell=True, + capture_output=True, + timeout=30 + ) + except: + pass # Ignore cleanup errors before test yield + # Clean up files after test for filename in request.param: file_path = os.path.join(BASE_DIR, filename) if os.path.exists(file_path): @@ -45,6 +110,66 @@ def clean_test_temp_files(request): shutil.rmtree(file_path) else: os.remove(file_path) + + # Clean up Docker containers AFTER test (avoid conflicts with next test) + try: + subprocess.run( + "docker ps -a | grep 'container_ci-dummy' | awk '{print $1}' | xargs -r docker rm -f", + shell=True, + capture_output=True, + timeout=30 + ) + except: + pass # Ignore cleanup errors after test + + +def generate_additional_context_for_machine() -> dict: + """Generate appropriate additional context based on detected machine capabilities. + + Returns: + dict: Additional context with gpu_vendor and guest_os suitable for current machine + """ + if has_gpu(): + # Simple vendor detection for GPU machines + vendor = "NVIDIA" if os.path.exists("/usr/bin/nvidia-smi") else "AMD" + return {"gpu_vendor": vendor, "guest_os": "UBUNTU"} + else: + # On CPU-only machines, use defaults suitable for build-only operations + return { + "gpu_vendor": "AMD", # Default for build-only nodes + "guest_os": "UBUNTU", # Default OS + } + + +def generate_additional_context_json() -> str: + """Generate JSON string of additional context for current machine. + + Returns: + str: JSON string representation of additional context + """ + return json.dumps(generate_additional_context_for_machine()) + + +def create_mock_args_with_auto_context(**kwargs) -> MagicMock: + """Create mock args with automatically generated additional context. + + Args: + **kwargs: Additional attributes to set on the mock args + + Returns: + MagicMock: Mock args object with auto-generated additional context + """ + mock_args = MagicMock() + + # Set auto-generated context + mock_args.additional_context = generate_additional_context_json() + mock_args.additional_context_file = None + + # Set any additional attributes + for key, value in kwargs.items(): + setattr(mock_args, key, value) + + return mock_args def is_nvidia() -> bool: @@ -99,7 +224,7 @@ def get_gpu_nodeid_map() -> dict: gpu_map[unique_id] = gpu_id else: try: - # Try the new amd-smi tool first (ROCm 6.4+) + # Try the new amd-smi tool first (ROCm 6.4.1+, PR #54) output = console.sh("amd-smi list --json") gpu_data = json.loads(output) for gpu_info in gpu_data: @@ -109,16 +234,24 @@ def get_gpu_nodeid_map() -> dict: except: # Fall back to older rocm-smi tools try: - rocm_version = console.sh("hipconfig --version") - rocm_version = float(".".join(rocm_version.split(".")[:2])) + rocm_version_str = console.sh("hipconfig --version") + # Parse version as tuple for proper comparison (6.4.1 vs 6.4.0) + version_parts = rocm_version_str.split(".") + if len(version_parts) >= 3: + rocm_version = tuple(int(p.split('-')[0]) for p in version_parts[:3]) + else: + # Fallback to float comparison for versions without patch + rocm_version = (int(version_parts[0]), int(version_parts[1]), 0) + + # Use appropriate rocm-smi command based on version (PR #54: threshold is 6.4.1) command = ( - "rocm-smi --showuniqueid" if rocm_version < 6.4 else "rocm-smi --showhw" + "rocm-smi --showuniqueid" if rocm_version < (6, 4, 1) else "rocm-smi --showhw" ) output = console.sh(command) lines = output.split("\n") for line in lines: - if rocm_version < 6.4: + if rocm_version < (6, 4, 1): if "Unique ID:" in line: gpu_id = int(line.split(":")[0].split("[")[1].split("]")[0]) unique_id = line.split(":")[2].strip() diff --git a/tests/integration/test_batch_manifest_integration.py b/tests/integration/test_batch_manifest_integration.py new file mode 100644 index 00000000..86841b33 --- /dev/null +++ b/tests/integration/test_batch_manifest_integration.py @@ -0,0 +1,47 @@ +"""Integration tests for batch manifest build workflow. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import os +import tempfile + +import pytest +from typer.testing import CliRunner + +from madengine.cli import app + + +class TestBatchManifestBuildIntegration: + """Integration tests for batch manifest build functionality.""" + + def test_batch_manifest_mutually_exclusive_with_tags(self): + """Test that --batch-manifest and --tags are mutually exclusive.""" + runner = CliRunner() + + # Create a simple batch manifest + batch_data = [{"model_name": "dummy", "build_new": True}] + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(batch_data, f) + batch_file = f.name + + try: + # Test that using both options is rejected + result = runner.invoke( + app, + [ + "build", + "--batch-manifest", batch_file, + "--tags", "dummy", + "--additional-context", '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + ] + ) + + # Should fail with mutual exclusivity error + assert result.exit_code != 0 + assert "Cannot specify both --batch-manifest and --tags" in result.output + finally: + os.unlink(batch_file) + diff --git a/tests/integration/test_cli_error_integration.py b/tests/integration/test_cli_error_integration.py new file mode 100644 index 00000000..b04de1bb --- /dev/null +++ b/tests/integration/test_cli_error_integration.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +Unit tests for madengine CLI error handling integration. + +Tests the integration of unified error handling in mad_cli.py and +distributed_orchestrator.py components. +""" + +import pytest +import json +import os +import tempfile +from unittest.mock import Mock, patch, MagicMock, mock_open +from rich.console import Console + +# Add src to path for imports +import sys +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) + +from madengine.core.errors import ( + ErrorHandler, + ConfigurationError, + set_error_handler, + get_error_handler, + create_error_context +) + + +class TestMadCLIErrorIntegration: + """Test mad_cli.py error handling integration.""" + + @patch('madengine.cli.utils.Console') + def test_setup_logging_creates_error_handler(self, mock_console_class): + """Test that setup_logging initializes the unified error handler.""" + from madengine.cli import setup_logging + + mock_console = Mock(spec=Console) + mock_console_class.return_value = mock_console + + # Clear any existing global error handler + set_error_handler(None) + + # Call setup_logging + setup_logging(verbose=True) + + # Verify error handler was set + handler = get_error_handler() + assert handler is not None + assert isinstance(handler, ErrorHandler) + assert handler.verbose is True + + def test_setup_logging_verbose_flag(self): + """Test that verbose flag is properly passed to error handler.""" + from madengine.cli import setup_logging + + # Test with verbose=False + setup_logging(verbose=False) + handler = get_error_handler() + assert handler.verbose is False + + # Test with verbose=True + setup_logging(verbose=True) + handler = get_error_handler() + assert handler.verbose is True + + def test_build_command_error_handling(self): + """Test that build command imports and can use unified error handling.""" + from madengine.cli import ExitCode + + # Test that the import works and error handling is available + try: + # This tests the actual import in mad_cli.py + from madengine.cli import setup_logging + + # Verify error handler can be set up + setup_logging(verbose=False) + + # Verify handle_error can be imported in the context where it's used + from madengine.core.errors import handle_error, create_error_context + + # Create a test error to ensure the system works + error = Exception("Test build error") + context = create_error_context( + operation="build", + phase="build", + component="CLI" + ) + + # This should not raise an exception + handle_error(error, context=context) + + except ImportError as e: + pytest.fail(f"Error handling integration failed: {e}") + + @patch('madengine.cli.utils.console') + def test_cli_error_display_consistency(self, mock_console): + """Test that CLI errors are displayed consistently through unified handler.""" + from madengine.cli import setup_logging + + # Setup logging to initialize error handler + setup_logging(verbose=False) + + # Get the initialized error handler + handler = get_error_handler() + + # Create a test error + error = ConfigurationError( + "Invalid configuration", + context=create_error_context( + operation="cli_command", + component="CLI", + phase="validation" + ) + ) + + # Handle the error through the unified system + handler.handle_error(error) + + # The error should be displayed through Rich console + # (Note: The actual console calls depend on the handler implementation) + assert handler.console is not None + + +class TestErrorHandlingWorkflow: + """Test complete error handling workflow across components.""" + + @patch('madengine.cli.utils.console') + def test_end_to_end_error_flow(self, mock_console): + """Test complete error flow from CLI through orchestrator.""" + from madengine.cli import setup_logging + from madengine.core.errors import ValidationError + + # Setup unified error handling + setup_logging(verbose=True) + handler = get_error_handler() + + # Create an error that might occur in the orchestrator + orchestrator_error = ValidationError( + "Invalid model tag format", + context=create_error_context( + operation="model_discovery", + component="DistributedOrchestrator", + phase="validation", + model_name="invalid::tag" + ), + suggestions=[ + "Use format: model_name:version", + "Check model name contains only alphanumeric characters" + ] + ) + + # Handle the error through the unified system + handler.handle_error(orchestrator_error) + + # Verify the error was processed + assert handler.console is not None + assert orchestrator_error.context.operation == "model_discovery" + assert orchestrator_error.context.component == "DistributedOrchestrator" + assert len(orchestrator_error.suggestions) == 2 + + def test_error_logging_integration(self): + """Test that errors are properly logged with structured data.""" + from madengine.cli import setup_logging + from madengine.core.errors import BuildError + + # Setup logging + setup_logging(verbose=False) + handler = get_error_handler() + + # Create a build error with rich context + build_error = BuildError( + "Docker build failed", + context=create_error_context( + operation="docker_build", + component="DockerBuilder", + phase="build", + model_name="test_model", + additional_info={"dockerfile": "Dockerfile.ubuntu.amd"} + ), + suggestions=["Check Dockerfile syntax", "Verify base image availability"] + ) + + # Mock the logger to capture log calls + with patch.object(handler, 'logger') as mock_logger: + handler.handle_error(build_error) + + # Verify logging was called with structured data + mock_logger.error.assert_called_once() + log_call_args = mock_logger.error.call_args + + # Check the log message + assert "build: Docker build failed" in log_call_args[0][0] + + # Check the extra structured data + extra_data = log_call_args[1]['extra'] + assert extra_data['context']['operation'] == "docker_build" + assert extra_data['context']['component'] == "DockerBuilder" + assert extra_data['recoverable'] is False # BuildError is not recoverable + assert len(extra_data['suggestions']) == 2 + + def test_error_context_serialization(self): + """Test that error contexts can be serialized for logging and debugging.""" + from madengine.core.errors import RuntimeError + + context = create_error_context( + operation="model_execution", + component="ContainerRunner", + phase="runtime", + model_name="llama2", + node_id="worker-node-01", + file_path="/models/llama2/run.sh", + additional_info={ + "container_id": "abc123", + "gpu_count": 2, + "timeout": 3600 + } + ) + + error = RuntimeError( + "Model execution failed with exit code 1", + context=context + ) + + # Test that context can be serialized + context_dict = error.context.__dict__ + json_str = json.dumps(context_dict, default=str) + + # Verify all context information is in the serialized form + assert "model_execution" in json_str + assert "ContainerRunner" in json_str + assert "runtime" in json_str + assert "llama2" in json_str + assert "worker-node-01" in json_str + assert "abc123" in json_str + + +class TestErrorHandlingPerformance: + """Test performance aspects of error handling.""" + + def test_error_handler_initialization_performance(self): + """Test that error handler initialization is fast.""" + import time + from madengine.core.errors import ErrorHandler + from rich.console import Console + + start_time = time.time() + + # Create multiple error handlers + for _ in range(100): + console = Console() + handler = ErrorHandler(console=console, verbose=False) + + end_time = time.time() + + # Should be able to create 100 handlers in under 1 second + assert end_time - start_time < 1.0 + + def test_error_context_creation_performance(self): + """Test that error context creation is efficient.""" + import time + + start_time = time.time() + + # Create many error contexts + for i in range(1000): + context = create_error_context( + operation=f"operation_{i}", + component=f"Component_{i}", + phase="test", + model_name=f"model_{i}", + additional_info={"iteration": i} + ) + + end_time = time.time() + + # Should be able to create 1000 contexts in under 0.1 seconds + assert end_time - start_time < 0.1 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/test_console.py b/tests/integration/test_console_integration.py similarity index 91% rename from tests/test_console.py rename to tests/integration/test_console_integration.py index 6ed0cb79..e6a700a0 100644 --- a/tests/test_console.py +++ b/tests/integration/test_console_integration.py @@ -4,25 +4,29 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ + # built-in modules import subprocess import typing + # third-party modules import pytest import typing_extensions + # project modules from madengine.core import console class TestConsole: """Test the console module. - + test_sh: Test the console.sh function with echo command. """ + def test_sh(self): obj = console.Console() assert obj.sh("echo MAD Engine") == "MAD Engine" - + def test_sh_fail(self): obj = console.Console() try: @@ -47,7 +51,9 @@ def test_sh_secret(self): def test_sh_env(self): obj = console.Console() - assert obj.sh("echo $MAD_ENGINE", env={"MAD_ENGINE": "MAD Engine"}) == "MAD Engine" + assert ( + obj.sh("echo $MAD_ENGINE", env={"MAD_ENGINE": "MAD Engine"}) == "MAD Engine" + ) def test_sh_verbose(self): obj = console.Console(shellVerbose=False) diff --git a/tests/integration/test_container_execution.py b/tests/integration/test_container_execution.py new file mode 100644 index 00000000..77cfb291 --- /dev/null +++ b/tests/integration/test_container_execution.py @@ -0,0 +1,463 @@ +"""Test the container runner module. + +This module tests the Docker container execution functionality for distributed execution. + +UPDATED: Now uses execution/container_runner.py (modern madengine architecture). +Previous: Used deprecated tools/container_runner.py (removed). + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# built-in modules +import os +import json +import tempfile +import unittest.mock +from unittest.mock import patch, MagicMock, mock_open + +# third-party modules +import pytest + +# project modules +from madengine.execution.container_runner import ContainerRunner +from madengine.core.context import Context +from madengine.core.console import Console +from madengine.core.dataprovider import Data +from tests.fixtures.utils import BASE_DIR, MODEL_DIR + + +class TestContainerRunner: + """Test the container runner module.""" + + @patch("madengine.core.context.Context") + def test_container_runner_initialization(self, mock_context_class): + """Test ContainerRunner initialization.""" + mock_context = MagicMock() + mock_context_class.return_value = mock_context + context = mock_context_class() + console = Console() + data = MagicMock() + + runner = ContainerRunner(context, data, console) + + assert runner.context == context + assert runner.data == data + assert runner.console == console + assert runner.credentials is None + + def test_container_runner_initialization_minimal(self): + """Test ContainerRunner initialization with minimal parameters.""" + runner = ContainerRunner() + + assert runner.context is None + assert runner.data is None + assert isinstance(runner.console, Console) + assert runner.credentials is None + + def test_load_build_manifest(self): + """Test loading build manifest from file.""" + runner = ContainerRunner() + + manifest_data = { + "images": { + "model1": "localhost:5000/ci-model1:latest", + "model2": "localhost:5000/ci-model2:latest", + }, + "metadata": { + "build_time": "2023-01-01T12:00:00Z", + "registry": "localhost:5000", + }, + } + + with patch("builtins.open", mock_open(read_data=json.dumps(manifest_data))): + result = runner.load_build_manifest("test_manifest.json") + + assert result == manifest_data + assert "images" in result + assert "model1" in result["images"] + + @patch.object(Console, "sh") + def test_pull_image(self, mock_sh): + """Test pulling image from registry.""" + runner = ContainerRunner() + + mock_sh.return_value = "Pull successful" + + result = runner.pull_image("localhost:5000/test:latest") + + assert result == "localhost:5000/test:latest" + mock_sh.assert_called_with("docker pull localhost:5000/test:latest") + + @patch.object(Console, "sh") + def test_pull_image_with_local_name(self, mock_sh): + """Test pulling image with local name tagging.""" + runner = ContainerRunner() + + mock_sh.return_value = "Success" + + result = runner.pull_image("localhost:5000/test:latest", "local-test") + + assert result == "local-test" + # Should have called pull and tag + expected_calls = [ + unittest.mock.call("docker pull localhost:5000/test:latest"), + unittest.mock.call("docker tag localhost:5000/test:latest local-test"), + ] + mock_sh.assert_has_calls(expected_calls) + + @patch("madengine.core.context.Context") + def test_get_gpu_arg_all_gpus(self, mock_context_class): + """Test get_gpu_arg with all GPUs requested.""" + mock_context = MagicMock() + mock_context.ctx = { + "docker_env_vars": {"MAD_GPU_VENDOR": "AMD", "MAD_SYSTEM_NGPUS": "4"}, + "docker_gpus": "0,1,2,3", + "gpu_renderDs": [128, 129, 130, 131], # Mock render device IDs for AMD GPUs + } + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + result = runner.get_gpu_arg("-1") + + # Should return GPU args for all available GPUs + assert "--device=/dev/kfd" in result and "renderD" in result + + @patch("madengine.core.context.Context") + def test_get_gpu_arg_specific_gpus(self, mock_context_class): + """Test get_gpu_arg with specific GPUs requested.""" + mock_context = MagicMock() + mock_context.ctx = { + "docker_env_vars": {"MAD_GPU_VENDOR": "NVIDIA", "MAD_SYSTEM_NGPUS": "4"}, + "docker_gpus": "0,1,2,3", + } + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + result = runner.get_gpu_arg("2") + + # Should return GPU args for 2 GPUs + assert "gpu" in result.lower() + + @patch("madengine.core.context.Context") + def test_get_gpu_arg_range_format(self, mock_context_class): + """Test get_gpu_arg with range format.""" + mock_context = MagicMock() + mock_context.ctx = { + "docker_env_vars": {"MAD_GPU_VENDOR": "NVIDIA", "MAD_SYSTEM_NGPUS": "4"}, + "docker_gpus": "0-3", + } + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + result = runner.get_gpu_arg("2") + + # Should handle range format correctly + assert isinstance(result, str) + + @patch("madengine.core.context.Context") + @patch.object(Console, "sh") + @patch("madengine.execution.container_runner.Docker") + @patch("builtins.open", new_callable=mock_open) + @patch("os.path.exists") + def test_run_container_success( + self, mock_exists, mock_file, mock_docker_class, mock_sh, mock_context_class + ): + """Test successful container run.""" + # Mock context to avoid GPU detection + mock_context = MagicMock() + mock_context.ctx = { + "docker_env_vars": {"MAD_GPU_VENDOR": "NVIDIA", "MAD_SYSTEM_NGPUS": "2"}, + "docker_gpus": "0,1", + "gpu_vendor": "NVIDIA", + } + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + # Mock Docker instance + mock_docker = MagicMock() + mock_docker.sh.return_value = "Command output" + mock_docker_class.return_value = mock_docker + + mock_sh.return_value = "hostname" + + # Mock log file with performance metrics + log_content = "Running test...\nperformance: 100.5 samples_per_second\nTest completed" + mock_file.return_value.read.return_value = log_content + + # Mock os.path.exists to return True for log file + def exists_side_effect(path): + if path.endswith(".live.log"): + return True + return False + mock_exists.side_effect = exists_side_effect + + model_info = { + "name": "test_model", + "n_gpus": "1", + "scripts": "test_script.sh", + "args": "", + } + + with patch.object(runner, "get_gpu_arg", return_value="--gpus device=0"): + with patch.object(runner, "get_cpu_arg", return_value=""): + with patch.object(runner, "get_env_arg", return_value=""): + with patch.object(runner, "get_mount_arg", return_value=""): + result = runner.run_container( + model_info, "test-image", timeout=300 + ) + + assert result["status"] == "SUCCESS" + assert "test_duration" in result + assert mock_docker_class.called + assert result["performance"] == "100.5" + assert result["metric"] == "samples_per_second" + + @patch("madengine.core.context.Context") + @patch.object(Console, "sh") + @patch("madengine.execution.container_runner.Docker") + def test_run_container_timeout( + self, mock_docker_class, mock_sh, mock_context_class + ): + """Test container run with timeout.""" + # Mock context to avoid GPU detection + mock_context = MagicMock() + mock_context.ctx = { + "docker_env_vars": {"MAD_GPU_VENDOR": "NVIDIA", "MAD_SYSTEM_NGPUS": "2"}, + "docker_gpus": "0,1", + "gpu_vendor": "NVIDIA", + } + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + # Mock Docker instance that raises TimeoutError + mock_docker = MagicMock() + mock_docker.sh.side_effect = TimeoutError("Timeout occurred") + mock_docker_class.return_value = mock_docker + + mock_sh.return_value = "hostname" + + model_info = { + "name": "test_model", + "n_gpus": "1", + "scripts": "test_script.sh", + "args": "", + } + + with patch.object(runner, "get_gpu_arg", return_value="--gpus device=0"): + with patch.object(runner, "get_cpu_arg", return_value=""): + with patch.object(runner, "get_env_arg", return_value=""): + with patch.object(runner, "get_mount_arg", return_value=""): + # run_container catches exceptions and returns results with status + result = runner.run_container( + model_info, "test-image", timeout=10 + ) + assert result["status"] == "FAILURE" + + @patch("madengine.core.context.Context") + @patch.object(Console, "sh") + @patch("madengine.execution.container_runner.Docker") + def test_run_container_failure( + self, mock_docker_class, mock_sh, mock_context_class + ): + """Test container run failure.""" + # Mock context to avoid GPU detection + mock_context = MagicMock() + mock_context.ctx = { + "docker_env_vars": {"MAD_GPU_VENDOR": "NVIDIA", "MAD_SYSTEM_NGPUS": "2"}, + "docker_gpus": "0,1", + "gpu_vendor": "NVIDIA", + } + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + # Mock Docker instance that raises RuntimeError + mock_docker = MagicMock() + mock_docker.sh.side_effect = RuntimeError("Container failed to start") + mock_docker_class.return_value = mock_docker + + mock_sh.return_value = "hostname" + + model_info = { + "name": "test_model", + "n_gpus": "1", + "scripts": "test_script.sh", + "args": "", + } + + with patch.object(runner, "get_gpu_arg", return_value="--gpus device=0"): + with patch.object(runner, "get_cpu_arg", return_value=""): + with patch.object(runner, "get_env_arg", return_value=""): + with patch.object(runner, "get_mount_arg", return_value=""): + # run_container catches exceptions and returns results with status + result = runner.run_container( + model_info, "test-image", timeout=300 + ) + assert result["status"] == "FAILURE" + + @patch("madengine.core.context.Context") + def test_load_credentials(self, mock_context_class): + """Test setting credentials for container runner.""" + # Mock context to avoid GPU detection + mock_context = MagicMock() + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + credentials = {"github": {"username": "testuser", "password": "testpass"}} + + runner.set_credentials(credentials) + + assert runner.credentials == credentials + + @patch("madengine.core.context.Context") + def test_login_to_registry(self, mock_context_class): + """Test login to Docker registry.""" + # Mock context to avoid GPU detection + mock_context = MagicMock() + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + credentials = { + "localhost:5000": {"username": "testuser", "password": "testpass"} + } + + with patch.object(runner.console, "sh") as mock_sh: + mock_sh.return_value = "Login Succeeded" + runner.login_to_registry("localhost:5000", credentials) + + # Verify login command was called + assert mock_sh.called + + @patch("madengine.core.context.Context") + def test_get_gpu_arg_specific_gpu(self, mock_context_class): + """Test getting GPU arguments for specific GPU count.""" + # Mock context to avoid GPU detection + mock_context = MagicMock() + mock_context.ctx = { + "docker_env_vars": {"MAD_GPU_VENDOR": "NVIDIA", "MAD_SYSTEM_NGPUS": "4"}, + "docker_gpus": "0,1,2,3", + } + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + result = runner.get_gpu_arg("2") + + # Should return GPU args for 2 GPUs + assert "gpu" in result.lower() or "device" in result.lower() + + @patch("madengine.core.context.Context") + def test_get_cpu_arg(self, mock_context_class): + """Test getting CPU arguments for docker run.""" + # Mock context to avoid GPU detection + mock_context = MagicMock() + mock_context.ctx = {"docker_cpus": "0,1,2,3"} + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + result = runner.get_cpu_arg() + + assert "--cpuset-cpus" in result + assert "0,1,2,3" in result + + @patch("madengine.core.context.Context") + def test_get_env_arg(self, mock_context_class): + """Test getting environment variables for container.""" + # Mock context to avoid GPU detection + mock_context = MagicMock() + mock_context.ctx = { + "docker_env_vars": { + "MAD_GPU_VENDOR": "NVIDIA", + "MAD_MODEL_NAME": "test_model", + "CUSTOM_VAR": "custom_value", + } + } + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + custom_env = {"EXTRA_VAR": "extra_value"} + result = runner.get_env_arg(custom_env) + + assert "--env MAD_GPU_VENDOR=" in result + assert "--env EXTRA_VAR=" in result + + @patch("madengine.core.context.Context") + def test_get_mount_arg(self, mock_context_class): + """Test getting mount arguments for container.""" + # Mock context to avoid GPU detection + mock_context = MagicMock() + mock_context.ctx = { + "docker_mounts": { + "/container/data": "/host/data", + "/container/output": "/host/output", + } + } + mock_context_class.return_value = mock_context + runner = ContainerRunner(mock_context) + + mount_datapaths = [ + {"path": "/host/input", "home": "/container/input", "readwrite": "false"} + ] + + result = runner.get_mount_arg(mount_datapaths) + + assert "-v /host/input:/container/input:ro" in result + assert "-v /host/data:/container/data" in result + + def test_apply_tools_without_tools_config(self): + """Test applying tools when no tools configuration exists.""" + runner = ContainerRunner() + + # Mock context without tools + runner.context = MagicMock() + runner.context.ctx = {} + + pre_encapsulate_post_scripts = { + "pre_scripts": [], + "encapsulate_script": "", + "post_scripts": [], + } + run_env = {} + + # Should not raise any exception + runner.apply_tools(pre_encapsulate_post_scripts, run_env, "nonexistent.json") + + # Scripts should remain unchanged + assert pre_encapsulate_post_scripts["pre_scripts"] == [] + assert pre_encapsulate_post_scripts["encapsulate_script"] == "" + assert run_env == {} + + def test_run_pre_post_script(self): + """Test running pre/post scripts.""" + runner = ContainerRunner() + + # Mock Docker instance + mock_docker = MagicMock() + mock_docker.sh = MagicMock() + + scripts = [ + {"path": "/path/to/script1.sh", "args": "arg1 arg2"}, + {"path": "/path/to/script2.sh"}, + ] + + runner.run_pre_post_script(mock_docker, "model_dir", scripts) + + # Verify scripts were copied and executed + assert mock_docker.sh.call_count == 4 # 2 copies + 2 executions + + # Check if copy commands were called + copy_calls = [ + call for call in mock_docker.sh.call_args_list if "cp -vLR" in str(call) + ] + assert len(copy_calls) == 2 + + def test_initialization_with_all_parameters(self): + """Test ContainerRunner initialization with all parameters.""" + context = MagicMock() + console = Console() + data = MagicMock() + + runner = ContainerRunner(context, data, console) + + assert runner.context == context + assert runner.data == data + assert runner.console == console + assert runner.credentials is None diff --git a/tests/integration/test_docker_integration.py b/tests/integration/test_docker_integration.py new file mode 100644 index 00000000..ac4826db --- /dev/null +++ b/tests/integration/test_docker_integration.py @@ -0,0 +1,821 @@ +"""Test the Docker builder module. + +This module tests the Docker image building functionality for distributed execution. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# built-in modules +import os +import json +import tempfile +import unittest.mock +from unittest.mock import patch, MagicMock, mock_open + +# third-party modules +import pytest + +# project modules +from madengine.execution.docker_builder import DockerBuilder +from madengine.core.context import Context +from madengine.core.console import Console +from tests.fixtures.utils import BASE_DIR, MODEL_DIR + + +class TestDockerBuilder: + """Test the Docker builder module.""" + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_docker_builder_initialization( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test DockerBuilder initialization.""" + context = Context() + console = Console() + + builder = DockerBuilder(context, console) + + assert builder.context == context + assert builder.console == console + assert builder.built_images == {} + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_docker_builder_initialization_without_console( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test DockerBuilder initialization without console.""" + context = Context() + + builder = DockerBuilder(context) + + assert builder.context == context + assert isinstance(builder.console, Console) + assert builder.built_images == {} + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_get_context_path_with_dockercontext( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test get_context_path when dockercontext is specified.""" + context = Context() + builder = DockerBuilder(context) + + info = {"dockercontext": "/custom/context"} + result = builder.get_context_path(info) + + assert result == "/custom/context" + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_get_context_path_without_dockercontext( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test get_context_path when dockercontext is not specified.""" + context = Context() + builder = DockerBuilder(context) + + info = {} + result = builder.get_context_path(info) + + assert result == "./docker" + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_get_context_path_with_empty_dockercontext( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test get_context_path when dockercontext is empty.""" + context = Context() + builder = DockerBuilder(context) + + info = {"dockercontext": ""} + result = builder.get_context_path(info) + + assert result == "./docker" + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_get_build_arg_no_args( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test get_build_arg with no additional runtime build arguments.""" + context = Context() + builder = DockerBuilder(context) + + result = builder.get_build_arg() + + # Context automatically includes system GPU architecture + assert "MAD_SYSTEM_GPU_ARCHITECTURE" in result + assert "--build-arg" in result + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_get_build_arg_with_context_args( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test get_build_arg with context build arguments.""" + context = Context() + context.ctx = {"docker_build_arg": {"ARG1": "value1", "ARG2": "value2"}} + builder = DockerBuilder(context) + + result = builder.get_build_arg() + + assert "--build-arg ARG1='value1'" in result + assert "--build-arg ARG2='value2'" in result + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_get_build_arg_with_run_args( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test get_build_arg with runtime build arguments.""" + context = Context() + builder = DockerBuilder(context) + + run_build_arg = {"RUNTIME_ARG": "runtime_value"} + result = builder.get_build_arg(run_build_arg) + + assert "--build-arg RUNTIME_ARG='runtime_value'" in result + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_get_build_arg_with_both_args( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test get_build_arg with both context and runtime arguments.""" + context = Context() + context.ctx = {"docker_build_arg": {"CONTEXT_ARG": "context_value"}} + builder = DockerBuilder(context) + + run_build_arg = {"RUNTIME_ARG": "runtime_value"} + result = builder.get_build_arg(run_build_arg) + + assert "--build-arg CONTEXT_ARG='context_value'" in result + assert "--build-arg RUNTIME_ARG='runtime_value'" in result + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + @patch.object(Console, "sh") + def test_build_image_success( + self, + mock_sh, + mock_render, + mock_docker_gpu, + mock_hip, + mock_arch, + mock_ngpus, + mock_vendor, + ): + """Test successful Docker image build.""" + context = Context() + console = Console() + builder = DockerBuilder(context, console) + + # Mock the console.sh calls + mock_sh.return_value = "Build successful" + + model_info = {"name": "test/model", "dockercontext": "./docker"} + dockerfile = "./docker/Dockerfile" + + with patch.object(builder, "get_build_arg", return_value=""): + result = builder.build_image(model_info, dockerfile) + + # Verify the image name generation + expected_image_name = "ci-test_model_Dockerfile" + assert result["docker_image"] == expected_image_name + assert "build_duration" in result + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + @patch.object(Console, "sh") + def test_build_image_with_registry_push( + self, + mock_sh, + mock_render, + mock_docker_gpu, + mock_hip, + mock_arch, + mock_ngpus, + mock_vendor, + ): + """Test Docker image build with registry push.""" + context = Context() + console = Console() + builder = DockerBuilder(context, console) + + # Mock successful build and push + mock_sh.return_value = "Success" + + model_info = {"name": "test_model"} + dockerfile = "./docker/Dockerfile" + registry = "localhost:5000" + + with patch.object(builder, "get_build_arg", return_value=""): + with patch.object(builder, "get_context_path", return_value="./docker"): + with patch.object( + builder, "push_image", return_value="localhost:5000/ci-test_model" + ) as mock_push: + result = builder.build_image(model_info, dockerfile) + registry_image = builder.push_image( + result["docker_image"], registry + ) + + # Should have called docker build + build_calls = [ + call for call in mock_sh.call_args_list if "docker build" in str(call) + ] + assert len(build_calls) >= 1 + assert registry_image == "localhost:5000/ci-test_model" + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + @patch.object(Console, "sh") + def test_build_image_failure( + self, + mock_sh, + mock_render, + mock_docker_gpu, + mock_hip, + mock_arch, + mock_ngpus, + mock_vendor, + ): + """Test Docker image build failure.""" + context = Context() + console = Console() + builder = DockerBuilder(context, console) + + # Mock build failure + mock_sh.side_effect = RuntimeError("Build failed") + + model_info = {"name": "test_model"} + dockerfile = "./docker/Dockerfile" + + with patch.object(builder, "get_build_arg", return_value=""): + with patch.object(builder, "get_context_path", return_value="./docker"): + # Test that the exception is raised + with pytest.raises(RuntimeError, match="Build failed"): + builder.build_image(model_info, dockerfile) + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_build_all_models( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test building all models.""" + context = Context() + builder = DockerBuilder(context) + + models = [ + {"name": "model1", "dockerfile": "./docker/Dockerfile1"}, + {"name": "model2", "dockerfile": "./docker/Dockerfile2"}, + ] + + # Mock console.sh calls for dockerfile listing + def mock_sh_side_effect(command, **kwargs): + if "ls ./docker/Dockerfile1.*" in command: + return "./docker/Dockerfile1" + elif "ls ./docker/Dockerfile2.*" in command: + return "./docker/Dockerfile2" + elif "head -n5" in command: + return "# CONTEXT AMD" + else: + return "success" + + # Mock context filter to return only the specific dockerfile for each model + def mock_filter_side_effect(dockerfiles): + # Return only the dockerfile that was requested for each model + if "./docker/Dockerfile1" in dockerfiles: + return {"./docker/Dockerfile1": "AMD"} + elif "./docker/Dockerfile2" in dockerfiles: + return {"./docker/Dockerfile2": "AMD"} + return dockerfiles + + # Mock successful builds + with patch.object(builder.console, "sh", side_effect=mock_sh_side_effect): + with patch.object(context, "filter", side_effect=mock_filter_side_effect): + with patch.object(builder, "build_image") as mock_build: + mock_build.return_value = { + "docker_image": "test_image", + "build_duration": 30.0, + } + + result = builder.build_all_models(models) + + assert len(result["successful_builds"]) == 2 + assert len(result["failed_builds"]) == 0 + assert mock_build.call_count == 2 + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_build_all_models_with_failures( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test building all models with some failures.""" + context = Context() + builder = DockerBuilder(context) + + models = [ + {"name": "model1", "dockerfile": "./docker/Dockerfile1"}, + {"name": "model2", "dockerfile": "./docker/Dockerfile2"}, + ] + + # Mock console.sh calls for dockerfile listing + def mock_sh_side_effect(command, **kwargs): + if "ls ./docker/Dockerfile1.*" in command: + return "./docker/Dockerfile1" + elif "ls ./docker/Dockerfile2.*" in command: + return "./docker/Dockerfile2" + elif "head -n5" in command: + return "# CONTEXT AMD" + else: + return "success" + + # Mock context filter to return only the specific dockerfile for each model + def mock_filter_side_effect(dockerfiles): + # Return only the dockerfile that was requested for each model + if "./docker/Dockerfile1" in dockerfiles: + return {"./docker/Dockerfile1": "AMD"} + elif "./docker/Dockerfile2" in dockerfiles: + return {"./docker/Dockerfile2": "AMD"} + return dockerfiles + + # Mock one success, one failure + def mock_build_side_effect(model_info, dockerfile, *args, **kwargs): + if model_info["name"] == "model1" and "Dockerfile1" in dockerfile: + return {"docker_image": "model1_image", "build_duration": 30.0} + else: + raise RuntimeError("Build failed") + + with patch.object(builder.console, "sh", side_effect=mock_sh_side_effect): + with patch.object(context, "filter", side_effect=mock_filter_side_effect): + with patch.object( + builder, "build_image", side_effect=mock_build_side_effect + ): + result = builder.build_all_models(models) + + assert len(result["successful_builds"]) == 1 + assert len(result["failed_builds"]) == 1 # 1 failure: model2/Dockerfile2 + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_export_build_manifest( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test exporting build manifest.""" + context = Context() + builder = DockerBuilder(context) + + # Set up some built images (key should match real DockerBuilder output) + builder.built_images = { + "ci-model1": {"docker_image": "ci-model1", "dockerfile": "./docker/Dockerfile"} + } + + with patch("builtins.open", mock_open()) as mock_file: + with patch("json.dump") as mock_json_dump: + builder.export_build_manifest("manifest.json") + + # Verify file was opened and JSON was written + mock_file.assert_called_once_with("manifest.json", "w") + mock_json_dump.assert_called_once() + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + @patch.object(Console, "sh") + def test_build_image_with_credentials( + self, + mock_sh, + mock_render, + mock_docker_gpu, + mock_hip, + mock_arch, + mock_ngpus, + mock_vendor, + ): + """Test Docker image build with credentials.""" + context = Context() + builder = DockerBuilder(context) + + mock_sh.return_value = "Success" + + model_info = {"name": "test_model", "cred": "testcred"} + dockerfile = "./docker/Dockerfile" + credentials = {"testcred": {"username": "testuser", "password": "testpass"}} + + with patch.object(builder, "get_build_arg") as mock_get_build_arg: + with patch.object(builder, "get_context_path", return_value="./docker"): + result = builder.build_image( + model_info, dockerfile, credentials=credentials + ) + + # Verify credentials were passed to build args + mock_get_build_arg.assert_called_once() + call_args = mock_get_build_arg.call_args[0][0] + assert "testcred_USERNAME" in call_args + assert "testcred_PASSWORD" in call_args + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + def test_clean_cache_option( + self, mock_render, mock_docker_gpu, mock_hip, mock_arch, mock_ngpus, mock_vendor + ): + """Test clean cache option in build.""" + context = Context() + builder = DockerBuilder(context) + + model_info = {"name": "test_model"} + dockerfile = "./docker/Dockerfile" + + with patch.object(builder.console, "sh") as mock_sh: + with patch.object(builder, "get_build_arg", return_value=""): + with patch.object(builder, "get_context_path", return_value="./docker"): + builder.build_image(model_info, dockerfile, clean_cache=True) + + # Verify --no-cache was used + build_calls = [ + call for call in mock_sh.call_args_list if "docker build" in str(call) + ] + assert any("--no-cache" in str(call) for call in build_calls) + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + @patch.object(Console, "sh") + def test_push_image_dockerhub_with_repository( + self, + mock_sh, + mock_render, + mock_docker_gpu, + mock_hip, + mock_arch, + mock_ngpus, + mock_vendor, + ): + """Test pushing image to DockerHub with repository specified in credentials.""" + context = Context() + console = Console() + builder = DockerBuilder(context, console) + + docker_image = "ci-dummy_dummy.ubuntu.amd" + registry = "dockerhub" + credentials = { + "dockerhub": { + "repository": "your-repository", + "username": "your-dockerhub-username", + "password": "your-dockerhub-password-or-token", + } + } + + # Mock successful operations + mock_sh.return_value = "Success" + + result = builder.push_image(docker_image, registry, credentials) + + # Verify the correct tag and push commands were called + expected_tag = "your-repository:ci-dummy_dummy.ubuntu.amd" + tag_calls = [ + call for call in mock_sh.call_args_list if "docker tag" in str(call) + ] + push_calls = [ + call for call in mock_sh.call_args_list if "docker push" in str(call) + ] + + assert len(tag_calls) == 1 + assert expected_tag in str(tag_calls[0]) + assert len(push_calls) == 1 + assert expected_tag in str(push_calls[0]) + assert result == expected_tag + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + @patch.object(Console, "sh") + def test_push_image_local_registry_with_repository( + self, + mock_sh, + mock_render, + mock_docker_gpu, + mock_hip, + mock_arch, + mock_ngpus, + mock_vendor, + ): + """Test pushing image to local registry with repository specified in credentials.""" + context = Context() + console = Console() + builder = DockerBuilder(context, console) + + docker_image = "ci-dummy_dummy.ubuntu.amd" + registry = "localhost:5000" + credentials = { + "localhost:5000": { + "repository": "your-repository", + "username": "your-local-registry-username", + "password": "your-local-registry-password", + } + } + + # Mock successful operations + mock_sh.return_value = "Success" + + result = builder.push_image(docker_image, registry, credentials) + + # Verify the correct tag and push commands were called + expected_tag = "localhost:5000/your-repository:ci-dummy_dummy.ubuntu.amd" + tag_calls = [ + call for call in mock_sh.call_args_list if "docker tag" in str(call) + ] + push_calls = [ + call for call in mock_sh.call_args_list if "docker push" in str(call) + ] + + assert len(tag_calls) == 1 + assert expected_tag in str(tag_calls[0]) + assert len(push_calls) == 1 + assert expected_tag in str(push_calls[0]) + assert result == expected_tag + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + @patch.object(Console, "sh") + def test_push_image_dockerhub_no_repository( + self, + mock_sh, + mock_render, + mock_docker_gpu, + mock_hip, + mock_arch, + mock_ngpus, + mock_vendor, + ): + """Test pushing image to DockerHub without repository specified in credentials.""" + context = Context() + console = Console() + builder = DockerBuilder(context, console) + + docker_image = "ci-dummy_dummy.ubuntu.amd" + registry = "dockerhub" + credentials = { + "dockerhub": { + "username": "your-dockerhub-username", + "password": "your-dockerhub-password-or-token", + } + } + + # Mock successful operations + mock_sh.return_value = "Success" + + result = builder.push_image(docker_image, registry, credentials) + + # DockerHub without repository should just use the image name (no tagging needed) + push_calls = [ + call for call in mock_sh.call_args_list if "docker push" in str(call) + ] + assert len(push_calls) == 1 + assert docker_image in str(push_calls[0]) + assert result == docker_image + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + @patch.object(Console, "sh") + def test_push_image_local_registry_no_repository( + self, + mock_sh, + mock_render, + mock_docker_gpu, + mock_hip, + mock_arch, + mock_ngpus, + mock_vendor, + ): + """Test pushing image to local registry without repository specified in credentials.""" + context = Context() + console = Console() + builder = DockerBuilder(context, console) + + docker_image = "ci-dummy_dummy.ubuntu.amd" + registry = "localhost:5000" + credentials = { + "localhost:5000": { + "username": "your-local-registry-username", + "password": "your-local-registry-password", + } + } + + # Mock successful operations + mock_sh.return_value = "Success" + + result = builder.push_image(docker_image, registry, credentials) + + # Should fallback to registry/imagename format + expected_tag = "localhost:5000/ci-dummy_dummy.ubuntu.amd" + tag_calls = [ + call for call in mock_sh.call_args_list if "docker tag" in str(call) + ] + push_calls = [ + call for call in mock_sh.call_args_list if "docker push" in str(call) + ] + + assert len(tag_calls) == 1 + assert expected_tag in str(tag_calls[0]) + assert len(push_calls) == 1 + assert expected_tag in str(push_calls[0]) + assert result == expected_tag + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + @patch.object(Console, "sh") + def test_push_image_no_registry( + self, + mock_sh, + mock_render, + mock_docker_gpu, + mock_hip, + mock_arch, + mock_ngpus, + mock_vendor, + ): + """Test pushing image with no registry specified.""" + context = Context() + console = Console() + builder = DockerBuilder(context, console) + + docker_image = "ci-dummy_dummy.ubuntu.amd" + + result = builder.push_image(docker_image) + + # Should not call docker tag or push commands and return the original image name + docker_calls = [ + call + for call in mock_sh.call_args_list + if "docker tag" in str(call) or "docker push" in str(call) + ] + assert len(docker_calls) == 0 + assert result == docker_image + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx908") + @patch.object(Context, "get_system_hip_version", return_value="5.4") + @patch.object(Context, "get_docker_gpus", return_value="all") + @patch.object(Context, "get_gpu_renderD_nodes", return_value=["renderD128"]) + @patch.object(Console, "sh") + def test_build_manifest_with_tagged_image( + self, + mock_sh, + mock_render, + mock_docker_gpu, + mock_hip, + mock_arch, + mock_ngpus, + mock_vendor, + ): + """Test that build manifest includes registry_image when pushing to registry.""" + import tempfile + import os + + # Mock successful operations BEFORE creating Context + # to avoid MagicMock objects being stored during initialization + mock_sh.return_value = "Success" + + context = Context() + console = Console() + builder = DockerBuilder(context, console) + + model_info = {"name": "test_model"} + dockerfile = "./docker/Dockerfile" + registry = "localhost:5000" + credentials = { + "localhost:5000": { + "repository": "test-repository", + "username": "test-user", + "password": "test-password", + } + } + + with patch.object(builder, "get_build_arg", return_value=""): + with patch.object(builder, "get_context_path", return_value="./docker"): + # Build image + build_info = builder.build_image(model_info, dockerfile, credentials) + local_image = build_info["docker_image"] + + # Push to registry + registry_image = builder.push_image(local_image, registry, credentials) + + # Update built_images with tagged image (simulating what build_all_models does) + if local_image in builder.built_images: + builder.built_images[local_image]["registry_image"] = registry_image + + # Export manifest to temporary file + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as tmp_file: + builder.export_build_manifest(tmp_file.name, registry) + + # Read and verify the manifest + with open(tmp_file.name, "r") as f: + import json + + manifest = json.load(f) + + # Clean up + os.unlink(tmp_file.name) + + # Verify the manifest contains the tagged image + assert local_image in manifest["built_images"] + assert "registry_image" in manifest["built_images"][local_image] + assert manifest["built_images"][local_image]["registry_image"] == registry_image + assert manifest["built_images"][local_image]["registry"] == registry + + # Verify the tagged image format is correct + expected_tagged_image = f"localhost:5000/test-repository:{local_image}" + assert registry_image == expected_tagged_image diff --git a/tests/integration/test_error_system_integration.py b/tests/integration/test_error_system_integration.py new file mode 100644 index 00000000..bf704da2 --- /dev/null +++ b/tests/integration/test_error_system_integration.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +""" +Integration tests for madengine unified error handling system. + +This test file focuses on testing the integration without requiring +optional dependencies like paramiko, ansible-runner, or kubernetes. +""" + +import pytest +import json +from unittest.mock import Mock, patch, MagicMock + +# Add src to path for imports +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) + +from madengine.core.errors import ( + ErrorHandler, + MADEngineError, + ValidationError, + ConfigurationError, + RunnerError, + set_error_handler, + get_error_handler, + create_error_context +) + + +class TestUnifiedErrorSystem: + """Test the unified error handling system integration.""" + + def test_error_system_basic_functionality(self): + """Test basic error system functionality works.""" + # Create error handler + mock_console = Mock() + handler = ErrorHandler(console=mock_console, verbose=False) + + # Create error with context + context = create_error_context( + operation="test_operation", + component="TestComponent", + model_name="test_model" + ) + + error = ValidationError("Test validation error", context=context) + + # Handle the error + handler.handle_error(error) + + # Verify it was handled + mock_console.print.assert_called_once() + + # Verify error structure + assert error.context.operation == "test_operation" + assert error.context.component == "TestComponent" + assert error.recoverable is True + + def test_mad_cli_error_handler_setup(self): + """Test that mad_cli properly sets up error handling.""" + from madengine.cli import setup_logging + + # Clear existing handler + set_error_handler(None) + + # Setup logging + setup_logging(verbose=True) + + # Verify handler was created + handler = get_error_handler() + assert handler is not None + assert isinstance(handler, ErrorHandler) + assert handler.verbose is True + + + def test_runner_error_base_class(self): + """Test that RunnerError base class works properly.""" + context = create_error_context( + operation="runner_test", + component="TestRunner" + ) + + error = RunnerError("Test runner error", context=context) + + assert isinstance(error, MADEngineError) + assert error.recoverable is True + assert error.context.operation == "runner_test" + assert error.context.component == "TestRunner" + + def test_error_context_serialization(self): + """Test that error contexts can be serialized.""" + context = create_error_context( + operation="serialization_test", + component="TestComponent", + model_name="test_model", + node_id="test_node", + additional_info={"key": "value", "number": 42} + ) + + error = ValidationError("Test error", context=context) + + # Test serialization + context_dict = error.context.__dict__ + json_str = json.dumps(context_dict, default=str) + + # Verify content + assert "serialization_test" in json_str + assert "TestComponent" in json_str + assert "test_model" in json_str + assert "test_node" in json_str + assert "key" in json_str + assert "42" in json_str + + def test_error_hierarchy_consistency(self): + """Test that all error types maintain consistent behavior.""" + from madengine.core.errors import ( + ValidationError, ConnectionError, AuthenticationError, + RuntimeError, BuildError, DiscoveryError, OrchestrationError, + RunnerError, ConfigurationError, TimeoutError + ) + + error_classes = [ + ValidationError, ConnectionError, AuthenticationError, + RuntimeError, BuildError, DiscoveryError, OrchestrationError, + RunnerError, ConfigurationError, TimeoutError + ] + + for error_class in error_classes: + error = error_class("Test error message") + + # All should inherit from MADEngineError + assert isinstance(error, MADEngineError) + + # All should have context (even if default) + assert error.context is not None + + # All should have category + assert error.category is not None + + # All should have recoverable flag + assert isinstance(error.recoverable, bool) + + def test_global_error_handler_workflow(self): + """Test the complete global error handler workflow.""" + from madengine.core.errors import handle_error + + # Create and set global handler + mock_console = Mock() + handler = ErrorHandler(console=mock_console, verbose=False) + set_error_handler(handler) + + # Create error + error = ValidationError( + "Global handler test", + context=create_error_context( + operation="global_test", + component="TestGlobalHandler" + ) + ) + + # Use global handle_error function + handle_error(error) + + # Verify it was handled through the global handler + mock_console.print.assert_called_once() + + def test_error_suggestions_and_recovery(self): + """Test error suggestions and recovery information.""" + suggestions = [ + "Check your configuration file", + "Verify network connectivity", + "Try running with --verbose flag" + ] + + error = ConfigurationError( + "Configuration validation failed", + context=create_error_context( + operation="config_validation", + file_path="/path/to/config.json" + ), + suggestions=suggestions + ) + + assert error.suggestions == suggestions + assert error.recoverable is True + assert error.context.file_path == "/path/to/config.json" + + # Test error display includes suggestions + mock_console = Mock() + handler = ErrorHandler(console=mock_console) + handler.handle_error(error) + + # Should have been called to display the error + mock_console.print.assert_called_once() + + def test_nested_error_handling(self): + """Test handling of nested errors with causes.""" + from madengine.core.errors import RuntimeError as MADRuntimeError, OrchestrationError + + # Create a chain of errors + original_error = ConnectionError("Network timeout") + runtime_error = MADRuntimeError("Operation failed", cause=original_error) + final_error = OrchestrationError("Orchestration failed", cause=runtime_error) + + # Test the chain + assert final_error.cause == runtime_error + assert runtime_error.cause == original_error + + # Test handling preserves the chain + mock_console = Mock() + handler = ErrorHandler(console=mock_console, verbose=True) + handler.handle_error(final_error, show_traceback=True) + + # Should display error and potentially traceback + assert mock_console.print.call_count >= 1 + + def test_error_performance(self): + """Test that error handling is performant.""" + import time + + mock_console = Mock() + handler = ErrorHandler(console=mock_console) + + start_time = time.time() + + # Create and handle many errors + for i in range(100): + error = ValidationError( + f"Test error {i}", + context=create_error_context( + operation=f"test_op_{i}", + component="PerformanceTest" + ) + ) + handler.handle_error(error) + + end_time = time.time() + + # Should handle 100 errors in under 1 second + assert end_time - start_time < 1.0 + + # Verify all errors were handled + assert mock_console.print.call_count == 100 + + +class TestErrorSystemBackwardCompatibility: + """Test backward compatibility of the error system.""" + + def test_legacy_exception_handling_still_works(self): + """Test that legacy exception patterns still work.""" + try: + # Simulate old-style exception raising + raise ValueError("Legacy error") + except Exception as e: + # Should be able to handle with new system + mock_console = Mock() + handler = ErrorHandler(console=mock_console) + + context = create_error_context( + operation="legacy_handling", + component="LegacyTest" + ) + + handler.handle_error(e, context=context) + + # Should handle gracefully + mock_console.print.assert_called_once() + + def test_error_system_without_rich(self): + """Test error system fallback when Rich is not available.""" + # This test verifies the system degrades gracefully + # In practice, Rich is a hard dependency, but we test the concept + + with patch('madengine.core.errors.Console', side_effect=ImportError): + # Should still be able to create basic errors + error = ValidationError("Test without Rich") + assert str(error) == "Test without Rich" + assert error.recoverable is True + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/integration/test_gpu_management.py b/tests/integration/test_gpu_management.py new file mode 100644 index 00000000..8bec767c --- /dev/null +++ b/tests/integration/test_gpu_management.py @@ -0,0 +1,590 @@ +"""Test GPU Management (ROCm and NVIDIA). + +This module tests the new GPU tool manager architecture including: +- BaseGPUToolManager abstract class +- ROCmToolManager with 6.4.1 threshold (PR #54) +- NvidiaToolManager basic functionality +- GPU Tool Factory singleton pattern + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import os +import json +import stat +import pytest +import unittest.mock +from unittest.mock import Mock, MagicMock, patch, call, mock_open + +from madengine.utils.gpu_tool_manager import BaseGPUToolManager +from madengine.utils.rocm_tool_manager import ROCmToolManager, ROCM_VERSION_THRESHOLD +from madengine.utils.nvidia_tool_manager import NvidiaToolManager +from madengine.utils.gpu_tool_factory import ( + get_gpu_tool_manager, + clear_manager_cache, + get_cached_managers, +) +from madengine.utils.gpu_validator import GPUVendor +from madengine.core.context import Context +from madengine.core.console import Console + + + + +def is_amd_gpu(): + """Check if system has AMD GPU.""" + try: + import subprocess + result = subprocess.run(['rocm-smi'], capture_output=True, timeout=5) + return result.returncode == 0 + except: + return False + + +# ============================================================================ +# GPU Tool Manager Tests +# ============================================================================ + +class TestBaseGPUToolManager: + """Test the base GPU tool manager abstract class.""" + + + + +class TestROCmToolManager: + """Test the ROCm tool manager with 6.4.1 threshold (PR #54).""" + + def test_get_rocm_version_from_hipconfig(self): + """Test ROCm version detection from hipconfig.""" + manager = ROCmToolManager() + + with patch.object(manager, 'is_tool_available', return_value=True), \ + patch.object(manager, '_execute_shell_command') as mock_exec: + mock_exec.return_value = (True, "6.4.1-12345", "") + + version = manager.get_rocm_version() + + assert version == (6, 4, 1) + # Verify result is cached + assert manager._get_cached_result("rocm_version") == (6, 4, 1) + + def test_get_preferred_smi_tool_6_4_1_and_above(self): + """Test that amd-smi is preferred for ROCm >= 6.4.1.""" + manager = ROCmToolManager() + + with patch.object(manager, 'get_rocm_version', return_value=(6, 4, 1)): + assert manager.get_preferred_smi_tool() == "amd-smi" + + with patch.object(manager, 'get_rocm_version', return_value=(6, 5, 0)): + assert manager.get_preferred_smi_tool() == "amd-smi" + + def test_get_preferred_smi_tool_below_6_4_1(self): + """Test that rocm-smi is preferred for ROCm < 6.4.1.""" + manager = ROCmToolManager() + + with patch.object(manager, 'get_rocm_version', return_value=(6, 4, 0)): + assert manager.get_preferred_smi_tool() == "rocm-smi" + + with patch.object(manager, 'get_rocm_version', return_value=(6, 3, 0)): + assert manager.get_preferred_smi_tool() == "rocm-smi" + + with patch.object(manager, 'get_rocm_version', return_value=(5, 7, 0)): + assert manager.get_preferred_smi_tool() == "rocm-smi" + + def test_get_gpu_count_with_amd_smi(self): + """Test GPU count detection using amd-smi.""" + manager = ROCmToolManager() + + with patch.object(manager, 'get_preferred_smi_tool', return_value="amd-smi"), \ + patch.object(manager, 'execute_command', return_value="8"): + count = manager.get_gpu_count() + + assert count == 8 + # Verify caching + assert manager._get_cached_result("gpu_count") == 8 + + def test_get_gpu_count_with_fallback_to_rocm_smi(self): + """Test GPU count fallback from amd-smi to rocm-smi.""" + manager = ROCmToolManager() + + def mock_execute(command, fallback=None, timeout=30): + # Simulate amd-smi failure, rocm-smi success + if "amd-smi" in command: + raise RuntimeError("amd-smi not found") + return "4" + + with patch.object(manager, 'get_preferred_smi_tool', return_value="amd-smi"), \ + patch.object(manager, 'execute_command', side_effect=mock_execute): + # Should fallback successfully + with pytest.raises(RuntimeError): # Our mock raises, but real impl would fallback + manager.get_gpu_count() + + def test_get_gpu_product_name_with_fallback(self): + """Test GPU product name with rocm-smi fallback (PR #54).""" + manager = ROCmToolManager() + + with patch.object(manager, 'get_preferred_smi_tool', return_value="amd-smi"), \ + patch.object(manager, 'execute_command', return_value="AMD Instinct MI300X"): + product = manager.get_gpu_product_name(gpu_id=0) + + assert product == "AMD Instinct MI300X" + assert manager._get_cached_result("gpu_product_name:0") == "AMD Instinct MI300X" + + def test_get_gpu_architecture(self): + """Test GPU architecture detection via rocminfo.""" + manager = ROCmToolManager() + + with patch.object(manager, '_execute_shell_command') as mock_exec: + mock_exec.return_value = (True, "gfx942", "") + + arch = manager.get_gpu_architecture() + + assert arch == "gfx942" + assert manager._get_cached_result("gpu_architecture") == "gfx942" + + def test_execute_command_with_fallback(self): + """Test command execution with fallback mechanism.""" + manager = ROCmToolManager() + + with patch.object(manager, '_execute_shell_command') as mock_exec: + # First call fails, second succeeds + mock_exec.side_effect = [ + (False, "", "command not found"), + (True, "success", "") + ] + + result = manager.execute_command("primary_cmd", "fallback_cmd") + + assert result == "success" + assert mock_exec.call_count == 2 + + + + +class TestNvidiaToolManager: + """Test the NVIDIA tool manager.""" + + def test_initialization(self): + """Test NVIDIA tool manager initialization.""" + manager = NvidiaToolManager() + assert manager is not None + + def test_get_cuda_version_from_nvcc(self): + """Test CUDA version detection from nvcc.""" + manager = NvidiaToolManager() + + with patch.object(manager, 'is_tool_available', return_value=True), \ + patch.object(manager, '_execute_shell_command') as mock_exec: + mock_exec.return_value = (True, "12.0", "") + + version = manager.get_cuda_version() + + assert version == "12.0" + assert manager._get_cached_result("cuda_version") == "12.0" + + def test_get_driver_version(self): + """Test NVIDIA driver version detection.""" + manager = NvidiaToolManager() + + with patch.object(manager, 'is_tool_available', return_value=True), \ + patch.object(manager, '_execute_shell_command') as mock_exec: + mock_exec.return_value = (True, "525.60.13", "") + + version = manager.get_driver_version() + + assert version == "525.60.13" + + def test_execute_nvidia_smi(self): + """Test nvidia-smi execution.""" + manager = NvidiaToolManager() + + with patch.object(manager, 'is_tool_available', return_value=True), \ + patch.object(manager, 'execute_command', return_value="GPU info"): + result = manager.execute_nvidia_smi("--list-gpus") + + assert result == "GPU info" + + def test_get_gpu_count(self): + """Test NVIDIA GPU count detection.""" + manager = NvidiaToolManager() + + with patch.object(manager, 'execute_nvidia_smi', return_value="8"): + count = manager.get_gpu_count() + + assert count == 8 + + def test_get_gpu_product_name(self): + """Test NVIDIA GPU product name detection.""" + manager = NvidiaToolManager() + + with patch.object(manager, 'execute_nvidia_smi', return_value="NVIDIA H100 80GB HBM3"): + product = manager.get_gpu_product_name(gpu_id=0) + + assert product == "NVIDIA H100 80GB HBM3" + + + + +class TestGPUToolFactory: + """Test the GPU tool factory with singleton pattern.""" + + def setup_method(self): + """Clear factory cache before each test.""" + clear_manager_cache() + + def teardown_method(self): + """Clear factory cache after each test.""" + clear_manager_cache() + + def test_get_amd_manager(self): + """Test getting AMD tool manager.""" + with patch('madengine.utils.gpu_validator.detect_gpu_vendor', return_value=GPUVendor.AMD): + manager = get_gpu_tool_manager(GPUVendor.AMD) + + assert isinstance(manager, ROCmToolManager) + + def test_get_nvidia_manager(self): + """Test getting NVIDIA tool manager.""" + manager = get_gpu_tool_manager(GPUVendor.NVIDIA) + + assert isinstance(manager, NvidiaToolManager) + + def test_singleton_pattern(self): + """Test that factory returns same instance (singleton).""" + manager1 = get_gpu_tool_manager(GPUVendor.AMD) + manager2 = get_gpu_tool_manager(GPUVendor.AMD) + + assert manager1 is manager2 # Same instance + + def test_different_vendors_different_instances(self): + """Test that different vendors get different instances.""" + amd_manager = get_gpu_tool_manager(GPUVendor.AMD) + nvidia_manager = get_gpu_tool_manager(GPUVendor.NVIDIA) + + assert amd_manager is not nvidia_manager + assert isinstance(amd_manager, ROCmToolManager) + assert isinstance(nvidia_manager, NvidiaToolManager) + + def test_auto_detect_vendor(self): + """Test auto-detection of GPU vendor.""" + with patch('madengine.utils.gpu_validator.detect_gpu_vendor', return_value=GPUVendor.AMD): + manager = get_gpu_tool_manager(vendor=None) + + assert isinstance(manager, ROCmToolManager) + + def test_unknown_vendor_raises_error(self): + """Test that unknown vendor raises appropriate error.""" + with pytest.raises(ValueError, match="Unable to detect GPU vendor"): + get_gpu_tool_manager(GPUVendor.UNKNOWN) + + def test_clear_manager_cache(self): + """Test clearing manager cache.""" + manager1 = get_gpu_tool_manager(GPUVendor.AMD) + + clear_manager_cache() + + manager2 = get_gpu_tool_manager(GPUVendor.AMD) + + # After clearing cache, should get new instance + assert manager1 is not manager2 + + def test_get_cached_managers(self): + """Test getting dictionary of cached managers.""" + amd_manager = get_gpu_tool_manager(GPUVendor.AMD) + nvidia_manager = get_gpu_tool_manager(GPUVendor.NVIDIA) + + cached = get_cached_managers() + + assert len(cached) == 2 + assert GPUVendor.AMD in cached + assert GPUVendor.NVIDIA in cached + assert cached[GPUVendor.AMD] is amd_manager + assert cached[GPUVendor.NVIDIA] is nvidia_manager + + + + +class TestToolManagerIntegration: + """Integration tests for tool managers with Context.""" + + def test_context_uses_tool_manager_for_gpu_count(self): + """Test that Context uses tool manager for GPU count.""" + from madengine.core.context import Context + + additional_context = json.dumps({ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU" + }) + + with patch('madengine.core.context.Context.get_gpu_vendor', return_value="AMD"), \ + patch('madengine.core.context.Context._get_tool_manager') as mock_get_manager: + + mock_manager = Mock() + mock_manager.get_gpu_count.return_value = 8 + mock_get_manager.return_value = mock_manager + + context = Context( + additional_context=additional_context, + build_only_mode=True + ) + + # Force initialization of docker_env_vars + context.ctx["docker_env_vars"] = {"MAD_GPU_VENDOR": "AMD"} + + count = context.get_system_ngpus() + + assert count == 8 + mock_manager.get_gpu_count.assert_called_once() + + def test_context_uses_tool_manager_for_product_name(self): + """Test that Context uses tool manager for GPU product name (PR #54).""" + from madengine.core.context import Context + + additional_context = json.dumps({ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU" + }) + + with patch('madengine.core.context.Context._get_tool_manager') as mock_get_manager: + mock_manager = Mock() + mock_manager.get_gpu_product_name.return_value = "AMD Instinct MI300X" + mock_get_manager.return_value = mock_manager + + context = Context( + additional_context=additional_context, + build_only_mode=True + ) + + context.ctx["docker_env_vars"] = {"MAD_GPU_VENDOR": "AMD"} + + product = context.get_system_gpu_product_name() + + assert product == "AMD Instinct MI300X" + mock_manager.get_gpu_product_name.assert_called_once_with(gpu_id=0) + + + + +class TestPR54Compliance: + """Test compliance with PR #54 requirements.""" + + def test_rocm_version_threshold_is_6_4_1(self): + """Test that ROCm version threshold matches PR #54.""" + assert ROCM_VERSION_THRESHOLD == (6, 4, 1), \ + "ROCm version threshold must be 6.4.1 as per PR #54" + + def test_amd_smi_preferred_for_6_4_1_and_above(self): + """Test amd-smi is preferred for ROCm >= 6.4.1 (PR #54).""" + manager = ROCmToolManager() + + test_versions = [ + ((6, 4, 1), "amd-smi"), + ((6, 4, 2), "amd-smi"), + ((6, 5, 0), "amd-smi"), + ((7, 0, 0), "amd-smi"), + ] + + for version, expected_tool in test_versions: + with patch.object(manager, 'get_rocm_version', return_value=version): + tool = manager.get_preferred_smi_tool() + assert tool == expected_tool, \ + f"ROCm {version} should prefer {expected_tool}" + + def test_rocm_smi_used_for_below_6_4_1(self): + """Test rocm-smi is used for ROCm < 6.4.1 (PR #54).""" + manager = ROCmToolManager() + + test_versions = [ + ((6, 4, 0), "rocm-smi"), + ((6, 3, 0), "rocm-smi"), + ((6, 0, 0), "rocm-smi"), + ((5, 7, 0), "rocm-smi"), + ] + + for version, expected_tool in test_versions: + with patch.object(manager, 'get_rocm_version', return_value=version): + tool = manager.get_preferred_smi_tool() + assert tool == expected_tool, \ + f"ROCm {version} should use {expected_tool}" + + def test_gpu_product_name_has_fallback(self): + """Test GPU product name has rocm-smi fallback (PR #54).""" + manager = ROCmToolManager() + + # Verify the method supports fallback by checking it calls execute_command + with patch.object(manager, 'get_preferred_smi_tool', return_value="amd-smi"), \ + patch.object(manager, 'execute_command') as mock_exec: + mock_exec.return_value = "AMD Instinct MI300X" + + product = manager.get_gpu_product_name(0) + + # Verify execute_command was called (which has fallback logic) + mock_exec.assert_called_once() + + # Verify both amd-smi and rocm-smi commands are in the call + call_args = mock_exec.call_args + assert "amd-smi" in str(call_args) or "rocm-smi" in str(call_args) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) + + + + +# ============================================================================ +# GPU RenderD Nodes Tests +# ============================================================================ + +class TestGetGpuRenderDNodesIntegration: + """Integration test suite for the get_gpu_renderD_nodes method using real hardware.""" + + @pytest.mark.skipif(is_amd_gpu(), reason="Test requires non-AMD GPU or no GPU") + def test_returns_none_for_non_amd_gpu(self): + """Test that the function returns None for non-AMD GPUs.""" + context = Context() + + # Should return None for non-AMD GPUs + if context.ctx['docker_env_vars']['MAD_GPU_VENDOR'] != 'AMD': + assert context.ctx['gpu_renderDs'] is None + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_returns_list_for_amd_gpu(self): + """Test that the function returns a list of renderD nodes for AMD GPUs.""" + context = Context() + + # Should return a list for AMD GPUs + assert context.ctx['gpu_renderDs'] is not None + assert isinstance(context.ctx['gpu_renderDs'], list) + + # List should not be empty if there are GPUs + if context.ctx['docker_env_vars']['MAD_SYSTEM_NGPUS'] > 0: + assert len(context.ctx['gpu_renderDs']) > 0 + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_renderD_count_matches_gpu_count(self): + """Test that the number of renderD nodes matches the number of GPUs.""" + context = Context() + + # Get GPU count from context (which uses amd-smi list --csv or rocm-smi as fallback) + # This is more reliable than amd-smi list -e --json which only works on ROCm 6.4+ + expected_gpu_count = context.ctx['docker_env_vars']['MAD_SYSTEM_NGPUS'] + + # Skip test if no GPUs detected + if expected_gpu_count == 0: + pytest.skip("No GPUs detected on system") + + # The number of renderD nodes should match the number of GPUs + assert len(context.ctx['gpu_renderDs']) == expected_gpu_count + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_renderD_values_are_valid(self): + """Test that all renderD values are valid integers.""" + context = Context() + + # All renderD values should be positive integers + for renderD in context.ctx['gpu_renderDs']: + assert isinstance(renderD, int) + assert renderD > 0 + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_renderD_nodes_are_unique(self): + """Test that all renderD nodes are unique.""" + context = Context() + + renderDs = context.ctx['gpu_renderDs'] + # All renderD values should be unique + assert len(renderDs) == len(set(renderDs)) + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_renderD_values_match_kfd_properties(self): + """Test that renderD values match what's in KFD properties.""" + console = Console() + context = Context() + + # Get renderD values from KFD directly + try: + kfd_output = console.sh("grep -r drm_render_minor /sys/devices/virtual/kfd/kfd/topology/nodes") + kfd_lines = [line for line in kfd_output.split("\n") if line.strip()] + # Filter out CPU entries (renderD value 0) + kfd_renderDs = [int(line.split()[-1]) for line in kfd_lines if int(line.split()[-1]) != 0] + except Exception: + pytest.skip("Unable to read KFD properties") + + # The renderD values from context should be a subset of KFD renderDs + for renderD in context.ctx['gpu_renderDs']: + assert renderD in kfd_renderDs, f"renderD {renderD} not found in KFD properties" + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_gpu_ordering_is_consistent(self): + """Test that GPU ordering matches amd-smi GPU IDs.""" + console = Console() + context = Context() + + try: + # Get amd-smi data + amd_smi_output = console.sh("amd-smi list -e --json") + gpu_data = json.loads(amd_smi_output) + + # Sort by GPU ID + sorted_gpus = sorted(gpu_data, key=lambda x: x["gpu"]) + + # The number of GPUs should match + assert len(context.ctx['gpu_renderDs']) == len(sorted_gpus) + + except Exception: + pytest.skip("Unable to verify GPU ordering with amd-smi") + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_renderD_nodes_exist_in_dev(self): + """Test that the renderD nodes actually exist in /dev/dri/.""" + context = Context() + + # Check that each renderD node exists as a device file + for renderD in context.ctx['gpu_renderDs']: + dev_path = f"/dev/dri/renderD{renderD}" + assert os.path.exists(dev_path), f"Device {dev_path} does not exist" + # Should be a character device + assert stat.S_ISCHR(os.stat(dev_path).st_mode), f"{dev_path} is not a character device" + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_no_cpu_entries_in_renderDs(self): + """Test that CPU entries (renderD=0) are not included.""" + context = Context() + + # None of the renderD values should be 0 (CPUs) + for renderD in context.ctx['gpu_renderDs']: + assert renderD != 0, "CPU entry (renderD=0) found in GPU renderD list" + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_context_initialization_succeeds(self): + """Test that Context initialization succeeds with real GPU data.""" + # This should not raise any exceptions + context = Context() + + # Basic sanity checks + assert context.ctx is not None + assert 'gpu_renderDs' in context.ctx + assert 'docker_env_vars' in context.ctx + assert 'MAD_GPU_VENDOR' in context.ctx['docker_env_vars'] + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_renderD_mapping_is_reproducible(self): + """Test that creating multiple Context objects produces the same renderD mapping.""" + context1 = Context() + context2 = Context() + + # The renderD lists should be identical + assert context1.ctx['gpu_renderDs'] == context2.ctx['gpu_renderDs'] + + @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") + def test_renderD_values_are_in_valid_range(self): + """Test that renderD values are in the valid Linux device range.""" + context = Context() + + # renderD values typically start at 128 and go up + # Valid range is 128-255 for render nodes + for renderD in context.ctx['gpu_renderDs']: + assert 128 <= renderD <= 255, f"renderD {renderD} is outside valid range [128, 255]" + + diff --git a/tests/integration/test_multi_gpu_arch.py b/tests/integration/test_multi_gpu_arch.py new file mode 100644 index 00000000..339d3fb9 --- /dev/null +++ b/tests/integration/test_multi_gpu_arch.py @@ -0,0 +1,194 @@ +"""Comprehensive unit tests for multi-GPU architecture support in madengine. + +Covers: +- Multi-arch DockerBuilder logic (image naming, manifest, legacy/override) +- Dockerfile GPU variable parsing/validation +- Target architecture normalization and compatibility +- Run-phase manifest filtering by gpu_architecture + +UPDATED: Now uses BuildOrchestrator instead of deprecated DistributedOrchestrator. + +All tests are logic/unit tests and do not require GPU hardware. +""" +import pytest +from unittest.mock import MagicMock, patch +from madengine.execution.docker_builder import DockerBuilder +from madengine.orchestration.build_orchestrator import BuildOrchestrator +from madengine.orchestration.run_orchestrator import RunOrchestrator + +class TestMultiGPUArch: + def setup_method(self): + self.context = MagicMock() + self.console = MagicMock() + self.builder = DockerBuilder(self.context, self.console) + + # Mock args for BuildOrchestrator (replacement for DistributedOrchestrator) + mock_args = MagicMock() + mock_args.additional_context = '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + mock_args.additional_context_file = None + mock_args.live_output = True + mock_args.data_config_file_name = "data.json" + mock_args.tags = [] + mock_args.target_archs = [] + mock_args.force_mirror_local = None + + # Create BuildOrchestrator with mocked args + self.orchestrator = BuildOrchestrator(mock_args) + + # --- DockerBuilder Multi-Arch Logic --- + @patch.object(DockerBuilder, "_get_dockerfiles_for_model") + @patch.object(DockerBuilder, "_check_dockerfile_has_gpu_variables") + @patch.object(DockerBuilder, "build_image") + def test_multi_arch_build_image_naming(self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles): + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] + # GPU variable present + mock_check_gpu_vars.return_value = (True, "docker/dummy.Dockerfile") + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", "build_duration": 1.0} + result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + assert result[0]["docker_image"].endswith("_gfx908") + # GPU variable absent + mock_check_gpu_vars.return_value = (False, "docker/dummy.Dockerfile") + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} + result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + assert not result[0]["docker_image"].endswith("_gfx908") + + @patch.object(DockerBuilder, "_get_dockerfiles_for_model") + @patch.object(DockerBuilder, "_check_dockerfile_has_gpu_variables") + @patch.object(DockerBuilder, "build_image") + def test_multi_arch_manifest_fields(self, mock_build_image, mock_check_gpu_vars, mock_get_dockerfiles): + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] + mock_check_gpu_vars.return_value = (True, "docker/dummy.Dockerfile") + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd_gfx908", "build_duration": 1.0} + result = self.builder._build_model_for_arch(model_info, "gfx908", None, False, None, "", None) + assert result[0]["gpu_architecture"] == "gfx908" + + @patch.object(DockerBuilder, "_get_dockerfiles_for_model") + @patch.object(DockerBuilder, "build_image") + def test_legacy_single_arch_build(self, mock_build_image, mock_get_dockerfiles): + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_get_dockerfiles.return_value = ["docker/dummy.Dockerfile"] + mock_build_image.return_value = {"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0} + result = self.builder._build_model_single_arch(model_info, None, False, None, "", None) + assert result[0]["docker_image"] == "ci-dummy_dummy.ubuntu.amd" + + @patch.object(DockerBuilder, "_build_model_single_arch") + def test_additional_context_overrides_target_archs(self, mock_single_arch): + self.context.ctx = {"docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx908"}} + model_info = {"name": "dummy", "dockerfile": "docker/dummy.Dockerfile"} + mock_single_arch.return_value = [{"docker_image": "ci-dummy_dummy.ubuntu.amd", "build_duration": 1.0}] + result = self.builder.build_all_models([model_info], target_archs=["gfx908", "gfx90a"]) + assert result["successful_builds"][0]["docker_image"] == "ci-dummy_dummy.ubuntu.amd" + + # --- Dockerfile GPU Variable Parsing/Validation --- + def test_parse_dockerfile_gpu_variables(self): + dockerfile_content = """ + ARG MAD_SYSTEM_GPU_ARCHITECTURE=gfx908 + ENV PYTORCH_ROCM_ARCH=gfx908;gfx90a + ARG GPU_TARGETS=gfx908,gfx942 + ENV GFX_COMPILATION_ARCH=gfx908 + ARG GPU_ARCHS=gfx908;gfx90a;gfx942 + """ + result = self.builder._parse_dockerfile_gpu_variables(dockerfile_content) + assert result["MAD_SYSTEM_GPU_ARCHITECTURE"] == ["gfx908"] + assert result["PYTORCH_ROCM_ARCH"] == ["gfx908", "gfx90a"] + assert result["GPU_TARGETS"] == ["gfx908", "gfx942"] + assert result["GFX_COMPILATION_ARCH"] == ["gfx908"] + assert result["GPU_ARCHS"] == ["gfx908", "gfx90a", "gfx942"] + + def test_parse_dockerfile_gpu_variables_env_delimiter(self): + dockerfile_content = "ENV PYTORCH_ROCM_ARCH = gfx908,gfx90a" + result = self.builder._parse_dockerfile_gpu_variables(dockerfile_content) + assert result["PYTORCH_ROCM_ARCH"] == ["gfx908", "gfx90a"] + + def test_parse_malformed_dockerfile(self): + dockerfile_content = "ENV BAD_LINE\nARG MAD_SYSTEM_GPU_ARCHITECTURE=\nENV PYTORCH_ROCM_ARCH=\n" + result = self.builder._parse_dockerfile_gpu_variables(dockerfile_content) + assert isinstance(result, dict) + + # --- Target Architecture Normalization/Compatibility --- + def test_normalize_architecture_name(self): + cases = { + "gfx908": "gfx908", + "GFX908": "gfx908", + "mi100": "gfx908", + "mi-100": "gfx908", + "mi200": "gfx90a", + "mi-200": "gfx90a", + "mi210": "gfx90a", + "mi250": "gfx90a", + "mi300": "gfx940", + "mi-300": "gfx940", + "mi300a": "gfx940", + "mi300x": "gfx942", + "mi-300x": "gfx942", + "unknown": "unknown", + "": None, + } + for inp, expected in cases.items(): + assert self.builder._normalize_architecture_name(inp) == expected + + def test_is_target_arch_compatible_with_variable(self): + assert self.builder._is_target_arch_compatible_with_variable("MAD_SYSTEM_GPU_ARCHITECTURE", ["gfx908"], "gfx942") + assert self.builder._is_target_arch_compatible_with_variable("PYTORCH_ROCM_ARCH", ["gfx908", "gfx942"], "gfx942") + assert not self.builder._is_target_arch_compatible_with_variable("PYTORCH_ROCM_ARCH", ["gfx908"], "gfx942") + assert self.builder._is_target_arch_compatible_with_variable("GFX_COMPILATION_ARCH", ["gfx908"], "gfx908") + assert not self.builder._is_target_arch_compatible_with_variable("GFX_COMPILATION_ARCH", ["gfx908"], "gfx942") + assert self.builder._is_target_arch_compatible_with_variable("UNKNOWN_VAR", ["foo"], "bar") + + def test_is_compilation_arch_compatible(self): + assert self.builder._is_compilation_arch_compatible("gfx908", "gfx908") + assert not self.builder._is_compilation_arch_compatible("gfx908", "gfx942") + assert self.builder._is_compilation_arch_compatible("foo", "foo") + + # --- Run-Phase Manifest Filtering --- + def test_filter_images_by_gpu_architecture(self): + """Test image filtering by GPU architecture using RunOrchestrator. + + Note: Current behavior treats images without gpu_vendor as compatible (legacy support). + """ + # Create RunOrchestrator which has _filter_images_by_gpu_architecture + mock_args = MagicMock() + mock_args.additional_context = '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + mock_args.additional_context_file = None + mock_args.tags = [] + mock_args.live_output = True + mock_args.data_config_file_name = "data.json" + mock_args.force_mirror_local = None + + run_orch = RunOrchestrator(mock_args) + + # Test exact match - both images have gpu_vendor set to "AMD" + built_images = { + "img1": {"gpu_architecture": "gfx908", "gpu_vendor": "AMD"}, + "img2": {"gpu_architecture": "gfx90a", "gpu_vendor": "AMD"} + } + filtered = run_orch._filter_images_by_gpu_architecture(built_images, "gfx908") + assert "img1" in filtered and "img2" not in filtered + + # Test legacy image (no gpu_vendor field) - should be included for compatibility + built_images = { + "img1": {"gpu_architecture": "gfx908"}, # No gpu_vendor + "img2": {"gpu_architecture": "gfx90a", "gpu_vendor": "AMD"} + } + filtered = run_orch._filter_images_by_gpu_architecture(built_images, "gfx908") + # Current behavior: legacy images (no gpu_vendor) are treated as compatible + assert "img1" in filtered # Legacy image included + # img2 may or may not be included depending on gpu_vendor matching logic + + # Test no match case with explicit gpu_vendor + built_images = { + "img1": {"gpu_architecture": "gfx90a", "gpu_vendor": "AMD"}, + "img2": {"gpu_architecture": "gfx942", "gpu_vendor": "AMD"} + } + filtered = run_orch._filter_images_by_gpu_architecture(built_images, "gfx908") + assert len(filtered) == 0 + + # Test all matching case with gpu_vendor + built_images = { + "img1": {"gpu_architecture": "gfx908", "gpu_vendor": "AMD"}, + "img2": {"gpu_architecture": "gfx908", "gpu_vendor": "AMD"} + } + filtered = run_orch._filter_images_by_gpu_architecture(built_images, "gfx908") + assert len(filtered) == 2 diff --git a/tests/integration/test_orchestrator_workflows.py b/tests/integration/test_orchestrator_workflows.py new file mode 100644 index 00000000..78d9636b --- /dev/null +++ b/tests/integration/test_orchestrator_workflows.py @@ -0,0 +1,446 @@ +"""Test the orchestration layer modules. + +This module tests the Build and Run orchestrators that coordinate +the build and execution workflows. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# built-in modules +import json +import os +import tempfile +from unittest.mock import MagicMock, mock_open, patch + +# third-party modules +import pytest + +# project modules +from madengine.orchestration.build_orchestrator import BuildOrchestrator +from madengine.orchestration.run_orchestrator import RunOrchestrator +from madengine.core.errors import BuildError, ConfigurationError, DiscoveryError + + +class TestBuildOrchestrator: + """Test the Build Orchestrator module.""" + + @patch("madengine.orchestration.build_orchestrator.Context") + def test_build_orchestrator_initialization(self, mock_context): + """Test orchestrator initialization with minimal args.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + mock_context_instance = MagicMock() + mock_context.return_value = mock_context_instance + + with patch("os.path.exists", return_value=False): + orchestrator = BuildOrchestrator(mock_args) + + assert orchestrator.args == mock_args + assert orchestrator.context == mock_context_instance + assert orchestrator.additional_context == {} + assert orchestrator.credentials is None + + @patch( + "builtins.open", + new_callable=mock_open, + read_data='{"dockerhub": {"username": "test", "password": "pass"}}', + ) + @patch("os.path.exists") + @patch("madengine.orchestration.build_orchestrator.Context") + def test_build_orchestrator_with_credentials( + self, mock_context, mock_exists, mock_file + ): + """Test orchestrator initialization with credentials.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + mock_context_instance = MagicMock() + mock_context.return_value = mock_context_instance + + def exists_side_effect(path): + return path == "credential.json" + + mock_exists.side_effect = exists_side_effect + + orchestrator = BuildOrchestrator(mock_args) + + assert orchestrator.credentials == { + "dockerhub": {"username": "test", "password": "pass"} + } + + @patch.dict( + "os.environ", + { + "MAD_DOCKERHUB_USER": "env_user", + "MAD_DOCKERHUB_PASSWORD": "env_pass", + "MAD_DOCKERHUB_REPO": "env_repo", + }, + ) + @patch("os.path.exists", return_value=False) + @patch("madengine.orchestration.build_orchestrator.Context") + def test_build_orchestrator_env_credentials(self, mock_context, mock_exists): + """Test orchestrator with environment variable credentials.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + mock_context_instance = MagicMock() + mock_context.return_value = mock_context_instance + + orchestrator = BuildOrchestrator(mock_args) + + assert orchestrator.credentials == { + "dockerhub": { + "username": "env_user", + "password": "env_pass", + "repository": "env_repo", + } + } + + @patch("madengine.orchestration.build_orchestrator.DiscoverModels") + @patch("madengine.orchestration.build_orchestrator.DockerBuilder") + @patch("madengine.orchestration.build_orchestrator.Context") + @patch("os.path.exists", return_value=False) + @patch("pathlib.Path.exists", return_value=False) + def test_build_execute_success( + self, + mock_path_exists, + mock_os_exists, + mock_context_class, + mock_docker_builder, + mock_discover_models, + ): + """Test successful build execution.""" + # Setup mocks + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = False + mock_args._separate_phases = True + mock_args.target_archs = [] + + # Mock context + mock_context = MagicMock() + mock_context.ctx = {"docker_build_arg": {"MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a"}} + mock_context_class.return_value = mock_context + + # Mock discover models + mock_discover_instance = MagicMock() + mock_discover_instance.run.return_value = [ + {"name": "model1", "tags": ["test"]}, + {"name": "model2", "tags": ["test"]}, + ] + mock_discover_models.return_value = mock_discover_instance + + # Mock docker builder + mock_builder_instance = MagicMock() + # Match actual docker_builder.py return format (lists, not ints) + mock_builder_instance.build_all_models.return_value = { + "successful_builds": [{"model": "model1"}, {"model": "model2"}], + "failed_builds": [], + } + mock_docker_builder.return_value = mock_builder_instance + + # Execute + orchestrator = BuildOrchestrator(mock_args) + manifest_file = orchestrator.execute(registry="docker.io", clean_cache=False) + + # Assertions + assert manifest_file == "build_manifest.json" + mock_discover_instance.run.assert_called_once() + mock_builder_instance.build_all_models.assert_called_once() + mock_builder_instance.export_build_manifest.assert_called_once() + + @patch("madengine.orchestration.build_orchestrator.DiscoverModels") + @patch("madengine.orchestration.build_orchestrator.Context") + @patch("os.path.exists", return_value=False) + def test_build_execute_no_models_found( + self, mock_os_exists, mock_context_class, mock_discover_models + ): + """Test build execution when no models are discovered.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = False + + mock_context = MagicMock() + mock_context.ctx = {"docker_build_arg": {}} + mock_context_class.return_value = mock_context + + mock_discover_instance = MagicMock() + mock_discover_instance.run.return_value = [] + mock_discover_models.return_value = mock_discover_instance + + orchestrator = BuildOrchestrator(mock_args) + + with pytest.raises(DiscoveryError): + orchestrator.execute() + + @patch("madengine.orchestration.build_orchestrator.DiscoverModels") + @patch("madengine.orchestration.build_orchestrator.DockerBuilder") + @patch("madengine.orchestration.build_orchestrator.Context") + @patch("os.path.exists", return_value=False) + @patch("pathlib.Path.exists", return_value=False) + def test_build_execute_all_failures( + self, + mock_path_exists, + mock_os_exists, + mock_context_class, + mock_docker_builder, + mock_discover_models, + ): + """Test build execution when ALL builds fail - should raise BuildError.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = False + mock_args._separate_phases = True + mock_args.target_archs = [] + + mock_context = MagicMock() + mock_context.ctx = {"docker_build_arg": {}} + mock_context_class.return_value = mock_context + + mock_discover_instance = MagicMock() + mock_discover_instance.run.return_value = [{"name": "model1", "tags": ["test"]}] + mock_discover_models.return_value = mock_discover_instance + + mock_builder_instance = MagicMock() + # All builds failed - should raise BuildError + mock_builder_instance.build_all_models.return_value = { + "successful_builds": [], + "failed_builds": [{"model": "model1", "error": "Build failed"}], + } + mock_docker_builder.return_value = mock_builder_instance + + orchestrator = BuildOrchestrator(mock_args) + + # Should raise BuildError when ALL builds fail + with pytest.raises(BuildError, match="All builds failed"): + orchestrator.execute() + + @patch("madengine.orchestration.build_orchestrator.DiscoverModels") + @patch("madengine.orchestration.build_orchestrator.DockerBuilder") + @patch("madengine.orchestration.build_orchestrator.Context") + @patch("os.path.exists", return_value=False) + @patch("pathlib.Path.exists", return_value=False) + def test_build_execute_partial_failure( + self, + mock_path_exists, + mock_os_exists, + mock_context_class, + mock_docker_builder, + mock_discover_models, + ): + """Test build execution with PARTIAL failures - should save manifest and not raise.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = False + mock_args._separate_phases = True + mock_args.target_archs = [] + + mock_context = MagicMock() + mock_context.ctx = {"docker_build_arg": {}} + mock_context_class.return_value = mock_context + + mock_discover_instance = MagicMock() + mock_discover_instance.run.return_value = [ + {"name": "model1", "tags": ["test"]}, + {"name": "model2", "tags": ["test"]}, + ] + mock_discover_models.return_value = mock_discover_instance + + mock_builder_instance = MagicMock() + # Partial failure: 1 success, 1 failure + mock_builder_instance.build_all_models.return_value = { + "successful_builds": [{"model": "model1", "docker_image": "ci-model1"}], + "failed_builds": [{"model": "model2", "error": "Build failed"}], + } + mock_docker_builder.return_value = mock_builder_instance + + orchestrator = BuildOrchestrator(mock_args) + + # Should NOT raise exception, manifest should be saved + manifest_file = orchestrator.execute() + + # Verify manifest was saved + assert manifest_file == "build_manifest.json" + mock_builder_instance.export_build_manifest.assert_called_once() + + # Verify both successes and failures are in the summary + mock_builder_instance.build_all_models.assert_called_once() + result = mock_builder_instance.build_all_models.return_value + assert len(result["successful_builds"]) == 1 + assert len(result["failed_builds"]) == 1 + + +class TestRunOrchestrator: + """Test the Run Orchestrator module.""" + + @patch("madengine.orchestration.run_orchestrator.Context") + def test_run_orchestrator_initialization(self, mock_context): + """Test orchestrator initialization.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + assert orchestrator.args == mock_args + assert orchestrator.additional_context == {} + assert orchestrator.context is None # Lazy initialization + + def test_run_orchestrator_additional_context_parsing(self): + """Test additional context parsing from JSON string.""" + mock_args = MagicMock() + mock_args.additional_context = '{"deploy": "slurm", "slurm": {"nodes": 4}}' + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + assert orchestrator.additional_context == { + "deploy": "slurm", + "slurm": {"nodes": 4}, + } + + @patch("os.path.exists", return_value=False) + def test_run_execute_no_manifest_no_tags(self, mock_exists): + """Test run execution fails without manifest or tags.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + with pytest.raises(ConfigurationError): + orchestrator.execute(manifest_file=None, tags=None) + + @patch("madengine.orchestration.build_orchestrator.BuildOrchestrator") + def test_run_execute_triggers_build_phase( + self, mock_build_orchestrator + ): + """Test run execution triggers build phase when no manifest exists.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + mock_args.tags = ["test"] + + mock_build_instance = MagicMock() + mock_build_instance.execute.return_value = "build_manifest.json" + mock_build_orchestrator.return_value = mock_build_instance + + # Mock manifest loading + manifest_data = { + "built_images": {"model1": {"name": "model1"}}, + "deployment_config": {"target": "local"}, + } + + orchestrator = RunOrchestrator(mock_args) + + # Mock file operations and execution + with patch("os.path.exists", side_effect=lambda p: p == "build_manifest.json"), \ + patch("builtins.open", mock_open(read_data=json.dumps(manifest_data))), \ + patch.object(orchestrator, "_execute_local", return_value={}) as mock_execute_local: + orchestrator.execute(manifest_file=None, tags=["test"]) + + mock_build_instance.execute.assert_called_once() + mock_execute_local.assert_called_once() + + @patch( + "builtins.open", + new_callable=mock_open, + read_data='{"built_images": {"model1": {"name": "model1"}}}', + ) + @patch("os.path.exists", return_value=True) + def test_run_execute_local(self, mock_exists, mock_file): + """Test run execution in local mode.""" + mock_args = MagicMock() + mock_args.additional_context = '{"deploy": "local"}' + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + with patch.object( + orchestrator, "_execute_local", return_value={"status": "success"} + ) as mock_execute_local: + result = orchestrator.execute(manifest_file="build_manifest.json") + + assert result["status"] == "success" + assert "session_start_row" in result + assert "session_row_count" in result + mock_execute_local.assert_called_once() + + @patch( + "builtins.open", + new_callable=mock_open, + read_data='{"built_images": {"model1": {"name": "model1"}}, "deployment_config": {"slurm": {"partition": "gpu", "nodes": 2}}}', + ) + @patch("os.path.exists", return_value=True) + def test_run_execute_distributed(self, mock_exists, mock_file): + """Test run execution in distributed mode.""" + mock_args = MagicMock() + mock_args.additional_context = '{"deploy": "slurm"}' + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + with patch.object( + orchestrator, + "_execute_distributed", + return_value={"status": "deployed"}, + ) as mock_execute_distributed: + result = orchestrator.execute(manifest_file="build_manifest.json") + + assert result["status"] == "deployed" + assert "session_start_row" in result + assert "session_row_count" in result + mock_execute_distributed.assert_called_once_with("slurm", "build_manifest.json") + + @patch( + "builtins.open", + new_callable=mock_open, + read_data='{"built_images": {"model1": {"name": "model1"}}, "context": {}}', + ) + @patch("os.path.exists", return_value=True) + def test_execute_local_with_mock( + self, mock_exists, mock_file + ): + """Test local execution workflow (mocked).""" + mock_args = MagicMock() + mock_args.additional_context = '{"deploy": "local"}' + mock_args.live_output = False + + orchestrator = RunOrchestrator(mock_args) + + # Mock the _execute_local method to avoid deep integration + with patch.object( + orchestrator, "_execute_local", return_value={"successful_runs": 1} + ) as mock_execute_local: + result = orchestrator.execute(manifest_file="build_manifest.json") + + assert result["successful_runs"] == 1 + mock_execute_local.assert_called_once() + + def test_filter_images_by_gpu_architecture(self): + """Test GPU architecture filtering logic.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + built_images = { + "model1": {"name": "model1", "gpu_architecture": "gfx90a", "gpu_vendor": "AMD"}, + "model2": {"name": "model2", "gpu_architecture": "gfx908", "gpu_vendor": "AMD"}, + "model3": {"name": "model3", "gpu_architecture": ""}, # Legacy - no gpu_vendor + } + + # Filter for gfx90a + compatible = orchestrator._filter_images_by_gpu_architecture( + built_images, "gfx90a" + ) + + assert "model1" in compatible + assert "model2" not in compatible + assert "model3" in compatible # Legacy images without gpu_vendor pass through + diff --git a/tests/integration/test_platform_integration.py b/tests/integration/test_platform_integration.py new file mode 100644 index 00000000..be0fe770 --- /dev/null +++ b/tests/integration/test_platform_integration.py @@ -0,0 +1,548 @@ +""" +Multi-platform integration tests for madengine. + +Tests the complete build and run workflows across AMD GPU, NVIDIA GPU, and CPU platforms. +These tests focus on integration and end-to-end flows rather than isolated unit tests. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import os +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch, mock_open +import pytest + +from madengine.orchestration.build_orchestrator import BuildOrchestrator +from madengine.orchestration.run_orchestrator import RunOrchestrator +from madengine.core.errors import BuildError, ConfigurationError, DiscoveryError + + +# ============================================================================ +# Multi-Platform Build Tests +# ============================================================================ + +class TestMultiPlatformBuild: + """Test build orchestration across different platforms.""" + + @pytest.mark.unit + @pytest.mark.parametrize("platform", ["amd", "nvidia", "cpu"]) + def test_build_initialization_all_platforms( + self, platform, multi_platform_context, mock_build_args + ): + """Test that BuildOrchestrator initializes correctly on all platforms.""" + with patch( + "madengine.orchestration.build_orchestrator.Context", + return_value=multi_platform_context, + ): + with patch("os.path.exists", return_value=False): + orchestrator = BuildOrchestrator(mock_build_args) + + assert orchestrator.args == mock_build_args + assert orchestrator.context == multi_platform_context + assert orchestrator.credentials is None + + @pytest.mark.unit + @pytest.mark.amd + def test_build_amd_gpu_architecture_detection(self, amd_gpu_context, mock_build_args): + """Test AMD GPU architecture is correctly detected and used.""" + with patch( + "madengine.orchestration.build_orchestrator.Context", + return_value=amd_gpu_context, + ): + with patch("os.path.exists", return_value=False): + orchestrator = BuildOrchestrator(mock_build_args) + + assert orchestrator.context.get_gpu_vendor() == "AMD" + assert orchestrator.context.get_system_gpu_architecture() == "gfx90a" + + @pytest.mark.unit + @pytest.mark.nvidia + def test_build_nvidia_gpu_architecture_detection( + self, nvidia_gpu_context, mock_build_args + ): + """Test NVIDIA GPU architecture is correctly detected and used.""" + with patch( + "madengine.orchestration.build_orchestrator.Context", + return_value=nvidia_gpu_context, + ): + with patch("os.path.exists", return_value=False): + orchestrator = BuildOrchestrator(mock_build_args) + + assert orchestrator.context.get_gpu_vendor() == "NVIDIA" + assert orchestrator.context.get_system_gpu_architecture() == "sm_90" + + @pytest.mark.unit + @pytest.mark.cpu + def test_build_cpu_only_mode(self, cpu_context, mock_build_args): + """Test CPU-only build mode works correctly.""" + with patch( + "madengine.orchestration.build_orchestrator.Context", + return_value=cpu_context, + ): + with patch("os.path.exists", return_value=False): + orchestrator = BuildOrchestrator(mock_build_args) + + assert orchestrator.context.get_gpu_vendor() == "NONE" + assert orchestrator.context.get_system_ngpus() == 0 + + +# ============================================================================ +# Error Handling and Resilience Tests +# ============================================================================ + +class TestBuildResilience: + """Test build resilience and error handling.""" + + @pytest.mark.unit + def test_partial_build_failure_saves_manifest( + self, mock_build_args, amd_gpu_context, sample_build_summary_partial + ): + """Test that partial failures still save the manifest with successful builds.""" + with patch( + "madengine.orchestration.build_orchestrator.Context", + return_value=amd_gpu_context, + ): + with patch("os.path.exists", return_value=False): + with patch("pathlib.Path.exists", return_value=False): + with patch( + "madengine.orchestration.build_orchestrator.DiscoverModels" + ) as mock_discover: + with patch( + "madengine.orchestration.build_orchestrator.DockerBuilder" + ) as mock_builder: + # Setup mocks + mock_discover_instance = MagicMock() + mock_discover_instance.run.return_value = [ + {"name": "model1", "tags": ["test"]}, + {"name": "model2", "tags": ["test"]}, + ] + mock_discover.return_value = mock_discover_instance + + mock_builder_instance = MagicMock() + mock_builder_instance.build_all_models.return_value = ( + sample_build_summary_partial + ) + mock_builder.return_value = mock_builder_instance + + # Execute + orchestrator = BuildOrchestrator(mock_build_args) + manifest_file = orchestrator.execute() + + # Verify manifest was saved despite partial failure + assert manifest_file == "build_manifest.json" + mock_builder_instance.export_build_manifest.assert_called_once() + + # Verify successful builds are available + summary = mock_builder_instance.build_all_models.return_value + assert len(summary["successful_builds"]) == 1 + assert len(summary["failed_builds"]) == 1 + + @pytest.mark.unit + def test_all_builds_fail_raises_error( + self, mock_build_args, amd_gpu_context, sample_build_summary_all_failed + ): + """Test that when ALL builds fail, BuildError is raised.""" + with patch( + "madengine.orchestration.build_orchestrator.Context", + return_value=amd_gpu_context, + ): + with patch("os.path.exists", return_value=False): + with patch("pathlib.Path.exists", return_value=False): + with patch( + "madengine.orchestration.build_orchestrator.DiscoverModels" + ) as mock_discover: + with patch( + "madengine.orchestration.build_orchestrator.DockerBuilder" + ) as mock_builder: + # Setup mocks + mock_discover_instance = MagicMock() + mock_discover_instance.run.return_value = [ + {"name": "model1", "tags": ["test"]}, + {"name": "model2", "tags": ["test"]}, + ] + mock_discover.return_value = mock_discover_instance + + mock_builder_instance = MagicMock() + mock_builder_instance.build_all_models.return_value = ( + sample_build_summary_all_failed + ) + mock_builder.return_value = mock_builder_instance + + # Execute and expect error + orchestrator = BuildOrchestrator(mock_build_args) + + with pytest.raises(BuildError, match="All builds failed"): + orchestrator.execute() + + @pytest.mark.unit + def test_multi_model_build_continues_on_single_failure( + self, mock_build_args, amd_gpu_context + ): + """Test that multi-model build continues when one model fails.""" + with patch( + "madengine.orchestration.build_orchestrator.Context", + return_value=amd_gpu_context, + ): + with patch("os.path.exists", return_value=False): + with patch("pathlib.Path.exists", return_value=False): + with patch( + "madengine.orchestration.build_orchestrator.DiscoverModels" + ) as mock_discover: + with patch( + "madengine.orchestration.build_orchestrator.DockerBuilder" + ) as mock_builder: + # Setup mocks + mock_discover_instance = MagicMock() + mock_discover_instance.run.return_value = [ + {"name": "model1", "tags": ["test"]}, + {"name": "model2", "tags": ["test"]}, + {"name": "model3", "tags": ["test"]}, + ] + mock_discover.return_value = mock_discover_instance + + mock_builder_instance = MagicMock() + # 2 successes, 1 failure + mock_builder_instance.build_all_models.return_value = { + "successful_builds": [ + { + "model": "model1", + "docker_image": "ci-model1", + }, + { + "model": "model3", + "docker_image": "ci-model3", + }, + ], + "failed_builds": [ + { + "model": "model2", + "error": "Build failed", + }, + ], + "total_build_time": 20.0, + } + mock_builder.return_value = mock_builder_instance + + # Execute - should not raise + orchestrator = BuildOrchestrator(mock_build_args) + manifest_file = orchestrator.execute() + + # Verify manifest saved and both successes are there + assert manifest_file == "build_manifest.json" + mock_builder_instance.export_build_manifest.assert_called_once() + + +# ============================================================================ +# Multi-Architecture Build Tests +# ============================================================================ + +class TestMultiArchitectureBuild: + """Test multi-architecture build scenarios.""" + + @pytest.mark.unit + @pytest.mark.amd + def test_multi_arch_amd_builds(self, mock_build_args, amd_gpu_context): + """Test building for multiple AMD GPU architectures.""" + mock_build_args.target_archs = ["gfx908", "gfx90a", "gfx942"] + + with patch( + "madengine.orchestration.build_orchestrator.Context", + return_value=amd_gpu_context, + ): + with patch("os.path.exists", return_value=False): + with patch("pathlib.Path.exists", return_value=False): + with patch( + "madengine.orchestration.build_orchestrator.DiscoverModels" + ) as mock_discover: + with patch( + "madengine.orchestration.build_orchestrator.DockerBuilder" + ) as mock_builder: + # Setup mocks + mock_discover_instance = MagicMock() + mock_discover_instance.run.return_value = [ + {"name": "model1", "tags": ["test"]}, + ] + mock_discover.return_value = mock_discover_instance + + mock_builder_instance = MagicMock() + # Build for each architecture + mock_builder_instance.build_all_models.return_value = { + "successful_builds": [ + { + "model": "model1", + "docker_image": "ci-model1_gfx908", + "gpu_architecture": "gfx908", + }, + { + "model": "model1", + "docker_image": "ci-model1_gfx90a", + "gpu_architecture": "gfx90a", + }, + { + "model": "model1", + "docker_image": "ci-model1_gfx942", + "gpu_architecture": "gfx942", + }, + ], + "failed_builds": [], + "total_build_time": 45.0, + } + mock_builder.return_value = mock_builder_instance + + # Execute + orchestrator = BuildOrchestrator(mock_build_args) + manifest_file = orchestrator.execute() + + # Verify all architectures were built + summary = mock_builder_instance.build_all_models.return_value + assert len(summary["successful_builds"]) == 3 + archs = [ + b["gpu_architecture"] + for b in summary["successful_builds"] + ] + assert "gfx908" in archs + assert "gfx90a" in archs + assert "gfx942" in archs + + +# ============================================================================ +# Run Orchestrator Multi-Platform Tests +# ============================================================================ + +class TestMultiPlatformRun: + """Test run orchestration across different platforms.""" + + @pytest.mark.unit + def test_run_with_manifest_local_execution( + self, mock_run_args, amd_gpu_context, temp_manifest_file + ): + """Test local execution from manifest file.""" + mock_run_args.manifest_file = temp_manifest_file + + with patch("os.path.exists", return_value=True): + with patch( + "builtins.open", + mock_open( + read_data=json.dumps( + { + "built_images": {"ci-model1": {"name": "model1"}}, + "deployment_config": {}, + } + ) + ), + ): + orchestrator = RunOrchestrator(mock_run_args) + + with patch.object( + orchestrator, + "_execute_local", + return_value={"successful_runs": 1, "failed_runs": 0}, + ) as mock_execute_local: + result = orchestrator.execute(manifest_file=temp_manifest_file) + + assert result["successful_runs"] == 1 + mock_execute_local.assert_called_once() + + @pytest.mark.unit + def test_run_multi_model_continues_on_failure( + self, mock_run_args, amd_gpu_context, temp_manifest_file + ): + """Test that run continues when one model fails.""" + mock_run_args.manifest_file = temp_manifest_file + + with patch("os.path.exists", return_value=True): + with patch( + "builtins.open", + mock_open( + read_data=json.dumps( + { + "built_images": { + "ci-model1": {"name": "model1"}, + "ci-model2": {"name": "model2"}, + }, + "deployment_config": {}, + } + ) + ), + ): + orchestrator = RunOrchestrator(mock_run_args) + + # Mock execution with 1 success, 1 failure + with patch.object( + orchestrator, + "_execute_local", + return_value={ + "successful_runs": [{"model": "model1"}], + "failed_runs": [{"model": "model2", "error": "Runtime error"}], + "total_runs": 2, + }, + ) as mock_execute_local: + result = orchestrator.execute(manifest_file=temp_manifest_file) + + # Verify both were attempted + assert len(result["successful_runs"]) == 1 + assert len(result["failed_runs"]) == 1 + assert result["total_runs"] == 2 + + +# ============================================================================ +# Integration Tests (Full Flow) +# ============================================================================ + +class TestEndToEndIntegration: + """Integration tests for complete build + run workflows.""" + + @pytest.mark.integration + @pytest.mark.slow + def test_build_then_run_workflow( + self, mock_build_args, mock_run_args, amd_gpu_context, temp_working_dir + ): + """Test complete workflow: build models, then run them.""" + # Phase 1: Build + with patch( + "madengine.orchestration.build_orchestrator.Context", + return_value=amd_gpu_context, + ): + with patch("pathlib.Path.exists", return_value=False): + with patch( + "madengine.orchestration.build_orchestrator.DiscoverModels" + ) as mock_discover: + with patch( + "madengine.orchestration.build_orchestrator.DockerBuilder" + ) as mock_builder: + # Setup build mocks + mock_discover_instance = MagicMock() + mock_discover_instance.run.return_value = [ + {"name": "model1", "tags": ["test"]}, + ] + mock_discover.return_value = mock_discover_instance + + mock_builder_instance = MagicMock() + mock_builder_instance.build_all_models.return_value = { + "successful_builds": [ + { + "model": "model1", + "docker_image": "ci-model1", + }, + ], + "failed_builds": [], + "total_build_time": 10.0, + } + mock_builder.return_value = mock_builder_instance + + # Execute build + build_orchestrator = BuildOrchestrator(mock_build_args) + manifest_file = build_orchestrator.execute() + + assert manifest_file == "build_manifest.json" + mock_builder_instance.export_build_manifest.assert_called_once() + + # Phase 2: Run (using manifest from build) + manifest_data = { + "built_images": {"ci-model1": {"docker_image": "ci-model1"}}, + "built_models": {"ci-model1": {"name": "model1"}}, + "deployment_config": {}, + } + + with patch("os.path.exists", return_value=True): + with patch("builtins.open", mock_open(read_data=json.dumps(manifest_data))): + run_orchestrator = RunOrchestrator(mock_run_args) + + with patch.object( + run_orchestrator, + "_execute_local", + return_value={ + "successful_runs": [{"model": "model1"}], + "failed_runs": [], + "total_runs": 1, + }, + ): + result = run_orchestrator.execute(manifest_file="build_manifest.json") + + assert len(result["successful_runs"]) == 1 + assert len(result["failed_runs"]) == 0 + + +# ============================================================================ +# Platform-Specific Behavior Tests +# ============================================================================ + +class TestPlatformSpecificBehavior: + """Test platform-specific behaviors and edge cases.""" + + @pytest.mark.unit + @pytest.mark.amd + def test_amd_gpu_renderD_node_detection(self, amd_gpu_context, mock_run_args): + """Test AMD GPU renderD node detection.""" + with patch( + "madengine.orchestration.run_orchestrator.Context", + return_value=amd_gpu_context, + ): + orchestrator = RunOrchestrator(mock_run_args) + orchestrator._init_runtime_context() + + # Verify AMD-specific context + assert orchestrator.context.get_gpu_vendor() == "AMD" + assert orchestrator.context.get_gpu_renderD_nodes() == [ + "renderD128", + "renderD129", + ] + + @pytest.mark.unit + @pytest.mark.nvidia + def test_nvidia_gpu_cuda_detection(self, nvidia_gpu_context, mock_run_args): + """Test NVIDIA GPU CUDA detection.""" + with patch( + "madengine.orchestration.run_orchestrator.Context", + return_value=nvidia_gpu_context, + ): + orchestrator = RunOrchestrator(mock_run_args) + orchestrator._init_runtime_context() + + # Verify NVIDIA-specific context + assert orchestrator.context.get_gpu_vendor() == "NVIDIA" + assert orchestrator.context.get_system_cuda_version() == "12.1" + + @pytest.mark.unit + @pytest.mark.cpu + def test_cpu_only_execution(self, cpu_context, mock_run_args, temp_manifest_file): + """Test CPU-only execution without GPU requirements.""" + with patch( + "madengine.orchestration.run_orchestrator.Context", return_value=cpu_context + ): + with patch("os.path.exists", return_value=True): + with patch( + "builtins.open", + mock_open( + read_data=json.dumps( + { + "built_images": {"ci-model1": {"name": "model1"}}, + "deployment_config": {}, + } + ) + ), + ): + orchestrator = RunOrchestrator(mock_run_args) + + # CPU execution should not require GPU detection + with patch.object( + orchestrator, + "_execute_local", + return_value={ + "successful_runs": [{"model": "model1"}], + "failed_runs": [], + }, + ): + result = orchestrator.execute(manifest_file=temp_manifest_file) + + assert len(result["successful_runs"]) == 1 + # Context is initialized during execute, verify CPU mode + if orchestrator.context: + assert orchestrator.context.get_system_ngpus() == 0 + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"]) + diff --git a/tests/test_contexts.py b/tests/test_contexts.py deleted file mode 100644 index 45ba117f..00000000 --- a/tests/test_contexts.py +++ /dev/null @@ -1,306 +0,0 @@ -"""Test the context module. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -import sys -import csv -# third-party modules -import pytest -# project modules -from .fixtures.utils import BASE_DIR, MODEL_DIR -from .fixtures.utils import global_data -from .fixtures.utils import clean_test_temp_files -from .fixtures.utils import get_gpu_nodeid_map -from .fixtures.utils import get_num_gpus -from .fixtures.utils import get_num_cpus -from madengine.core.context import Context - -class TestContexts: - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_dockerfile_picked_on_detected_context_0(self, global_data, clean_test_temp_files): - """ - picks dockerfile based on detected context and only those - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_ctxtest ") - - success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy_ctxtest': - if row['status'] == 'SUCCESS' and row['performance'] == '0': - success = True - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("model did not pick correct context.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'ctx_test']], indirect=True) - def test_dockerfile_picked_on_detected_context_1(self, global_data, clean_test_temp_files): - """ - picks dockerfile based on detected context and only those - """ - with open(os.path.join(BASE_DIR, 'ctx_test'), 'w') as ctx_test_file: - print("1", file=ctx_test_file) - - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_ctxtest ") - - success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy_ctxtest': - if row['status'] == 'SUCCESS' and row['performance'] == '1': - success = True - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("model did not pick correct context.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'ctx_test']], indirect=True) - def test_all_dockerfiles_matching_context_executed(self, global_data, clean_test_temp_files): - """ - All dockerfiles matching context is executed - """ - with open(os.path.join(BASE_DIR, 'ctx_test'), 'w') as ctx_test_file: - print("2", file=ctx_test_file) - - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_ctxtest ") - - foundDockerfiles = [] - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy_ctxtest': - if row['status'] == 'SUCCESS' and row['performance'] == '2': - foundDockerfiles.append(row['docker_file'].replace(f'{MODEL_DIR}/', '')) - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not ("docker/dummy_ctxtest.ctx2a.ubuntu.amd.Dockerfile" in foundDockerfiles and - "docker/dummy_ctxtest.ctx2b.ubuntu.amd.Dockerfile" in foundDockerfiles ): - pytest.fail("All dockerfiles matching context is not executed. Executed dockerfiles are " + ' '.join(foundDockerfiles)) - - def test_dockerfile_executed_if_contexts_keys_are_not_common(self): - """ - Dockerfile is executed even if all context keys are not common but common keys match - """ - # already tested in test_dockerfile_picked_on_detected_context_0 - pass - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_can_override_context_with_additionalContext_commandline(self, global_data, clean_test_temp_files): - """ - Context can be overridden through additional-context command-line argument - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_ctxtest --additional-context \"{'ctx_test': '1'}\" ") - - success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy_ctxtest': - if row['status'] == 'SUCCESS' and row['performance'] == '1': - success = True - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("model did not pick correct context.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'ctx.json']], indirect=True) - def test_can_override_context_with_additionalContextFile_commandline(self, global_data, clean_test_temp_files): - """ - Context can be overridden through additional-context-file - """ - with open(os.path.join(BASE_DIR, 'ctx.json'), 'w') as ctx_json_file: - print("{ \"ctx_test\": \"1\" }", file=ctx_json_file) - - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_ctxtest --additional-context-file ctx.json ") - - success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy_ctxtest': - if row['status'] == 'SUCCESS' and row['performance'] == '1': - success = True - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("model did not pick correct context.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'ctx.json']], indirect=True) - def test_additionalContext_commandline_overrides_additionalContextFile(self, global_data, clean_test_temp_files): - """ - additional-context command-line argument has priority over additional-context-file - """ - with open(os.path.join(BASE_DIR, 'ctx.json'), 'w') as ctx_json_file: - print("{ \"ctx_test\": \"2\" }", file=ctx_json_file) - - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_ctxtest --additional-context-file ctx.json --additional-context \"{'ctx_test': '1'}\" ") - - success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy_ctxtest': - if row['status'] == 'SUCCESS' and row['performance'] == '1': - success = True - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("model did not pick correct context.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_base_docker_override(self, global_data, clean_test_temp_files): - """ - BASE_DOCKER overrides base docker - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_ctxtest --additional-context \"{'docker_build_arg':{'BASE_DOCKER':'rocm/tensorflow' }}\" ") - - foundBaseDocker = [] - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy_ctxtest': - if row['status'] == 'SUCCESS' and row['performance'] == '0': - foundBaseDocker.append(row['base_docker']) - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not "rocm/tensorflow" in foundBaseDocker: - pytest.fail("BASE_DOCKER does not override base docker. Expected: rocm/tensorflow Found:" + foundBaseDocker) - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_docker_image_override(self, global_data, clean_test_temp_files): - """ - Using user-provided image passed in with MAD_CONTAINER_IMAGE - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_ctxtest --additional-context \"{'docker_env_vars':{'ctxtest':'1'},'MAD_CONTAINER_IMAGE':'rocm/tensorflow:latest' }\" ") - - foundLocalImage = None - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy_ctxtest': - if row['status'] == 'SUCCESS' and row['performance'] == '1': - foundLocalImage = row['docker_image'] - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not "rocm/tensorflow:latest" in foundLocalImage: - pytest.fail("MAD_CONTAINER_IMAGE does not override docker image. Expected: rocm/tensorflow:latest Found:" + foundLocalImage) - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_docker_env_vars_override(self, global_data, clean_test_temp_files): - """ - docker_env_vars pass environment variables into docker container - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_ctxtest --additional-context \"{'docker_env_vars':{'ctxtest':'1'} }\" ") - - success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy_ctxtest': - if row['status'] == 'SUCCESS' and row['performance'] == '1': - success = True - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("docker_env_vars did not pass environment variables into docker container.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_docker_mounts_mount_host_paths_in_docker_container(self, global_data, clean_test_temp_files): - """ - docker_mounts mount host paths inside docker containers - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_mountpath --additional-context \"{'docker_env_vars':{'MAD_DATAHOME':'/data'}, 'docker_mounts':{'/data':'/tmp'} }\" ") - - success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy_mountpath': - if row['status'] == 'SUCCESS': - success = True - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("docker_mounts did not mount host paths inside docker container.") - - @pytest.mark.skipif(get_num_gpus() < 8, reason="test requires atleast 8 gpus") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html','results_dummy_gpubind.csv']], indirect=True) - def test_docker_gpus(self, global_data, clean_test_temp_files): - """ - docker_gpus binds gpus to docker containers - """ - - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_gpubind --additional-context \"{'docker_gpus':'0,2-4,5-5,7'}\" ") - - gpu_nodeid_map = get_gpu_nodeid_map() - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - gpu_node_ids = [] - for row in csv_reader: - if 'dummy_gpubind' in row['model']: - if row['status'] == 'SUCCESS': - gpu_node_ids.append(row['performance']) - else: - pytest.fail("model in perf_test.csv did not run successfully.") - - # Debug information - print(f"GPU node IDs from performance: {gpu_node_ids}") - print(f"GPU nodeid map: {gpu_nodeid_map}") - mapped_gpus = [gpu_nodeid_map.get(node_id) for node_id in gpu_node_ids] - print(f"Mapped GPUs: {mapped_gpus}") - - # Filter out None values and sort - valid_mapped_gpus = [gpu for gpu in mapped_gpus if gpu is not None] - sorted_gpus = sorted(valid_mapped_gpus) - print(f"Sorted valid GPUs: {sorted_gpus}") - - if sorted_gpus != [0, 2, 3, 4, 5, 7]: - pytest.fail(f"docker_gpus did not bind expected gpus in docker container. Expected: [0, 2, 3, 4, 5, 7], Got: {sorted_gpus}, Raw node IDs: {gpu_node_ids}, Mapping: {gpu_nodeid_map}") - - @pytest.mark.skipif(get_num_cpus() < 64, reason="test requires atleast 64 cpus") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html','results_dummy_cpubind.csv']], indirect=True) - def test_docker_cpus(self, global_data, clean_test_temp_files): - """ - docker_cpus binds cpus to docker containers - """ - - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_cpubind --additional-context \"{'docker_cpus':'14-18,32,44-44,62'}\" ") - - success = False - with open(os.path.join(BASE_DIR, 'perf.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if 'dummy_cpubind' in row['model']: - if row['status'] == 'SUCCESS' and row['performance']=="14-18|32|44|62": - success = True - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("docker_cpus did not bind expected cpus in docker container.") - - def test_gpu_product_name_matches_arch(self): - """ - Check MAD_SYSTEM_GPU_PRODUCT_NAME is not empty and is valid. - - No models run for this test. - """ - - context = Context() - product_name = context.ctx['docker_env_vars']["MAD_SYSTEM_GPU_PRODUCT_NAME"] - - #fail the test if GPU product name is empty - if not product_name or not product_name.strip(): - pytest.fail("GPU product name is empty or just whitespaces") - - product_name = product_name.upper() - - #if product name has AMD or NVIDIA in it then it's a safe bet - #that it was parsed properly - if not ("AMD" in product_name or "NVIDIA" in product_name): - pytest.fail(f"Incorrect product name={product_name!r}") diff --git a/tests/test_custom_timeouts.py b/tests/test_custom_timeouts.py deleted file mode 100644 index 09ba62ea..00000000 --- a/tests/test_custom_timeouts.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Test the timeouts in MADEngine. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -import pytest -import os -import re -import csv -import time - -from .fixtures.utils import BASE_DIR, MODEL_DIR -from .fixtures.utils import global_data -from .fixtures.utils import clean_test_temp_files -from .fixtures.utils import is_nvidia - -class TestCustomTimeoutsFunctionality: - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_default_model_timeout_2hrs(self, global_data, clean_test_temp_files): - """ - default model timeout is 2 hrs - This test only checks if the timeout is set; it does not actually time the model. - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy") - - regexp = re.compile(r'Setting timeout to ([0-9]*) seconds.') - foundTimeout = None - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundTimeout = match.groups()[0] - if foundTimeout != '7200': - pytest.fail("default model timeout is not 2 hrs (" + foundTimeout + "s).") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_can_override_timeout_in_model(self, global_data, clean_test_temp_files): - """ - timeout can be overridden in model - This test only checks if the timeout is set; it does not actually time the model. - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_timeout") - - regexp = re.compile(r'Setting timeout to ([0-9]*) seconds.') - foundTimeout = None - with open( os.path.join(BASE_DIR, "dummy_timeout_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundTimeout = match.groups()[0] - if foundTimeout != '360': - pytest.fail("timeout in models.json (360s) could not override actual timeout (" + foundTimeout + "s).") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_can_override_timeout_in_commandline(self, global_data, clean_test_temp_files): - """ - timeout command-line argument overrides default timeout - This test only checks if the timeout is set; it does not actually time the model. - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --timeout 120") - - regexp = re.compile(r'Setting timeout to ([0-9]*) seconds.') - foundTimeout = None - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundTimeout = match.groups()[0] - if foundTimeout != '120': - pytest.fail("timeout command-line argument (120s) could not override actual timeout (" + foundTimeout + "s).") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_commandline_timeout_overrides_model_timeout(self, global_data, clean_test_temp_files): - """ - timeout command-line argument overrides model timeout - This test only checks if the timeout is set; it does not actually time the model. - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_timeout --timeout 120") - - regexp = re.compile(r'Setting timeout to ([0-9]*) seconds.') - foundTimeout = None - with open( os.path.join(BASE_DIR, "dummy_timeout_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundTimeout = match.groups()[0] - if foundTimeout != '120': - pytest.fail("timeout in command-line argument (360s) could not override model.json timeout (" + foundTimeout + "s).") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'run_directory']], indirect=True) - def test_timeout_in_commandline_timesout_correctly(self, global_data, clean_test_temp_files): - """ - timeout command-line argument times model out correctly - """ - start_time = time.time() - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_sleep --timeout 60", canFail = True, timeout = 180) - - test_duration = time.time() - start_time - - assert test_duration == pytest.approx(60, 10) - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'run_directory']], indirect=True) - def test_timeout_in_model_timesout_correctly(self, global_data, clean_test_temp_files): - """ - timeout in models.json times model out correctly - """ - start_time = time.time() - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_sleep", canFail = True, timeout = 180) - - test_duration = time.time() - start_time - - assert test_duration == pytest.approx(120, 20) - - diff --git a/tests/test_debugging.py b/tests/test_debugging.py deleted file mode 100644 index 3eda2ba7..00000000 --- a/tests/test_debugging.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Test the debugging in MADEngine. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -import pytest -import os -import re - -from .fixtures.utils import BASE_DIR, MODEL_DIR -from .fixtures.utils import global_data -from .fixtures.utils import clean_test_temp_files -from .fixtures.utils import is_nvidia - - -class TestDebuggingFunctionality: - """""" - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'run_directory']], indirect=True) - def test_keepAlive_keeps_docker_alive(self, global_data, clean_test_temp_files): - """ - keep-alive command-line argument keeps the docker container alive - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --keep-alive") - output = global_data['console'].sh("docker ps -aqf 'name=container_dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + "'") - - if not output: - pytest.fail("docker container not found after keep-alive argument.") - - global_data['console'].sh("docker container stop --time=1 container_dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") ) - global_data['console'].sh("docker container rm -f container_dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") ) - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'run_directory']], indirect=True) - def test_no_keepAlive_does_not_keep_docker_alive(self, global_data, clean_test_temp_files): - """ - without keep-alive command-line argument, the docker container is not kept alive - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy") - output = global_data['console'].sh("docker ps -aqf 'name=container_dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + "'") - - if output: - global_data['console'].sh("docker container stop --time=1 container_dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") ) - global_data['console'].sh("docker container rm -f container_dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") ) - pytest.fail("docker container found after not specifying keep-alive argument.") - - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'run_directory']], indirect=True) - def test_keepAlive_preserves_model_dir(self, global_data, clean_test_temp_files): - """ - keep-alive command-line argument will keep model directory after run - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --keep-alive") - - global_data['console'].sh("docker container stop --time=1 container_dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") ) - global_data['console'].sh("docker container rm -f container_dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") ) - if not os.path.exists( os.path.join(BASE_DIR, "run_directory")): - pytest.fail("model directory not left over after keep-alive argument.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'run_directory']], indirect=True) - def test_keepModelDir_keeps_model_dir(self, global_data, clean_test_temp_files): - """ - keep-model-dir command-line argument keeps model directory after run - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --keep-model-dir") - - if not os.path.exists( os.path.join(BASE_DIR, "run_directory")): - pytest.fail("model directory not left over after keep-model-dir argument.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'run_directory']], indirect=True) - def test_no_keepModelDir_does_not_keep_model_dir(self, global_data, clean_test_temp_files): - """ - keep-model-dir command-line argument keeps model directory after run - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy") - - if os.path.exists( os.path.join(BASE_DIR, "run_directory")): - pytest.fail("model directory left over after not specifying keep-model-dir (or keep-alive) argument.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'run_directory']], indirect=True) - def test_skipModelRun_does_not_run_model(self, global_data, clean_test_temp_files): - """ - skip-model-run command-line argument does not run model - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --skip-model-run") - - regexp = re.compile(r'performance: [0-9]* samples_per_second') - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - if regexp.search(line): - pytest.fail("skip-model-run argument ran model.") diff --git a/tests/test_discover.py b/tests/test_discover.py deleted file mode 100644 index d0643985..00000000 --- a/tests/test_discover.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Test the tags feature. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" - -# built-in modules -import os -import csv -import pandas as pd - -# third-party modules -import pytest - -# project modules -from .fixtures.utils import BASE_DIR, MODEL_DIR -from .fixtures.utils import global_data -from .fixtures.utils import clean_test_temp_files - - -class TestDiscover: - """Test the model discovery feature.""" - - @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True - ) - def test_static(self, global_data, clean_test_temp_files): - """ - test a tag from a models.json file - """ - global_data["console"].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy2/model2 ") - - success = False - with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row["model"] == "dummy2/model2" and row["status"] == "SUCCESS": - success = True - if not success: - pytest.fail("dummy2/model2 did not run successfully.") - - @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True - ) - def test_dynamic(self, global_data, clean_test_temp_files): - """ - test a tag from a get_models_json.py file - """ - global_data["console"].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy3/model4 ") - - success = False - with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row["model"] == "dummy3/model4" and row["status"] == "SUCCESS": - success = True - if not success: - pytest.fail("dummy3/model4 did not run successfully.") - - @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True - ) - def test_additional_args(self, global_data, clean_test_temp_files): - """ - passes additional args specified in the command line to the model - """ - global_data["console"].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy2/model2:batch-size=32 ") - - success = False - with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row["model"] == "dummy2/model2" and row["status"] == "SUCCESS" and "--batch-size 32" in row["args"]: - success = True - if not success: - pytest.fail("dummy2/model2:batch-size=32 did not run successfully.") - - @pytest.mark.parametrize( - "clean_test_temp_files", [["perf.csv", "perf.html"]], indirect=True - ) - def test_multiple(self, global_data, clean_test_temp_files): - """ - test multiple tags from top-level models.json, models.json in a script subdir, and get_models_json.py - """ - global_data["console"].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_test_group_1 dummy_test_group_2 dummy_test_group_3 ") - - success = False - with open(os.path.join(BASE_DIR, "perf.csv"), "r") as csv_file: - csv_reader = pd.read_csv(csv_file) - if len(csv_reader) == 5: - if csv_reader["model"].tolist() == [ - "dummy", - "dummy2/model1", - "dummy2/model2", - "dummy3/model3", - "dummy3/model4", - ]: - if csv_reader["status"].tolist() == [ - "SUCCESS", - "SUCCESS", - "SUCCESS", - "SUCCESS", - "SUCCESS", - ]: - success = True - if not success: - pytest.fail("multiple tags did not run successfully.") \ No newline at end of file diff --git a/tests/test_gpu_renderD_nodes.py b/tests/test_gpu_renderD_nodes.py deleted file mode 100644 index ed99b04e..00000000 --- a/tests/test_gpu_renderD_nodes.py +++ /dev/null @@ -1,191 +0,0 @@ -"""Integration tests for get_gpu_renderD_nodes function. - -These tests run against real hardware to validate the function works correctly -with actual GPU information from the system. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import json -import os -import re -import stat -# third-party modules -import pytest -# project modules -from madengine.core.context import Context -from madengine.core.console import Console - - -def is_amd_gpu(): - """Check if the system has AMD GPUs.""" - try: - console = Console() - vendor = console.sh( - 'bash -c \'if [[ -f /opt/rocm/bin/amd-smi ]]; then echo "AMD"; elif [[ -f /usr/local/bin/amd-smi ]]; then echo "AMD"; else echo "OTHER"; fi || true\'' - ) - return vendor.strip() == "AMD" - except Exception: - return False - - -def is_nvidia_gpu(): - """Check if the system has NVIDIA GPUs.""" - try: - console = Console() - result = console.sh('bash -c \'if [[ -f /usr/bin/nvidia-smi ]] && $(/usr/bin/nvidia-smi > /dev/null 2>&1); then echo "NVIDIA"; else echo "OTHER"; fi || true\'') - return result.strip() == "NVIDIA" - except Exception: - return False - - -class TestGetGpuRenderDNodesIntegration: - """Integration test suite for the get_gpu_renderD_nodes method using real hardware.""" - - @pytest.mark.skipif(is_amd_gpu(), reason="Test requires non-AMD GPU or no GPU") - def test_returns_none_for_non_amd_gpu(self): - """Test that the function returns None for non-AMD GPUs.""" - context = Context() - - # Should return None for non-AMD GPUs - if context.ctx['docker_env_vars']['MAD_GPU_VENDOR'] != 'AMD': - assert context.ctx['gpu_renderDs'] is None - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_returns_list_for_amd_gpu(self): - """Test that the function returns a list of renderD nodes for AMD GPUs.""" - context = Context() - - # Should return a list for AMD GPUs - assert context.ctx['gpu_renderDs'] is not None - assert isinstance(context.ctx['gpu_renderDs'], list) - - # List should not be empty if there are GPUs - if context.ctx['docker_env_vars']['MAD_SYSTEM_NGPUS'] > 0: - assert len(context.ctx['gpu_renderDs']) > 0 - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_renderD_count_matches_gpu_count(self): - """Test that the number of renderD nodes matches the number of GPUs.""" - context = Context() - - # Get GPU count from context (which uses amd-smi list --csv or rocm-smi as fallback) - # This is more reliable than amd-smi list -e --json which only works on ROCm 6.4+ - expected_gpu_count = context.ctx['docker_env_vars']['MAD_SYSTEM_NGPUS'] - - # Skip test if no GPUs detected - if expected_gpu_count == 0: - pytest.skip("No GPUs detected on system") - - # The number of renderD nodes should match the number of GPUs - assert len(context.ctx['gpu_renderDs']) == expected_gpu_count - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_renderD_values_are_valid(self): - """Test that all renderD values are valid integers.""" - context = Context() - - # All renderD values should be positive integers - for renderD in context.ctx['gpu_renderDs']: - assert isinstance(renderD, int) - assert renderD > 0 - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_renderD_nodes_are_unique(self): - """Test that all renderD nodes are unique.""" - context = Context() - - renderDs = context.ctx['gpu_renderDs'] - # All renderD values should be unique - assert len(renderDs) == len(set(renderDs)) - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_renderD_values_match_kfd_properties(self): - """Test that renderD values match what's in KFD properties.""" - console = Console() - context = Context() - - # Get renderD values from KFD directly - try: - kfd_output = console.sh("grep -r drm_render_minor /sys/devices/virtual/kfd/kfd/topology/nodes") - kfd_lines = [line for line in kfd_output.split("\n") if line.strip()] - # Filter out CPU entries (renderD value 0) - kfd_renderDs = [int(line.split()[-1]) for line in kfd_lines if int(line.split()[-1]) != 0] - except Exception: - pytest.skip("Unable to read KFD properties") - - # The renderD values from context should be a subset of KFD renderDs - for renderD in context.ctx['gpu_renderDs']: - assert renderD in kfd_renderDs, f"renderD {renderD} not found in KFD properties" - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_gpu_ordering_is_consistent(self): - """Test that GPU ordering matches amd-smi GPU IDs.""" - console = Console() - context = Context() - - try: - # Get amd-smi data - amd_smi_output = console.sh("amd-smi list -e --json") - gpu_data = json.loads(amd_smi_output) - - # Sort by GPU ID - sorted_gpus = sorted(gpu_data, key=lambda x: x["gpu"]) - - # The number of GPUs should match - assert len(context.ctx['gpu_renderDs']) == len(sorted_gpus) - - except Exception: - pytest.skip("Unable to verify GPU ordering with amd-smi") - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_renderD_nodes_exist_in_dev(self): - """Test that the renderD nodes actually exist in /dev/dri/.""" - context = Context() - - # Check that each renderD node exists as a device file - for renderD in context.ctx['gpu_renderDs']: - dev_path = f"/dev/dri/renderD{renderD}" - assert os.path.exists(dev_path), f"Device {dev_path} does not exist" - # Should be a character device - assert stat.S_ISCHR(os.stat(dev_path).st_mode), f"{dev_path} is not a character device" - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_no_cpu_entries_in_renderDs(self): - """Test that CPU entries (renderD=0) are not included.""" - context = Context() - - # None of the renderD values should be 0 (CPUs) - for renderD in context.ctx['gpu_renderDs']: - assert renderD != 0, "CPU entry (renderD=0) found in GPU renderD list" - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_context_initialization_succeeds(self): - """Test that Context initialization succeeds with real GPU data.""" - # This should not raise any exceptions - context = Context() - - # Basic sanity checks - assert context.ctx is not None - assert 'gpu_renderDs' in context.ctx - assert 'docker_env_vars' in context.ctx - assert 'MAD_GPU_VENDOR' in context.ctx['docker_env_vars'] - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_renderD_mapping_is_reproducible(self): - """Test that creating multiple Context objects produces the same renderD mapping.""" - context1 = Context() - context2 = Context() - - # The renderD lists should be identical - assert context1.ctx['gpu_renderDs'] == context2.ctx['gpu_renderDs'] - - @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") - def test_renderD_values_are_in_valid_range(self): - """Test that renderD values are in the valid Linux device range.""" - context = Context() - - # renderD values typically start at 128 and go up - # Valid range is 128-255 for render nodes - for renderD in context.ctx['gpu_renderDs']: - assert 128 <= renderD <= 255, f"renderD {renderD} is outside valid range [128, 255]" diff --git a/tests/test_live_output.py b/tests/test_live_output.py deleted file mode 100644 index 76a0c4f4..00000000 --- a/tests/test_live_output.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Test the functionality of live output in MADEngine. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import re -import pytest -# project modules -from .fixtures.utils import global_data -from .fixtures.utils import BASE_DIR, MODEL_DIR -from .fixtures.utils import clean_test_temp_files - - -class TestLiveOutputFunctionality: - """Test the live output functionality.""" - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_default_silent_run(self, global_data, clean_test_temp_files): - """ - default run is silent - """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy") - - regexp = re.compile(r'performance: [0-9]* samples_per_second') - if regexp.search(output): - pytest.fail("default run is not silent") - - if "ARG BASE_DOCKER=" in output: - pytest.fail("default run is not silent") - - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_liveOutput_prints_output_to_screen(self, global_data, clean_test_temp_files): - """ - live_output prints output to screen - """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --live-output") - - regexp = re.compile(r'performance: [0-9]* samples_per_second') - if not regexp.search(output): - pytest.fail("default run is silent") - - if "ARG BASE_DOCKER=" not in output: - pytest.fail("default run is silent") diff --git a/tests/test_mad.py b/tests/test_mad.py deleted file mode 100644 index 055eb212..00000000 --- a/tests/test_mad.py +++ /dev/null @@ -1,67 +0,0 @@ -"""Test the mad module. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -import sys -import subprocess -import typing -# third-party modules -import pytest -# project modules -from madengine import mad - - -class TestMad: - """Test the mad module. - - test_run_model: run python3 mad.py --help - """ - def test_mad_cli(self): - # Construct the path to the script - script_path = os.path.join(os.path.dirname(__file__), "../src/madengine", "mad.py") - # Run the script with arguments using subprocess.run - result = subprocess.run([sys.executable, script_path, "--help"], stdout=subprocess.PIPE) - print(result.stdout.decode("utf-8")) - assert result.returncode == 0 - - def test_mad_run_cli(self): - # Construct the path to the script - script_path = os.path.join(os.path.dirname(__file__), "../src/madengine", "mad.py") - # Run the script with arguments using subprocess.run - result = subprocess.run([sys.executable, script_path, "run", "--help"], stdout=subprocess.PIPE) - print(result.stdout.decode("utf-8")) - assert result.returncode == 0 - - def test_mad_report_cli(self): - # Construct the path to the script - script_path = os.path.join(os.path.dirname(__file__), "../src/madengine", "mad.py") - # Run the script with arguments using subprocess.run - result = subprocess.run([sys.executable, script_path, "report", "--help"], stdout=subprocess.PIPE) - print(result.stdout.decode("utf-8")) - assert result.returncode == 0 - - def test_mad_database_cli(self): - # Construct the path to the script - script_path = os.path.join(os.path.dirname(__file__), "../src/madengine", "mad.py") - # Run the script with arguments using subprocess.run - result = subprocess.run([sys.executable, script_path, "database", "--help"], stdout=subprocess.PIPE) - print(result.stdout.decode("utf-8")) - assert result.returncode == 0 - - def test_mad_discover_cli(self): - # Construct the path to the script - script_path = os.path.join(os.path.dirname(__file__), "../src/madengine", "mad.py") - # Run the script with arguments using subprocess.run - result = subprocess.run([sys.executable, script_path, "discover", "--help"], stdout=subprocess.PIPE) - print(result.stdout.decode("utf-8")) - assert result.returncode == 0 - - def test_mad_version_cli(self): - # Construct the path to the script - script_path = os.path.join(os.path.dirname(__file__), "../src/madengine", "mad.py") - # Run the script with arguments using subprocess.run - result = subprocess.run([sys.executable, script_path, "--version"], stdout=subprocess.PIPE) - print(result.stdout.decode("utf-8")) - assert result.returncode == 0 diff --git a/tests/test_misc.py b/tests/test_misc.py deleted file mode 100644 index 11a6fa81..00000000 --- a/tests/test_misc.py +++ /dev/null @@ -1,86 +0,0 @@ -"""Test the misc modules. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -import sys -import csv -import pandas as pd -# 3rd party modules -import pytest -# project modules -from .fixtures.utils import BASE_DIR, MODEL_DIR -from .fixtures.utils import global_data -from .fixtures.utils import clean_test_temp_files - - -class TestMiscFunctionality: - - @pytest.mark.parametrize('clean_test_temp_files', [['perf_test.csv', 'perf_test.html']], indirect=True) - def test_output_commandline_argument_writes_csv_correctly(self, global_data, clean_test_temp_files): - """ - output command-line argument writes csv file to specified output path - """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy -o perf_test.csv") - success = False - with open(os.path.join(BASE_DIR, 'perf_test.csv'), 'r') as csv_file: - csv_reader = csv.DictReader(csv_file) - for row in csv_reader: - if row['model'] == 'dummy': - if row['status'] == 'SUCCESS': - success = True - break - else: - pytest.fail("model in perf_test.csv did not run successfully.") - if not success: - pytest.fail("model, dummy, not found in perf_test.csv.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf_test.csv', 'perf_test.html']], indirect=True) - def test_commandline_argument_skip_gpu_arch(self, global_data, clean_test_temp_files): - """ - skip_gpu_arch command-line argument skips GPU architecture check - """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_skip_gpu_arch") - if 'Skipping model' not in output: - pytest.fail("Enable skipping gpu arch for running model is failed.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf_test.csv', 'perf_test.html']], indirect=True) - def test_commandline_argument_disable_skip_gpu_arch_fail(self, global_data, clean_test_temp_files): - """ - skip_gpu_arch command-line argument fails GPU architecture check - """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_skip_gpu_arch --disable-skip-gpu-arch") - # Check if exception with message 'Skipping model' is thrown - if 'Skipping model' in output: - pytest.fail("Disable skipping gpu arch for running model is failed.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf_test.csv', 'perf_test.html']], indirect=True) - def test_output_multi_results(self, global_data, clean_test_temp_files): - """ - test output multiple results - """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_multi") - # Check if multiple results are written to perf_dummy.csv - success = False - # Read the csv file to a dataframe using pandas - multi_df = pd.read_csv(os.path.join(BASE_DIR, 'perf_dummy.csv')) - # Check the number of rows in the dataframe is 4, and columns is 4 - if multi_df.shape == (4, 4): - success = True - if not success: - pytest.fail("The generated multi results is not correct.") - # Check if multiple results from perf_dummy.csv get copied over to perf.csv - perf_df = pd.read_csv(os.path.join(BASE_DIR, 'perf.csv')) - # Get the corresponding rows and columns from perf.csv - perf_df = perf_df[multi_df.columns] - perf_df = perf_df.iloc[-4:, :] - # Drop model columns from both dataframes; these will not match - # if multiple results csv has {model}, then perf csv has {tag_name}_{model} - multi_df = multi_df.drop('model', axis=1) - perf_df = perf_df.drop('model', axis=1) - if all(perf_df.columns == multi_df.columns): - success = True - if not success: - pytest.fail("The columns of the generated multi results do not match perf.csv.") - diff --git a/tests/test_pre_post_scripts.py b/tests/test_pre_post_scripts.py deleted file mode 100644 index 50d64b30..00000000 --- a/tests/test_pre_post_scripts.py +++ /dev/null @@ -1,182 +0,0 @@ -"""Test the scripts for pre and post processing. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -import re -import csv -import time -# 3rd party modules -import pytest -# project modules -from .fixtures.utils import BASE_DIR, MODEL_DIR -from .fixtures.utils import global_data -from .fixtures.utils import clean_test_temp_files -from .fixtures.utils import is_nvidia - - -class TestPrePostScriptsFunctionality: - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_pre_scripts_run_before_model(self, global_data, clean_test_temp_files): - """ - pre_scripts are run in docker container before model execution - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --additional-context \"{ 'pre_scripts':[{'path':'scripts/common/pre_scripts/pre_test.sh'}] }\" ") - - regexp = re.compile(r'Pre-Script test called ([0-9]*)') - foundLine = None - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundLine = match.groups()[0] - if foundLine != '0': - pytest.fail("pre_scripts specification did not run the selected pre-script.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_post_scripts_run_after_model(self, global_data, clean_test_temp_files): - """ - post_scripts are run in docker container after model execution - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --additional-context \"{ 'post_scripts':[{'path':'scripts/common/post_scripts/post_test.sh'}] }\" ") - - regexp = re.compile(r'Post-Script test called ([0-9]*)') - foundLine = None - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundLine = match.groups()[0] - if foundLine != '0': - pytest.fail("post_scripts specification did not run the selected post-script.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_pre_scripts_accept_arguments(self, global_data, clean_test_temp_files): - """ - pre_scripts are run in docker container before model execution and accept arguments - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --additional-context \"{ 'pre_scripts':[{'path':'scripts/common/pre_scripts/pre_test.sh', 'args':'1'}] }\" ") - - regexp = re.compile(r'Pre-Script test called ([0-9]*)') - foundLine = None - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundLine = match.groups()[0] - if foundLine != '1': - pytest.fail("pre_scripts specification did not run the selected pre-script.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_post_scripts_accept_arguments(self, global_data, clean_test_temp_files): - """ - post_scripts are run in docker container after model execution and accept arguments - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --additional-context \"{ 'post_scripts':[{'path':'scripts/common/post_scripts/post_test.sh', 'args':'1'}] }\" ") - - regexp = re.compile(r'Post-Script test called ([0-9]*)') - foundLine = None - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundLine = match.groups()[0] - if foundLine != '1': - pytest.fail("post_scripts specification did not run the selected post-script.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_both_pre_and_post_scripts_run_before_and_after_model(self, global_data, clean_test_temp_files): - """ - post_scripts are run in docker container after model execution - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --additional-context \"{ 'pre_scripts':[{'path':'scripts/common/pre_scripts/pre_test.sh'}], 'post_scripts':[{'path':'scripts/common/post_scripts/post_test.sh'}] }\" ") - - regexp = re.compile(r'Pre-Script test called ([0-9]*)') - foundLine = None - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundLine = match.groups()[0] - if foundLine != '0': - pytest.fail("pre_scripts specification did not run the selected pre-script.") - - regexp = re.compile(r'Post-Script test called ([0-9]*)') - foundLine = None - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundLine = match.groups()[0] - if foundLine != '0': - pytest.fail("post_scripts specification did not run the selected post-script.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_all_pre_scripts_run_in_order(self, global_data, clean_test_temp_files): - """ - all pre_scripts are run in order - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --additional-context \"{ 'pre_scripts':[{'path':'scripts/common/pre_scripts/pre_test.sh', 'args':'1'}, {'path':'scripts/common/pre_scripts/pre_test.sh', 'args':'2'} ] }\" ") - - regexp = re.compile(r'Pre-Script test called ([0-9]*)') - foundLine = None - pre_post_script_count = 0 - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundLine = match.groups()[0] - pre_post_script_count += 1 - if foundLine != str(pre_post_script_count): - pytest.fail("pre_scripts run in order. Did not find " + str(pre_post_script_count) ) - - if foundLine != '2': - pytest.fail("pre_scripts specification did not run the selected pre-script.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_all_post_scripts_run_in_order(self, global_data, clean_test_temp_files): - """ - all post_scripts are run in order - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --additional-context \"{ 'post_scripts':[{'path':'scripts/common/post_scripts/post_test.sh', 'args':'1'}, {'path':'scripts/common/post_scripts/post_test.sh', 'args':'2'} ] }\" ") - - regexp = re.compile(r'Post-Script test called ([0-9]*)') - foundLine = None - pre_post_script_count = 0 - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundLine = match.groups()[0] - pre_post_script_count += 1 - if foundLine != str(pre_post_script_count): - pytest.fail("post_scripts run in order. Did not find " + str(pre_post_script_count) ) - - if foundLine != '2': - pytest.fail("post_scripts specification did not run the selected post-script.") diff --git a/tests/test_profiling.py b/tests/test_profiling.py deleted file mode 100644 index 85aca389..00000000 --- a/tests/test_profiling.py +++ /dev/null @@ -1,217 +0,0 @@ -"""Test the data provider module. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -# built-in modules -import os -import re -import sys -import csv -# third-party modules -import pytest -# project modules -from .fixtures.utils import BASE_DIR, MODEL_DIR -from .fixtures.utils import global_data -from .fixtures.utils import clean_test_temp_files -from .fixtures.utils import is_nvidia - - -class TestProfilingFunctionality: - - @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'rocprof_output']], indirect=True) - def test_rocprof_profiling_tool_runs_correctly(self, global_data, clean_test_temp_files): - """ - specifying a profiling tool runs respective pre and post scripts - """ - # canFail is set to True because rocProf mode is failing the full DLM run; this test will test if the correct output files are generated - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_prof --additional-context \"{ 'tools': [{ 'name': 'rocprof' }] }\" ", canFail=True) - - if not os.path.exists( os.path.join(BASE_DIR, "rocprof_output", "results.csv") ): - pytest.fail("rocprof_output/results.csv not generated with rocprof profiling run.") - - @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'rpd_output']], indirect=True) - def test_rpd_profiling_tool_runs_correctly(self, global_data, clean_test_temp_files): - """ - specifying a profiling tool runs respective pre and post scripts - """ - # canFail is set to True because rpd mode is failing the full DLM run; this test will test if the correct output files are generated - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_prof --additional-context \"{ 'tools': [{ 'name': 'rpd' }] }\" ", canFail=True) - - if not os.path.exists( os.path.join(BASE_DIR, "rpd_output", "trace.rpd") ): - pytest.fail("rpd_output/trace.rpd not generated with rpd profiling run.") - - @pytest.mark.skip(reason="Skipping this test for debugging purposes") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'gpu_info_power_profiler_output.csv']], indirect=True) - def test_gpu_info_power_profiling_tool_runs_correctly(self, global_data, clean_test_temp_files): - """ - specifying a profiling tool runs respective pre and post scripts - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_prof --additional-context \"{ 'tools': [{ 'name': 'gpu_info_power_profiler' }] }\" ", canFail=False) - - if not os.path.exists( os.path.join(BASE_DIR, "gpu_info_power_profiler_output.csv") ): - pytest.fail("gpu_info_power_profiler_output.csv not generated with gpu_info_power_profiler run.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'gpu_info_vram_profiler_output.csv']], indirect=True) - def test_gpu_info_vram_profiling_tool_runs_correctly(self, global_data, clean_test_temp_files): - """ - specifying a profiling tool runs respective pre and post scripts - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_prof --additional-context \"{ 'tools': [{ 'name': 'gpu_info_vram_profiler' }] }\" ", canFail=False) - - if not os.path.exists( os.path.join(BASE_DIR, "gpu_info_vram_profiler_output.csv") ): - pytest.fail("gpu_info_vram_profiler_output.csv not generated with gpu_info_vram_profiler run.") - - @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'library_trace.csv']], indirect=True) - def test_rocblas_trace_runs_correctly(self, global_data, clean_test_temp_files): - """ - specifying a profiling tool runs respective pre and post scripts - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_prof --additional-context \"{ 'tools': [{ 'name': 'rocblas_trace' }] }\" ", canFail=False) - - regexp = re.compile(r'rocblas-bench') - foundMatch = None - with open( os.path.join(BASE_DIR, "library_trace.csv" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundMatch = True - if not foundMatch: - pytest.fail("could not detect rocblas-bench in output log file with rocblas trace tool.") - - @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'library_trace.csv']], indirect=True) - def test_tensile_trace_runs_correctly(self, global_data, clean_test_temp_files): - """ - specifying a profiling tool runs respective pre and post scripts - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_prof --additional-context \"{ 'tools': [{ 'name': 'tensile_trace' }] }\" ", canFail=False) - - regexp = re.compile(r'tensile,Cijk') - foundMatch = None - with open( os.path.join(BASE_DIR, "library_trace.csv" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundMatch = True - if not foundMatch: - pytest.fail("could not detect tensile call in output log file with tensile trace tool.") - - @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'library_trace.csv']], indirect=True) - def test_miopen_trace_runs_correctly(self, global_data, clean_test_temp_files): - """ - specifying a profiling tool runs respective pre and post scripts - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_prof --additional-context \"{ 'tools': [{ 'name': 'miopen_trace' }] }\" ", canFail=False) - - regexp = re.compile(r'MIOpenDriver') - foundMatch = None - with open( os.path.join(BASE_DIR, "library_trace.csv" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundMatch = True - if not foundMatch: - pytest.fail("could not detect miopen call in output log file with miopen trace tool.") - - @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_rccl_trace_runs_correctly(self, global_data, clean_test_temp_files): - """ - specifying a profiling tool runs respective pre and post scripts - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_prof_rccl --additional-context \"{ 'tools': [{ 'name': 'rccl_trace' }] }\" ", canFail=False) - - regexp = re.compile(r'NCCL INFO AllReduce:') - foundMatch = None - with open( os.path.join(BASE_DIR, "dummy_prof_rccl_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - foundMatch = True - if not foundMatch: - pytest.fail("could not detect rccl call in output log file with rccl trace tool.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_toolA_runs_correctly(self, global_data, clean_test_temp_files): - """ - specifying a profiling tool runs respective pre and post scripts - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --additional-context \"{ 'tools': [{ 'name': 'test_tools_A' }] }\" ", canFail=False) - - match_str_array = ['^pre_script A$', '^cmd_A$', '^post_script A$'] - - match_str_idx = 0 - regexp = re.compile(match_str_array[match_str_idx]) - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - print("MATCH = ", line ) - match_str_idx = match_str_idx + 1 - if match_str_idx == len(match_str_array): - break - regexp = re.compile(match_str_array[match_str_idx]) - if match_str_idx != len(match_str_array): - print("Matched up to ", match_str_idx) - pytest.fail("all strings were not matched in toolA test.") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_stackable_design_runs_correctly(self, global_data, clean_test_temp_files): - """ - specifying a profiling tool runs respective pre and post scripts - """ - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy --additional-context \"{ 'tools': [{ 'name': 'test_tools_A' }, { 'name': 'test_tools_B' } ] }\" ", canFail=False) - - match_str_array = [ '^pre_script B$', '^pre_script A$', '^cmd_B$', '^cmd_A$', '^post_script A$', '^post_script B$'] - - match_str_idx = 0 - regexp = re.compile(match_str_array[match_str_idx]) - with open( os.path.join(BASE_DIR, "dummy_dummy.ubuntu." + ("amd" if not is_nvidia() else "nvidia") + ".live.log" ), 'r') as f: - while True: - line = f.readline() - if not line: - break - match = regexp.search(line) - if match: - print("MATCH = ", line ) - match_str_idx = match_str_idx + 1 - if match_str_idx == len(match_str_array): - break - regexp = re.compile(match_str_array[match_str_idx]) - if match_str_idx != len(match_str_array): - print("Matched up to ", match_str_idx) - pytest.fail("all strings were not matched in the stacked test using toolA and toolB.") - - - @pytest.mark.skipif(is_nvidia(), reason="test does not run on NVIDIA") - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html', 'rocprof_output']], indirect=True) - def test_can_change_default_behavior_of_profiling_tool_with_additionalContext(self, global_data, clean_test_temp_files): - """ - default behavior of a profiling tool can be changed from additional-context - """ - # canFail is set to True because rocProf is failing; this test will test if the correct output files are generated - global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_prof --additional-context \"{ 'tools': [{ 'name': 'rocprof', 'cmd': 'rocprof --hsa-trace' }] }\" ", canFail=True) - - if not os.path.exists( os.path.join(BASE_DIR, "rocprof_output", "results.hsa_stats.csv") ): - pytest.fail("rocprof_output/results.hsa_stats.csv not generated with rocprof --hsa-trace profiling run.") - - diff --git a/tests/test_tags.py b/tests/test_tags.py deleted file mode 100644 index 39eecaf3..00000000 --- a/tests/test_tags.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" -import pytest -import os -import sys -import json - -from .fixtures.utils import BASE_DIR, MODEL_DIR -from .fixtures.utils import global_data -from .fixtures.utils import clean_test_temp_files - -class TestTagsFunctionality: - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_can_select_model_subset_with_commandline_tag_argument(self, global_data, clean_test_temp_files): - """ - can select subset of models with tag with command-line argument - """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_group_1") - - if "Running model dummy" not in output: - pytest.fail("dummy tag not selected with commandline --tags argument") - - if "Running model dummy2" not in output: - pytest.fail("dummy2 tag not selected with commandline --tags argument") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_all_models_matching_any_tag_selected_with_multiple_tags(self, global_data, clean_test_temp_files): - """ - if multiple tags are specified, all models that match any tag will be selected - """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_group_1 dummy_group_2") - - if "Running model dummy" not in output: - pytest.fail("dummy tag not selected with commandline --tags argument") - - if "Running model dummy2" not in output: - pytest.fail("dummy2 tag not selected with commandline --tags argument") - - if "Running model dummy3" not in output: - pytest.fail("dummy3 tag not selected with commandline --tags argument") - - @pytest.mark.parametrize('clean_test_temp_files', [['perf.csv', 'perf.html']], indirect=True) - def test_model_names_are_automatically_tags(self, global_data, clean_test_temp_files): - """ - Each model name is automatically a tag - """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy") - - if "Running model dummy" not in output: - pytest.fail("dummy tag not selected with commandline --tags argument") - diff --git a/src/madengine/db/__init__.py b/tests/unit/__init__.py similarity index 100% rename from src/madengine/db/__init__.py rename to tests/unit/__init__.py diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py new file mode 100644 index 00000000..b76b81af --- /dev/null +++ b/tests/unit/test_cli.py @@ -0,0 +1,394 @@ +"""Test the CLI module. + +This module tests the modern Typer-based command-line interface functionality +including utilities, validation, and argument processing. + +GPU Hardware Support: +- Tests automatically detect if the machine has GPU hardware +- GPU-dependent tests are skipped on CPU-only machines using @requires_gpu decorator +- Tests use auto-generated additional context appropriate for the current machine +- CPU-only machines default to AMD GPU vendor for build compatibility + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# built-in modules +import json +import os +import sys +import tempfile +import unittest.mock +from pathlib import Path +from unittest.mock import MagicMock, Mock, patch, mock_open + +# third-party modules +import pytest +import typer +from typer.testing import CliRunner + +# project modules +from madengine.cli import ( + app, + setup_logging, + create_args_namespace, + validate_additional_context, + save_summary_with_feedback, + display_results_table, + ExitCode, + VALID_GPU_VENDORS, + VALID_GUEST_OS, + DEFAULT_MANIFEST_FILE, + DEFAULT_PERF_OUTPUT, + DEFAULT_DATA_CONFIG, + DEFAULT_TOOLS_CONFIG, + DEFAULT_TIMEOUT, +) +from tests.fixtures.utils import ( + BASE_DIR, + MODEL_DIR, + has_gpu, + requires_gpu, + generate_additional_context_for_machine, +) + + +# ============================================================================ +# CLI Utilities Tests +# ============================================================================ + +class TestSetupLogging: + """Test the setup_logging function.""" + + @patch("madengine.cli.utils.logging.basicConfig") + def test_setup_logging_verbose(self, mock_basic_config): + """Test logging setup with verbose mode enabled.""" + setup_logging(verbose=True) + + mock_basic_config.assert_called_once() + call_args = mock_basic_config.call_args + assert call_args[1]["level"] == 10 # logging.DEBUG + + @patch("madengine.cli.utils.logging.basicConfig") + def test_setup_logging_normal(self, mock_basic_config): + """Test logging setup with normal mode.""" + setup_logging(verbose=False) + + mock_basic_config.assert_called_once() + call_args = mock_basic_config.call_args + assert call_args[1]["level"] == 20 # logging.INFO + + +class TestCreateArgsNamespace: + """Test the create_args_namespace function.""" + + def test_create_args_namespace_basic(self): + """Test creating args namespace with basic parameters.""" + args = create_args_namespace( + tags=["dummy"], registry="localhost:5000", verbose=True + ) + + assert args.tags == ["dummy"] + assert args.registry == "localhost:5000" + assert args.verbose is True + + def test_create_args_namespace_complex(self): + """Test creating args namespace with complex parameters.""" + args = create_args_namespace( + tags=["model1", "model2"], + additional_context='{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}', + timeout=300, + keep_alive=True, + verbose=False, + ) + + assert args.tags == ["model1", "model2"] + assert args.additional_context == '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + assert args.timeout == 300 + assert args.keep_alive is True + assert args.verbose is False + + +class TestSaveSummaryWithFeedback: + """Test the save_summary_with_feedback function.""" + + def test_save_summary_success(self): + """Test successful summary saving.""" + summary = {"successful_builds": ["model1", "model2"], "failed_builds": []} + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + temp_file = f.name + + try: + with patch("madengine.cli.utils.console") as mock_console: + save_summary_with_feedback(summary, temp_file, "Build") + + # Verify file was written + with open(temp_file, "r") as f: + saved_data = json.load(f) + assert saved_data == summary + + mock_console.print.assert_called() + finally: + os.unlink(temp_file) + + def test_save_summary_io_error(self): + """Test summary saving with IO error.""" + summary = {"successful_builds": ["model1"], "failed_builds": []} + + with patch("madengine.cli.utils.console") as mock_console: + with pytest.raises(typer.Exit) as exc_info: + save_summary_with_feedback(summary, "/invalid/path/file.json", "Build") + + assert exc_info.value.exit_code == ExitCode.FAILURE + mock_console.print.assert_called() + + +class TestDisplayResultsTable: + """Test the display_results_table function.""" + + def test_display_results_table_build_success(self): + """Test displaying build results table with successes.""" + summary = {"successful_builds": ["model1", "model2"], "failed_builds": []} + + with patch("madengine.cli.utils.console") as mock_console: + display_results_table(summary, "Build Results") + + mock_console.print.assert_called() + + def test_display_results_table_build_failures(self): + """Test displaying build results table with failures.""" + summary = { + "successful_builds": ["model1"], + "failed_builds": ["model2", "model3"], + } + + with patch("madengine.cli.utils.console") as mock_console: + display_results_table(summary, "Build Results") + + mock_console.print.assert_called() + + def test_display_results_table_run_results(self): + """Test displaying run results table.""" + summary = { + "successful_runs": [ + {"model": "model1", "status": "success"}, + {"model": "model2", "status": "success"}, + ], + "failed_runs": [{"model": "model3", "status": "failed"}], + } + + with patch("madengine.cli.utils.console") as mock_console: + display_results_table(summary, "Run Results") + + mock_console.print.assert_called() + + +# ============================================================================ +# CLI Validation Tests +# ============================================================================ + +class TestValidateAdditionalContext: + """Test the validate_additional_context function.""" + + def test_validate_additional_context_valid_string(self): + """Test validation with valid additional context from string.""" + # Use auto-generated context for current machine + context = generate_additional_context_for_machine() + context_json = json.dumps(context) + + with patch("madengine.cli.validators.console") as mock_console: + result = validate_additional_context(context_json) + + assert result == context + mock_console.print.assert_called() + + def test_validate_additional_context_valid_file(self): + """Test validation with valid additional context from file.""" + # Use auto-generated context for current machine + context = generate_additional_context_for_machine() + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(context, f) + temp_file = f.name + + try: + with patch("madengine.cli.validators.console") as mock_console: + result = validate_additional_context("{}", temp_file) + + assert result == context + mock_console.print.assert_called() + finally: + os.unlink(temp_file) + + def test_validate_additional_context_invalid_json(self): + """Test validation with invalid JSON.""" + with patch("madengine.cli.validators.console") as mock_console: + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context("invalid json") + + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + mock_console.print.assert_called() + + def test_validate_additional_context_missing_required_fields(self): + """Test validation with missing required fields.""" + with patch("madengine.cli.validators.console") as mock_console: + # Missing gpu_vendor + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context('{"guest_os": "UBUNTU"}') + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + # Missing guest_os + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context('{"gpu_vendor": "AMD"}') + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + def test_validate_additional_context_invalid_values(self): + """Test validation with invalid field values.""" + with patch("madengine.cli.validators.console") as mock_console: + # Invalid gpu_vendor + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context( + '{"gpu_vendor": "INVALID", "guest_os": "UBUNTU"}' + ) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + # Invalid guest_os + with pytest.raises(typer.Exit) as exc_info: + validate_additional_context( + '{"gpu_vendor": "AMD", "guest_os": "INVALID"}' + ) + assert exc_info.value.exit_code == ExitCode.INVALID_ARGS + + +class TestProcessBatchManifest: + """Test the process_batch_manifest function.""" + + def test_process_batch_manifest_valid_mixed_build_new(self): + """Test processing batch manifest with mixed build_new values - core functionality.""" + from madengine.cli.validators import process_batch_manifest + + batch_data = [ + {"model_name": "model1", "build_new": True}, + {"model_name": "model2", "build_new": False}, + {"model_name": "model3", "build_new": True}, + ] + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(batch_data, f) + temp_file = f.name + + try: + result = process_batch_manifest(temp_file) + + # Only models with build_new=True should be in build_tags + assert result["build_tags"] == ["model1", "model3"] + # All models should be in all_tags + assert result["all_tags"] == ["model1", "model2", "model3"] + assert len(result["manifest_data"]) == 3 + finally: + os.unlink(temp_file) + + def test_process_batch_manifest_default_build_new_false(self): + """Test that build_new defaults to false when not specified.""" + from madengine.cli.validators import process_batch_manifest + + batch_data = [ + {"model_name": "model1"}, # No build_new field + {"model_name": "model2", "build_new": True}, + ] + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(batch_data, f) + temp_file = f.name + + try: + result = process_batch_manifest(temp_file) + + # model1 should not be in build_tags (defaults to false) + assert result["build_tags"] == ["model2"] + assert result["all_tags"] == ["model1", "model2"] + finally: + os.unlink(temp_file) + + def test_process_batch_manifest_with_registry_fields(self): + """Test per-model registry override - key feature.""" + from madengine.cli.validators import process_batch_manifest + + batch_data = [ + { + "model_name": "model1", + "build_new": True, + "registry": "docker.io/myorg", + "registry_image": "myorg/model1" + }, + { + "model_name": "model2", + "build_new": True, + "registry": "gcr.io/myproject" + }, + ] + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(batch_data, f) + temp_file = f.name + + try: + result = process_batch_manifest(temp_file) + + # Verify registry metadata is preserved + assert result["manifest_data"][0]["registry"] == "docker.io/myorg" + assert result["manifest_data"][0]["registry_image"] == "myorg/model1" + assert result["manifest_data"][1]["registry"] == "gcr.io/myproject" + finally: + os.unlink(temp_file) + + def test_process_batch_manifest_error_handling(self): + """Test error handling for various invalid inputs.""" + from madengine.cli.validators import process_batch_manifest + + # File not found + with pytest.raises(FileNotFoundError) as exc_info: + process_batch_manifest("non_existent_file.json") + assert "Batch manifest file not found" in str(exc_info.value) + + # Invalid JSON + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + f.write("invalid json content{") + temp_file = f.name + + try: + with pytest.raises(ValueError) as exc_info: + process_batch_manifest(temp_file) + assert "Invalid JSON" in str(exc_info.value) + finally: + os.unlink(temp_file) + + def test_process_batch_manifest_validation(self): + """Test validation rules for batch manifest.""" + from madengine.cli.validators import process_batch_manifest + + # Not a list + batch_data = {"model_name": "model1", "build_new": True} + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(batch_data, f) + temp_file = f.name + + try: + with pytest.raises(ValueError) as exc_info: + process_batch_manifest(temp_file) + assert "must be a list" in str(exc_info.value) + finally: + os.unlink(temp_file) + + # Missing model_name + batch_data = [{"build_new": True}] + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(batch_data, f) + temp_file = f.name + + try: + with pytest.raises(ValueError) as exc_info: + process_batch_manifest(temp_file) + assert "missing required 'model_name' field" in str(exc_info.value) + finally: + os.unlink(temp_file) diff --git a/tests/unit/test_config_loader.py b/tests/unit/test_config_loader.py new file mode 100644 index 00000000..9e155466 --- /dev/null +++ b/tests/unit/test_config_loader.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python3 +""" +Unit tests for ConfigLoader. + +Tests the configuration loader's ability to: +1. Apply proper defaults for minimal configs +2. Preserve full configs unchanged +3. Handle override behavior correctly +4. Auto-infer deployment types +5. Detect configuration conflicts + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import pytest +from pathlib import Path + +from madengine.deployment.config_loader import ConfigLoader + + +# Helper function to get project root +def get_project_root(): + """Get the project root directory.""" + return Path(__file__).parent.parent.parent + + +# Helper function to check if config file exists +def config_exists(relative_path): + """Check if a config file exists.""" + full_path = get_project_root() / relative_path + return full_path.exists() + + +# Helper function to load config file +def load_config_file(relative_path): + """Load a config file if it exists.""" + full_path = get_project_root() / relative_path + if not full_path.exists(): + pytest.skip(f"Config file not found: {relative_path}") + + with open(full_path) as f: + return json.load(f) + + +class TestConfigLoaderBasics: + """Test basic ConfigLoader functionality.""" + + def test_minimal_single_gpu(self): + """Test minimal single GPU config gets proper defaults.""" + user_config = { + "k8s": { + "gpu_count": 1 + } + } + + result = ConfigLoader.load_k8s_config(user_config) + + # Validate defaults applied + assert result["k8s"]["gpu_count"] == 1 + assert result["k8s"]["memory"] == "16Gi" + assert result["k8s"]["cpu"] == "8" + assert result["k8s"]["namespace"] == "default" + assert result["gpu_vendor"] == "AMD" + assert "OMP_NUM_THREADS" in result["env_vars"] + + def test_minimal_multi_gpu(self): + """Test minimal multi-GPU config gets proper defaults.""" + user_config = { + "k8s": { + "gpu_count": 2 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 2 + } + } + + result = ConfigLoader.load_k8s_config(user_config) + + # Validate multi-GPU defaults + assert result["k8s"]["gpu_count"] == 2 + assert result["k8s"]["memory"] == "64Gi" + assert result["k8s"]["cpu"] == "16" + assert "NCCL_DEBUG" in result["env_vars"] + assert result["env_vars"]["NCCL_DEBUG"] == "WARN" + assert "MIOPEN_FIND_MODE" in result["env_vars"] + assert result["distributed"]["backend"] == "nccl" + + def test_minimal_multi_node(self): + """Test minimal multi-node config gets proper defaults.""" + user_config = { + "k8s": { + "gpu_count": 2 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 2, + "nproc_per_node": 2 + } + } + + result = ConfigLoader.load_k8s_config(user_config) + + # Validate multi-node defaults + assert result["k8s"]["host_ipc"] == True + assert "NCCL_DEBUG_SUBSYS" in result["env_vars"] + assert "NCCL_TIMEOUT" in result["env_vars"] + + def test_nvidia_config(self): + """Test NVIDIA GPU config gets proper defaults.""" + user_config = { + "gpu_vendor": "NVIDIA", + "k8s": { + "gpu_count": 4 + }, + "distributed": { + "launcher": "torchrun", + "nnodes": 1, + "nproc_per_node": 4 + } + } + + result = ConfigLoader.load_k8s_config(user_config) + + # Validate NVIDIA defaults + assert result["k8s"]["gpu_resource_name"] == "nvidia.com/gpu" + assert "NCCL_P2P_DISABLE" in result["env_vars"] + assert result["env_vars"]["OMP_NUM_THREADS"] == "12" + + def test_override_behavior(self): + """Test that user overrides work correctly.""" + user_config = { + "k8s": { + "gpu_count": 1, + "namespace": "custom-namespace", + "memory": "32Gi" # Override default 16Gi + }, + "env_vars": { + "CUSTOM_VAR": "custom_value" + } + } + + result = ConfigLoader.load_k8s_config(user_config) + + # Validate overrides + assert result["k8s"]["namespace"] == "custom-namespace" + assert result["k8s"]["memory"] == "32Gi" # Overridden + assert result["k8s"]["cpu"] == "8" # Still has default + assert "CUSTOM_VAR" in result["env_vars"] + assert "OMP_NUM_THREADS" in result["env_vars"] # Default still there + + +class TestConfigLoaderK8sConfigs: + """Test with actual K8s config files (if they exist).""" + + @pytest.mark.skipif( + not config_exists("examples/k8s-configs/basic/01-native-single-node-single-gpu.json"), + reason="K8s config file not found" + ) + def test_k8s_single_gpu_config(self): + """Test K8s single GPU config file.""" + user_config = load_config_file("examples/k8s-configs/basic/01-native-single-node-single-gpu.json") + result = ConfigLoader.load_k8s_config(user_config) + + # Validate key fields are preserved + assert result["k8s"]["gpu_count"] == 1 + assert "memory" in result["k8s"] + assert "namespace" in result["k8s"] + assert result["gpu_vendor"] in ["AMD", "NVIDIA"] + + @pytest.mark.skipif( + not config_exists("examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json"), + reason="K8s multi-GPU config file not found" + ) + def test_k8s_multi_gpu_config(self): + """Test K8s multi-GPU config file.""" + user_config = load_config_file("examples/k8s-configs/basic/02-torchrun-single-node-multi-gpu.json") + result = ConfigLoader.load_k8s_config(user_config) + + # Validate multi-GPU config + assert result["k8s"]["gpu_count"] >= 2 + assert "distributed" in result + assert result["distributed"]["nnodes"] == 1 + assert result["distributed"]["nproc_per_node"] >= 2 + + +class TestConfigLoaderSlurmConfigs: + """Test with actual SLURM config files (if they exist).""" + + @pytest.mark.skipif( + not config_exists("examples/slurm-configs/basic/01-single-node-single-gpu.json"), + reason="SLURM config file not found" + ) + def test_slurm_single_gpu_config(self): + """Test SLURM single GPU config file.""" + user_config = load_config_file("examples/slurm-configs/basic/01-single-node-single-gpu.json") + result = ConfigLoader.load_slurm_config(user_config) + + # Validate SLURM config structure + assert "slurm" in result + assert result["slurm"]["nodes"] == 1 + assert result["slurm"]["gpus_per_node"] >= 1 + + @pytest.mark.skipif( + not config_exists("examples/slurm-configs/basic/06-vllm-multi-node.json"), + reason="SLURM vLLM multi-node config file not found" + ) + def test_slurm_vllm_multi_node_config(self): + """Test SLURM vLLM multi-node config file.""" + user_config = load_config_file("examples/slurm-configs/basic/06-vllm-multi-node.json") + result = ConfigLoader.load_slurm_config(user_config) + + # Validate multi-node vLLM config + assert "slurm" in result + assert result["slurm"]["nodes"] >= 2 + assert result["slurm"]["gpus_per_node"] >= 1 + assert "distributed" in result + + # Check for new preflight node check parameters + if "enable_node_check" in result["slurm"]: + assert isinstance(result["slurm"]["enable_node_check"], bool) + if "auto_cleanup_nodes" in result["slurm"]: + assert isinstance(result["slurm"]["auto_cleanup_nodes"], bool) + + +class TestConfigLoaderDeploymentType: + """Test deployment type inference and validation.""" + + def test_auto_infer_k8s(self): + """Test k8s deployment type is auto-inferred from k8s field presence.""" + user_config = { + "k8s": { + "gpu_count": 1 + } + } + + result = ConfigLoader.load_config(user_config) + + # Validate k8s config was loaded and defaults applied + assert "k8s" in result + assert result["k8s"]["gpu_count"] == 1 + assert "memory" in result["k8s"] # Default was applied + + def test_auto_infer_slurm(self): + """Test slurm deployment type is auto-inferred from slurm field presence.""" + user_config = { + "slurm": { + "nodes": 1, + "gpus_per_node": 4 + } + } + + result = ConfigLoader.load_config(user_config) + + # Validate slurm config was loaded and defaults applied + assert "slurm" in result + assert result["slurm"]["nodes"] == 1 + assert result["slurm"]["gpus_per_node"] == 4 + + def test_auto_infer_local(self): + """Test local deployment when no k8s/slurm present.""" + user_config = { + "env_vars": {"MY_VAR": "value"} + } + + result = ConfigLoader.load_config(user_config) + + # Validate local config (no k8s or slurm fields) + assert "k8s" not in result or result.get("k8s") == {} + assert "slurm" not in result or result.get("slurm") == {} + assert result["env_vars"]["MY_VAR"] == "value" + + def test_conflict_k8s_and_slurm(self): + """Test error when both k8s and slurm fields present.""" + user_config = { + "k8s": {"gpu_count": 1}, + "slurm": {"nodes": 2} + } + + with pytest.raises(ValueError, match="Both 'k8s' and 'slurm'"): + ConfigLoader.load_config(user_config) + + def test_conflict_explicit_deploy_mismatch(self): + """Test error when explicit deploy field conflicts with config presence.""" + user_config = { + "deploy": "slurm", + "k8s": {"gpu_count": 1} + } + + with pytest.raises(ValueError, match="Conflicting deployment"): + ConfigLoader.load_config(user_config) + + def test_explicit_deploy_matching(self): + """Test that explicit deploy field works when it matches config.""" + user_config = { + "deploy": "k8s", + "k8s": {"gpu_count": 1} + } + + result = ConfigLoader.load_config(user_config) + + # Should work fine since deploy matches k8s presence + # The deploy field may or may not be preserved in result + assert result["k8s"]["gpu_count"] == 1 + assert "memory" in result["k8s"] # Defaults applied + + + +# Run pytest if executed directly +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) + diff --git a/tests/unit/test_context_logic.py b/tests/unit/test_context_logic.py new file mode 100644 index 00000000..0ce6504f --- /dev/null +++ b/tests/unit/test_context_logic.py @@ -0,0 +1,55 @@ +""" +Context logic unit tests. + +Pure unit tests for Context class initialization and logic without external dependencies. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import pytest +from unittest.mock import Mock, patch + +from madengine.core.context import Context + + +@pytest.mark.unit +class TestContextInitialization: + """Test Context object initialization.""" + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_ngpus", return_value=1) + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx90a") + def test_context_initializes_with_defaults(self, mock_arch, mock_ngpus, mock_vendor): + """Context should initialize with system defaults.""" + context = Context() + + assert context.get_gpu_vendor() == "AMD" + assert context.get_system_ngpus() == 1 + assert context.get_system_gpu_architecture() == "gfx90a" + + # REMOVED: test_context_detects_nvidia_gpus and test_context_handles_cpu_only + # These tests require actual GPU detection and are better suited as integration tests. + # Context initialization tests are covered in integration/test_platform_integration.py + + +@pytest.mark.unit +class TestBuildArgGeneration: + """Test Docker build argument generation logic.""" + + @patch.object(Context, "get_gpu_vendor", return_value="AMD") + @patch.object(Context, "get_system_gpu_architecture", return_value="gfx90a") + def test_generates_build_args_for_amd(self, mock_arch, mock_vendor): + """Should generate proper build args for AMD GPUs.""" + context = Context() + context.ctx = { + "docker_build_arg": { + "MAD_GPU_VENDOR": "AMD", + "MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a" + } + } + + assert context.ctx["docker_build_arg"]["MAD_GPU_VENDOR"] == "AMD" + assert context.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] == "gfx90a" + + +# Total: 5 unit tests diff --git a/tests/unit/test_database_mongodb.py b/tests/unit/test_database_mongodb.py new file mode 100644 index 00000000..de64c1e4 --- /dev/null +++ b/tests/unit/test_database_mongodb.py @@ -0,0 +1,583 @@ +""" +Unit tests for MongoDB database operations. + +Tests the refactored database upload functionality including file loading, +type handling, and document transformation. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import json +import os +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch, Mock +import pytest +import pandas as pd + +from madengine.database.mongodb import ( + MongoDBConfig, + UploadOptions, + UploadResult, + FileFormat, + JSONLoader, + CSVLoader, + DocumentTransformer, + detect_file_format, + get_loader, + upload_file_to_mongodb, +) + + +# ============================================================================ +# Fixtures +# ============================================================================ + +@pytest.fixture +def sample_json_data(): + """Sample JSON data with native types.""" + return [ + { + "model": "test_model_1", + "performance": 123.45, + "metric": "tokens/sec", + "status": "SUCCESS", + "configs": { + "batch_size": 32, + "learning_rate": 0.001 + }, + "enabled": True, + "timestamp": "2026-01-07 10:00:00" + }, + { + "model": "test_model_2", + "performance": 234.56, + "metric": "tokens/sec", + "status": "SUCCESS", + "configs": { + "batch_size": 64, + "learning_rate": 0.002 + }, + "enabled": False, + "timestamp": "2026-01-07 10:05:00" + } + ] + + +@pytest.fixture +def temp_json_file(sample_json_data): + """Create a temporary JSON file.""" + with tempfile.NamedTemporaryFile( + mode='w', suffix='.json', delete=False + ) as f: + json.dump(sample_json_data, f) + file_path = f.name + + yield Path(file_path) + + # Cleanup + if os.path.exists(file_path): + os.unlink(file_path) + + +@pytest.fixture +def temp_csv_file(): + """Create a temporary CSV file.""" + with tempfile.NamedTemporaryFile( + mode='w', suffix='.csv', delete=False, newline='' + ) as f: + f.write("model,performance,metric,status,timestamp\n") + f.write("csv_model_1,345.67,tokens/sec,SUCCESS,2026-01-07 11:00:00\n") + f.write("csv_model_2,456.78,tokens/sec,SUCCESS,2026-01-07 11:05:00\n") + file_path = f.name + + yield Path(file_path) + + # Cleanup + if os.path.exists(file_path): + os.unlink(file_path) + + +# ============================================================================ +# Configuration Tests +# ============================================================================ + +@pytest.mark.unit +def test_mongodb_config_defaults(): + """Test MongoDBConfig with default values.""" + config = MongoDBConfig() + + assert config.host == "localhost" + assert config.port == 27017 + assert config.username == "" + assert config.password == "" + assert config.timeout_ms == 5000 + + +@pytest.mark.unit +def test_mongodb_config_from_env(): + """Test MongoDBConfig loading from environment.""" + env_vars = { + "MONGO_HOST": "test-host", + "MONGO_PORT": "27018", + "MONGO_USER": "testuser", + "MONGO_PASSWORD": "testpass", + } + + with patch.dict(os.environ, env_vars, clear=False): + config = MongoDBConfig.from_env() + + assert config.host == "test-host" + assert config.port == 27018 + assert config.username == "testuser" + assert config.password == "testpass" + + +@pytest.mark.unit +def test_mongodb_config_uri_with_auth(): + """Test MongoDB URI generation with authentication.""" + config = MongoDBConfig( + host="example.com", + port=27017, + username="user", + password="pass" + ) + + assert config.uri == "mongodb://user:pass@example.com:27017/admin" + + +@pytest.mark.unit +def test_mongodb_config_uri_without_auth(): + """Test MongoDB URI generation without authentication.""" + config = MongoDBConfig(host="example.com", port=27017) + + assert config.uri == "mongodb://example.com:27017" + + +@pytest.mark.unit +def test_upload_options_defaults(): + """Test UploadOptions default values.""" + options = UploadOptions() + + assert options.unique_fields is None + assert options.upsert is True + assert options.batch_size == 1000 + assert options.ordered is False + assert options.create_indexes is True + assert options.add_metadata is True + assert options.dry_run is False + + +# ============================================================================ +# File Detection Tests +# ============================================================================ + +@pytest.mark.unit +def test_detect_json_format_by_extension(temp_json_file): + """Test JSON format detection by file extension.""" + file_format = detect_file_format(temp_json_file) + assert file_format == FileFormat.JSON + + +@pytest.mark.unit +def test_detect_csv_format_by_extension(temp_csv_file): + """Test CSV format detection by file extension.""" + file_format = detect_file_format(temp_csv_file) + assert file_format == FileFormat.CSV + + +@pytest.mark.unit +def test_detect_json_format_by_content(): + """Test JSON format detection by content when no extension.""" + with tempfile.NamedTemporaryFile( + mode='w', suffix='', delete=False + ) as f: + json.dump({"test": "data"}, f) + file_path = f.name + + try: + file_format = detect_file_format(Path(file_path)) + assert file_format == FileFormat.JSON + finally: + os.unlink(file_path) + + +@pytest.mark.unit +def test_get_loader_json(): + """Test getting JSON loader.""" + loader = get_loader(FileFormat.JSON) + assert isinstance(loader, JSONLoader) + + +@pytest.mark.unit +def test_get_loader_csv(): + """Test getting CSV loader.""" + loader = get_loader(FileFormat.CSV) + assert isinstance(loader, CSVLoader) + + +# ============================================================================ +# JSON Loader Tests +# ============================================================================ + +@pytest.mark.unit +def test_json_loader_load_array(temp_json_file, sample_json_data): + """Test JSONLoader with array format.""" + loader = JSONLoader() + documents = loader.load(temp_json_file) + + assert len(documents) == 2 + assert documents[0]["model"] == "test_model_1" + assert documents[0]["performance"] == 123.45 + assert isinstance(documents[0]["configs"], dict) + assert documents[0]["enabled"] is True + + +@pytest.mark.unit +def test_json_loader_load_single_object(): + """Test JSONLoader with single object format.""" + data = {"model": "test", "value": 42} + + with tempfile.NamedTemporaryFile( + mode='w', suffix='.json', delete=False + ) as f: + json.dump(data, f) + file_path = f.name + + try: + loader = JSONLoader() + documents = loader.load(Path(file_path)) + + assert len(documents) == 1 + assert documents[0]["model"] == "test" + assert documents[0]["value"] == 42 + finally: + os.unlink(file_path) + + +@pytest.mark.unit +def test_json_loader_preserves_types(temp_json_file): + """Test that JSONLoader preserves native types.""" + loader = JSONLoader() + documents = loader.load(temp_json_file) + + doc = documents[0] + assert isinstance(doc["performance"], float) + assert isinstance(doc["configs"], dict) + assert isinstance(doc["enabled"], bool) + assert isinstance(doc["model"], str) + + +@pytest.mark.unit +def test_json_loader_infer_schema(sample_json_data): + """Test JSON schema inference.""" + loader = JSONLoader() + schema = loader.infer_schema(sample_json_data) + + assert schema["model"] == str + assert schema["performance"] == float + assert schema["configs"] == dict + assert schema["enabled"] == bool + + +# ============================================================================ +# CSV Loader Tests +# ============================================================================ + +@pytest.mark.unit +def test_csv_loader_load(temp_csv_file): + """Test CSVLoader basic loading.""" + loader = CSVLoader() + documents = loader.load(temp_csv_file) + + assert len(documents) == 2 + assert documents[0]["model"] == "csv_model_1" + assert documents[1]["model"] == "csv_model_2" + + +@pytest.mark.unit +def test_csv_loader_type_inference(temp_csv_file): + """Test that CSVLoader infers types correctly.""" + loader = CSVLoader() + documents = loader.load(temp_csv_file) + + doc = documents[0] + # Performance should be float, not string + assert isinstance(doc["performance"], (float, int)) + assert doc["performance"] == 345.67 + + +@pytest.mark.unit +def test_csv_loader_json_string_parsing(): + """Test that CSVLoader parses JSON strings in columns.""" + with tempfile.NamedTemporaryFile( + mode='w', suffix='.csv', delete=False, newline='' + ) as f: + f.write('model,configs\n') + f.write('test,"{""lr"": 0.001}"\n') + file_path = f.name + + try: + loader = CSVLoader() + documents = loader.load(Path(file_path)) + + # Should parse JSON string in configs column + assert isinstance(documents[0]["configs"], (dict, str)) + finally: + os.unlink(file_path) + + +@pytest.mark.unit +def test_csv_loader_handles_null_values(): + """Test CSVLoader handles null/missing values.""" + with tempfile.NamedTemporaryFile( + mode='w', suffix='.csv', delete=False, newline='' + ) as f: + f.write('model,value\n') + f.write('test1,42\n') + f.write('test2,\n') # Empty value + file_path = f.name + + try: + loader = CSVLoader() + documents = loader.load(Path(file_path)) + + assert documents[0]["value"] == 42 + assert documents[1]["value"] is None + finally: + os.unlink(file_path) + + +# ============================================================================ +# Document Transformer Tests +# ============================================================================ + +@pytest.mark.unit +def test_document_transformer_adds_metadata(): + """Test that transformer adds metadata fields.""" + options = UploadOptions(add_metadata=True) + transformer = DocumentTransformer(options) + + documents = [{"model": "test", "value": 42}] + transformed = transformer.transform(documents) + + assert "_meta_uploaded_at" in transformed[0] + assert "created_date" in transformed[0] + + +@pytest.mark.unit +def test_document_transformer_preserves_existing_metadata(): + """Test that transformer preserves existing created_date.""" + options = UploadOptions(add_metadata=True) + transformer = DocumentTransformer(options) + + original_date = "2026-01-01 00:00:00" + documents = [{"model": "test", "created_date": original_date}] + transformed = transformer.transform(documents) + + assert transformed[0]["created_date"] == original_date + + +@pytest.mark.unit +def test_document_transformer_infer_unique_fields(): + """Test automatic unique field inference.""" + options = UploadOptions() + transformer = DocumentTransformer(options) + + documents = [ + {"model": "model1", "timestamp": "2026-01-01", "value": 1}, + {"model": "model2", "timestamp": "2026-01-02", "value": 2}, + ] + + unique_fields = transformer.infer_unique_fields(documents) + + assert "model" in unique_fields + + +@pytest.mark.unit +def test_document_transformer_no_metadata_when_disabled(): + """Test that metadata is not added when disabled.""" + options = UploadOptions(add_metadata=False) + transformer = DocumentTransformer(options) + + documents = [{"model": "test", "value": 42}] + transformed = transformer.transform(documents) + + assert "_meta_uploaded_at" not in transformed[0] + + +# ============================================================================ +# Upload Result Tests +# ============================================================================ + +@pytest.mark.unit +def test_upload_result_success_status(): + """Test UploadResult with success status.""" + result = UploadResult( + status="success", + documents_read=10, + documents_processed=10, + documents_inserted=8, + documents_updated=2, + documents_failed=0, + duration_seconds=1.5 + ) + + assert result.status == "success" + assert result.documents_read == 10 + assert result.documents_inserted == 8 + assert result.documents_updated == 2 + + +@pytest.mark.unit +def test_upload_result_with_errors(): + """Test UploadResult with errors.""" + result = UploadResult( + status="partial", + documents_read=10, + documents_processed=8, + documents_inserted=7, + documents_updated=1, + documents_failed=2, + errors=["Error 1", "Error 2"], + duration_seconds=2.0 + ) + + assert result.status == "partial" + assert result.documents_failed == 2 + assert len(result.errors) == 2 + + +# ============================================================================ +# Main Upload Function Tests (Mocked) +# ============================================================================ + +@pytest.mark.unit +def test_upload_file_to_mongodb_json_dry_run(temp_json_file): + """Test uploading JSON file in dry-run mode.""" + config = MongoDBConfig() + options = UploadOptions(dry_run=True) + + result = upload_file_to_mongodb( + file_path=str(temp_json_file), + database_name="test_db", + collection_name="test_collection", + config=config, + options=options + ) + + assert result.status == "success" + assert result.documents_read == 2 + assert result.documents_processed == 0 + assert result.documents_inserted == 0 + + +@pytest.mark.unit +def test_upload_file_to_mongodb_csv_dry_run(temp_csv_file): + """Test uploading CSV file in dry-run mode.""" + config = MongoDBConfig() + options = UploadOptions(dry_run=True) + + result = upload_file_to_mongodb( + file_path=str(temp_csv_file), + database_name="test_db", + collection_name="test_collection", + config=config, + options=options + ) + + assert result.status == "success" + assert result.documents_read == 2 + + +@pytest.mark.unit +def test_upload_file_to_mongodb_auto_detects_unique_fields(temp_json_file): + """Test that upload auto-detects unique fields.""" + config = MongoDBConfig() + options = UploadOptions( + dry_run=True, + unique_fields=None # Should auto-detect + ) + + result = upload_file_to_mongodb( + file_path=str(temp_json_file), + database_name="test_db", + collection_name="test_collection", + config=config, + options=options + ) + + assert result.status == "success" + # Options should have been updated with detected fields + assert options.unique_fields is not None + + +@pytest.mark.unit +def test_upload_file_to_mongodb_file_not_found(): + """Test upload with non-existent file.""" + config = MongoDBConfig() + options = UploadOptions() + + with pytest.raises(FileNotFoundError): + upload_file_to_mongodb( + file_path="/nonexistent/file.json", + database_name="test_db", + collection_name="test_collection", + config=config, + options=options + ) + + +@pytest.mark.unit +def test_upload_file_to_mongodb_with_custom_unique_fields(temp_json_file): + """Test upload with custom unique fields.""" + config = MongoDBConfig() + options = UploadOptions( + dry_run=True, + unique_fields=["model", "timestamp"] + ) + + result = upload_file_to_mongodb( + file_path=str(temp_json_file), + database_name="test_db", + collection_name="test_collection", + config=config, + options=options + ) + + assert result.status == "success" + assert options.unique_fields == ["model", "timestamp"] + + +@pytest.mark.unit +@patch('madengine.database.mongodb.MongoDBUploader') +def test_upload_file_to_mongodb_calls_uploader(mock_uploader_class, temp_json_file): + """Test that upload function properly calls MongoDBUploader.""" + # Setup mock + mock_uploader = MagicMock() + mock_uploader_class.return_value.__enter__.return_value = mock_uploader + mock_uploader.upload.return_value = UploadResult( + status="success", + documents_read=2, + documents_processed=2, + documents_inserted=2, + documents_updated=0, + documents_failed=0, + duration_seconds=0.1 + ) + + config = MongoDBConfig() + options = UploadOptions(dry_run=False) + + result = upload_file_to_mongodb( + file_path=str(temp_json_file), + database_name="test_db", + collection_name="test_collection", + config=config, + options=options + ) + + # Verify uploader was called + mock_uploader.upload.assert_called_once() + assert result.status == "success" + assert result.documents_inserted == 2 diff --git a/tests/unit/test_error_handling.py b/tests/unit/test_error_handling.py new file mode 100644 index 00000000..e4da5af4 --- /dev/null +++ b/tests/unit/test_error_handling.py @@ -0,0 +1,504 @@ +#!/usr/bin/env python3 +""" +Unit tests for madengine unified error handling system. + +Tests the core error handling functionality including error types, +context management, Rich console integration, and error propagation. +""" + +import pytest +import json +import io +import re +from unittest.mock import Mock, patch, MagicMock +from rich.console import Console +from rich.text import Text + +# Add src to path for imports +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) + +from madengine.core.errors import ( + ErrorCategory, + ErrorContext, + MADEngineError, + ValidationError, + ConnectionError, + AuthenticationError, + RuntimeError, + BuildError, + DiscoveryError, + OrchestrationError, + RunnerError, + ConfigurationError, + TimeoutError, + ErrorHandler, + set_error_handler, + get_error_handler, + handle_error, + create_error_context +) + + +class TestErrorContext: + """Test error context data structure.""" + + def test_error_context_creation(self): + """Test basic error context creation.""" + context = ErrorContext( + operation="test_operation", + phase="test_phase", + component="test_component" + ) + + assert context.operation == "test_operation" + assert context.phase == "test_phase" + assert context.component == "test_component" + assert context.model_name is None + assert context.node_id is None + assert context.file_path is None + assert context.additional_info is None + + def test_error_context_full(self): + """Test error context with all fields.""" + additional_info = {"key": "value", "number": 42} + context = ErrorContext( + operation="complex_operation", + phase="execution", + component="TestComponent", + model_name="test_model", + node_id="node-001", + file_path="/path/to/file.json", + additional_info=additional_info + ) + + assert context.operation == "complex_operation" + assert context.phase == "execution" + assert context.component == "TestComponent" + assert context.model_name == "test_model" + assert context.node_id == "node-001" + assert context.file_path == "/path/to/file.json" + assert context.additional_info == additional_info + + def test_create_error_context_function(self): + """Test create_error_context convenience function.""" + context = create_error_context( + operation="test_op", + phase="test_phase", + model_name="test_model" + ) + + assert isinstance(context, ErrorContext) + assert context.operation == "test_op" + assert context.phase == "test_phase" + assert context.model_name == "test_model" + + +class TestMADEngineErrorHierarchy: + """Test madengine error class hierarchy.""" + + def test_base_madengine_error(self): + """Test base madengine error functionality.""" + context = ErrorContext(operation="test") + error = MADEngineError( + message="Test error", + category=ErrorCategory.RUNTIME, + context=context, + recoverable=True, + suggestions=["Try again", "Check logs"] + ) + + assert str(error) == "Test error" + assert error.message == "Test error" + assert error.category == ErrorCategory.RUNTIME + assert error.context == context + assert error.recoverable is True + assert error.suggestions == ["Try again", "Check logs"] + assert error.cause is None + + @pytest.mark.parametrize("error_class,category,recoverable,message", [ + (ValidationError, ErrorCategory.VALIDATION, True, "Invalid input"), + (ConnectionError, ErrorCategory.CONNECTION, True, "Connection failed"), + (BuildError, ErrorCategory.BUILD, False, "Build failed"), + (RunnerError, ErrorCategory.RUNNER, True, "Runner execution failed"), + (AuthenticationError, ErrorCategory.AUTHENTICATION, True, "Auth failed"), + (ConfigurationError, ErrorCategory.CONFIGURATION, True, "Config error"), + ]) + def test_error_types(self, error_class, category, recoverable, message): + """Test all error types with parametrized test.""" + error = error_class(message) + + assert isinstance(error, MADEngineError) + assert error.category == category + assert error.recoverable is recoverable + assert str(error) == message + + def test_error_with_cause(self): + """Test error with underlying cause.""" + original_error = ValueError("Original error") + mad_error = RuntimeError("Runtime failure", cause=original_error) + + assert mad_error.cause == original_error + assert str(mad_error) == "Runtime failure" + + +class TestErrorHandler: + """Test ErrorHandler functionality.""" + + def setup_method(self): + """Set up test fixtures.""" + self.mock_console = Mock(spec=Console) + self.error_handler = ErrorHandler(console=self.mock_console, verbose=False) + + def test_error_handler_creation(self): + """Test ErrorHandler initialization.""" + assert self.error_handler.console == self.mock_console + assert self.error_handler.verbose is False + assert self.error_handler.logger is not None + + def test_handle_madengine_error(self): + """Test handling of madengine structured errors.""" + context = create_error_context( + operation="test_operation", + component="TestComponent", + model_name="test_model" + ) + error = ValidationError( + "Test validation error", + context=context, + suggestions=["Check input", "Verify format"] + ) + + self.error_handler.handle_error(error) + + # Verify console.print was called for the error panel + self.mock_console.print.assert_called() + call_args = self.mock_console.print.call_args[0] + + # Check that a Rich Panel was created + assert len(call_args) > 0 + panel = call_args[0] + assert hasattr(panel, 'title') + assert "Validation Error" in panel.title + + def test_handle_generic_error(self): + """Test handling of generic Python exceptions.""" + error = ValueError("Generic Python error") + context = create_error_context(operation="test_op") + + self.error_handler.handle_error(error, context=context) + + # Verify console.print was called + self.mock_console.print.assert_called() + call_args = self.mock_console.print.call_args[0] + + # Check that a Rich Panel was created + assert len(call_args) > 0 + panel = call_args[0] + assert hasattr(panel, 'title') + assert "ValueError" in panel.title + + def test_handle_error_verbose_mode(self): + """Test error handling in verbose mode.""" + verbose_handler = ErrorHandler(console=self.mock_console, verbose=True) + # Create error with a cause to trigger print_exception + original_error = ValueError("Original error") + error = RuntimeError("Test runtime error", cause=original_error) + + verbose_handler.handle_error(error, show_traceback=True) + + # Verify both print and print_exception were called + assert self.mock_console.print.call_count >= 2 + self.mock_console.print_exception.assert_called() + + def test_error_categorization_display(self): + """Test that different error categories display with correct styling.""" + test_cases = [ + (ValidationError("Validation failed"), "⚠️", "Validation Error"), + (ConnectionError("Connection failed"), "🔌", "Connection Error"), + (BuildError("Build failed"), "🔨", "Build Error"), + (RunnerError("Runner failed"), "🚀", "Runner Error"), + ] + + for error, expected_emoji, expected_title in test_cases: + self.mock_console.reset_mock() + self.error_handler.handle_error(error) + + # Verify console.print was called + self.mock_console.print.assert_called() + call_args = self.mock_console.print.call_args[0] + panel = call_args[0] + + assert expected_emoji in panel.title + assert expected_title in panel.title + + +class TestGlobalErrorHandler: + """Test global error handler functionality.""" + + def test_set_and_get_error_handler(self): + """Test setting and getting global error handler.""" + mock_console = Mock(spec=Console) + handler = ErrorHandler(console=mock_console) + + set_error_handler(handler) + retrieved_handler = get_error_handler() + + assert retrieved_handler == handler + + def test_handle_error_function(self): + """Test global handle_error function.""" + mock_console = Mock(spec=Console) + handler = ErrorHandler(console=mock_console) + set_error_handler(handler) + + error = ValidationError("Test error") + context = create_error_context(operation="test") + + handle_error(error, context=context) + + # Verify the handler was used + mock_console.print.assert_called() + + def test_handle_error_no_global_handler(self): + """Test handle_error function when no global handler is set.""" + # Clear global handler + set_error_handler(None) + + with patch('madengine.core.errors.logging') as mock_logging: + error = ValueError("Test error") + handle_error(error) + + # Should fallback to logging + mock_logging.error.assert_called_once() + + +class TestErrorContextPropagation: + """Test error context propagation through call stack.""" + + def test_context_preservation_through_hierarchy(self): + """Test that context is preserved when creating derived errors.""" + original_context = create_error_context( + operation="original_op", + component="OriginalComponent", + model_name="test_model" + ) + + # Create a base error with context + base_error = MADEngineError( + "Base error", + ErrorCategory.RUNTIME, + context=original_context + ) + + # Create a derived error that should preserve context + derived_error = ValidationError( + "Derived error", + context=original_context, + cause=base_error + ) + + assert derived_error.context == original_context + assert derived_error.cause == base_error + assert derived_error.context.operation == "original_op" + assert derived_error.context.component == "OriginalComponent" + + def test_context_enrichment(self): + """Test adding additional context information.""" + base_context = create_error_context(operation="base_op") + + # Create enriched context + enriched_context = ErrorContext( + operation=base_context.operation, + phase="enriched_phase", + component="EnrichedComponent", + additional_info={"enriched": True} + ) + + error = RuntimeError("Test error", context=enriched_context) + + assert error.context.operation == "base_op" + assert error.context.phase == "enriched_phase" + assert error.context.component == "EnrichedComponent" + assert error.context.additional_info["enriched"] is True + + +class TestErrorRecoveryAndSuggestions: + """Test error recovery indicators and suggestions.""" + + def test_recoverable_errors(self): + """Test that certain error types are marked as recoverable.""" + recoverable_errors = [ + ValidationError("Validation error"), + ConnectionError("Connection error"), + AuthenticationError("Auth error"), + ConfigurationError("Config error"), + TimeoutError("Timeout error"), + ] + + for error in recoverable_errors: + assert error.recoverable is True, f"{type(error).__name__} should be recoverable" + + def test_non_recoverable_errors(self): + """Test that certain error types are marked as non-recoverable.""" + non_recoverable_errors = [ + RuntimeError("Runtime error"), + BuildError("Build error"), + OrchestrationError("Orchestration error"), + ] + + for error in non_recoverable_errors: + assert error.recoverable is False, f"{type(error).__name__} should not be recoverable" + + def test_suggestions_in_errors(self): + """Test that suggestions are properly included in errors.""" + suggestions = ["Check configuration", "Verify credentials", "Try again"] + error = ValidationError( + "Validation failed", + suggestions=suggestions + ) + + assert error.suggestions == suggestions + + # Test handling displays suggestions + mock_console = Mock(spec=Console) + handler = ErrorHandler(console=mock_console) + handler.handle_error(error) + + # Verify console.print was called and suggestions are in output + mock_console.print.assert_called() + + +class TestErrorIntegration: + """Test error handling integration scenarios.""" + + def test_error_serialization_context(self): + """Test that error context can be serialized for logging.""" + context = create_error_context( + operation="test_operation", + phase="test_phase", + component="TestComponent", + model_name="test_model", + additional_info={"key": "value"} + ) + + error = ValidationError("Test error", context=context) + + # Context should be serializable + context_dict = error.context.__dict__ + json_str = json.dumps(context_dict, default=str) + + assert "test_operation" in json_str + assert "test_phase" in json_str + assert "TestComponent" in json_str + assert "test_model" in json_str + + def test_nested_error_handling(self): + """Test handling of nested exceptions.""" + original_error = ConnectionError("Network timeout") + wrapped_error = RuntimeError("Operation failed", cause=original_error) + final_error = OrchestrationError("Orchestration failed", cause=wrapped_error) + + assert final_error.cause == wrapped_error + assert wrapped_error.cause == original_error + + # Test that the handler can display nested error information + mock_console = Mock(spec=Console) + handler = ErrorHandler(console=mock_console) + handler.handle_error(final_error) + + mock_console.print.assert_called() + + +class TestErrorPatternMatching: + """Test error pattern matching for log analysis. + + These tests validate the error pattern fixes for GPT2 training, + ensuring ROCProf logs are correctly excluded while real errors are caught. + """ + + @pytest.fixture + def benign_patterns(self): + """Benign patterns that should be excluded from error detection.""" + return [ + r"^E[0-9]{8}.*generateRocpd\.cpp", + r"^W[0-9]{8}.*simple_timer\.cpp", + r"^W[0-9]{8}.*generateRocpd\.cpp", + r"^E[0-9]{8}.*tool\.cpp", + "Opened result file:", + "SQLite3 generation ::", + r"\[rocprofv3\]", + "rocpd_op:", + "rpd_tracer:", + ] + + @pytest.fixture + def error_patterns(self): + """Error patterns that should be detected in logs.""" + return [ + "OutOfMemoryError", + "HIP out of memory", + "CUDA out of memory", + "RuntimeError:", + "AssertionError:", + "ValueError:", + "SystemExit", + r"failed \(exitcode:", + r"Traceback \(most recent call last\)", + "FAILED", + "Exception:", + "ImportError:", + "ModuleNotFoundError:", + ] + + def test_benign_patterns_match_rocprof_logs(self, benign_patterns): + """Test that benign patterns correctly match ROCProf logs.""" + # Test cases that should be excluded (false positives) + rocprof_messages = [ + "E20251230 16:43:09.797714 140310524069632 generateRocpd.cpp:605] Opened result file: /myworkspace/transformers/banff-cyxtera-s83-5/1004_results.db", + "W20251230 16:43:09.852161 140310524069632 simple_timer.cpp:55] SQLite3 generation :: rocpd_string", + "W20251230 16:43:09.896980 140310524069632 simple_timer.cpp:55] [rocprofv3] output generation :: 0.121982 sec", + "E20251230 16:43:12.684603 140140898293696 tool.cpp:2420] HIP (runtime) version 7.1.0 initialized", + "rocpd_op: 0", + "rpd_tracer: finalized in 50.142105 ms", + ] + + for test_line in rocprof_messages: + matched = any(re.search(pattern, test_line) for pattern in benign_patterns) + assert matched, f"Failed to match ROCProf log: {test_line[:80]}" + + def test_error_patterns_catch_real_errors(self, error_patterns): + """Test that error patterns correctly catch real errors.""" + # Test cases that should be caught (real errors) + real_errors = [ + "RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB", + "ImportError: cannot import name 'AutoModel' from 'transformers'", + "ModuleNotFoundError: No module named 'torch'", + "Traceback (most recent call last):", + "ValueError: invalid literal for int() with base 10: 'abc'", + "AssertionError: Expected shape (2, 3) but got (3, 2)", + "torch.distributed.elastic.multiprocessing.errors.ChildFailedError: FAILED", + ] + + for test_line in real_errors: + matched = any(re.search(pattern, test_line) for pattern in error_patterns) + assert matched, f"Failed to catch error: {test_line[:80]}" + + def test_rocprof_messages_dont_trigger_errors(self, error_patterns): + """Test that ROCProf messages don't trigger error patterns.""" + # ROCProf messages that should NOT trigger errors + rocprof_messages = [ + "E20251230 16:43:09.797714 140310524069632 generateRocpd.cpp:605] Opened result file", + "W20251230 16:43:09.852161 140310524069632 simple_timer.cpp:55] SQLite3 generation", + "rocpd_op: 0", + "rpd_tracer: finalized in 50.142105 ms", + ] + + for test_line in rocprof_messages: + matched = any(re.search(pattern, test_line) for pattern in error_patterns) + assert not matched, f"False positive: {test_line[:80]} matched error pattern" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/unit/test_orchestrator_logic.py b/tests/unit/test_orchestrator_logic.py new file mode 100644 index 00000000..4f0aaa6d --- /dev/null +++ b/tests/unit/test_orchestrator_logic.py @@ -0,0 +1,92 @@ +""" +Orchestrator logic unit tests. + +Pure unit tests for orchestrator initialization and logic without external dependencies. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import pytest +from unittest.mock import MagicMock, mock_open, patch + +from madengine.orchestration.build_orchestrator import BuildOrchestrator +from madengine.orchestration.run_orchestrator import RunOrchestrator +from madengine.core.errors import ConfigurationError + + +@pytest.mark.unit +class TestBuildOrchestratorInit: + """Test Build Orchestrator initialization.""" + + @patch("madengine.orchestration.build_orchestrator.Context") + @patch("os.path.exists", return_value=False) + def test_initializes_with_minimal_args(self, mock_exists, mock_context): + """Should initialize with minimal arguments.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + orchestrator = BuildOrchestrator(mock_args) + + assert orchestrator.args == mock_args + assert orchestrator.additional_context == {} + assert orchestrator.credentials is None + + @patch("madengine.orchestration.build_orchestrator.Context") + @patch("os.path.exists", return_value=False) + def test_parses_additional_context_json(self, mock_exists, mock_context): + """Should parse JSON additional context.""" + mock_args = MagicMock() + mock_args.additional_context = '{"key": "value"}' + mock_args.live_output = True + + orchestrator = BuildOrchestrator(mock_args) + + assert orchestrator.additional_context == {"key": "value"} + + +@pytest.mark.unit +class TestRunOrchestratorInit: + """Test Run Orchestrator initialization.""" + + @patch("madengine.orchestration.run_orchestrator.Context") + def test_initializes_with_args(self, mock_context): + """Should initialize with provided arguments.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + assert orchestrator.args == mock_args + assert orchestrator.additional_context == {} + + def test_parses_deploy_type_from_context(self): + """Should extract deploy type from additional context.""" + mock_args = MagicMock() + mock_args.additional_context = '{"deploy": "slurm"}' + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + assert orchestrator.additional_context["deploy"] == "slurm" + + +@pytest.mark.unit +class TestManifestValidation: + """Test manifest validation logic.""" + + @patch("os.path.exists", return_value=False) + def test_run_without_manifest_or_tags_raises_error(self, mock_exists): + """Should raise ConfigurationError without manifest or tags.""" + mock_args = MagicMock() + mock_args.additional_context = None + mock_args.live_output = True + + orchestrator = RunOrchestrator(mock_args) + + with pytest.raises(ConfigurationError): + orchestrator.execute(manifest_file=None, tags=None) + + +# Total: 5 unit tests diff --git a/tests/unit/test_reporting_superset.py b/tests/unit/test_reporting_superset.py new file mode 100644 index 00000000..a107d3dc --- /dev/null +++ b/tests/unit/test_reporting_superset.py @@ -0,0 +1,777 @@ +"""Unit tests for Performance Superset Reporting. + +Tests the reporting layer's superset functionality including: +1. ConfigParser for loading model configuration files (CSV, JSON, YAML) +2. perf_super.json generation (cumulative) with configs and multi_results +3. perf_entry_super.json generation (latest run) from perf_super.json +4. CSV export from perf_super.json to perf_entry_super.csv and perf_super.csv +5. Handling of complex fields (configs, multi_results) in CSV format + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" +# built-in modules +import os +import json +import tempfile +import shutil +# 3rd party modules +import pytest +import pandas as pd +# project modules +from madengine.utils.config_parser import ConfigParser +from madengine.reporting.update_perf_super import ( + update_perf_super_json, + update_perf_super_csv, + convert_super_json_to_csv, +) + + +class TestConfigParser: + """Test cases for ConfigParser functionality.""" + + @pytest.fixture + def test_dir(self): + """Create temporary directory for tests.""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + @pytest.fixture + def fixtures_dir(self): + """Get path to dummy fixtures directory.""" + return os.path.join( + os.path.dirname(__file__), + '..', + 'fixtures', + 'dummy', + 'scripts', + 'dummy' + ) + + @pytest.fixture + def config_file(self, fixtures_dir): + """Get path to config file.""" + return os.path.join(fixtures_dir, 'configs', 'default.csv') + + def test_config_file_exists(self, config_file): + """Test that the dummy config file exists.""" + assert os.path.exists(config_file), \ + f"Config file should exist at {config_file}" + + def test_config_parser_loads_csv(self, config_file): + """Test that ConfigParser can load the dummy CSV config.""" + parser = ConfigParser() + configs = parser.load_config_file(config_file) + + assert configs is not None, "Configs should not be None" + assert isinstance(configs, list), "Configs should be a list" + assert len(configs) == 3, "Should have 3 config rows" + + # Check first config has expected fields + first_config = configs[0] + assert 'model' in first_config + assert 'benchmark' in first_config + assert 'config_value' in first_config + assert 'batch_size' in first_config + assert 'datatype' in first_config + assert 'max_tokens' in first_config + + # Verify values + assert first_config['model'] == 'dummy/model-1' + assert first_config['benchmark'] == 'throughput' + assert first_config['datatype'] == 'float16' + assert first_config['batch_size'] == 8 + assert first_config['config_value'] == 128 + assert first_config['max_tokens'] == 1024 + + def test_config_parser_from_args(self, fixtures_dir): + """Test parsing config path from args string.""" + parser = ConfigParser(scripts_base_dir=fixtures_dir) + args_string = "--config configs/default.csv" + + config_path = parser.parse_config_from_args( + args_string, + os.path.join(fixtures_dir, 'run.sh') + ) + + assert config_path is not None, "Config path should be found" + assert os.path.exists(config_path), \ + f"Config file should exist at {config_path}" + + def test_config_parser_parse_and_load(self, fixtures_dir): + """Test parse_and_load convenience method.""" + parser = ConfigParser(scripts_base_dir=fixtures_dir) + args_string = "--batch-size 32 --config configs/default.csv" + + configs = parser.parse_and_load(args_string, fixtures_dir) + + assert configs is not None, "Configs should be loaded" + assert isinstance(configs, list), "Configs should be a list" + assert len(configs) == 3, "Should have 3 config rows" + + def test_config_parser_no_config_arg(self, fixtures_dir): + """Test handling when no --config argument is present.""" + parser = ConfigParser(scripts_base_dir=fixtures_dir) + args_string = "--batch-size 32 --epochs 10" + + configs = parser.parse_and_load(args_string, fixtures_dir) + + assert configs is None, "Should return None when no config argument" + + def test_config_parser_match_config_to_result(self, config_file): + """Test matching configs to results.""" + parser = ConfigParser() + configs = parser.load_config_file(config_file) + + # Test matching with model name + result_data = { + 'model': 'dummy/model-1', + 'benchmark': 'throughput' + } + + matched = parser.match_config_to_result(configs, result_data, 'dummy/model-1') + + assert matched is not None, "Should match a config" + assert matched['model'] == 'dummy/model-1' + assert matched['benchmark'] == 'throughput' + + def test_config_parser_json_file(self, test_dir): + """Test loading JSON config file.""" + # Create a JSON config file + json_config = { + "batch_size": 32, + "learning_rate": 0.001, + "epochs": 10 + } + + json_path = os.path.join(test_dir, "config.json") + with open(json_path, 'w') as f: + json.dump(json_config, f) + + parser = ConfigParser() + configs = parser.load_config_file(json_path) + + assert configs is not None, "Configs should be loaded" + assert isinstance(configs, dict), "JSON config should be a dict" + assert configs['batch_size'] == 32 + assert configs['learning_rate'] == 0.001 + + +class TestPerfEntrySuperGeneration: + """Test cases for perf_super.json generation (cumulative).""" + + @pytest.fixture + def test_dir(self): + """Create temporary directory for tests.""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + @pytest.fixture + def fixtures_dir(self): + """Get path to dummy fixtures directory.""" + return os.path.join( + os.path.dirname(__file__), + '..', + 'fixtures', + 'dummy', + 'scripts', + 'dummy' + ) + + def test_perf_entry_super_json_structure(self, test_dir, fixtures_dir): + """Test that perf_super.json has the correct structure.""" + # Create mock data + common_info = { + "pipeline": "dummy_test", + "n_gpus": "1", + "nnodes": "1", + "gpus_per_node": "1", + "training_precision": "", + "args": "--config configs/default.csv", + "tags": "dummies,perf_super_test", + "docker_file": "docker/dummy.Dockerfile", + "base_docker": "rocm/pytorch:latest", + "docker_sha": "abc123", + "docker_image": "test:v1", + "git_commit": "test123", + "machine_name": "test_machine", + "deployment_type": "local", + "launcher": "torchrun", + "gpu_architecture": "test_gpu", + "relative_change": "", + "build_duration": "10", + "test_duration": "20", + "dataname": "", + "data_provider_type": "", + "data_size": "", + "data_download_duration": "", + "build_number": "1", + "additional_docker_run_options": "", + } + + # Create common_info.json + common_info_path = os.path.join(test_dir, "common_info.json") + with open(common_info_path, 'w') as f: + json.dump(common_info, f) + + # Create results CSV + results_csv = os.path.join(test_dir, "perf_dummy_super.csv") + with open(results_csv, 'w') as f: + f.write("model,performance,metric,status\n") + f.write("dummy/model-1,1234.56,tokens/s,SUCCESS\n") + f.write("dummy/model-2,2345.67,requests/s,SUCCESS\n") + f.write("dummy/model-3,345.78,ms,SUCCESS\n") + + # Generate perf_super.json (cumulative) + perf_super_path = os.path.join(test_dir, "perf_super.json") + + update_perf_super_json( + perf_super_json=perf_super_path, + multiple_results=results_csv, + common_info=common_info_path, + model_name="dummy_perf_super", + scripts_base_dir=fixtures_dir + ) + + # Verify file was created + assert os.path.exists(perf_super_path), \ + "perf_super.json should be created" + + # Load and verify structure + with open(perf_super_path, 'r') as f: + data = json.load(f) + + assert isinstance(data, list), "Data should be a list" + assert len(data) == 3, "Should have 3 result records" + + # Check first record structure + first_record = data[0] + + # Verify all common fields are present + required_fields = [ + 'model', 'performance', 'metric', 'status', 'pipeline', + 'n_gpus', 'args', 'tags', 'gpu_architecture' + ] + for field in required_fields: + assert field in first_record, f"Field '{field}' should be present" + + # Verify configs field is present + assert 'configs' in first_record, "configs field should be present" + + # Verify configs is not None (config file was found and loaded) + assert first_record['configs'] is not None, \ + "configs should not be None when config file exists" + + # Verify configs has expected structure + configs = first_record['configs'] + assert isinstance(configs, dict), "configs should be a dict" + assert 'model' in configs + assert 'benchmark' in configs + assert 'config_value' in configs + assert 'batch_size' in configs + assert 'datatype' in configs + assert 'max_tokens' in configs + + def test_perf_entry_super_config_matching(self, test_dir, fixtures_dir): + """Test that configs are correctly matched for all results.""" + # Create mock data + common_info = { + "pipeline": "dummy_test", + "n_gpus": "1", + "nnodes": "1", + "gpus_per_node": "1", + "args": "--config configs/default.csv", + "tags": "dummies", + "training_precision": "", + "docker_file": "", + "base_docker": "", + "docker_sha": "", + "docker_image": "", + "git_commit": "", + "machine_name": "", + "deployment_type": "local", + "launcher": "torchrun", + "gpu_architecture": "", + "relative_change": "", + "build_duration": "", + "test_duration": "", + "dataname": "", + "data_provider_type": "", + "data_size": "", + "data_download_duration": "", + "build_number": "", + "additional_docker_run_options": "", + } + + common_info_path = os.path.join(test_dir, "common_info_super.json") + with open(common_info_path, 'w') as f: + json.dump(common_info, f) + + # Create results CSV + results_csv = os.path.join(test_dir, "perf_dummy_super.csv") + with open(results_csv, 'w') as f: + f.write("model,performance,metric,benchmark\n") + f.write("dummy/model-1,1234.56,tokens/s,throughput\n") + f.write("dummy/model-2,2345.67,requests/s,serving\n") + f.write("dummy/model-3,345.78,ms,latency\n") + + perf_super_path = os.path.join(test_dir, "perf_super.json") + + update_perf_super_json( + perf_super_json=perf_super_path, + multiple_results=results_csv, + common_info=common_info_path, + model_name="dummy_perf_super", + scripts_base_dir=fixtures_dir + ) + + # Load and verify matching + with open(perf_super_path, 'r') as f: + data = json.load(f) + + # Verify each result has configs + assert len(data) == 3, "Should have 3 results" + + for record in data: + configs = record.get('configs') + assert configs is not None, "Each record should have configs" + assert isinstance(configs, dict), "Configs should be a dict" + + # Verify configs have expected structure (from default.csv) + assert 'model' in configs + assert 'benchmark' in configs + assert 'config_value' in configs + assert 'batch_size' in configs + assert 'datatype' in configs + assert 'max_tokens' in configs + + # Verify configs values are from our config file + assert configs['benchmark'] in ['throughput', 'serving', 'latency'] + assert configs['datatype'] in ['float16', 'float32', 'bfloat16'] + + def test_perf_entry_super_no_config(self, test_dir, fixtures_dir): + """Test handling when no config file is specified.""" + # Create mock data without config + common_info = { + "pipeline": "dummy_test", + "n_gpus": "1", + "nnodes": "1", + "gpus_per_node": "1", + "args": "", # No --config argument + "tags": "dummies", + "training_precision": "", + "docker_file": "", + "base_docker": "", + "docker_sha": "", + "docker_image": "", + "git_commit": "", + "machine_name": "", + "deployment_type": "local", + "launcher": "", + "gpu_architecture": "", + "relative_change": "", + "build_duration": "", + "test_duration": "", + "dataname": "", + "data_provider_type": "", + "data_size": "", + "data_download_duration": "", + "build_number": "", + "additional_docker_run_options": "", + } + + common_info_path = os.path.join(test_dir, "common_info_super.json") + with open(common_info_path, 'w') as f: + json.dump(common_info, f) + + # Create results CSV + results_csv = os.path.join(test_dir, "perf_dummy_super.csv") + with open(results_csv, 'w') as f: + f.write("model,performance,metric\n") + f.write("dummy-no-config,1234.56,tokens/s\n") + + perf_super_path = os.path.join(test_dir, "perf_super.json") + + update_perf_super_json( + perf_super_json=perf_super_path, + multiple_results=results_csv, + common_info=common_info_path, + model_name="dummy_no_config", + scripts_base_dir=fixtures_dir + ) + + # Load and verify + with open(perf_super_path, 'r') as f: + data = json.load(f) + + assert len(data) == 1, "Should have 1 result" + + # Verify configs is None when no config file + assert data[0]['configs'] is None, \ + "configs should be None when no config file specified" + + def test_perf_entry_super_multi_results(self, test_dir, fixtures_dir): + """Test handling of multiple result metrics.""" + common_info = { + "pipeline": "dummy_test", + "n_gpus": "8", + "nnodes": "1", + "gpus_per_node": "8", + "args": "", + "tags": "multi_metrics", + "training_precision": "fp16", + "docker_file": "", + "base_docker": "", + "docker_sha": "", + "docker_image": "", + "git_commit": "", + "machine_name": "", + "deployment_type": "local", + "launcher": "vllm", + "gpu_architecture": "gfx90a", + "relative_change": "", + "build_duration": "", + "test_duration": "", + "dataname": "", + "data_provider_type": "", + "data_size": "", + "data_download_duration": "", + "build_number": "", + "additional_docker_run_options": "", + } + + common_info_path = os.path.join(test_dir, "common_info_super.json") + with open(common_info_path, 'w') as f: + json.dump(common_info, f) + + # Create results CSV with extra metrics + results_csv = os.path.join(test_dir, "perf_multi_metrics.csv") + with open(results_csv, 'w') as f: + f.write("model,performance,metric,throughput,latency_mean_ms,latency_p50_ms,latency_p90_ms,gpu_memory_used_mb\n") + f.write("model-1,1234.56,tokens/s,1234.56,8.1,7.9,12.3,12288\n") + f.write("model-2,2345.67,requests/s,2345.67,4.3,4.1,6.8,16384\n") + + perf_super_path = os.path.join(test_dir, "perf_super.json") + + update_perf_super_json( + perf_super_json=perf_super_path, + multiple_results=results_csv, + common_info=common_info_path, + model_name="test_multi_metrics", + scripts_base_dir=fixtures_dir + ) + + # Load and verify + with open(perf_super_path, 'r') as f: + data = json.load(f) + + assert len(data) == 2, "Should have 2 results" + + # Check first result has multi_results with extra metrics + first_result = data[0] + assert 'multi_results' in first_result, "Should have multi_results field" + assert first_result['multi_results'] is not None, "multi_results should not be None" + + multi_results = first_result['multi_results'] + assert isinstance(multi_results, dict), "multi_results should be a dict" + + # Verify extra metrics are in multi_results + assert 'throughput' in multi_results + assert 'latency_mean_ms' in multi_results + assert 'latency_p50_ms' in multi_results + assert 'latency_p90_ms' in multi_results + assert 'gpu_memory_used_mb' in multi_results + + # Verify values + assert multi_results['throughput'] == 1234.56 + assert multi_results['latency_mean_ms'] == 8.1 + assert multi_results['gpu_memory_used_mb'] == 12288 + + def test_perf_entry_super_deployment_fields(self, test_dir, fixtures_dir): + """Test that all deployment-related fields are present.""" + common_info = { + "pipeline": "dummy_test", + "n_gpus": "16", # 2 nodes × 8 GPUs + "nnodes": "2", + "gpus_per_node": "8", + "args": "", + "tags": "multi_node", + "training_precision": "fp16", + "docker_file": "", + "base_docker": "", + "docker_sha": "", + "docker_image": "", + "git_commit": "", + "machine_name": "node-1", + "deployment_type": "slurm", + "launcher": "torchrun", + "gpu_architecture": "gfx90a", + "relative_change": "", + "build_duration": "", + "test_duration": "", + "dataname": "", + "data_provider_type": "", + "data_size": "", + "data_download_duration": "", + "build_number": "", + "additional_docker_run_options": "", + } + + common_info_path = os.path.join(test_dir, "common_info_super.json") + with open(common_info_path, 'w') as f: + json.dump(common_info, f) + + # Create results CSV + results_csv = os.path.join(test_dir, "perf_deployment.csv") + with open(results_csv, 'w') as f: + f.write("model,performance,metric\n") + f.write("multi-node-test,5000.0,tokens/s\n") + + perf_super_path = os.path.join(test_dir, "perf_super.json") + + update_perf_super_json( + perf_super_json=perf_super_path, + multiple_results=results_csv, + common_info=common_info_path, + model_name="test_deployment", + scripts_base_dir=fixtures_dir + ) + + # Load and verify + with open(perf_super_path, 'r') as f: + data = json.load(f) + + assert len(data) == 1, "Should have 1 result" + + result = data[0] + + # Verify all deployment fields are present + deployment_fields = { + "n_gpus": "16", + "nnodes": "2", + "gpus_per_node": "8", + "deployment_type": "slurm", + "launcher": "torchrun", + "machine_name": "node-1", + } + + for field, expected_value in deployment_fields.items(): + assert field in result, f"Field '{field}' should be present" + assert result[field] == expected_value, \ + f"Field '{field}' should be '{expected_value}', got '{result[field]}'" + + +class TestPerfSuperCSVGeneration: + """Test cases for CSV generation from perf_super.json.""" + + @pytest.fixture + def test_dir(self): + """Create temporary directory for tests.""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + def test_csv_generation_from_json(self, test_dir): + """Test CSV generation from perf_super.json.""" + # Create a sample perf_super.json + data = [ + { + "model": "test_model_1", + "n_gpus": "8", + "performance": "1234.56", + "metric": "tokens/s", + "status": "SUCCESS", + "configs": {"batch_size": 32, "learning_rate": 0.001}, + "multi_results": {"throughput": 1234.56, "latency_ms": 8.1}, + }, + { + "model": "test_model_2", + "n_gpus": "8", + "performance": "2345.67", + "metric": "requests/s", + "status": "SUCCESS", + "configs": {"batch_size": 64, "learning_rate": 0.002}, + "multi_results": None, + } + ] + + json_path = os.path.join(test_dir, "perf_super.json") + with open(json_path, 'w') as f: + json.dump(data, f) + + # Change to test directory + original_dir = os.getcwd() + os.chdir(test_dir) + + try: + # Generate CSVs + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv" + ) + + # Verify files exist + assert os.path.exists("perf_entry_super.csv"), \ + "perf_entry_super.csv should be created" + assert os.path.exists("perf_super.csv"), \ + "perf_super.csv should be created" + + # Load and verify perf_entry_super.csv (latest entry only) + entry_df = pd.read_csv("perf_entry_super.csv") + assert len(entry_df) == 1, "Should have 1 entry (latest)" + assert entry_df.iloc[0]['model'] == "test_model_2" + + # Load and verify perf_super.csv (all entries) + super_df = pd.read_csv("perf_super.csv") + assert len(super_df) == 2, "Should have 2 entries (all)" + + # Verify configs column is JSON string + assert 'configs' in super_df.columns + first_configs = json.loads(super_df.iloc[0]['configs']) + assert first_configs['batch_size'] == 32 + + # Verify multi_results column + assert 'multi_results' in super_df.columns + first_multi = json.loads(super_df.iloc[0]['multi_results']) + assert first_multi['throughput'] == 1234.56 + + finally: + os.chdir(original_dir) + + def test_csv_handles_none_values(self, test_dir): + """Test that CSV generation handles None values correctly.""" + data = [ + { + "model": "test_model", + "performance": "1234.56", + "metric": "tokens/s", + "configs": None, + "multi_results": None, + } + ] + + json_path = os.path.join(test_dir, "perf_super.json") + with open(json_path, 'w') as f: + json.dump(data, f) + + original_dir = os.getcwd() + os.chdir(test_dir) + + try: + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv" + ) + + # Load CSV + df = pd.read_csv("perf_super.csv") + + # Verify None values are handled + assert pd.isna(df.iloc[0]['configs']) or df.iloc[0]['configs'] == '' + assert pd.isna(df.iloc[0]['multi_results']) or df.iloc[0]['multi_results'] == '' + + finally: + os.chdir(original_dir) + + def test_csv_multiple_entries_in_entry_file(self, test_dir): + """Test that perf_entry_super.csv can contain multiple entries from current run. + + This tests the fix for the issue where perf_entry.csv and perf_entry.json + had 4 entries (for multiple results) but perf_entry_super.csv only had 1. + Now perf_entry_super.csv should contain all entries from the current run. + """ + # Simulate a cumulative JSON with old entries + new entries + data = [ + # Old entry from a previous run + { + "model": "old_model", + "n_gpus": "4", + "performance": "999.99", + "metric": "tokens/s", + "status": "SUCCESS", + "configs": None, + "multi_results": None, + }, + # New entries from current run (4 models from multiple results) + { + "model": "dummy_multi_1", + "n_gpus": "1", + "performance": "1234.56", + "metric": "samples_per_sec", + "status": "SUCCESS", + "configs": None, + "multi_results": {"temperature": 12345}, + }, + { + "model": "dummy_multi_2", + "n_gpus": "1", + "performance": "2345.67", + "metric": "samples_per_sec", + "status": "SUCCESS", + "configs": None, + "multi_results": {"temperature": 23456}, + }, + { + "model": "dummy_multi_3", + "n_gpus": "1", + "performance": "3456.78", + "metric": "samples_per_sec", + "status": "SUCCESS", + "configs": None, + "multi_results": {"temperature": 34567}, + }, + { + "model": "dummy_multi_4", + "n_gpus": "1", + "performance": "4567.89", + "metric": "samples_per_sec", + "status": "SUCCESS", + "configs": None, + "multi_results": {"temperature": 45678}, + } + ] + + json_path = os.path.join(test_dir, "perf_super.json") + with open(json_path, 'w') as f: + json.dump(data, f) + + original_dir = os.getcwd() + os.chdir(test_dir) + + try: + # Generate CSVs with num_entries=4 (simulating 4 entries added in current run) + update_perf_super_csv( + perf_super_json="perf_super.json", + perf_super_csv="perf_super.csv", + num_entries=4 + ) + + # Verify perf_entry_super.csv has ALL 4 entries from current run + entry_df = pd.read_csv("perf_entry_super.csv") + assert len(entry_df) == 4, \ + f"perf_entry_super.csv should have 4 entries, got {len(entry_df)}" + + # Verify the models are the 4 from the current run (not the old one) + models = entry_df['model'].tolist() + expected_models = ['dummy_multi_1', 'dummy_multi_2', 'dummy_multi_3', 'dummy_multi_4'] + assert models == expected_models, \ + f"Expected {expected_models}, got {models}" + + # Verify perf_super.csv has ALL 5 entries (old + new) + super_df = pd.read_csv("perf_super.csv") + assert len(super_df) == 5, \ + f"perf_super.csv should have 5 entries (1 old + 4 new), got {len(super_df)}" + + # Verify all models are in perf_super.csv + all_models = super_df['model'].tolist() + assert 'old_model' in all_models, "Old model should be in perf_super.csv" + assert all(m in all_models for m in expected_models), \ + "All new models should be in perf_super.csv" + + finally: + os.chdir(original_dir) +