From 40394fc9468e536d4d1b430c81d698020f553710 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Mon, 22 Sep 2025 23:23:39 -0400 Subject: [PATCH 1/8] Added the sglang disagg models integration running on SLURM Cluster --- src/madengine/tools/run_models.py | 156 ++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 4f26450d..515b4c96 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -884,6 +884,158 @@ def run_model_impl( # explicitly delete model docker to stop the container, without waiting for the in-built garbage collector del model_docker + def run_model_slurm(self, model_info: typing.Dict) -> bool: + """Run model on SLURM cluster. + + Args: + model_info: The model information. + + Returns: + bool: The status of running model on SLURM cluster. + + Raises: + Exception: An error occurred while running model on SLURM cluster. + """ + print(f"Running model {model_info['name']} on SLURM cluster") + + # Extract SLURM arguments from context + slurm_args = self.context.ctx["slurm_args"] + + # Validate required SLURM arguments + required_args = ["FRAMEWORK", "PREFILL_NODES", "DECODE_NODES", "PARTITION", "TIME"] + for arg in required_args: + if arg not in slurm_args: + raise Exception(f"Missing required SLURM argument: {arg}") + + # Extract model name from model_info args (remove --model prefix) + model_name = "" + if "args" in model_info and model_info["args"]: + args_parts = model_info["args"].split() + if "--model" in args_parts: + model_index = args_parts.index("--model") + if model_index + 1 < len(args_parts): + model_name = args_parts[model_index + 1] + + if not model_name: + raise Exception(f"Could not extract model name from args: {model_info.get('args', '')}") + + print(f"Extracted model name: {model_name}") + + # Set up environment variables for the SLURM script + env_vars = { + "FRAMEWORK": slurm_args["FRAMEWORK"], + "PREFILL_NODES": str(slurm_args["PREFILL_NODES"]), + "DECODE_NODES": str(slurm_args["DECODE_NODES"]), + "PARTITION": slurm_args["PARTITION"], + "TIME": slurm_args["TIME"], + "MODEL_NAME": model_name + } + + # Add DOCKER_IMAGE if provided and not empty + if "DOCKER_IMAGE" in slurm_args and slurm_args["DOCKER_IMAGE"]: + env_vars["DOCKER_IMAGE"] = slurm_args["DOCKER_IMAGE"] + + # Set environment variables + for key, value in env_vars.items(): + os.environ[key] = value + print(f"Setting {key}={value}") + + # Prepare run details for result tracking + run_details = RunDetails() + run_details.model = model_info["name"] + run_details.n_gpus = model_info["n_gpus"] + run_details.training_precision = model_info["training_precision"] + run_details.args = model_info["args"] + run_details.tags = model_info["tags"] + run_details.pipeline = os.environ.get("pipeline") + run_details.machine_name = self.console.sh("hostname") + + try: + # Execute the SLURM script + script_path = model_info.get("scripts", "scripts/sglang_disagg/run.sh") + print(f"Executing SLURM script: {script_path}") + + # Make script executable + self.console.sh(f"chmod +x {script_path}") + + # Run the script with model argument + start_time = time.time() + log_file_path = f"{model_info['name'].replace('/', '_')}_slurm.live.log" + + with open(log_file_path, mode="w", buffering=1) as outlog: + with redirect_stdout(PythonicTee(outlog, self.args.live_output)), redirect_stderr(PythonicTee(outlog, self.args.live_output)): + result = self.console.sh(f"bash {script_path} --model {model_name}", timeout=None) + + run_details.test_duration = time.time() - start_time + print(f"SLURM execution duration: {run_details.test_duration} seconds") + + # Extract performance metrics from log + multiple_results = model_info.get("multiple_results") + + if multiple_results: + run_details.performance = multiple_results + # Check if the results file exists and is valid + if os.path.exists(multiple_results): + with open(multiple_results, 'r') as f: + header = f.readline().strip().split(',') + for line in f: + row = line.strip().split(',') + for col in row: + if col == '': + run_details.performance = None + print("Error: Performance metric is empty in multiple results file.") + break + else: + print(f"Warning: Multiple results file {multiple_results} not found") + run_details.performance = None + else: + # Extract performance from log using regex + perf_regex = r".*performance:\s*\([+|-]?[0-9]*[.]?[0-9]*\(e[+|-]?[0-9]+\)?\)\s*.*\s*" + run_details.performance = self.console.sh("cat " + log_file_path + + " | sed -n 's/" + perf_regex + "/\\1/p'") + + metric_regex = r".*performance:\s*[+|-]?[0-9]*[.]?[0-9]*\(e[+|-]?[0-9]+\)?\s*\(\w*\)\s*" + run_details.metric = self.console.sh("cat " + log_file_path + + " | sed -n 's/" + metric_regex + "/\\2/p'") + + # Determine success/failure + run_details.status = 'SUCCESS' if run_details.performance else 'FAILURE' + + # Print performance results + run_details.print_perf() + + # Update CSV results + if multiple_results: + run_details.generate_json("common_info.json", multiple_results=True) + update_perf_csv( + multiple_results=multiple_results, + perf_csv=self.args.output, + model_name=run_details.model, + common_info="common_info.json", + ) + else: + run_details.generate_json("perf_entry.json") + update_perf_csv( + single_result="perf_entry.json", + perf_csv=self.args.output, + ) + + return run_details.status == 'SUCCESS' + + except Exception as e: + print("===== SLURM EXECUTION EXCEPTION =====") + print("Exception: ", e) + traceback.print_exc() + print("=======================================") + + run_details.status = "FAILURE" + run_details.generate_json("perf_entry.json") + update_perf_csv( + exception_result="perf_entry.json", + perf_csv=self.args.output, + ) + return False + def run_model(self, model_info: typing.Dict) -> bool: """Run model on container. @@ -898,6 +1050,10 @@ def run_model(self, model_info: typing.Dict) -> bool: """ print(f"Running model {model_info['name']} with {model_info}") + # Check if SLURM execution is requested + if "slurm_args" in self.context.ctx: + return self.run_model_slurm(model_info) + # set default values if model run fails run_details = RunDetails() From 45ee3766540f2bc40a6ba9c4ff30106fe950f446 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 23 Sep 2025 19:56:57 -0400 Subject: [PATCH 2/8] Fixed the issue running SLURM control node without GPU --- src/madengine/core/context.py | 87 +++++++++++++++++++++++++------ src/madengine/tools/run_models.py | 5 ++ 2 files changed, 76 insertions(+), 16 deletions(-) diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index cb628a69..4153af4f 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -92,22 +92,43 @@ def __init__( else: print("Warning: unknown numa balancing setup ...") - # Keeping gpu_vendor for filterning purposes, if we filter using file names we can get rid of this attribute. - self.ctx["gpu_vendor"] = self.get_gpu_vendor() - - # Initialize the docker context - self.ctx["docker_env_vars"] = {} - self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] = self.ctx["gpu_vendor"] - self.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"] = self.get_system_ngpus() - self.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = self.get_system_gpu_architecture() - self.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_PRODUCT_NAME"] = self.get_system_gpu_product_name() - self.ctx['docker_env_vars']['MAD_SYSTEM_HIP_VERSION'] = self.get_system_hip_version() - self.ctx["docker_build_arg"] = { - "MAD_SYSTEM_GPU_ARCHITECTURE": self.get_system_gpu_architecture(), - "MAD_SYSTEM_GPU_PRODUCT_NAME": self.get_system_gpu_product_name() - } - self.ctx["docker_gpus"] = self.get_docker_gpus() - self.ctx["gpu_renderDs"] = self.get_gpu_renderD_nodes() + # Check if SLURM mode is requested before GPU detection + is_slurm_mode = self._is_slurm_mode(additional_context, additional_context_file) + + if is_slurm_mode: + # For SLURM mode, set minimal GPU context to avoid detection on control node + print("SLURM mode detected - skipping GPU detection on control node") + self.ctx["gpu_vendor"] = "AMD" # Default to AMD for SLURM environments + self.ctx["docker_env_vars"] = {} + self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] = self.ctx["gpu_vendor"] + self.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"] = "8" # Default value for SLURM + self.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = "gfx90a" # Default for SLURM + self.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_PRODUCT_NAME"] = "AMD_GPU" # Default value + self.ctx['docker_env_vars']['MAD_SYSTEM_HIP_VERSION'] = "5.0.0" # Default value + self.ctx["docker_build_arg"] = { + "MAD_SYSTEM_GPU_ARCHITECTURE": "gfx90a", + "MAD_SYSTEM_GPU_PRODUCT_NAME": "AMD_GPU" + } + self.ctx["docker_gpus"] = "0,1,2,3,4,5,6,7" # Default GPU list + self.ctx["gpu_renderDs"] = [128, 129, 130, 131, 132, 133, 134, 135] # Default renderD nodes + else: + # Normal mode - detect GPUs + # Keeping gpu_vendor for filterning purposes, if we filter using file names we can get rid of this attribute. + self.ctx["gpu_vendor"] = self.get_gpu_vendor() + + # Initialize the docker context + self.ctx["docker_env_vars"] = {} + self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] = self.ctx["gpu_vendor"] + self.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"] = self.get_system_ngpus() + self.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = self.get_system_gpu_architecture() + self.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_PRODUCT_NAME"] = self.get_system_gpu_product_name() + self.ctx['docker_env_vars']['MAD_SYSTEM_HIP_VERSION'] = self.get_system_hip_version() + self.ctx["docker_build_arg"] = { + "MAD_SYSTEM_GPU_ARCHITECTURE": self.get_system_gpu_architecture(), + "MAD_SYSTEM_GPU_PRODUCT_NAME": self.get_system_gpu_product_name() + } + self.ctx["docker_gpus"] = self.get_docker_gpus() + self.ctx["gpu_renderDs"] = self.get_gpu_renderD_nodes() # Default multi-node configuration self.ctx['multi_node_args'] = { @@ -148,6 +169,40 @@ def __init__( # Set multi-node runner after context update self.ctx['docker_env_vars']['MAD_MULTI_NODE_RUNNER'] = self.set_multi_node_runner() + def _is_slurm_mode(self, additional_context: str = None, additional_context_file: str = None) -> bool: + """Check if SLURM mode is requested. + + Args: + additional_context: The additional context string. + additional_context_file: The additional context file. + + Returns: + bool: True if SLURM mode is detected, False otherwise. + """ + import ast + import json + + # Check additional_context_file first + if additional_context_file: + try: + with open(additional_context_file) as f: + context_data = json.load(f) + if 'slurm_args' in context_data: + return True + except Exception: + pass + + # Check additional_context string + if additional_context: + try: + dict_additional_context = ast.literal_eval(additional_context) + if 'slurm_args' in dict_additional_context: + return True + except Exception: + pass + + return False + def get_ctx_test(self) -> str: """Get context test. diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 515b4c96..41481d43 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -199,6 +199,11 @@ def clean_up_docker_container(self, is_cleaned: bool = False) -> None: self.console.sh("docker ps -a || true") self.console.sh("docker kill $(docker ps -q) || true") + # Skip GPU info display in SLURM mode as control node may not have GPUs + if "slurm_args" in self.context.ctx: + print("SLURM mode detected - skipping GPU info display on control node") + return + # get gpu vendor gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] # show gpu info From d77afebb4e9c810116a13bc93078c1f62ce6892e Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 23 Sep 2025 21:01:49 -0400 Subject: [PATCH 3/8] Fixed the run.sh of sglang-disagg --- src/madengine/tools/run_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 41481d43..f102f369 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -963,13 +963,13 @@ def run_model_slurm(self, model_info: typing.Dict) -> bool: # Make script executable self.console.sh(f"chmod +x {script_path}") - # Run the script with model argument + # Run the script without arguments (all parameters loaded from environment) start_time = time.time() log_file_path = f"{model_info['name'].replace('/', '_')}_slurm.live.log" with open(log_file_path, mode="w", buffering=1) as outlog: with redirect_stdout(PythonicTee(outlog, self.args.live_output)), redirect_stderr(PythonicTee(outlog, self.args.live_output)): - result = self.console.sh(f"bash {script_path} --model {model_name}", timeout=None) + result = self.console.sh(f"bash {script_path}", timeout=None) run_details.test_duration = time.time() - start_time print(f"SLURM execution duration: {run_details.test_duration} seconds") From cf5b30550d6b15d91fc520f7b13b4249ba4dd300 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 1 Oct 2025 12:00:51 -0400 Subject: [PATCH 4/8] Updated the docks about SLURM run on multinode --- docs/how-to-run-multi-node.md | 117 +++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 3 deletions(-) diff --git a/docs/how-to-run-multi-node.md b/docs/how-to-run-multi-node.md index 5c84e6cf..f1c5e153 100644 --- a/docs/how-to-run-multi-node.md +++ b/docs/how-to-run-multi-node.md @@ -75,17 +75,128 @@ You can see at the end of these commands, we are pointing DLM/MAD to the shared- **NOTE: The above commands assumes the shared-file system is mounted at `/nfs` in the commands above. If this is not the case and a user simply copies/pastes the above commands on two nodes, DLM/MAD will create a folder called `nfs` on each node and copy the data there, which is not desired behavior.** +## SLURM Cluster Integration + +MADEngine now supports running workloads on SLURM clusters, allowing you to leverage job scheduling and resource management for multi-node training and inference. + +### Overview + +When `slurm_args` is provided in the `additional-context`, MADEngine will: +1. Parse the SLURM configuration parameters +2. Submit the job directly to the SLURM cluster using `sbatch` +3. Skip the standard Docker container build and run workflow +4. Execute the model-specific script (e.g., `scripts/sglang_disagg/run.sh`) which handles SLURM job submission + +### SLURM Arguments + +The following arguments can be specified in the `slurm_args` dictionary: + +| Argument | Description | Required | Example | +|----------|-------------|----------|---------| +| `FRAMEWORK` | Framework to use for the job | Yes | `'sglang_disagg'` | +| `PREFILL_NODES` | Number of nodes for prefill phase | Yes | `'2'` | +| `DECODE_NODES` | Number of nodes for decode phase | Yes | `'2'` | +| `PARTITION` | SLURM partition/queue name | Yes | `'amd-rccl'` | +| `TIME` | Maximum job runtime (HH:MM:SS) | Yes | `'12:00:00'` | +| `DOCKER_IMAGE` | Docker image to use (optional) | No | `''` (uses default from run.sh) | + +### Usage Examples + +#### Basic SLURM Job Submission + +To run a model on SLURM with default settings: + +```bash +madengine run --tags sglang_disagg_pd_qwen3-32B \ + --additional-context "{'slurm_args': { + 'FRAMEWORK': 'sglang_disagg', + 'PREFILL_NODES': '2', + 'DECODE_NODES': '2', + 'PARTITION': 'amd-rccl', + 'TIME': '12:00:00', + 'DOCKER_IMAGE': '' + }}" +``` + +#### Custom Docker Image + +To specify a custom Docker image for the SLURM job: + +```bash +madengine run --tags sglang_disagg_pd_qwen3-32B \ + --additional-context "{'slurm_args': { + 'FRAMEWORK': 'sglang_disagg', + 'PREFILL_NODES': '4', + 'DECODE_NODES': '4', + 'PARTITION': 'gpu-high-priority', + 'TIME': '24:00:00', + 'DOCKER_IMAGE': 'myregistry/custom-image:latest' + }}" +``` + +#### Running Different Model Configurations + +For DeepSeek-V2 model: + +```bash +madengine run --tags sglang_disagg_pd_deepseek_v2 \ + --additional-context "{'slurm_args': { + 'FRAMEWORK': 'sglang_disagg', + 'PREFILL_NODES': '8', + 'DECODE_NODES': '8', + 'PARTITION': 'amd-mi300x', + 'TIME': '48:00:00', + 'DOCKER_IMAGE': '' + }}" +``` + +### Model Configuration + +Models configured for SLURM should include the model name in the `args` attribute of `models.json`. For example: + +```json +{ + "name": "sglang_disagg_pd_qwen3-32B", + "args": "--model Qwen/Qwen2.5-32B-Instruct", + "tags": ["sglang_disagg"] +} +``` + +The model name (e.g., `Qwen/Qwen2.5-32B-Instruct`) will be extracted and set as the `MODEL_NAME` environment variable for the SLURM job. + +### Requirements + +To use SLURM integration, ensure the following are available: + +1. **SLURM Cluster Access**: Access to a SLURM cluster with proper credentials +2. **Python Dependencies**: `paramiko` and `scp` for SSH connections (if needed) + ```bash + pip install paramiko scp + ``` +3. **Model Scripts**: Framework-specific scripts (e.g., `scripts/sglang_disagg/run.sh`) that handle SLURM job submission + +### How It Works + +1. **Context Parsing**: MADEngine detects `slurm_args` in the additional context +2. **Model Selection**: Extracts model information from `models.json` based on the provided tags +3. **Environment Setup**: Prepares environment variables including `MODEL_NAME`, node counts, partition, etc. +4. **Job Submission**: Executes the framework-specific run script which submits the SLURM job using `sbatch` +5. **Job Monitoring**: The SLURM cluster manages job execution, resource allocation, and scheduling + ## TODO ### RUNNER +- [x] torchrun - [ ] mpirun (requires ansible integration) -### Job Schedulare +### Job Scheduler -- [ ] SLURM +- [x] SLURM (via slurm_args integration) - [ ] Kubernetes ### Design Consideration -- [ ] Having the python model script launched by individual bash scripts can be limiting for multi-node. Perhaps we can explore a full python workflow for multi-node and only the job scheduler uses a bash script like SLURM using sbatch script. +- [x] SLURM integration using sbatch scripts for job submission +- [ ] Full Python workflow for multi-node (without bash script intermediaries) +- [ ] Kubernetes-native job scheduling integration From 72ddaffd8463cadb500fba92f8c0f2ddde4d65d0 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 1 Oct 2025 12:39:18 -0400 Subject: [PATCH 5/8] Updated readme of running multinode --- docs/how-to-run-multi-node.md | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/docs/how-to-run-multi-node.md b/docs/how-to-run-multi-node.md index f1c5e153..b526bd6b 100644 --- a/docs/how-to-run-multi-node.md +++ b/docs/how-to-run-multi-node.md @@ -77,11 +77,11 @@ You can see at the end of these commands, we are pointing DLM/MAD to the shared- ## SLURM Cluster Integration -MADEngine now supports running workloads on SLURM clusters, allowing you to leverage job scheduling and resource management for multi-node training and inference. +madengine now supports running workloads on SLURM clusters, allowing you to leverage job scheduling and resource management for multi-node training and inference. ### Overview -When `slurm_args` is provided in the `additional-context`, MADEngine will: +When `slurm_args` is provided in the `additional-context`, madengine will: 1. Parse the SLURM configuration parameters 2. Submit the job directly to the SLURM cluster using `sbatch` 3. Skip the standard Docker container build and run workflow @@ -157,27 +157,23 @@ Models configured for SLURM should include the model name in the `args` attribut ```json { "name": "sglang_disagg_pd_qwen3-32B", - "args": "--model Qwen/Qwen2.5-32B-Instruct", + "args": "--model Qwen3-32B", "tags": ["sglang_disagg"] } ``` -The model name (e.g., `Qwen/Qwen2.5-32B-Instruct`) will be extracted and set as the `MODEL_NAME` environment variable for the SLURM job. +The model name (e.g., `Qwen/Qwen3-32B`) will be extracted and set as the `MODEL_NAME` environment variable for the SLURM job. ### Requirements To use SLURM integration, ensure the following are available: 1. **SLURM Cluster Access**: Access to a SLURM cluster with proper credentials -2. **Python Dependencies**: `paramiko` and `scp` for SSH connections (if needed) - ```bash - pip install paramiko scp - ``` -3. **Model Scripts**: Framework-specific scripts (e.g., `scripts/sglang_disagg/run.sh`) that handle SLURM job submission +2. **Model Scripts**: Framework-specific scripts (e.g., `scripts/sglang_disagg/run.sh`) that handle SLURM job submission ### How It Works -1. **Context Parsing**: MADEngine detects `slurm_args` in the additional context +1. **Context Parsing**: madengine detects `slurm_args` in the additional context 2. **Model Selection**: Extracts model information from `models.json` based on the provided tags 3. **Environment Setup**: Prepares environment variables including `MODEL_NAME`, node counts, partition, etc. 4. **Job Submission**: Executes the framework-specific run script which submits the SLURM job using `sbatch` From a14dcb64418f97759747ea51d327258ffb5e80ee Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 14 Oct 2025 16:12:04 -0400 Subject: [PATCH 6/8] Updated the flow of run model to detect SLURM tag and environments to improve robustness --- src/madengine/tools/run_models.py | 128 ++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index f102f369..1d5d5d03 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -1041,6 +1041,94 @@ def run_model_slurm(self, model_info: typing.Dict) -> bool: ) return False + def _is_slurm_required_model(self, model_info: typing.Dict) -> bool: + """Check if model requires SLURM cluster execution. + + A model requires SLURM if it has the 'slurm' tag in its tags list. + This indicates the model is designed for distributed multi-node + inference and cannot run on single-node environments. + + Args: + model_info: The model information dictionary containing tags. + + Returns: + bool: True if model has 'slurm' tag, False otherwise. + """ + tags = model_info.get("tags", []) + return "slurm" in tags + + def _is_slurm_environment(self) -> bool: + """Check if current environment is a SLURM cluster. + + This method detects if the code is running on a SLURM cluster by checking + for standard SLURM environment variables that are set by the SLURM workload + manager. This follows SLURM best practices for environment detection. + + SLURM sets various environment variables when jobs are submitted: + - SLURM_JOB_ID: The job ID assigned by SLURM + - SLURM_CLUSTER_NAME: The name of the cluster + - SLURM_JOBID: Alternative job ID variable + - SLURMD_NODENAME: The name of the node running the job + + Returns: + bool: True if running on a SLURM cluster, False otherwise. + """ + # Check for SLURM environment variables that indicate we're on a SLURM cluster + slurm_indicators = [ + 'SLURM_JOB_ID', # Primary indicator - set when job is running + 'SLURM_JOBID', # Alternative job ID variable + 'SLURM_CLUSTER_NAME', # Cluster name + 'SLURMD_NODENAME' # Node name in SLURM + ] + + # If any SLURM environment variable is present, we're on a SLURM cluster + for indicator in slurm_indicators: + if indicator in os.environ: + print(f"SLURM environment detected: {indicator}={os.environ[indicator]}") + return True + + # Also check if slurm_args was explicitly provided in context + # This allows manual override for testing or special configurations + if "slurm_args" in self.context.ctx: + print("SLURM environment detected via slurm_args in context") + return True + + return False + + def _write_skipped_model_result(self, model_info: typing.Dict, status: str) -> None: + """Write a skipped model entry to the performance CSV. + + This method creates a CSV entry for models that were skipped during + execution, allowing for complete tracking and reporting of all models + in the run, including those that couldn't be executed. + + Args: + model_info: The model information dictionary. + status: The skip status (e.g., "SKIPPED_SLURM_REQUIRED"). + """ + run_details = RunDetails() + run_details.model = model_info["name"] + run_details.n_gpus = model_info.get("n_gpus", "-1") + run_details.training_precision = model_info.get("training_precision", "") + run_details.args = model_info.get("args", "") + run_details.tags = model_info.get("tags", []) + run_details.status = status + run_details.pipeline = os.environ.get("pipeline", "") + run_details.machine_name = self.console.sh("hostname") + + # Get GPU architecture from context if available + if "docker_env_vars" in self.context.ctx and "MAD_SYSTEM_GPU_ARCHITECTURE" in self.context.ctx["docker_env_vars"]: + run_details.gpu_architecture = self.context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] + else: + run_details.gpu_architecture = "N/A" + + # Generate JSON and update CSV + run_details.generate_json("perf_entry.json") + update_perf_csv( + single_result="perf_entry.json", + perf_csv=self.args.output, + ) + def run_model(self, model_info: typing.Dict) -> bool: """Run model on container. @@ -1055,6 +1143,46 @@ def run_model(self, model_info: typing.Dict) -> bool: """ print(f"Running model {model_info['name']} with {model_info}") + # Check if model requires SLURM but SLURM is not configured + if self._is_slurm_required_model(model_info): + if not self._is_slurm_environment(): + print(f"") + print(f"=" * 80) + print(f"⚠️ WARNING: Model '{model_info['name']}' requires SLURM cluster execution") + print(f"=" * 80) + print(f"") + print(f"This model is tagged with 'slurm' and is designed for distributed") + print(f"multi-node inference. It cannot run on typical single-node environments.") + print(f"") + print(f"Current environment: Single-node execution (no slurm_args detected)") + print(f"Required environment: SLURM cluster with multi-node configuration") + print(f"") + print(f"⚠️ SKIPPING model execution") + print(f"") + print(f"To run this model on a SLURM cluster, use:") + print(f"") + print(f" madengine run --tags {model_info['name']} \\") + print(f" --additional-context \"{{") + print(f" 'slurm_args': {{") + print(f" 'FRAMEWORK': 'sglang_disagg',") + print(f" 'PREFILL_NODES': '2',") + print(f" 'DECODE_NODES': '2',") + print(f" 'PARTITION': 'gpu-partition',") + print(f" 'TIME': '12:00:00',") + print(f" 'DOCKER_IMAGE': ''") + print(f" }}") + print(f" }}\"") + print(f"") + print(f"For more information, see: docs/how-to-run-multi-node.md") + print(f"=" * 80) + print(f"") + + # Write skip status to CSV for reporting + self._write_skipped_model_result(model_info, "SKIPPED_SLURM_REQUIRED") + + # Return True to not fail the entire run (this is a skip, not a failure) + return True + # Check if SLURM execution is requested if "slurm_args" in self.context.ctx: return self.run_model_slurm(model_info) From 77aa4bf1cb6ff582363d20f337d2b40b6153e714 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 14 Oct 2025 16:30:37 -0400 Subject: [PATCH 7/8] Modified the function of is slurm environment using scontrol show config which should work on SLURM login nodes --- src/madengine/tools/run_models.py | 41 +++++++++++-------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 1d5d5d03..56fedde8 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -1060,39 +1060,26 @@ def _is_slurm_required_model(self, model_info: typing.Dict) -> bool: def _is_slurm_environment(self) -> bool: """Check if current environment is a SLURM cluster. - This method detects if the code is running on a SLURM cluster by checking - for standard SLURM environment variables that are set by the SLURM workload - manager. This follows SLURM best practices for environment detection. - - SLURM sets various environment variables when jobs are submitted: - - SLURM_JOB_ID: The job ID assigned by SLURM - - SLURM_CLUSTER_NAME: The name of the cluster - - SLURM_JOBID: Alternative job ID variable - - SLURMD_NODENAME: The name of the node running the job + This method detects if the code is running on a SLURM cluster by attempting + to execute the 'scontrol show config' command, which is a standard SLURM + command available on all SLURM nodes. Returns: bool: True if running on a SLURM cluster, False otherwise. """ - # Check for SLURM environment variables that indicate we're on a SLURM cluster - slurm_indicators = [ - 'SLURM_JOB_ID', # Primary indicator - set when job is running - 'SLURM_JOBID', # Alternative job ID variable - 'SLURM_CLUSTER_NAME', # Cluster name - 'SLURMD_NODENAME' # Node name in SLURM - ] - - # If any SLURM environment variable is present, we're on a SLURM cluster - for indicator in slurm_indicators: - if indicator in os.environ: - print(f"SLURM environment detected: {indicator}={os.environ[indicator]}") + # Try to execute scontrol command to check if SLURM is available + try: + result = self.console.sh("scontrol show config 2>/dev/null | head -n 1", canFail=True) + # If the command succeeds and returns output, we're on a SLURM cluster + if result and len(result.strip()) > 0: + print(f"SLURM environment detected via 'scontrol show config'") + print(f"SLURM config: {result.strip()}") return True + except Exception as e: + # Command failed or not found - not a SLURM environment + pass - # Also check if slurm_args was explicitly provided in context - # This allows manual override for testing or special configurations - if "slurm_args" in self.context.ctx: - print("SLURM environment detected via slurm_args in context") - return True - + print("Not running on a SLURM cluster - single-node environment detected") return False def _write_skipped_model_result(self, model_info: typing.Dict, status: str) -> None: From 6b12b96702b70a8edd9e7cc9d5f87d7ce607a92d Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 14 Oct 2025 17:59:31 -0400 Subject: [PATCH 8/8] Added EXCLUSIVE_MODE logic to handle node exclusive access --- docs/how-to-run-multi-node.md | 36 ++++++++++++++++++++++++------- src/madengine/tools/run_models.py | 13 +++++++++++ 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/docs/how-to-run-multi-node.md b/docs/how-to-run-multi-node.md index b526bd6b..140f18d6 100644 --- a/docs/how-to-run-multi-node.md +++ b/docs/how-to-run-multi-node.md @@ -91,14 +91,15 @@ When `slurm_args` is provided in the `additional-context`, madengine will: The following arguments can be specified in the `slurm_args` dictionary: -| Argument | Description | Required | Example | -|----------|-------------|----------|---------| -| `FRAMEWORK` | Framework to use for the job | Yes | `'sglang_disagg'` | -| `PREFILL_NODES` | Number of nodes for prefill phase | Yes | `'2'` | -| `DECODE_NODES` | Number of nodes for decode phase | Yes | `'2'` | -| `PARTITION` | SLURM partition/queue name | Yes | `'amd-rccl'` | -| `TIME` | Maximum job runtime (HH:MM:SS) | Yes | `'12:00:00'` | -| `DOCKER_IMAGE` | Docker image to use (optional) | No | `''` (uses default from run.sh) | +| Argument | Description | Required | Default | Example | +|----------|-------------|----------|---------|---------| +| `FRAMEWORK` | Framework to use for the job | Yes | - | `'sglang_disagg'` | +| `PREFILL_NODES` | Number of nodes for prefill phase | Yes | - | `'2'` | +| `DECODE_NODES` | Number of nodes for decode phase | Yes | - | `'2'` | +| `PARTITION` | SLURM partition/queue name | Yes | - | `'amd-rccl'` | +| `TIME` | Maximum job runtime (HH:MM:SS) | Yes | - | `'12:00:00'` | +| `DOCKER_IMAGE` | Docker image to use | No | `''` | `'myregistry/image:tag'` | +| `EXCLUSIVE_MODE` | Request exclusive node access | No | `True` | `True` or `False` | ### Usage Examples @@ -150,6 +151,25 @@ madengine run --tags sglang_disagg_pd_deepseek_v2 \ }}" ``` +#### Using Exclusive Mode + +By default, `EXCLUSIVE_MODE` is `True`, which requests exclusive access to nodes (recommended for distributed inference). To share nodes with other jobs: + +```bash +madengine run --tags sglang_disagg_pd_qwen3-32B \ + --additional-context "{'slurm_args': { + 'FRAMEWORK': 'sglang_disagg', + 'PREFILL_NODES': '2', + 'DECODE_NODES': '2', + 'PARTITION': 'amd-rccl', + 'TIME': '12:00:00', + 'DOCKER_IMAGE': '', + 'EXCLUSIVE_MODE': False + }}" +``` + +**Note:** Exclusive mode (`--exclusive` in SLURM) is typically recommended for distributed multi-node workloads to ensure consistent performance and avoid interference from other jobs running on the same nodes. + ### Model Configuration Models configured for SLURM should include the model name in the `args` attribute of `models.json`. For example: diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 56fedde8..e1ab7f96 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -939,6 +939,19 @@ def run_model_slurm(self, model_info: typing.Dict) -> bool: # Add DOCKER_IMAGE if provided and not empty if "DOCKER_IMAGE" in slurm_args and slurm_args["DOCKER_IMAGE"]: env_vars["DOCKER_IMAGE"] = slurm_args["DOCKER_IMAGE"] + + # Add EXCLUSIVE_MODE if provided (default to true for multi-node jobs) + exclusive_mode = slurm_args.get("EXCLUSIVE_MODE", True) + if isinstance(exclusive_mode, bool): + env_vars["EXCLUSIVE_MODE"] = "true" if exclusive_mode else "false" + elif isinstance(exclusive_mode, str): + # Handle string values like "true", "false", "True", "False" + env_vars["EXCLUSIVE_MODE"] = exclusive_mode.lower() + else: + # Default to true for safety in multi-node distributed jobs + env_vars["EXCLUSIVE_MODE"] = "true" + + print(f"SLURM exclusive mode: {env_vars['EXCLUSIVE_MODE']}") # Set environment variables for key, value in env_vars.items():