From 3f48d73e3fd76774ac18acbe7a974a35f026db0c Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Fri, 23 Jan 2026 13:43:36 -0800 Subject: [PATCH 01/16] Implement --auto-parallelism --- architectures/centralized/client/src/app.rs | 1 + .../decentralized/solana-client/src/app.rs | 1 + shared/client/src/cli.rs | 4 + shared/client/src/lib.rs | 1 + shared/client/src/parallelism_lookup.rs | 54 ++++++++ shared/client/src/state/init.rs | 122 ++++++++++++------ 6 files changed, 146 insertions(+), 37 deletions(-) create mode 100644 shared/client/src/parallelism_lookup.rs diff --git a/architectures/centralized/client/src/app.rs b/architectures/centralized/client/src/app.rs index 3560e0a7a..4d2d7e873 100644 --- a/architectures/centralized/client/src/app.rs +++ b/architectures/centralized/client/src/app.rs @@ -131,6 +131,7 @@ pub async fn build_app( .await?; let state_options: RunInitConfig = RunInitConfig { + parallelism_auto: p.parallelism_auto, data_parallelism: p.data_parallelism, tensor_parallelism: p.tensor_parallelism, micro_batch_size: p.micro_batch_size, diff --git a/architectures/decentralized/solana-client/src/app.rs b/architectures/decentralized/solana-client/src/app.rs index 36a529bbb..82914752f 100644 --- a/architectures/decentralized/solana-client/src/app.rs +++ b/architectures/decentralized/solana-client/src/app.rs @@ -114,6 +114,7 @@ pub async fn build_app( let state_options: RunInitConfig = RunInitConfig { + parallelism_auto: p.parallelism_auto, data_parallelism: p.data_parallelism, tensor_parallelism: p.tensor_parallelism, micro_batch_size: p.micro_batch_size, diff --git a/shared/client/src/cli.rs b/shared/client/src/cli.rs index 268ea753a..139e273a6 100644 --- a/shared/client/src/cli.rs +++ b/shared/client/src/cli.rs @@ -112,6 +112,10 @@ pub struct TrainArgs { #[clap(long, env, value_parser = parse_trim_quotes)] pub run_id: String, + /// Auto-detect parallelism settings from lookup table based on model and GPU count + #[clap(long, env)] + pub parallelism_auto: bool, + #[clap(long, default_value_t = 1, env)] pub data_parallelism: usize, diff --git a/shared/client/src/lib.rs b/shared/client/src/lib.rs index b4ea80576..c9aaaf2aa 100644 --- a/shared/client/src/lib.rs +++ b/shared/client/src/lib.rs @@ -1,6 +1,7 @@ mod cli; mod client; mod fetch_data; +pub mod parallelism_lookup; mod protocol; mod state; mod tui; diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs new file mode 100644 index 000000000..2186c8804 --- /dev/null +++ b/shared/client/src/parallelism_lookup.rs @@ -0,0 +1,54 @@ +use anyhow::{Result, bail}; +use serde::Deserialize; +use std::collections::HashMap; +use std::process::Command; + +const PARALLELISM_DATA: &str = include_str!("parallelism_data.json"); + +#[derive(Debug, Clone, Copy, Deserialize)] +pub struct ParallelismConfig { + pub dp: usize, + pub tp: usize, + pub micro_batch_size: usize, +} + +type Table = HashMap>; + +pub fn get_num_gpus() -> Result { + let output = Command::new("nvidia-smi") + .args(["--query-gpu=name", "--format=csv,noheader"]) + .output()?; + + if !output.status.success() { + bail!("nvidia-smi failed"); + } + + let count = String::from_utf8_lossy(&output.stdout) + .lines() + .filter(|s| !s.is_empty()) + .count(); + + if count == 0 { + bail!("No GPUs detected"); + } + + Ok(count) +} + +pub fn lookup(model_repo_id: &str, num_gpus: usize) -> Result { + let table: Table = serde_json::from_str(PARALLELISM_DATA)?; + + let gpu_configs = table + .get(model_repo_id) + .ok_or_else(|| anyhow::anyhow!("Model '{}' not in parallelism table", model_repo_id))?; + + let config = gpu_configs.get(&num_gpus.to_string()).ok_or_else(|| { + anyhow::anyhow!( + "No config for {} GPUs with model '{}'", + num_gpus, + model_repo_id + ) + })?; + + Ok(*config) +} diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index f7326f382..406842d63 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -1,4 +1,4 @@ -use crate::{WandBInfo, fetch_data::DataFetcher}; +use crate::{WandBInfo, fetch_data::DataFetcher, parallelism_lookup}; use psyche_coordinator::{ Coordinator, HealthChecks, model::{self, HttpLLMTrainingDataLocation, LLMTrainingDataLocation}, @@ -50,6 +50,8 @@ pub struct RunInitConfig { pub device: Devices, pub hub_read_token: Option, pub hub_max_concurrent_downloads: usize, + /// If true, auto-detect parallelism from lookup table (overrides dp/tp/micro_batch_size) + pub parallelism_auto: bool, pub data_parallelism: usize, pub tensor_parallelism: usize, pub micro_batch_size: usize, @@ -115,6 +117,9 @@ pub enum InitRunError { #[error("Unsupported architecture: {0}")] UnsupportedArchitecture(String), + #[error("Parallelism auto-detection failed: {0}")] + ParallelismLookupFailed(#[from] anyhow::Error), + #[cfg(feature = "python")] #[error("Python distributed error: {0}")] PythonDistributedError(#[from] psyche_modeling::PythonDistributedCausalLMError), @@ -195,6 +200,59 @@ impl RunInitConfigAndIO = match &llm.checkpoint { + model::Checkpoint::Hub(hub_repo) | model::Checkpoint::P2P(hub_repo) => { + Some((&hub_repo.repo_id).into()) + } + _ => None, + }; + + if let Some(repo_id) = model_repo_id { + let num_gpus = parallelism_lookup::get_num_gpus()?; + let config = parallelism_lookup::lookup(&repo_id, num_gpus)?; + (config.dp, config.tp, config.micro_batch_size) + } else { + (1, 1, 1) + } + } else { + ( + init_config.data_parallelism, + init_config.tensor_parallelism, + init_config.micro_batch_size, + ) + }; + + #[cfg(not(feature = "parallelism"))] + let (data_parallelism, tensor_parallelism, micro_batch_size) = ( + init_config.data_parallelism, + init_config.tensor_parallelism, + init_config.micro_batch_size, + ); + + info!( + "Parallelism: dp={}, tp={}, micro_batch_size={}", + data_parallelism, tensor_parallelism, micro_batch_size + ); + let hub_read_token = init_config.hub_read_token.clone(); let hub_max_concurrent_downloads = init_config.hub_max_concurrent_downloads; let data_future = async { @@ -277,7 +335,7 @@ impl RunInitConfigAndIO RunInitConfigAndIO 1 + if data_parallelism > 1 && llm.architecture == model::LLMArchitecture::HfAuto { 1 } else { - init_config.data_parallelism + data_parallelism }, ); @@ -467,8 +525,8 @@ impl RunInitConfigAndIO { #[cfg(feature = "python")] { - let dp = init_config.data_parallelism; - let tp = init_config.tensor_parallelism; + let dp = data_parallelism; + let tp = tensor_parallelism; tokio::task::spawn_blocking(move || { if tp != 1 || dp != 1 { @@ -520,31 +578,25 @@ impl RunInitConfigAndIO { let mut futures: Vec< JoinHandle, ModelLoadError>>, - > = Vec::with_capacity( - init_config.data_parallelism * init_config.tensor_parallelism, - ); + > = Vec::with_capacity(data_parallelism * tensor_parallelism); let devices = init_config.device.clone(); - for dp in 0..init_config.data_parallelism { + for dp in 0..data_parallelism { let communicator_id: Option = - match init_config.tensor_parallelism { + match tensor_parallelism { 0 | 1 => None, #[cfg(feature = "parallelism")] _ => Some(tch::CStore::new().into()), #[cfg(not(feature = "parallelism"))] _ => unimplemented!(), }; - for tp in 0..init_config.tensor_parallelism { + for tp in 0..tensor_parallelism { let tensor_parallelism_world = communicator_id.as_ref().map(|communicator_id| { - ( - communicator_id.clone(), - tp, - init_config.tensor_parallelism, - ) + (communicator_id.clone(), tp, tensor_parallelism) }); let source = source.clone(); - let rank = dp * init_config.tensor_parallelism + tp; + let rank = dp * tensor_parallelism + tp; let devices = devices.clone(); let device = devices.device_for_rank(rank); futures.push(tokio::task::spawn_blocking(move || { @@ -604,9 +656,9 @@ impl RunInitConfigAndIO RunInitConfigAndIO::new(data_provider, init_config.data_parallelism * 2); + let data_fetcher = DataFetcher::::new(data_provider, data_parallelism * 2); let trainers: Vec = match models { RawLoadedModelType::ParallelNativeModels(models) => { @@ -690,26 +741,24 @@ impl RunInitConfigAndIO)>> = - if init_config.data_parallelism > 1 { + if data_parallelism > 1 { #[cfg(feature = "parallelism")] { Some( - (0..init_config.tensor_parallelism) + (0..tensor_parallelism) .map(|_| { ( tch::CStore::new().into(), - Arc::new(CancellableBarrier::new( - init_config.tensor_parallelism, - )) + Arc::new(CancellableBarrier::new(tensor_parallelism)) as Arc, ) }) @@ -736,13 +785,12 @@ impl RunInitConfigAndIO; + let barrier = Arc::new(CancellableBarrier::new(tensor_parallelism)) + as Arc; LocalTrainer::new( ParallelModels { models, @@ -751,7 +799,7 @@ impl RunInitConfigAndIO RunInitConfigAndIO RunInitConfigAndIO Date: Fri, 23 Jan 2026 13:43:51 -0800 Subject: [PATCH 02/16] parallelism_data.json --- shared/client/src/parallelism_data.json | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 shared/client/src/parallelism_data.json diff --git a/shared/client/src/parallelism_data.json b/shared/client/src/parallelism_data.json new file mode 100644 index 000000000..439fd5523 --- /dev/null +++ b/shared/client/src/parallelism_data.json @@ -0,0 +1,17 @@ +{ + "NousResearch/Meta-Llama-3.1-8B": { + "1": { "dp": 1, "tp": 1, "micro_batch_size": 4 }, + "8": { "dp": 8, "tp": 1, "micro_batch_size": 4 } + }, + "TinyLlama/TinyLlama-1.1B-Chat-v0.4": { + "1": { "dp": 1, "tp": 1, "micro_batch_size": 8 }, + "8": { "dp": 8, "tp": 1, "micro_batch_size": 8 } + }, + "deepseek-ai/DeepSeek-V2-Lite": { + "1": { "dp": 1, "tp": 1, "micro_batch_size": 2 }, + "8": { "dp": 4, "tp": 2, "micro_batch_size": 2 } + }, + "NousResearch/Hermes-4-70B": { + "8": { "dp": 1, "tp": 8, "micro_batch_size": 1 } + } +} From b541cc545eb14256c8fa2e7875a85b3f3a11a0dd Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Fri, 23 Jan 2026 13:59:20 -0800 Subject: [PATCH 03/16] simplify code --- shared/client/src/parallelism_lookup.rs | 34 ++++++++++--------------- shared/client/src/state/init.rs | 23 ++++++++--------- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs index 2186c8804..493ee08f5 100644 --- a/shared/client/src/parallelism_lookup.rs +++ b/shared/client/src/parallelism_lookup.rs @@ -1,7 +1,8 @@ -use anyhow::{Result, bail}; +use anyhow::Result; use serde::Deserialize; use std::collections::HashMap; use std::process::Command; +use tracing::info; const PARALLELISM_DATA: &str = include_str!("parallelism_data.json"); @@ -14,28 +15,21 @@ pub struct ParallelismConfig { type Table = HashMap>; -pub fn get_num_gpus() -> Result { - let output = Command::new("nvidia-smi") +fn get_gpu_name() -> String { + Command::new("nvidia-smi") .args(["--query-gpu=name", "--format=csv,noheader"]) - .output()?; - - if !output.status.success() { - bail!("nvidia-smi failed"); - } - - let count = String::from_utf8_lossy(&output.stdout) - .lines() - .filter(|s| !s.is_empty()) - .count(); - - if count == 0 { - bail!("No GPUs detected"); - } - - Ok(count) + .output() + .ok() + .and_then(|o| String::from_utf8(o.stdout).ok()) + .and_then(|s| s.lines().next().map(|l| l.trim().to_string())) + .unwrap_or_else(|| "unknown".to_string()) } -pub fn lookup(model_repo_id: &str, num_gpus: usize) -> Result { +pub fn lookup(model_repo_id: &str) -> Result { + let num_gpus = tch::Cuda::device_count(); + let gpu_name = get_gpu_name(); + info!("Detected {} GPU(s): {}", num_gpus, gpu_name); + let table: Table = serde_json::from_str(PARALLELISM_DATA)?; let gpu_configs = table diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 406842d63..646d058d1 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -213,26 +213,25 @@ impl RunInitConfigAndIO = match &llm.checkpoint { + let model_repo_id: String = match &llm.checkpoint { model::Checkpoint::Hub(hub_repo) | model::Checkpoint::P2P(hub_repo) => { - Some((&hub_repo.repo_id).into()) + (&hub_repo.repo_id).into() + } + _ => { + return Err(InitRunError::ParallelismLookupFailed(anyhow::anyhow!( + "--parallelism-auto requires a Hub or P2P checkpoint" + ))); } - _ => None, }; - if let Some(repo_id) = model_repo_id { - let num_gpus = parallelism_lookup::get_num_gpus()?; - let config = parallelism_lookup::lookup(&repo_id, num_gpus)?; - (config.dp, config.tp, config.micro_batch_size) - } else { - (1, 1, 1) - } + let config = parallelism_lookup::lookup(&model_repo_id)?; + (config.dp, config.tp, config.micro_batch_size) } else { ( init_config.data_parallelism, From 3af705e25a868750c140290886d201521b1dc8b4 Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Fri, 23 Jan 2026 14:12:10 -0800 Subject: [PATCH 04/16] clippy --- shared/client/src/state/init.rs | 61 +++++++++++++++++---------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 646d058d1..dbd1d5e1b 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -1,4 +1,6 @@ -use crate::{WandBInfo, fetch_data::DataFetcher, parallelism_lookup}; +#[cfg(feature = "parallelism")] +use crate::parallelism_lookup; +use crate::{WandBInfo, fetch_data::DataFetcher}; use psyche_coordinator::{ Coordinator, HealthChecks, model::{self, HttpLLMTrainingDataLocation, LLMTrainingDataLocation}, @@ -209,36 +211,37 @@ impl RunInitConfigAndIO { - (&hub_repo.repo_id).into() - } - _ => { - return Err(InitRunError::ParallelismLookupFailed(anyhow::anyhow!( - "--parallelism-auto requires a Hub or P2P checkpoint" - ))); + let (data_parallelism, tensor_parallelism, micro_batch_size) = + if init_config.parallelism_auto { + if init_config.data_parallelism != 1 + || init_config.tensor_parallelism != 1 + || init_config.micro_batch_size != 1 + { + tracing::warn!( + "--parallelism-auto is set, ignoring manual dp/tp/micro_batch_size values" + ); } - }; - let config = parallelism_lookup::lookup(&model_repo_id)?; - (config.dp, config.tp, config.micro_batch_size) - } else { - ( - init_config.data_parallelism, - init_config.tensor_parallelism, - init_config.micro_batch_size, - ) - }; + let model_repo_id: String = match &llm.checkpoint { + model::Checkpoint::Hub(hub_repo) | model::Checkpoint::P2P(hub_repo) => { + (&hub_repo.repo_id).into() + } + _ => { + return Err(InitRunError::ParallelismLookupFailed(anyhow::anyhow!( + "--parallelism-auto requires a Hub or P2P checkpoint" + ))); + } + }; + + let config = parallelism_lookup::lookup(&model_repo_id)?; + (config.dp, config.tp, config.micro_batch_size) + } else { + ( + init_config.data_parallelism, + init_config.tensor_parallelism, + init_config.micro_batch_size, + ) + }; #[cfg(not(feature = "parallelism"))] let (data_parallelism, tensor_parallelism, micro_batch_size) = ( From 27f4ed5436dc03fd849b6bfe186b5a2d80a1a3d6 Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Mon, 26 Jan 2026 07:55:35 -0800 Subject: [PATCH 05/16] add parallelism_data.json to Garnix --- nix/lib.nix | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nix/lib.nix b/nix/lib.nix index 961d212ad..8bfe63aef 100644 --- a/nix/lib.nix +++ b/nix/lib.nix @@ -20,7 +20,8 @@ let || (builtins.match ".*tests/fixtures/.*$" path != null) || (builtins.match ".*.config/.*$" path != null) || (builtins.match ".*local-dev-keypair.json$" path != null) - || (builtins.match ".*shared/client/src/state/prompt_texts/index\\.json$" path != null); + || (builtins.match ".*shared/client/src/state/prompt_texts/index\\.json$" path != null) + || (builtins.match ".*shared/client/src/parallelism_data\\.json$" path != null); src = lib.cleanSourceWith { src = ../.; From a8d53301a731b4a0144eb5710a99360af23ab90a Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Mon, 26 Jan 2026 08:56:02 -0800 Subject: [PATCH 06/16] add hardware type to json --- shared/client/src/parallelism_data.json | 31 +++++++++++++++++----- shared/client/src/parallelism_lookup.rs | 34 +++++++++++++++++++------ 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/shared/client/src/parallelism_data.json b/shared/client/src/parallelism_data.json index 439fd5523..53cf4d2f3 100644 --- a/shared/client/src/parallelism_data.json +++ b/shared/client/src/parallelism_data.json @@ -1,17 +1,34 @@ { + "_doc (HuggingFace repo)": { + "gpu type from nvidia-smi": { + "numember of gpus": { + "dp": "data parallelism", + "tp": "tensor parallelism", + "micro_batch_size": "micro batch size" + } + } + }, "NousResearch/Meta-Llama-3.1-8B": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 4 }, - "8": { "dp": 8, "tp": 1, "micro_batch_size": 4 } + "H100": { + "1": { "dp": 1, "tp": 1, "micro_batch_size": 4 }, + "8": { "dp": 8, "tp": 1, "micro_batch_size": 4 } + } }, "TinyLlama/TinyLlama-1.1B-Chat-v0.4": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 8 }, - "8": { "dp": 8, "tp": 1, "micro_batch_size": 8 } + "H100": { + "1": { "dp": 1, "tp": 1, "micro_batch_size": 8 }, + "8": { "dp": 8, "tp": 1, "micro_batch_size": 8 } + } }, "deepseek-ai/DeepSeek-V2-Lite": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 2 }, - "8": { "dp": 4, "tp": 2, "micro_batch_size": 2 } + "H100": { + "1": { "dp": 1, "tp": 1, "micro_batch_size": 2 }, + "8": { "dp": 4, "tp": 2, "micro_batch_size": 2 } + } }, "NousResearch/Hermes-4-70B": { - "8": { "dp": 1, "tp": 8, "micro_batch_size": 1 } + "H100": { + "8": { "dp": 1, "tp": 8, "micro_batch_size": 1 } + } } } diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs index 493ee08f5..c9f600bfd 100644 --- a/shared/client/src/parallelism_lookup.rs +++ b/shared/client/src/parallelism_lookup.rs @@ -13,22 +13,31 @@ pub struct ParallelismConfig { pub micro_batch_size: usize, } -type Table = HashMap>; +type Table = HashMap>>; -fn get_gpu_name() -> String { - Command::new("nvidia-smi") +fn get_gpu_type() -> String { + let raw = Command::new("nvidia-smi") .args(["--query-gpu=name", "--format=csv,noheader"]) .output() .ok() .and_then(|o| String::from_utf8(o.stdout).ok()) .and_then(|s| s.lines().next().map(|l| l.trim().to_string())) - .unwrap_or_else(|| "unknown".to_string()) + .unwrap_or_default(); + + // Normalize GPU name to match table keys + if raw.to_uppercase().contains("H200") { + "H200".to_string() + } else if raw.to_uppercase().contains("H100") { + "H100".to_string() + } else { + raw + } } pub fn lookup(model_repo_id: &str) -> Result { let num_gpus = tch::Cuda::device_count(); - let gpu_name = get_gpu_name(); - info!("Detected {} GPU(s): {}", num_gpus, gpu_name); + let gpu_type = get_gpu_type(); + info!("Detected {} x {} GPU(s)", num_gpus, gpu_type); let table: Table = serde_json::from_str(PARALLELISM_DATA)?; @@ -36,10 +45,19 @@ pub fn lookup(model_repo_id: &str) -> Result { .get(model_repo_id) .ok_or_else(|| anyhow::anyhow!("Model '{}' not in parallelism table", model_repo_id))?; - let config = gpu_configs.get(&num_gpus.to_string()).ok_or_else(|| { + let num_gpu_configs = gpu_configs.get(&gpu_type).ok_or_else(|| { + anyhow::anyhow!( + "GPU '{}' not in parallelism table for model '{}'", + gpu_type, + model_repo_id + ) + })?; + + let config = num_gpu_configs.get(&num_gpus.to_string()).ok_or_else(|| { anyhow::anyhow!( - "No config for {} GPUs with model '{}'", + "No config for {} x {} with model '{}'", num_gpus, + gpu_type, model_repo_id ) })?; From f265de968cc986b8f52ad5497651821f4de41c9d Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Mon, 26 Jan 2026 12:27:17 -0800 Subject: [PATCH 07/16] update .json --- scripts/train-solana-test.sh | 0 shared/client/src/parallelism_data.json | 20 +++++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) mode change 100755 => 100644 scripts/train-solana-test.sh diff --git a/scripts/train-solana-test.sh b/scripts/train-solana-test.sh old mode 100755 new mode 100644 diff --git a/shared/client/src/parallelism_data.json b/shared/client/src/parallelism_data.json index 53cf4d2f3..5386783a2 100644 --- a/shared/client/src/parallelism_data.json +++ b/shared/client/src/parallelism_data.json @@ -2,16 +2,16 @@ "_doc (HuggingFace repo)": { "gpu type from nvidia-smi": { "numember of gpus": { - "dp": "data parallelism", - "tp": "tensor parallelism", - "micro_batch_size": "micro batch size" + "dp": 0, + "tp": 0, + "micro_batch_size": 0 } } }, - "NousResearch/Meta-Llama-3.1-8B": { + "emozilla/llama2-20m-init": { "H100": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 4 }, - "8": { "dp": 8, "tp": 1, "micro_batch_size": 4 } + "1": { "dp": 1, "tp": 1, "micro_batch_size": 1 }, + "8": { "dp": 1, "tp": 1, "micro_batch_size": 1 } } }, "TinyLlama/TinyLlama-1.1B-Chat-v0.4": { @@ -20,9 +20,15 @@ "8": { "dp": 8, "tp": 1, "micro_batch_size": 8 } } }, + "NousResearch/Meta-Llama-3.1-8B": { + "H100": { + "1": { "dp": 1, "tp": 1, "micro_batch_size": 4 }, + "8": { "dp": 8, "tp": 1, "micro_batch_size": 4 } + } + }, "deepseek-ai/DeepSeek-V2-Lite": { "H100": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 2 }, + "1": { "dp": 1, "tp": 1, "micro_batch_size": 1 }, "8": { "dp": 4, "tp": 2, "micro_batch_size": 2 } } }, From 1d3aafd5532a6adb8b1f9fe6515e43f50d1a7eda Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Mon, 26 Jan 2026 12:27:36 -0800 Subject: [PATCH 08/16] Fallback: read from /proc/driver/nvidia --- shared/client/src/parallelism_lookup.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs index c9f600bfd..131cc619b 100644 --- a/shared/client/src/parallelism_lookup.rs +++ b/shared/client/src/parallelism_lookup.rs @@ -16,12 +16,31 @@ pub struct ParallelismConfig { type Table = HashMap>>; fn get_gpu_type() -> String { + // Try nvidia-smi first let raw = Command::new("nvidia-smi") .args(["--query-gpu=name", "--format=csv,noheader"]) .output() .ok() .and_then(|o| String::from_utf8(o.stdout).ok()) .and_then(|s| s.lines().next().map(|l| l.trim().to_string())) + .filter(|s| !s.is_empty()) + // Fallback: read from /proc/driver/nvidia (works in containers without nvidia-smi) + .or_else(|| { + std::fs::read_dir("/proc/driver/nvidia/gpus") + .ok()? + .filter_map(|e| e.ok()) + .next() + .and_then(|entry| { + let info_path = entry.path().join("information"); + std::fs::read_to_string(info_path).ok() + }) + .and_then(|content| { + content + .lines() + .find(|line| line.starts_with("Model:")) + .map(|line| line.trim_start_matches("Model:").trim().to_string()) + }) + }) .unwrap_or_default(); // Normalize GPU name to match table keys From 50762e5c20b54bd66400a05be83ba8a3a833399a Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Mon, 26 Jan 2026 12:34:32 -0800 Subject: [PATCH 09/16] restore scripts/train-solana-test.sh --- scripts/train-solana-test.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/train-solana-test.sh diff --git a/scripts/train-solana-test.sh b/scripts/train-solana-test.sh old mode 100644 new mode 100755 From f6c1c7a9538fa46654a68fc65b009e74eb7c06c6 Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Mon, 26 Jan 2026 13:08:54 -0800 Subject: [PATCH 10/16] update documentation --- psyche-book/src/enduser/create-run.md | 21 ++++++++++++++++++++- psyche-book/src/enduser/join-run.md | 10 ++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/psyche-book/src/enduser/create-run.md b/psyche-book/src/enduser/create-run.md index 7a9354ca1..542d58837 100644 --- a/psyche-book/src/enduser/create-run.md +++ b/psyche-book/src/enduser/create-run.md @@ -86,11 +86,30 @@ run-manager create-run \ At this point, your run has been successfully created. +### Updating the parallelism lookup table (recommended) + +If your run uses a model that is not already in the [parallelism lookup table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json), it's recommended to add the optimal parallelism configuration for your model and target GPU hardware. This allows clients to use `PARALLELISM_AUTO=true` for automatic configuration. + +The table maps model (HuggingFace repo ID) → GPU type → number of GPUs → parallelism settings: + +```json +{ + "your-org/your-model": { + "H100": { + "1": { "dp": 1, "tp": 1, "micro_batch_size": 4 }, + "8": { "dp": 4, "tp": 2, "micro_batch_size": 4 } + } + } +} +``` + +Consider opening a PR to add your model's configuration so other clients can benefit from it. + ### Initializing configuration Initially, the run will not have any configuration defined and will remain paused, so no clients can join yet. -To set the run configuration, you’ll need to provide mostly the same parameters as when creating the run, along with the path to a `config.toml` file that follows the [run config schema](./run-config.md). +To set the run configuration, you'll need to provide mostly the same parameters as when creating the run, along with the path to a `config.toml` file that follows the [run config schema](./run-config.md). ```bash run-manager update-config \ diff --git a/psyche-book/src/enduser/join-run.md b/psyche-book/src/enduser/join-run.md index b79678bcd..9ad7492c7 100644 --- a/psyche-book/src/enduser/join-run.md +++ b/psyche-book/src/enduser/join-run.md @@ -93,19 +93,29 @@ though you might need to. **`NVIDIA_DRIVER_CAPABILITIES`** - An environment variable that the NVIDIA Container Toolkit uses to determine which compute capabilities should be provided to your container. It is recommended to set it to 'all', e.g. `NVIDIA_DRIVER_CAPABILITIES=all`. +**`PARALLELISM_AUTO`** - Set to `true` to automatically detect optimal parallelism settings based on the model and your GPU hardware. + +- When enabled, the client will look up the best `DATA_PARALLELISM`, `TENSOR_PARALLELISM`, and `MICRO_BATCH_SIZE` values from a [built-in configuration table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json) +- Your model and GPU hardware combination must be present in the table +- This is the recommended option for most users +- If set, manual parallelism settings below will be ignored + **`DATA_PARALLELISM`** - Number of GPUs to distribute training data across. - If you have multiple GPUs, you can set this to 2, 4, etc. to speed up training - If you have 1 GPU, set this to `1` +- Ignored if `PARALLELISM_AUTO=true` **`TENSOR_PARALLELISM`** - Number of GPUs to distribute the model across, this lets you train a model you can't fit on one single GPU. - If you have 1 GPU, set this to `1` - If your have `n` GPUs you can distribute the model across all of them by setting it to `n`. +- Ignored if `PARALLELISM_AUTO=true` **`MICRO_BATCH_SIZE`** - Number of samples processed per GPU per training step - Set as high as your GPU memory allows +- Ignored if `PARALLELISM_AUTO=true` **`AUTHORIZER`** - The Solana address that authorized your wallet to join this run From 42fd01256d12dc5a025307ae72a2484ee920760f Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Mon, 26 Jan 2026 18:36:31 -0300 Subject: [PATCH 11/16] Change micro_batch_size for Meta-Llama-3.1 to 1 --- shared/client/src/parallelism_data.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/client/src/parallelism_data.json b/shared/client/src/parallelism_data.json index 5386783a2..234146aae 100644 --- a/shared/client/src/parallelism_data.json +++ b/shared/client/src/parallelism_data.json @@ -22,7 +22,7 @@ }, "NousResearch/Meta-Llama-3.1-8B": { "H100": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 4 }, + "1": { "dp": 1, "tp": 1, "micro_batch_size": 1 }, "8": { "dp": 8, "tp": 1, "micro_batch_size": 4 } } }, From 1695b9305ecbc3f595adc9d742f15c64a9968336 Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Tue, 27 Jan 2026 10:07:27 -0800 Subject: [PATCH 12/16] nit --- shared/client/src/parallelism_lookup.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs index 131cc619b..5968f33a0 100644 --- a/shared/client/src/parallelism_lookup.rs +++ b/shared/client/src/parallelism_lookup.rs @@ -17,7 +17,7 @@ type Table = HashMap> fn get_gpu_type() -> String { // Try nvidia-smi first - let raw = Command::new("nvidia-smi") + let raw_gpu_name = Command::new("nvidia-smi") .args(["--query-gpu=name", "--format=csv,noheader"]) .output() .ok() @@ -44,12 +44,12 @@ fn get_gpu_type() -> String { .unwrap_or_default(); // Normalize GPU name to match table keys - if raw.to_uppercase().contains("H200") { + if raw_gpu_name.to_uppercase().contains("H200") { "H200".to_string() - } else if raw.to_uppercase().contains("H100") { + } else if raw_gpu_name.to_uppercase().contains("H100") { "H100".to_string() } else { - raw + raw_gpu_name } } From 51757239add95850aa30d52bafdc3ec64030336f Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Tue, 27 Jan 2026 12:04:36 -0800 Subject: [PATCH 13/16] look data-parallelism.json in HF repo --- psyche-book/src/enduser/create-run.md | 22 ++++++- shared/client/src/parallelism_lookup.rs | 76 +++++++++++++++++++------ 2 files changed, 77 insertions(+), 21 deletions(-) diff --git a/psyche-book/src/enduser/create-run.md b/psyche-book/src/enduser/create-run.md index 542d58837..bc9e31188 100644 --- a/psyche-book/src/enduser/create-run.md +++ b/psyche-book/src/enduser/create-run.md @@ -86,11 +86,13 @@ run-manager create-run \ At this point, your run has been successfully created. -### Updating the parallelism lookup table (recommended) +### Adding parallelism configuration (recommended) If your run uses a model that is not already in the [parallelism lookup table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json), it's recommended to add the optimal parallelism configuration for your model and target GPU hardware. This allows clients to use `PARALLELISM_AUTO=true` for automatic configuration. -The table maps model (HuggingFace repo ID) → GPU type → number of GPUs → parallelism settings: +#### Option 1: Add to your model's HuggingFace repo (preferred) + +Add a `parallelism_data.json` file directly to your model's HuggingFace repository. The client will automatically fetch this configuration at runtime - no Psyche rebuild required. ```json { @@ -98,12 +100,26 @@ The table maps model (HuggingFace repo ID) → GPU type → number of GPUs → p "H100": { "1": { "dp": 1, "tp": 1, "micro_batch_size": 4 }, "8": { "dp": 4, "tp": 2, "micro_batch_size": 4 } + }, + "H200": { + "8": { "dp": 8, "tp": 1, "micro_batch_size": 8 } } } } ``` -Consider opening a PR to add your model's configuration so other clients can benefit from it. +This is the preferred approach because: + +- No PR to Psyche required +- No Docker image rebuild needed +- Model creators manage their own configuration +- Changes take effect immediately + +#### Option 2: Add to Psyche's compiled table + +Alternatively, open a PR to add your model to the [compiled parallelism table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json). This is useful for widely-used models that should have default configurations. The format is the same as above. + +**Lookup order**: The client first tries to fetch config from the model's HuggingFace repo, then falls back to the compiled table with a warning if not found. ### Initializing configuration diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs index 5968f33a0..80b0ae770 100644 --- a/shared/client/src/parallelism_lookup.rs +++ b/shared/client/src/parallelism_lookup.rs @@ -1,10 +1,12 @@ use anyhow::Result; +use hf_hub::{Repo, RepoType}; use serde::Deserialize; use std::collections::HashMap; use std::process::Command; -use tracing::info; +use tracing::{info, warn}; const PARALLELISM_DATA: &str = include_str!("parallelism_data.json"); +const REMOTE_CONFIG_FILENAME: &str = "parallelism_data.json"; #[derive(Debug, Clone, Copy, Deserialize)] pub struct ParallelismConfig { @@ -13,6 +15,7 @@ pub struct ParallelismConfig { pub micro_batch_size: usize, } +// Table format: model -> gpu_type -> num_gpus -> config type Table = HashMap>>; fn get_gpu_type() -> String { @@ -53,33 +56,70 @@ fn get_gpu_type() -> String { } } +/// Try to load parallelism config from the model's HuggingFace repo +fn load_from_model_repo(model_repo_id: &str) -> Option { + let token = std::env::var("HF_TOKEN").ok(); + + let api = hf_hub::api::sync::ApiBuilder::new() + .with_token(token) + .build() + .ok()? + .repo(Repo::new(model_repo_id.to_string(), RepoType::Model)); + + let path = api.get(REMOTE_CONFIG_FILENAME).ok()?; + let content = std::fs::read_to_string(path).ok()?; + serde_json::from_str(&content).ok() +} + +/// Lookup config in a table +fn lookup_in_table( + table: &Table, + model_repo_id: &str, + gpu_type: &str, + num_gpus: usize, +) -> Option { + table + .get(model_repo_id) + .and_then(|g| g.get(gpu_type)) + .and_then(|n| n.get(&num_gpus.to_string())) + .copied() +} + +/// Load the compiled parallelism table +fn load_compiled_table() -> Result
{ + serde_json::from_str(PARALLELISM_DATA) + .map_err(|e| anyhow::anyhow!("Failed to parse compiled parallelism data: {}", e)) +} + pub fn lookup(model_repo_id: &str) -> Result { - let num_gpus = tch::Cuda::device_count(); + let num_gpus = tch::Cuda::device_count() as usize; let gpu_type = get_gpu_type(); info!("Detected {} x {} GPU(s)", num_gpus, gpu_type); - let table: Table = serde_json::from_str(PARALLELISM_DATA)?; - - let gpu_configs = table - .get(model_repo_id) - .ok_or_else(|| anyhow::anyhow!("Model '{}' not in parallelism table", model_repo_id))?; + // Try model's own config first + if let Some(table) = load_from_model_repo(model_repo_id) { + if let Some(config) = lookup_in_table(&table, model_repo_id, &gpu_type, num_gpus) { + info!( + "Using parallelism config from model repo '{}'", + model_repo_id + ); + return Ok(config); + } + } - let num_gpu_configs = gpu_configs.get(&gpu_type).ok_or_else(|| { - anyhow::anyhow!( - "GPU '{}' not in parallelism table for model '{}'", - gpu_type, - model_repo_id - ) - })?; + // Fall back to compiled table + warn!( + "No parallelism config found in model repo '{}', using compiled defaults", + model_repo_id + ); - let config = num_gpu_configs.get(&num_gpus.to_string()).ok_or_else(|| { + let table = load_compiled_table()?; + lookup_in_table(&table, model_repo_id, &gpu_type, num_gpus).ok_or_else(|| { anyhow::anyhow!( "No config for {} x {} with model '{}'", num_gpus, gpu_type, model_repo_id ) - })?; - - Ok(*config) + }) } From 8f6d14e758d4a09b22ba4b33185c4cd6e4529c0a Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Wed, 28 Jan 2026 10:39:41 -0800 Subject: [PATCH 14/16] change json format --- psyche-book/src/enduser/create-run.md | 39 ++++++-------- shared/client/src/parallelism_data.json | 40 --------------- shared/client/src/parallelism_lookup.rs | 67 +++++++++---------------- 3 files changed, 38 insertions(+), 108 deletions(-) delete mode 100644 shared/client/src/parallelism_data.json diff --git a/psyche-book/src/enduser/create-run.md b/psyche-book/src/enduser/create-run.md index bc9e31188..6aaebb5d4 100644 --- a/psyche-book/src/enduser/create-run.md +++ b/psyche-book/src/enduser/create-run.md @@ -86,40 +86,31 @@ run-manager create-run \ At this point, your run has been successfully created. -### Adding parallelism configuration (recommended) +### Adding parallelism configuration (required for --parallelism-auto) -If your run uses a model that is not already in the [parallelism lookup table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json), it's recommended to add the optimal parallelism configuration for your model and target GPU hardware. This allows clients to use `PARALLELISM_AUTO=true` for automatic configuration. - -#### Option 1: Add to your model's HuggingFace repo (preferred) - -Add a `parallelism_data.json` file directly to your model's HuggingFace repository. The client will automatically fetch this configuration at runtime - no Psyche rebuild required. +If you want clients to use `PARALLELISM_AUTO=true` for automatic configuration, you must add a `parallelism_data.json` file to your model's HuggingFace repository. ```json { - "your-org/your-model": { - "H100": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 4 }, - "8": { "dp": 4, "tp": 2, "micro_batch_size": 4 } - }, - "H200": { - "8": { "dp": 8, "tp": 1, "micro_batch_size": 8 } - } + "H100": { + "1": { "dp": 1, "tp": 1, "micro_batch_size": 4 }, + "8": { "dp": 4, "tp": 2, "micro_batch_size": 4 } + }, + "H200": { + "8": { "dp": 8, "tp": 1, "micro_batch_size": 8 } } } ``` -This is the preferred approach because: - -- No PR to Psyche required -- No Docker image rebuild needed -- Model creators manage their own configuration -- Changes take effect immediately - -#### Option 2: Add to Psyche's compiled table +Format: `gpu_type` → `num_gpus` → config -Alternatively, open a PR to add your model to the [compiled parallelism table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json). This is useful for widely-used models that should have default configurations. The format is the same as above. +- **gpu_type**: GPU model name (e.g., "H100", "H200") +- **num_gpus**: Number of GPUs available (e.g., "1", "8") +- **dp**: Data parallelism +- **tp**: Tensor parallelism +- **micro_batch_size**: Micro batch size per GPU -**Lookup order**: The client first tries to fetch config from the model's HuggingFace repo, then falls back to the compiled table with a warning if not found. +The config is shared via P2P when clients join a run. ### Initializing configuration diff --git a/shared/client/src/parallelism_data.json b/shared/client/src/parallelism_data.json deleted file mode 100644 index 234146aae..000000000 --- a/shared/client/src/parallelism_data.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "_doc (HuggingFace repo)": { - "gpu type from nvidia-smi": { - "numember of gpus": { - "dp": 0, - "tp": 0, - "micro_batch_size": 0 - } - } - }, - "emozilla/llama2-20m-init": { - "H100": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 1 }, - "8": { "dp": 1, "tp": 1, "micro_batch_size": 1 } - } - }, - "TinyLlama/TinyLlama-1.1B-Chat-v0.4": { - "H100": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 8 }, - "8": { "dp": 8, "tp": 1, "micro_batch_size": 8 } - } - }, - "NousResearch/Meta-Llama-3.1-8B": { - "H100": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 1 }, - "8": { "dp": 8, "tp": 1, "micro_batch_size": 4 } - } - }, - "deepseek-ai/DeepSeek-V2-Lite": { - "H100": { - "1": { "dp": 1, "tp": 1, "micro_batch_size": 1 }, - "8": { "dp": 4, "tp": 2, "micro_batch_size": 2 } - } - }, - "NousResearch/Hermes-4-70B": { - "H100": { - "8": { "dp": 1, "tp": 8, "micro_batch_size": 1 } - } - } -} diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs index 80b0ae770..30b3a02b0 100644 --- a/shared/client/src/parallelism_lookup.rs +++ b/shared/client/src/parallelism_lookup.rs @@ -3,9 +3,8 @@ use hf_hub::{Repo, RepoType}; use serde::Deserialize; use std::collections::HashMap; use std::process::Command; -use tracing::{info, warn}; +use tracing::info; -const PARALLELISM_DATA: &str = include_str!("parallelism_data.json"); const REMOTE_CONFIG_FILENAME: &str = "parallelism_data.json"; #[derive(Debug, Clone, Copy, Deserialize)] @@ -15,8 +14,8 @@ pub struct ParallelismConfig { pub micro_batch_size: usize, } -// Table format: model -> gpu_type -> num_gpus -> config -type Table = HashMap>>; +// Table format: gpu_type -> num_gpus -> config +type Table = HashMap>; fn get_gpu_type() -> String { // Try nvidia-smi first @@ -56,8 +55,8 @@ fn get_gpu_type() -> String { } } -/// Try to load parallelism config from the model's HuggingFace repo -fn load_from_model_repo(model_repo_id: &str) -> Option
{ +/// Try to load parallelism config JSON from the model's HuggingFace repo +fn load_json_from_model_repo(model_repo_id: &str) -> Option { let token = std::env::var("HF_TOKEN").ok(); let api = hf_hub::api::sync::ApiBuilder::new() @@ -67,59 +66,39 @@ fn load_from_model_repo(model_repo_id: &str) -> Option
{ .repo(Repo::new(model_repo_id.to_string(), RepoType::Model)); let path = api.get(REMOTE_CONFIG_FILENAME).ok()?; - let content = std::fs::read_to_string(path).ok()?; - serde_json::from_str(&content).ok() + std::fs::read_to_string(path).ok() } /// Lookup config in a table -fn lookup_in_table( - table: &Table, - model_repo_id: &str, - gpu_type: &str, - num_gpus: usize, -) -> Option { +fn lookup_in_table(table: &Table, gpu_type: &str, num_gpus: usize) -> Option { table - .get(model_repo_id) - .and_then(|g| g.get(gpu_type)) + .get(gpu_type) .and_then(|n| n.get(&num_gpus.to_string())) .copied() } -/// Load the compiled parallelism table -fn load_compiled_table() -> Result
{ - serde_json::from_str(PARALLELISM_DATA) - .map_err(|e| anyhow::anyhow!("Failed to parse compiled parallelism data: {}", e)) -} - +/// Lookup parallelism config from the model's HuggingFace repo pub fn lookup(model_repo_id: &str) -> Result { let num_gpus = tch::Cuda::device_count() as usize; let gpu_type = get_gpu_type(); info!("Detected {} x {} GPU(s)", num_gpus, gpu_type); - // Try model's own config first - if let Some(table) = load_from_model_repo(model_repo_id) { - if let Some(config) = lookup_in_table(&table, model_repo_id, &gpu_type, num_gpus) { - info!( - "Using parallelism config from model repo '{}'", - model_repo_id - ); - return Ok(config); - } - } + let raw_json = load_json_from_model_repo(model_repo_id).ok_or_else(|| { + anyhow::anyhow!( + "No parallelism_data.json found in model repo '{}'. \ + Add this file to use --parallelism-auto", + model_repo_id + ) + })?; + + let table: Table = serde_json::from_str(&raw_json) + .map_err(|e| anyhow::anyhow!("Failed to parse parallelism_data.json: {}", e))?; - // Fall back to compiled table - warn!( - "No parallelism config found in model repo '{}', using compiled defaults", + info!( + "Using parallelism config from model repo '{}'", model_repo_id ); - let table = load_compiled_table()?; - lookup_in_table(&table, model_repo_id, &gpu_type, num_gpus).ok_or_else(|| { - anyhow::anyhow!( - "No config for {} x {} with model '{}'", - num_gpus, - gpu_type, - model_repo_id - ) - }) + lookup_in_table(&table, &gpu_type, num_gpus) + .ok_or_else(|| anyhow::anyhow!("No config for {} x {}", num_gpus, gpu_type)) } From 6644c67a77d7987aac2947baa54eb197b3cf9022 Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Wed, 28 Jan 2026 12:47:33 -0800 Subject: [PATCH 15/16] nvml_wrapper --- Cargo.lock | 1 + shared/client/Cargo.toml | 1 + shared/client/src/parallelism_lookup.rs | 83 ++++++++++++++----------- 3 files changed, 47 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 24ee94226..6eb0ed0cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6946,6 +6946,7 @@ dependencies = [ "iroh", "iroh-blobs", "lazy_static", + "nvml-wrapper", "postcard", "psyche-coordinator", "psyche-core", diff --git a/shared/client/Cargo.toml b/shared/client/Cargo.toml index 5f7159d81..faf508a4a 100644 --- a/shared/client/Cargo.toml +++ b/shared/client/Cargo.toml @@ -36,6 +36,7 @@ clap.workspace = true sysinfo = "0.32.0" iroh.workspace = true iroh-blobs.workspace = true +nvml-wrapper = "0.11.0" [features] parallelism = ["psyche-modeling/parallelism"] diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs index 30b3a02b0..6ea6f9e26 100644 --- a/shared/client/src/parallelism_lookup.rs +++ b/shared/client/src/parallelism_lookup.rs @@ -1,8 +1,8 @@ use anyhow::Result; use hf_hub::{Repo, RepoType}; +use nvml_wrapper::Nvml; use serde::Deserialize; use std::collections::HashMap; -use std::process::Command; use tracing::info; const REMOTE_CONFIG_FILENAME: &str = "parallelism_data.json"; @@ -17,41 +17,48 @@ pub struct ParallelismConfig { // Table format: gpu_type -> num_gpus -> config type Table = HashMap>; -fn get_gpu_type() -> String { - // Try nvidia-smi first - let raw_gpu_name = Command::new("nvidia-smi") - .args(["--query-gpu=name", "--format=csv,noheader"]) - .output() - .ok() - .and_then(|o| String::from_utf8(o.stdout).ok()) - .and_then(|s| s.lines().next().map(|l| l.trim().to_string())) - .filter(|s| !s.is_empty()) - // Fallback: read from /proc/driver/nvidia (works in containers without nvidia-smi) - .or_else(|| { - std::fs::read_dir("/proc/driver/nvidia/gpus") - .ok()? - .filter_map(|e| e.ok()) - .next() - .and_then(|entry| { - let info_path = entry.path().join("information"); - std::fs::read_to_string(info_path).ok() - }) - .and_then(|content| { - content - .lines() - .find(|line| line.starts_with("Model:")) - .map(|line| line.trim_start_matches("Model:").trim().to_string()) - }) - }) - .unwrap_or_default(); - - // Normalize GPU name to match table keys - if raw_gpu_name.to_uppercase().contains("H200") { +#[derive(Debug)] +struct GpuInfo { + name: String, + device_count: u32, +} + +fn get_gpu_info() -> Result { + let nvml = Nvml::init()?; + let device_count = nvml.device_count()?; + + if device_count == 0 { + anyhow::bail!("No GPUs found!"); + } + + let mut gpu_names = Vec::new(); + for i in 0..device_count { + let device = nvml.device_by_index(i)?; + gpu_names.push(device.name()?); + } + + let first_name = &gpu_names[0]; + if !gpu_names.iter().all(|name| name == first_name) { + anyhow::bail!( + "All GPUs must be of the same type, but we have mismatching names: {:?}", + gpu_names + ); + } + + Ok(GpuInfo { + name: gpu_names.pop().unwrap(), + device_count, + }) +} + +fn normalize_gpu_name(raw_name: &str) -> String { + let upper = raw_name.to_uppercase(); + if upper.contains("H200") { "H200".to_string() - } else if raw_gpu_name.to_uppercase().contains("H100") { + } else if upper.contains("H100") { "H100".to_string() } else { - raw_gpu_name + raw_name.to_string() } } @@ -79,9 +86,9 @@ fn lookup_in_table(table: &Table, gpu_type: &str, num_gpus: usize) -> Option Result { - let num_gpus = tch::Cuda::device_count() as usize; - let gpu_type = get_gpu_type(); - info!("Detected {} x {} GPU(s)", num_gpus, gpu_type); + let gpu_info = get_gpu_info()?; + let gpu_type = normalize_gpu_name(&gpu_info.name); + info!("Detected {} x {} GPU(s)", gpu_info.device_count, gpu_type); let raw_json = load_json_from_model_repo(model_repo_id).ok_or_else(|| { anyhow::anyhow!( @@ -99,6 +106,6 @@ pub fn lookup(model_repo_id: &str) -> Result { model_repo_id ); - lookup_in_table(&table, &gpu_type, num_gpus) - .ok_or_else(|| anyhow::anyhow!("No config for {} x {}", num_gpus, gpu_type)) + lookup_in_table(&table, &gpu_type, gpu_info.device_count as usize) + .ok_or_else(|| anyhow::anyhow!("No config for {} x {}", gpu_info.device_count, gpu_type)) } From fdf4f0ddb4c2f2f35412968c3277be6ed66a8e97 Mon Sep 17 00:00:00 2001 From: Pedro Fontana Date: Thu, 29 Jan 2026 09:11:01 -0800 Subject: [PATCH 16/16] se tch for GPU count (respects CUDA_VISIBLE_DEVICES) --- shared/client/src/parallelism_lookup.rs | 50 +++++++------------------ 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs index 6ea6f9e26..d6f18035a 100644 --- a/shared/client/src/parallelism_lookup.rs +++ b/shared/client/src/parallelism_lookup.rs @@ -17,38 +17,11 @@ pub struct ParallelismConfig { // Table format: gpu_type -> num_gpus -> config type Table = HashMap>; -#[derive(Debug)] -struct GpuInfo { - name: String, - device_count: u32, -} - -fn get_gpu_info() -> Result { +/// Get GPU type from NVML (reads first visible GPU) +fn get_gpu_type_from_nvml() -> Result { let nvml = Nvml::init()?; - let device_count = nvml.device_count()?; - - if device_count == 0 { - anyhow::bail!("No GPUs found!"); - } - - let mut gpu_names = Vec::new(); - for i in 0..device_count { - let device = nvml.device_by_index(i)?; - gpu_names.push(device.name()?); - } - - let first_name = &gpu_names[0]; - if !gpu_names.iter().all(|name| name == first_name) { - anyhow::bail!( - "All GPUs must be of the same type, but we have mismatching names: {:?}", - gpu_names - ); - } - - Ok(GpuInfo { - name: gpu_names.pop().unwrap(), - device_count, - }) + let device = nvml.device_by_index(0)?; + Ok(device.name()?) } fn normalize_gpu_name(raw_name: &str) -> String { @@ -86,9 +59,14 @@ fn lookup_in_table(table: &Table, gpu_type: &str, num_gpus: usize) -> Option Result { - let gpu_info = get_gpu_info()?; - let gpu_type = normalize_gpu_name(&gpu_info.name); - info!("Detected {} x {} GPU(s)", gpu_info.device_count, gpu_type); + let device_count = tch::Cuda::device_count() as usize; + if device_count == 0 { + anyhow::bail!("No GPUs found!"); + } + + // Use NVML for GPU type detection + let gpu_type = normalize_gpu_name(&get_gpu_type_from_nvml()?); + info!("Detected {} x {} GPU(s)", device_count, gpu_type); let raw_json = load_json_from_model_repo(model_repo_id).ok_or_else(|| { anyhow::anyhow!( @@ -106,6 +84,6 @@ pub fn lookup(model_repo_id: &str) -> Result { model_repo_id ); - lookup_in_table(&table, &gpu_type, gpu_info.device_count as usize) - .ok_or_else(|| anyhow::anyhow!("No config for {} x {}", gpu_info.device_count, gpu_type)) + lookup_in_table(&table, &gpu_type, device_count) + .ok_or_else(|| anyhow::anyhow!("No config for {} x {}", device_count, gpu_type)) }