From 3f48d73e3fd76774ac18acbe7a974a35f026db0c Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Fri, 23 Jan 2026 13:43:36 -0800
Subject: [PATCH 01/16] Implement --auto-parallelism

---
 architectures/centralized/client/src/app.rs   |   1 +
 .../decentralized/solana-client/src/app.rs    |   1 +
 shared/client/src/cli.rs                      |   4 +
 shared/client/src/lib.rs                      |   1 +
 shared/client/src/parallelism_lookup.rs       |  54 ++++++++
 shared/client/src/state/init.rs               | 122 ++++++++++++------
 6 files changed, 146 insertions(+), 37 deletions(-)
 create mode 100644 shared/client/src/parallelism_lookup.rs
diff --git a/architectures/centralized/client/src/app.rs b/architectures/centralized/client/src/app.rs
index 3560e0a7a..4d2d7e873 100644
--- a/architectures/centralized/client/src/app.rs
+++ b/architectures/centralized/client/src/app.rs
@@ -131,6 +131,7 @@ pub async fn build_app(
     .await?;
 
     let state_options: RunInitConfig<ClientId, ClientId> = RunInitConfig {
+        parallelism_auto: p.parallelism_auto,
         data_parallelism: p.data_parallelism,
         tensor_parallelism: p.tensor_parallelism,
         micro_batch_size: p.micro_batch_size,
diff --git a/architectures/decentralized/solana-client/src/app.rs b/architectures/decentralized/solana-client/src/app.rs
index 36a529bbb..82914752f 100644
--- a/architectures/decentralized/solana-client/src/app.rs
+++ b/architectures/decentralized/solana-client/src/app.rs
@@ -114,6 +114,7 @@ pub async fn build_app(
 
     let state_options: RunInitConfig<psyche_solana_coordinator::ClientId, NetworkIdentity> =
         RunInitConfig {
+            parallelism_auto: p.parallelism_auto,
             data_parallelism: p.data_parallelism,
             tensor_parallelism: p.tensor_parallelism,
             micro_batch_size: p.micro_batch_size,
diff --git a/shared/client/src/cli.rs b/shared/client/src/cli.rs
index 268ea753a..139e273a6 100644
--- a/shared/client/src/cli.rs
+++ b/shared/client/src/cli.rs
@@ -112,6 +112,10 @@ pub struct TrainArgs {
     #[clap(long, env, value_parser = parse_trim_quotes)]
     pub run_id: String,
 
+    /// Auto-detect parallelism settings from lookup table based on model and GPU count
+    #[clap(long, env)]
+    pub parallelism_auto: bool,
+
     #[clap(long, default_value_t = 1, env)]
     pub data_parallelism: usize,
 
diff --git a/shared/client/src/lib.rs b/shared/client/src/lib.rs
index b4ea80576..c9aaaf2aa 100644
--- a/shared/client/src/lib.rs
+++ b/shared/client/src/lib.rs
@@ -1,6 +1,7 @@
 mod cli;
 mod client;
 mod fetch_data;
+pub mod parallelism_lookup;
 mod protocol;
 mod state;
 mod tui;
diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs
new file mode 100644
index 000000000..2186c8804
--- /dev/null
+++ b/shared/client/src/parallelism_lookup.rs
@@ -0,0 +1,54 @@
+use anyhow::{Result, bail};
+use serde::Deserialize;
+use std::collections::HashMap;
+use std::process::Command;
+
+const PARALLELISM_DATA: &str = include_str!("parallelism_data.json");
+
+#[derive(Debug, Clone, Copy, Deserialize)]
+pub struct ParallelismConfig {
+    pub dp: usize,
+    pub tp: usize,
+    pub micro_batch_size: usize,
+}
+
+type Table = HashMap<String, HashMap<String, ParallelismConfig>>;
+
+pub fn get_num_gpus() -> Result<usize> {
+    let output = Command::new("nvidia-smi")
+        .args(["--query-gpu=name", "--format=csv,noheader"])
+        .output()?;
+
+    if !output.status.success() {
+        bail!("nvidia-smi failed");
+    }
+
+    let count = String::from_utf8_lossy(&output.stdout)
+        .lines()
+        .filter(|s| !s.is_empty())
+        .count();
+
+    if count == 0 {
+        bail!("No GPUs detected");
+    }
+
+    Ok(count)
+}
+
+pub fn lookup(model_repo_id: &str, num_gpus: usize) -> Result<ParallelismConfig> {
+    let table: Table = serde_json::from_str(PARALLELISM_DATA)?;
+
+    let gpu_configs = table
+        .get(model_repo_id)
+        .ok_or_else(|| anyhow::anyhow!("Model '{}' not in parallelism table", model_repo_id))?;
+
+    let config = gpu_configs.get(&num_gpus.to_string()).ok_or_else(|| {
+        anyhow::anyhow!(
+            "No config for {} GPUs with model '{}'",
+            num_gpus,
+            model_repo_id
+        )
+    })?;
+
+    Ok(*config)
+}
diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs
index f7326f382..406842d63 100644
--- a/shared/client/src/state/init.rs
+++ b/shared/client/src/state/init.rs
@@ -1,4 +1,4 @@
-use crate::{WandBInfo, fetch_data::DataFetcher};
+use crate::{WandBInfo, fetch_data::DataFetcher, parallelism_lookup};
 use psyche_coordinator::{
     Coordinator, HealthChecks,
     model::{self, HttpLLMTrainingDataLocation, LLMTrainingDataLocation},
@@ -50,6 +50,8 @@ pub struct RunInitConfig<T: NodeIdentity, A: AuthenticatableIdentity> {
     pub device: Devices,
     pub hub_read_token: Option<String>,
     pub hub_max_concurrent_downloads: usize,
+    /// If true, auto-detect parallelism from lookup table (overrides dp/tp/micro_batch_size)
+    pub parallelism_auto: bool,
     pub data_parallelism: usize,
     pub tensor_parallelism: usize,
     pub micro_batch_size: usize,
@@ -115,6 +117,9 @@ pub enum InitRunError {
     #[error("Unsupported architecture: {0}")]
     UnsupportedArchitecture(String),
 
+    #[error("Parallelism auto-detection failed: {0}")]
+    ParallelismLookupFailed(#[from] anyhow::Error),
+
     #[cfg(feature = "python")]
     #[error("Python distributed error: {0}")]
     PythonDistributedError(#[from] psyche_modeling::PythonDistributedCausalLMError),
@@ -195,6 +200,59 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
 
         let model::Model::LLM(llm) = state.model;
 
+        // Resolve parallelism configuration
+        #[cfg(not(feature = "parallelism"))]
+        if init_config.parallelism_auto {
+            return Err(InitRunError::ParallelismLookupFailed(anyhow::anyhow!(
+                "--parallelism-auto requires building with --features=parallelism"
+            )));
+        }
+
+        #[cfg(feature = "parallelism")]
+        let (data_parallelism, tensor_parallelism, micro_batch_size) = if init_config
+            .parallelism_auto
+        {
+            if init_config.data_parallelism != 1
+                && init_config.tensor_parallelism != 1
+                && init_config.micro_batch_size != 1
+            {
+                warn!("--parallelism-auto is set, ignoring manual dp/tp/micro_batch_size values");
+            }
+
+            let model_repo_id: Option<String> = match &llm.checkpoint {
+                model::Checkpoint::Hub(hub_repo) | model::Checkpoint::P2P(hub_repo) => {
+                    Some((&hub_repo.repo_id).into())
+                }
+                _ => None,
+            };
+
+            if let Some(repo_id) = model_repo_id {
+                let num_gpus = parallelism_lookup::get_num_gpus()?;
+                let config = parallelism_lookup::lookup(&repo_id, num_gpus)?;
+                (config.dp, config.tp, config.micro_batch_size)
+            } else {
+                (1, 1, 1)
+            }
+        } else {
+            (
+                init_config.data_parallelism,
+                init_config.tensor_parallelism,
+                init_config.micro_batch_size,
+            )
+        };
+
+        #[cfg(not(feature = "parallelism"))]
+        let (data_parallelism, tensor_parallelism, micro_batch_size) = (
+            init_config.data_parallelism,
+            init_config.tensor_parallelism,
+            init_config.micro_batch_size,
+        );
+
+        info!(
+            "Parallelism: dp={}, tp={}, micro_batch_size={}",
+            data_parallelism, tensor_parallelism, micro_batch_size
+        );
+
         let hub_read_token = init_config.hub_read_token.clone();
         let hub_max_concurrent_downloads = init_config.hub_max_concurrent_downloads;
         let data_future = async {
@@ -277,7 +335,7 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
 
                     let model = RawLoadedModel {
                         models: RawLoadedModelType::ParallelNativeModels(
-                            (0..(init_config.data_parallelism * init_config.tensor_parallelism))
+                            (0..(data_parallelism * tensor_parallelism))
                                 .map(|_| {
                                     if let Some(training_delay) =
                                         init_config.dummy_training_delay_secs
@@ -438,12 +496,12 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
                             tokenizer.clone(),
                             init_config.eval_task_max_docs,
                             // if doing python fsdp we only have one effective dp rank for inference
-                            if init_config.data_parallelism > 1
+                            if data_parallelism > 1
                                 && llm.architecture == model::LLMArchitecture::HfAuto
                             {
                                 1
                             } else {
-                                init_config.data_parallelism
+                                data_parallelism
                             },
                         );
 
@@ -467,8 +525,8 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
                             model::LLMArchitecture::HfAuto | model::LLMArchitecture::Torchtitan => {
                                 #[cfg(feature = "python")]
                                 {
-                                    let dp = init_config.data_parallelism;
-                                    let tp = init_config.tensor_parallelism;
+                                    let dp = data_parallelism;
+                                    let tp = tensor_parallelism;
 
                                     tokio::task::spawn_blocking(move || {
                                         if tp != 1 || dp != 1 {
@@ -520,31 +578,25 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
                             architecture => {
                                 let mut futures: Vec<
                                     JoinHandle<Result<Box<dyn CausalLM>, ModelLoadError>>,
-                                > = Vec::with_capacity(
-                                    init_config.data_parallelism * init_config.tensor_parallelism,
-                                );
+                                > = Vec::with_capacity(data_parallelism * tensor_parallelism);
                                 let devices = init_config.device.clone();
 
-                                for dp in 0..init_config.data_parallelism {
+                                for dp in 0..data_parallelism {
                                     let communicator_id: Option<CommunicatorId> =
-                                        match init_config.tensor_parallelism {
+                                        match tensor_parallelism {
                                             0 | 1 => None,
                                             #[cfg(feature = "parallelism")]
                                             _ => Some(tch::CStore::new().into()),
                                             #[cfg(not(feature = "parallelism"))]
                                             _ => unimplemented!(),
                                         };
-                                    for tp in 0..init_config.tensor_parallelism {
+                                    for tp in 0..tensor_parallelism {
                                         let tensor_parallelism_world =
                                             communicator_id.as_ref().map(|communicator_id| {
-                                                (
-                                                    communicator_id.clone(),
-                                                    tp,
-                                                    init_config.tensor_parallelism,
-                                                )
+                                                (communicator_id.clone(), tp, tensor_parallelism)
                                             });
                                         let source = source.clone();
-                                        let rank = dp * init_config.tensor_parallelism + tp;
+                                        let rank = dp * tensor_parallelism + tp;
                                         let devices = devices.clone();
                                         let device = devices.device_for_rank(rank);
                                         futures.push(tokio::task::spawn_blocking(move || {
@@ -604,9 +656,9 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
                         info!(
                             integration_test_log_marker = %IntegrationTestLogMarker::LoadedModel,
                             checkpoint = %llm.checkpoint,
-                            gpus = init_config.data_parallelism * init_config.tensor_parallelism,
-                            dp = init_config.data_parallelism,
-                            tp = init_config.tensor_parallelism,
+                            gpus = data_parallelism * tensor_parallelism,
+                            dp = data_parallelism,
+                            tp = tensor_parallelism,
                             "loaded_model",
                         );
 
@@ -681,8 +733,7 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
 
         // TODO add data fetching for verifying, too..
         let data_provider = data.map_err(InitRunError::DataProviderConnect)?;
-        let data_fetcher =
-            DataFetcher::<T, A>::new(data_provider, init_config.data_parallelism * 2);
+        let data_fetcher = DataFetcher::<T, A>::new(data_provider, data_parallelism * 2);
 
         let trainers: Vec<Trainer> = match models {
             RawLoadedModelType::ParallelNativeModels(models) => {
@@ -690,26 +741,24 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
                 for model in models {
                     if tp_models
                         .last()
-                        .map(|x| x.len() == init_config.tensor_parallelism)
+                        .map(|x| x.len() == tensor_parallelism)
                         .unwrap_or(true)
                     {
-                        tp_models.push(Vec::with_capacity(init_config.tensor_parallelism));
+                        tp_models.push(Vec::with_capacity(tensor_parallelism));
                     }
                     tp_models.last_mut().unwrap().push(model);
                 }
 
                 let data_parallel: Option<Vec<(CommunicatorId, Arc<dyn Barrier>)>> =
-                    if init_config.data_parallelism > 1 {
+                    if data_parallelism > 1 {
                         #[cfg(feature = "parallelism")]
                         {
                             Some(
-                                (0..init_config.tensor_parallelism)
+                                (0..tensor_parallelism)
                                     .map(|_| {
                                         (
                                             tch::CStore::new().into(),
-                                            Arc::new(CancellableBarrier::new(
-                                                init_config.tensor_parallelism,
-                                            ))
+                                            Arc::new(CancellableBarrier::new(tensor_parallelism))
                                                 as Arc<dyn Barrier>,
                                         )
                                     })
@@ -736,13 +785,12 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
                                     id: id.clone(),
                                     barrier: barrier.clone(),
                                     rank: dp,
-                                    world_size: init_config.data_parallelism,
+                                    world_size: data_parallelism,
                                 })
                                 .collect()
                         });
-                        let barrier =
-                            Arc::new(CancellableBarrier::new(init_config.tensor_parallelism))
-                                as Arc<dyn Barrier>;
+                        let barrier = Arc::new(CancellableBarrier::new(tensor_parallelism))
+                            as Arc<dyn Barrier>;
                         LocalTrainer::new(
                             ParallelModels {
                                 models,
@@ -751,7 +799,7 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
                             },
                             llm.lr_schedule,
                             llm.optimizer,
-                            init_config.micro_batch_size,
+                            micro_batch_size,
                             init_config.optim_stats_every_n_steps,
                             init_config.grad_accum_in_fp32,
                         )
@@ -770,7 +818,7 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
                         },
                         llm.lr_schedule,
                         llm.optimizer,
-                        init_config.micro_batch_size,
+                        micro_batch_size,
                         init_config.optim_stats_every_n_steps,
                         init_config.grad_accum_in_fp32,
                     )
@@ -784,7 +832,7 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
                         model,
                         llm.lr_schedule,
                         llm.optimizer,
-                        init_config.micro_batch_size,
+                        micro_batch_size,
                         init_config.optim_stats_every_n_steps,
                         init_config.grad_accum_in_fp32,
                     )?

From 653d0ebb8291c8efa54cff4cbdfb4f235face291 Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Fri, 23 Jan 2026 13:43:51 -0800
Subject: [PATCH 02/16] parallelism_data.json

---
 shared/client/src/parallelism_data.json | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 shared/client/src/parallelism_data.json

diff --git a/shared/client/src/parallelism_data.json b/shared/client/src/parallelism_data.json
new file mode 100644
index 000000000..439fd5523
--- /dev/null
+++ b/shared/client/src/parallelism_data.json
@@ -0,0 +1,17 @@
+{
+	"NousResearch/Meta-Llama-3.1-8B": {
+		"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
+		"8": { "dp": 8, "tp": 1, "micro_batch_size": 4 }
+	},
+	"TinyLlama/TinyLlama-1.1B-Chat-v0.4": {
+		"1": { "dp": 1, "tp": 1, "micro_batch_size": 8 },
+		"8": { "dp": 8, "tp": 1, "micro_batch_size": 8 }
+	},
+	"deepseek-ai/DeepSeek-V2-Lite": {
+		"1": { "dp": 1, "tp": 1, "micro_batch_size": 2 },
+		"8": { "dp": 4, "tp": 2, "micro_batch_size": 2 }
+	},
+	"NousResearch/Hermes-4-70B": {
+		"8": { "dp": 1, "tp": 8, "micro_batch_size": 1 }
+	}
+}

From b541cc545eb14256c8fa2e7875a85b3f3a11a0dd Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Fri, 23 Jan 2026 13:59:20 -0800
Subject: [PATCH 03/16] simplify code

---
 shared/client/src/parallelism_lookup.rs | 34 ++++++++++---------------
 shared/client/src/state/init.rs         | 23 ++++++++---------
 2 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs
index 2186c8804..493ee08f5 100644
--- a/shared/client/src/parallelism_lookup.rs
+++ b/shared/client/src/parallelism_lookup.rs
@@ -1,7 +1,8 @@
-use anyhow::{Result, bail};
+use anyhow::Result;
 use serde::Deserialize;
 use std::collections::HashMap;
 use std::process::Command;
+use tracing::info;
 
 const PARALLELISM_DATA: &str = include_str!("parallelism_data.json");
 
@@ -14,28 +15,21 @@ pub struct ParallelismConfig {
 
 type Table = HashMap<String, HashMap<String, ParallelismConfig>>;
 
-pub fn get_num_gpus() -> Result<usize> {
-    let output = Command::new("nvidia-smi")
+fn get_gpu_name() -> String {
+    Command::new("nvidia-smi")
         .args(["--query-gpu=name", "--format=csv,noheader"])
-        .output()?;
-
-    if !output.status.success() {
-        bail!("nvidia-smi failed");
-    }
-
-    let count = String::from_utf8_lossy(&output.stdout)
-        .lines()
-        .filter(|s| !s.is_empty())
-        .count();
-
-    if count == 0 {
-        bail!("No GPUs detected");
-    }
-
-    Ok(count)
+        .output()
+        .ok()
+        .and_then(|o| String::from_utf8(o.stdout).ok())
+        .and_then(|s| s.lines().next().map(|l| l.trim().to_string()))
+        .unwrap_or_else(|| "unknown".to_string())
 }
 
-pub fn lookup(model_repo_id: &str, num_gpus: usize) -> Result<ParallelismConfig> {
+pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
+    let num_gpus = tch::Cuda::device_count();
+    let gpu_name = get_gpu_name();
+    info!("Detected {} GPU(s): {}", num_gpus, gpu_name);
+
     let table: Table = serde_json::from_str(PARALLELISM_DATA)?;
 
     let gpu_configs = table
diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs
index 406842d63..646d058d1 100644
--- a/shared/client/src/state/init.rs
+++ b/shared/client/src/state/init.rs
@@ -213,26 +213,25 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
             .parallelism_auto
         {
             if init_config.data_parallelism != 1
-                && init_config.tensor_parallelism != 1
-                && init_config.micro_batch_size != 1
+                || init_config.tensor_parallelism != 1
+                || init_config.micro_batch_size != 1
             {
                 warn!("--parallelism-auto is set, ignoring manual dp/tp/micro_batch_size values");
             }
 
-            let model_repo_id: Option<String> = match &llm.checkpoint {
+            let model_repo_id: String = match &llm.checkpoint {
                 model::Checkpoint::Hub(hub_repo) | model::Checkpoint::P2P(hub_repo) => {
-                    Some((&hub_repo.repo_id).into())
+                    (&hub_repo.repo_id).into()
+                }
+                _ => {
+                    return Err(InitRunError::ParallelismLookupFailed(anyhow::anyhow!(
+                        "--parallelism-auto requires a Hub or P2P checkpoint"
+                    )));
                 }
-                _ => None,
             };
 
-            if let Some(repo_id) = model_repo_id {
-                let num_gpus = parallelism_lookup::get_num_gpus()?;
-                let config = parallelism_lookup::lookup(&repo_id, num_gpus)?;
-                (config.dp, config.tp, config.micro_batch_size)
-            } else {
-                (1, 1, 1)
-            }
+            let config = parallelism_lookup::lookup(&model_repo_id)?;
+            (config.dp, config.tp, config.micro_batch_size)
         } else {
             (
                 init_config.data_parallelism,

From 3af705e25a868750c140290886d201521b1dc8b4 Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Fri, 23 Jan 2026 14:12:10 -0800
Subject: [PATCH 04/16] clippy

---
 shared/client/src/state/init.rs | 61 +++++++++++++++++----------------
 1 file changed, 32 insertions(+), 29 deletions(-)

diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs
index 646d058d1..dbd1d5e1b 100644
--- a/shared/client/src/state/init.rs
+++ b/shared/client/src/state/init.rs
@@ -1,4 +1,6 @@
-use crate::{WandBInfo, fetch_data::DataFetcher, parallelism_lookup};
+#[cfg(feature = "parallelism")]
+use crate::parallelism_lookup;
+use crate::{WandBInfo, fetch_data::DataFetcher};
 use psyche_coordinator::{
     Coordinator, HealthChecks,
     model::{self, HttpLLMTrainingDataLocation, LLMTrainingDataLocation},
@@ -209,36 +211,37 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static> RunInitConfigAndIO<T
         }
 
         #[cfg(feature = "parallelism")]
-        let (data_parallelism, tensor_parallelism, micro_batch_size) = if init_config
-            .parallelism_auto
-        {
-            if init_config.data_parallelism != 1
-                || init_config.tensor_parallelism != 1
-                || init_config.micro_batch_size != 1
-            {
-                warn!("--parallelism-auto is set, ignoring manual dp/tp/micro_batch_size values");
-            }
-
-            let model_repo_id: String = match &llm.checkpoint {
-                model::Checkpoint::Hub(hub_repo) | model::Checkpoint::P2P(hub_repo) => {
-                    (&hub_repo.repo_id).into()
-                }
-                _ => {
-                    return Err(InitRunError::ParallelismLookupFailed(anyhow::anyhow!(
-                        "--parallelism-auto requires a Hub or P2P checkpoint"
-                    )));
+        let (data_parallelism, tensor_parallelism, micro_batch_size) =
+            if init_config.parallelism_auto {
+                if init_config.data_parallelism != 1
+                    || init_config.tensor_parallelism != 1
+                    || init_config.micro_batch_size != 1
+                {
+                    tracing::warn!(
+                        "--parallelism-auto is set, ignoring manual dp/tp/micro_batch_size values"
+                    );
                 }
-            };
 
-            let config = parallelism_lookup::lookup(&model_repo_id)?;
-            (config.dp, config.tp, config.micro_batch_size)
-        } else {
-            (
-                init_config.data_parallelism,
-                init_config.tensor_parallelism,
-                init_config.micro_batch_size,
-            )
-        };
+                let model_repo_id: String = match &llm.checkpoint {
+                    model::Checkpoint::Hub(hub_repo) | model::Checkpoint::P2P(hub_repo) => {
+                        (&hub_repo.repo_id).into()
+                    }
+                    _ => {
+                        return Err(InitRunError::ParallelismLookupFailed(anyhow::anyhow!(
+                            "--parallelism-auto requires a Hub or P2P checkpoint"
+                        )));
+                    }
+                };
+
+                let config = parallelism_lookup::lookup(&model_repo_id)?;
+                (config.dp, config.tp, config.micro_batch_size)
+            } else {
+                (
+                    init_config.data_parallelism,
+                    init_config.tensor_parallelism,
+                    init_config.micro_batch_size,
+                )
+            };
 
         #[cfg(not(feature = "parallelism"))]
         let (data_parallelism, tensor_parallelism, micro_batch_size) = (

From 27f4ed5436dc03fd849b6bfe186b5a2d80a1a3d6 Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Mon, 26 Jan 2026 07:55:35 -0800
Subject: [PATCH 05/16] add parallelism_data.json to Garnix

---
 nix/lib.nix | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nix/lib.nix b/nix/lib.nix
index 961d212ad..8bfe63aef 100644
--- a/nix/lib.nix
+++ b/nix/lib.nix
@@ -20,7 +20,8 @@ let
     || (builtins.match ".*tests/fixtures/.*$" path != null)
     || (builtins.match ".*.config/.*$" path != null)
     || (builtins.match ".*local-dev-keypair.json$" path != null)
-    || (builtins.match ".*shared/client/src/state/prompt_texts/index\\.json$" path != null);
+    || (builtins.match ".*shared/client/src/state/prompt_texts/index\\.json$" path != null)
+    || (builtins.match ".*shared/client/src/parallelism_data\\.json$" path != null);
 
   src = lib.cleanSourceWith {
     src = ../.;

From a8d53301a731b4a0144eb5710a99360af23ab90a Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Mon, 26 Jan 2026 08:56:02 -0800
Subject: [PATCH 06/16] add hardware type to json

---
 shared/client/src/parallelism_data.json | 31 +++++++++++++++++-----
 shared/client/src/parallelism_lookup.rs | 34 +++++++++++++++++++------
 2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/shared/client/src/parallelism_data.json b/shared/client/src/parallelism_data.json
index 439fd5523..53cf4d2f3 100644
--- a/shared/client/src/parallelism_data.json
+++ b/shared/client/src/parallelism_data.json
@@ -1,17 +1,34 @@
 {
+	"_doc (HuggingFace repo)": {
+		"gpu type from nvidia-smi": {
+			"numember of gpus": {
+				"dp": "data parallelism",
+				"tp": "tensor parallelism",
+				"micro_batch_size": "micro batch size"
+			}
+		}
+	},
 	"NousResearch/Meta-Llama-3.1-8B": {
-		"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
-		"8": { "dp": 8, "tp": 1, "micro_batch_size": 4 }
+		"H100": {
+			"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
+			"8": { "dp": 8, "tp": 1, "micro_batch_size": 4 }
+		}
 	},
 	"TinyLlama/TinyLlama-1.1B-Chat-v0.4": {
-		"1": { "dp": 1, "tp": 1, "micro_batch_size": 8 },
-		"8": { "dp": 8, "tp": 1, "micro_batch_size": 8 }
+		"H100": {
+			"1": { "dp": 1, "tp": 1, "micro_batch_size": 8 },
+			"8": { "dp": 8, "tp": 1, "micro_batch_size": 8 }
+		}
 	},
 	"deepseek-ai/DeepSeek-V2-Lite": {
-		"1": { "dp": 1, "tp": 1, "micro_batch_size": 2 },
-		"8": { "dp": 4, "tp": 2, "micro_batch_size": 2 }
+		"H100": {
+			"1": { "dp": 1, "tp": 1, "micro_batch_size": 2 },
+			"8": { "dp": 4, "tp": 2, "micro_batch_size": 2 }
+		}
 	},
 	"NousResearch/Hermes-4-70B": {
-		"8": { "dp": 1, "tp": 8, "micro_batch_size": 1 }
+		"H100": {
+			"8": { "dp": 1, "tp": 8, "micro_batch_size": 1 }
+		}
 	}
 }
diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs
index 493ee08f5..c9f600bfd 100644
--- a/shared/client/src/parallelism_lookup.rs
+++ b/shared/client/src/parallelism_lookup.rs
@@ -13,22 +13,31 @@ pub struct ParallelismConfig {
     pub micro_batch_size: usize,
 }
 
-type Table = HashMap<String, HashMap<String, ParallelismConfig>>;
+type Table = HashMap<String, HashMap<String, HashMap<String, ParallelismConfig>>>;
 
-fn get_gpu_name() -> String {
-    Command::new("nvidia-smi")
+fn get_gpu_type() -> String {
+    let raw = Command::new("nvidia-smi")
         .args(["--query-gpu=name", "--format=csv,noheader"])
         .output()
         .ok()
         .and_then(|o| String::from_utf8(o.stdout).ok())
         .and_then(|s| s.lines().next().map(|l| l.trim().to_string()))
-        .unwrap_or_else(|| "unknown".to_string())
+        .unwrap_or_default();
+
+    // Normalize GPU name to match table keys
+    if raw.to_uppercase().contains("H200") {
+        "H200".to_string()
+    } else if raw.to_uppercase().contains("H100") {
+        "H100".to_string()
+    } else {
+        raw
+    }
 }
 
 pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
     let num_gpus = tch::Cuda::device_count();
-    let gpu_name = get_gpu_name();
-    info!("Detected {} GPU(s): {}", num_gpus, gpu_name);
+    let gpu_type = get_gpu_type();
+    info!("Detected {} x {} GPU(s)", num_gpus, gpu_type);
 
     let table: Table = serde_json::from_str(PARALLELISM_DATA)?;
 
@@ -36,10 +45,19 @@ pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
         .get(model_repo_id)
         .ok_or_else(|| anyhow::anyhow!("Model '{}' not in parallelism table", model_repo_id))?;
 
-    let config = gpu_configs.get(&num_gpus.to_string()).ok_or_else(|| {
+    let num_gpu_configs = gpu_configs.get(&gpu_type).ok_or_else(|| {
+        anyhow::anyhow!(
+            "GPU '{}' not in parallelism table for model '{}'",
+            gpu_type,
+            model_repo_id
+        )
+    })?;
+
+    let config = num_gpu_configs.get(&num_gpus.to_string()).ok_or_else(|| {
         anyhow::anyhow!(
-            "No config for {} GPUs with model '{}'",
+            "No config for {} x {} with model '{}'",
             num_gpus,
+            gpu_type,
             model_repo_id
         )
     })?;

From f265de968cc986b8f52ad5497651821f4de41c9d Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Mon, 26 Jan 2026 12:27:17 -0800
Subject: [PATCH 07/16] update .json

---
 scripts/train-solana-test.sh            |  0
 shared/client/src/parallelism_data.json | 20 +++++++++++++-------
 2 files changed, 13 insertions(+), 7 deletions(-)
 mode change 100755 => 100644 scripts/train-solana-test.sh

diff --git a/scripts/train-solana-test.sh b/scripts/train-solana-test.sh
old mode 100755
new mode 100644
diff --git a/shared/client/src/parallelism_data.json b/shared/client/src/parallelism_data.json
index 53cf4d2f3..5386783a2 100644
--- a/shared/client/src/parallelism_data.json
+++ b/shared/client/src/parallelism_data.json
@@ -2,16 +2,16 @@
 	"_doc (HuggingFace repo)": {
 		"gpu type from nvidia-smi": {
 			"numember of gpus": {
-				"dp": "data parallelism",
-				"tp": "tensor parallelism",
-				"micro_batch_size": "micro batch size"
+				"dp": 0,
+				"tp": 0,
+				"micro_batch_size": 0
 			}
 		}
 	},
-	"NousResearch/Meta-Llama-3.1-8B": {
+	"emozilla/llama2-20m-init": {
 		"H100": {
-			"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
-			"8": { "dp": 8, "tp": 1, "micro_batch_size": 4 }
+			"1": { "dp": 1, "tp": 1, "micro_batch_size": 1 },
+			"8": { "dp": 1, "tp": 1, "micro_batch_size": 1 }
 		}
 	},
 	"TinyLlama/TinyLlama-1.1B-Chat-v0.4": {
@@ -20,9 +20,15 @@
 			"8": { "dp": 8, "tp": 1, "micro_batch_size": 8 }
 		}
 	},
+	"NousResearch/Meta-Llama-3.1-8B": {
+		"H100": {
+			"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
+			"8": { "dp": 8, "tp": 1, "micro_batch_size": 4 }
+		}
+	},
 	"deepseek-ai/DeepSeek-V2-Lite": {
 		"H100": {
-			"1": { "dp": 1, "tp": 1, "micro_batch_size": 2 },
+			"1": { "dp": 1, "tp": 1, "micro_batch_size": 1 },
 			"8": { "dp": 4, "tp": 2, "micro_batch_size": 2 }
 		}
 	},

From 1d3aafd5532a6adb8b1f9fe6515e43f50d1a7eda Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Mon, 26 Jan 2026 12:27:36 -0800
Subject: [PATCH 08/16] Fallback: read from /proc/driver/nvidia

---
 shared/client/src/parallelism_lookup.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs
index c9f600bfd..131cc619b 100644
--- a/shared/client/src/parallelism_lookup.rs
+++ b/shared/client/src/parallelism_lookup.rs
@@ -16,12 +16,31 @@ pub struct ParallelismConfig {
 type Table = HashMap<String, HashMap<String, HashMap<String, ParallelismConfig>>>;
 
 fn get_gpu_type() -> String {
+    // Try nvidia-smi first
     let raw = Command::new("nvidia-smi")
         .args(["--query-gpu=name", "--format=csv,noheader"])
         .output()
         .ok()
         .and_then(|o| String::from_utf8(o.stdout).ok())
         .and_then(|s| s.lines().next().map(|l| l.trim().to_string()))
+        .filter(|s| !s.is_empty())
+        // Fallback: read from /proc/driver/nvidia (works in containers without nvidia-smi)
+        .or_else(|| {
+            std::fs::read_dir("/proc/driver/nvidia/gpus")
+                .ok()?
+                .filter_map(|e| e.ok())
+                .next()
+                .and_then(|entry| {
+                    let info_path = entry.path().join("information");
+                    std::fs::read_to_string(info_path).ok()
+                })
+                .and_then(|content| {
+                    content
+                        .lines()
+                        .find(|line| line.starts_with("Model:"))
+                        .map(|line| line.trim_start_matches("Model:").trim().to_string())
+                })
+        })
         .unwrap_or_default();
 
     // Normalize GPU name to match table keys

From 50762e5c20b54bd66400a05be83ba8a3a833399a Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Mon, 26 Jan 2026 12:34:32 -0800
Subject: [PATCH 09/16] restore scripts/train-solana-test.sh

---
 scripts/train-solana-test.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 scripts/train-solana-test.sh

diff --git a/scripts/train-solana-test.sh b/scripts/train-solana-test.sh
old mode 100644
new mode 100755

From f6c1c7a9538fa46654a68fc65b009e74eb7c06c6 Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Mon, 26 Jan 2026 13:08:54 -0800
Subject: [PATCH 10/16] update documentation

---
 psyche-book/src/enduser/create-run.md | 21 ++++++++++++++++++++-
 psyche-book/src/enduser/join-run.md   | 10 ++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/psyche-book/src/enduser/create-run.md b/psyche-book/src/enduser/create-run.md
index 7a9354ca1..542d58837 100644
--- a/psyche-book/src/enduser/create-run.md
+++ b/psyche-book/src/enduser/create-run.md
@@ -86,11 +86,30 @@ run-manager create-run \
 
 At this point, your run has been successfully created.
 
+### Updating the parallelism lookup table (recommended)
+
+If your run uses a model that is not already in the [parallelism lookup table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json), it's recommended to add the optimal parallelism configuration for your model and target GPU hardware. This allows clients to use `PARALLELISM_AUTO=true` for automatic configuration.
+
+The table maps model (HuggingFace repo ID) → GPU type → number of GPUs → parallelism settings:
+
+```json
+{
+	"your-org/your-model": {
+		"H100": {
+			"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
+			"8": { "dp": 4, "tp": 2, "micro_batch_size": 4 }
+		}
+	}
+}
+```
+
+Consider opening a PR to add your model's configuration so other clients can benefit from it.
+
 ### Initializing configuration
 
 Initially, the run will not have any configuration defined and will remain paused, so no clients can join yet.
 
-To set the run configuration, you’ll need to provide mostly the same parameters as when creating the run, along with the path to a `config.toml` file that follows the [run config schema](./run-config.md).
+To set the run configuration, you'll need to provide mostly the same parameters as when creating the run, along with the path to a `config.toml` file that follows the [run config schema](./run-config.md).
 
 ```bash
 run-manager update-config \
diff --git a/psyche-book/src/enduser/join-run.md b/psyche-book/src/enduser/join-run.md
index b79678bcd..9ad7492c7 100644
--- a/psyche-book/src/enduser/join-run.md
+++ b/psyche-book/src/enduser/join-run.md
@@ -93,19 +93,29 @@ though you might need to.
 
 **`NVIDIA_DRIVER_CAPABILITIES`** - An environment variable that the NVIDIA Container Toolkit uses to determine which compute capabilities should be provided to your container. It is recommended to set it to 'all', e.g. `NVIDIA_DRIVER_CAPABILITIES=all`.
 
+**`PARALLELISM_AUTO`** - Set to `true` to automatically detect optimal parallelism settings based on the model and your GPU hardware.
+
+- When enabled, the client will look up the best `DATA_PARALLELISM`, `TENSOR_PARALLELISM`, and `MICRO_BATCH_SIZE` values from a [built-in configuration table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json)
+- Your model and GPU hardware combination must be present in the table
+- This is the recommended option for most users
+- If set, manual parallelism settings below will be ignored
+
 **`DATA_PARALLELISM`** - Number of GPUs to distribute training data across.
 
 - If you have multiple GPUs, you can set this to 2, 4, etc. to speed up training
 - If you have 1 GPU, set this to `1`
+- Ignored if `PARALLELISM_AUTO=true`
 
 **`TENSOR_PARALLELISM`** - Number of GPUs to distribute the model across, this lets you train a model you can't fit on one single GPU.
 
 - If you have 1 GPU, set this to `1`
 - If your have `n` GPUs you can distribute the model across all of them by setting it to `n`.
+- Ignored if `PARALLELISM_AUTO=true`
 
 **`MICRO_BATCH_SIZE`** - Number of samples processed per GPU per training step
 
 - Set as high as your GPU memory allows
+- Ignored if `PARALLELISM_AUTO=true`
 
 **`AUTHORIZER`** - The Solana address that authorized your wallet to join this run
 

From 42fd01256d12dc5a025307ae72a2484ee920760f Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Mon, 26 Jan 2026 18:36:31 -0300
Subject: [PATCH 11/16] Change micro_batch_size for Meta-Llama-3.1 to 1

---
 shared/client/src/parallelism_data.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shared/client/src/parallelism_data.json b/shared/client/src/parallelism_data.json
index 5386783a2..234146aae 100644
--- a/shared/client/src/parallelism_data.json
+++ b/shared/client/src/parallelism_data.json
@@ -22,7 +22,7 @@
 	},
 	"NousResearch/Meta-Llama-3.1-8B": {
 		"H100": {
-			"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
+			"1": { "dp": 1, "tp": 1, "micro_batch_size": 1 },
 			"8": { "dp": 8, "tp": 1, "micro_batch_size": 4 }
 		}
 	},

From 1695b9305ecbc3f595adc9d742f15c64a9968336 Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Tue, 27 Jan 2026 10:07:27 -0800
Subject: [PATCH 12/16] nit

---
 shared/client/src/parallelism_lookup.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs
index 131cc619b..5968f33a0 100644
--- a/shared/client/src/parallelism_lookup.rs
+++ b/shared/client/src/parallelism_lookup.rs
@@ -17,7 +17,7 @@ type Table = HashMap<String, HashMap<String, HashMap<String, ParallelismConfig>>
 
 fn get_gpu_type() -> String {
     // Try nvidia-smi first
-    let raw = Command::new("nvidia-smi")
+    let raw_gpu_name = Command::new("nvidia-smi")
         .args(["--query-gpu=name", "--format=csv,noheader"])
         .output()
         .ok()
@@ -44,12 +44,12 @@ fn get_gpu_type() -> String {
         .unwrap_or_default();
 
     // Normalize GPU name to match table keys
-    if raw.to_uppercase().contains("H200") {
+    if raw_gpu_name.to_uppercase().contains("H200") {
         "H200".to_string()
-    } else if raw.to_uppercase().contains("H100") {
+    } else if raw_gpu_name.to_uppercase().contains("H100") {
         "H100".to_string()
     } else {
-        raw
+        raw_gpu_name
     }
 }
 

From 51757239add95850aa30d52bafdc3ec64030336f Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Tue, 27 Jan 2026 12:04:36 -0800
Subject: [PATCH 13/16] look data-parallelism.json in HF repo

---
 psyche-book/src/enduser/create-run.md   | 22 ++++++-
 shared/client/src/parallelism_lookup.rs | 76 +++++++++++++++++++------
 2 files changed, 77 insertions(+), 21 deletions(-)

diff --git a/psyche-book/src/enduser/create-run.md b/psyche-book/src/enduser/create-run.md
index 542d58837..bc9e31188 100644
--- a/psyche-book/src/enduser/create-run.md
+++ b/psyche-book/src/enduser/create-run.md
@@ -86,11 +86,13 @@ run-manager create-run \
 
 At this point, your run has been successfully created.
 
-### Updating the parallelism lookup table (recommended)
+### Adding parallelism configuration (recommended)
 
 If your run uses a model that is not already in the [parallelism lookup table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json), it's recommended to add the optimal parallelism configuration for your model and target GPU hardware. This allows clients to use `PARALLELISM_AUTO=true` for automatic configuration.
 
-The table maps model (HuggingFace repo ID) → GPU type → number of GPUs → parallelism settings:
+#### Option 1: Add to your model's HuggingFace repo (preferred)
+
+Add a `parallelism_data.json` file directly to your model's HuggingFace repository. The client will automatically fetch this configuration at runtime - no Psyche rebuild required.
 
 ```json
 {
@@ -98,12 +100,26 @@ The table maps model (HuggingFace repo ID) → GPU type → number of GPUs → p
 		"H100": {
 			"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
 			"8": { "dp": 4, "tp": 2, "micro_batch_size": 4 }
+		},
+		"H200": {
+			"8": { "dp": 8, "tp": 1, "micro_batch_size": 8 }
 		}
 	}
 }
 ```
 
-Consider opening a PR to add your model's configuration so other clients can benefit from it.
+This is the preferred approach because:
+
+- No PR to Psyche required
+- No Docker image rebuild needed
+- Model creators manage their own configuration
+- Changes take effect immediately
+
+#### Option 2: Add to Psyche's compiled table
+
+Alternatively, open a PR to add your model to the [compiled parallelism table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json). This is useful for widely-used models that should have default configurations. The format is the same as above.
+
+**Lookup order**: The client first tries to fetch config from the model's HuggingFace repo, then falls back to the compiled table with a warning if not found.
 
 ### Initializing configuration
 
diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs
index 5968f33a0..80b0ae770 100644
--- a/shared/client/src/parallelism_lookup.rs
+++ b/shared/client/src/parallelism_lookup.rs
@@ -1,10 +1,12 @@
 use anyhow::Result;
+use hf_hub::{Repo, RepoType};
 use serde::Deserialize;
 use std::collections::HashMap;
 use std::process::Command;
-use tracing::info;
+use tracing::{info, warn};
 
 const PARALLELISM_DATA: &str = include_str!("parallelism_data.json");
+const REMOTE_CONFIG_FILENAME: &str = "parallelism_data.json";
 
 #[derive(Debug, Clone, Copy, Deserialize)]
 pub struct ParallelismConfig {
@@ -13,6 +15,7 @@ pub struct ParallelismConfig {
     pub micro_batch_size: usize,
 }
 
+// Table format: model -> gpu_type -> num_gpus -> config
 type Table = HashMap<String, HashMap<String, HashMap<String, ParallelismConfig>>>;
 
 fn get_gpu_type() -> String {
@@ -53,33 +56,70 @@ fn get_gpu_type() -> String {
     }
 }
 
+/// Try to load parallelism config from the model's HuggingFace repo
+fn load_from_model_repo(model_repo_id: &str) -> Option<Table> {
+    let token = std::env::var("HF_TOKEN").ok();
+
+    let api = hf_hub::api::sync::ApiBuilder::new()
+        .with_token(token)
+        .build()
+        .ok()?
+        .repo(Repo::new(model_repo_id.to_string(), RepoType::Model));
+
+    let path = api.get(REMOTE_CONFIG_FILENAME).ok()?;
+    let content = std::fs::read_to_string(path).ok()?;
+    serde_json::from_str(&content).ok()
+}
+
+/// Lookup config in a table
+fn lookup_in_table(
+    table: &Table,
+    model_repo_id: &str,
+    gpu_type: &str,
+    num_gpus: usize,
+) -> Option<ParallelismConfig> {
+    table
+        .get(model_repo_id)
+        .and_then(|g| g.get(gpu_type))
+        .and_then(|n| n.get(&num_gpus.to_string()))
+        .copied()
+}
+
+/// Load the compiled parallelism table
+fn load_compiled_table() -> Result<Table> {
+    serde_json::from_str(PARALLELISM_DATA)
+        .map_err(|e| anyhow::anyhow!("Failed to parse compiled parallelism data: {}", e))
+}
+
 pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
-    let num_gpus = tch::Cuda::device_count();
+    let num_gpus = tch::Cuda::device_count() as usize;
     let gpu_type = get_gpu_type();
     info!("Detected {} x {} GPU(s)", num_gpus, gpu_type);
 
-    let table: Table = serde_json::from_str(PARALLELISM_DATA)?;
-
-    let gpu_configs = table
-        .get(model_repo_id)
-        .ok_or_else(|| anyhow::anyhow!("Model '{}' not in parallelism table", model_repo_id))?;
+    // Try model's own config first
+    if let Some(table) = load_from_model_repo(model_repo_id) {
+        if let Some(config) = lookup_in_table(&table, model_repo_id, &gpu_type, num_gpus) {
+            info!(
+                "Using parallelism config from model repo '{}'",
+                model_repo_id
+            );
+            return Ok(config);
+        }
+    }
 
-    let num_gpu_configs = gpu_configs.get(&gpu_type).ok_or_else(|| {
-        anyhow::anyhow!(
-            "GPU '{}' not in parallelism table for model '{}'",
-            gpu_type,
-            model_repo_id
-        )
-    })?;
+    // Fall back to compiled table
+    warn!(
+        "No parallelism config found in model repo '{}', using compiled defaults",
+        model_repo_id
+    );
 
-    let config = num_gpu_configs.get(&num_gpus.to_string()).ok_or_else(|| {
+    let table = load_compiled_table()?;
+    lookup_in_table(&table, model_repo_id, &gpu_type, num_gpus).ok_or_else(|| {
         anyhow::anyhow!(
             "No config for {} x {} with model '{}'",
             num_gpus,
             gpu_type,
             model_repo_id
         )
-    })?;
-
-    Ok(*config)
+    })
 }

From 8f6d14e758d4a09b22ba4b33185c4cd6e4529c0a Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Wed, 28 Jan 2026 10:39:41 -0800
Subject: [PATCH 14/16] change json format

---
 psyche-book/src/enduser/create-run.md   | 39 ++++++--------
 shared/client/src/parallelism_data.json | 40 ---------------
 shared/client/src/parallelism_lookup.rs | 67 +++++++++----------------
 3 files changed, 38 insertions(+), 108 deletions(-)
 delete mode 100644 shared/client/src/parallelism_data.json

diff --git a/psyche-book/src/enduser/create-run.md b/psyche-book/src/enduser/create-run.md
index bc9e31188..6aaebb5d4 100644
--- a/psyche-book/src/enduser/create-run.md
+++ b/psyche-book/src/enduser/create-run.md
@@ -86,40 +86,31 @@ run-manager create-run \
 
 At this point, your run has been successfully created.
 
-### Adding parallelism configuration (recommended)
+### Adding parallelism configuration (required for --parallelism-auto)
 
-If your run uses a model that is not already in the [parallelism lookup table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json), it's recommended to add the optimal parallelism configuration for your model and target GPU hardware. This allows clients to use `PARALLELISM_AUTO=true` for automatic configuration.
-
-#### Option 1: Add to your model's HuggingFace repo (preferred)
-
-Add a `parallelism_data.json` file directly to your model's HuggingFace repository. The client will automatically fetch this configuration at runtime - no Psyche rebuild required.
+If you want clients to use `PARALLELISM_AUTO=true` for automatic configuration, you must add a `parallelism_data.json` file to your model's HuggingFace repository.
 
 ```json
 {
-	"your-org/your-model": {
-		"H100": {
-			"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
-			"8": { "dp": 4, "tp": 2, "micro_batch_size": 4 }
-		},
-		"H200": {
-			"8": { "dp": 8, "tp": 1, "micro_batch_size": 8 }
-		}
+	"H100": {
+		"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
+		"8": { "dp": 4, "tp": 2, "micro_batch_size": 4 }
+	},
+	"H200": {
+		"8": { "dp": 8, "tp": 1, "micro_batch_size": 8 }
 	}
 }
 ```
 
-This is the preferred approach because:
-
-- No PR to Psyche required
-- No Docker image rebuild needed
-- Model creators manage their own configuration
-- Changes take effect immediately
-
-#### Option 2: Add to Psyche's compiled table
+Format: `gpu_type` → `num_gpus` → config
 
-Alternatively, open a PR to add your model to the [compiled parallelism table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json). This is useful for widely-used models that should have default configurations. The format is the same as above.
+- **gpu_type**: GPU model name (e.g., "H100", "H200")
+- **num_gpus**: Number of GPUs available (e.g., "1", "8")
+- **dp**: Data parallelism
+- **tp**: Tensor parallelism
+- **micro_batch_size**: Micro batch size per GPU
 
-**Lookup order**: The client first tries to fetch config from the model's HuggingFace repo, then falls back to the compiled table with a warning if not found.
+The config is shared via P2P when clients join a run.
 
 ### Initializing configuration
 
diff --git a/shared/client/src/parallelism_data.json b/shared/client/src/parallelism_data.json
deleted file mode 100644
index 234146aae..000000000
--- a/shared/client/src/parallelism_data.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-	"_doc (HuggingFace repo)": {
-		"gpu type from nvidia-smi": {
-			"numember of gpus": {
-				"dp": 0,
-				"tp": 0,
-				"micro_batch_size": 0
-			}
-		}
-	},
-	"emozilla/llama2-20m-init": {
-		"H100": {
-			"1": { "dp": 1, "tp": 1, "micro_batch_size": 1 },
-			"8": { "dp": 1, "tp": 1, "micro_batch_size": 1 }
-		}
-	},
-	"TinyLlama/TinyLlama-1.1B-Chat-v0.4": {
-		"H100": {
-			"1": { "dp": 1, "tp": 1, "micro_batch_size": 8 },
-			"8": { "dp": 8, "tp": 1, "micro_batch_size": 8 }
-		}
-	},
-	"NousResearch/Meta-Llama-3.1-8B": {
-		"H100": {
-			"1": { "dp": 1, "tp": 1, "micro_batch_size": 1 },
-			"8": { "dp": 8, "tp": 1, "micro_batch_size": 4 }
-		}
-	},
-	"deepseek-ai/DeepSeek-V2-Lite": {
-		"H100": {
-			"1": { "dp": 1, "tp": 1, "micro_batch_size": 1 },
-			"8": { "dp": 4, "tp": 2, "micro_batch_size": 2 }
-		}
-	},
-	"NousResearch/Hermes-4-70B": {
-		"H100": {
-			"8": { "dp": 1, "tp": 8, "micro_batch_size": 1 }
-		}
-	}
-}
diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs
index 80b0ae770..30b3a02b0 100644
--- a/shared/client/src/parallelism_lookup.rs
+++ b/shared/client/src/parallelism_lookup.rs
@@ -3,9 +3,8 @@ use hf_hub::{Repo, RepoType};
 use serde::Deserialize;
 use std::collections::HashMap;
 use std::process::Command;
-use tracing::{info, warn};
+use tracing::info;
 
-const PARALLELISM_DATA: &str = include_str!("parallelism_data.json");
 const REMOTE_CONFIG_FILENAME: &str = "parallelism_data.json";
 
 #[derive(Debug, Clone, Copy, Deserialize)]
@@ -15,8 +14,8 @@ pub struct ParallelismConfig {
     pub micro_batch_size: usize,
 }
 
-// Table format: model -> gpu_type -> num_gpus -> config
-type Table = HashMap<String, HashMap<String, HashMap<String, ParallelismConfig>>>;
+// Table format: gpu_type -> num_gpus -> config
+type Table = HashMap<String, HashMap<String, ParallelismConfig>>;
 
 fn get_gpu_type() -> String {
     // Try nvidia-smi first
@@ -56,8 +55,8 @@ fn get_gpu_type() -> String {
     }
 }
 
-/// Try to load parallelism config from the model's HuggingFace repo
-fn load_from_model_repo(model_repo_id: &str) -> Option<Table> {
+/// Try to load parallelism config JSON from the model's HuggingFace repo
+fn load_json_from_model_repo(model_repo_id: &str) -> Option<String> {
     let token = std::env::var("HF_TOKEN").ok();
 
     let api = hf_hub::api::sync::ApiBuilder::new()
@@ -67,59 +66,39 @@ fn load_from_model_repo(model_repo_id: &str) -> Option<Table> {
         .repo(Repo::new(model_repo_id.to_string(), RepoType::Model));
 
     let path = api.get(REMOTE_CONFIG_FILENAME).ok()?;
-    let content = std::fs::read_to_string(path).ok()?;
-    serde_json::from_str(&content).ok()
+    std::fs::read_to_string(path).ok()
 }
 
 /// Lookup config in a table
-fn lookup_in_table(
-    table: &Table,
-    model_repo_id: &str,
-    gpu_type: &str,
-    num_gpus: usize,
-) -> Option<ParallelismConfig> {
+fn lookup_in_table(table: &Table, gpu_type: &str, num_gpus: usize) -> Option<ParallelismConfig> {
     table
-        .get(model_repo_id)
-        .and_then(|g| g.get(gpu_type))
+        .get(gpu_type)
         .and_then(|n| n.get(&num_gpus.to_string()))
         .copied()
 }
 
-/// Load the compiled parallelism table
-fn load_compiled_table() -> Result<Table> {
-    serde_json::from_str(PARALLELISM_DATA)
-        .map_err(|e| anyhow::anyhow!("Failed to parse compiled parallelism data: {}", e))
-}
-
+/// Lookup parallelism config from the model's HuggingFace repo
 pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
     let num_gpus = tch::Cuda::device_count() as usize;
     let gpu_type = get_gpu_type();
     info!("Detected {} x {} GPU(s)", num_gpus, gpu_type);
 
-    // Try model's own config first
-    if let Some(table) = load_from_model_repo(model_repo_id) {
-        if let Some(config) = lookup_in_table(&table, model_repo_id, &gpu_type, num_gpus) {
-            info!(
-                "Using parallelism config from model repo '{}'",
-                model_repo_id
-            );
-            return Ok(config);
-        }
-    }
+    let raw_json = load_json_from_model_repo(model_repo_id).ok_or_else(|| {
+        anyhow::anyhow!(
+            "No parallelism_data.json found in model repo '{}'. \
+             Add this file to use --parallelism-auto",
+            model_repo_id
+        )
+    })?;
+
+    let table: Table = serde_json::from_str(&raw_json)
+        .map_err(|e| anyhow::anyhow!("Failed to parse parallelism_data.json: {}", e))?;
 
-    // Fall back to compiled table
-    warn!(
-        "No parallelism config found in model repo '{}', using compiled defaults",
+    info!(
+        "Using parallelism config from model repo '{}'",
         model_repo_id
     );
 
-    let table = load_compiled_table()?;
-    lookup_in_table(&table, model_repo_id, &gpu_type, num_gpus).ok_or_else(|| {
-        anyhow::anyhow!(
-            "No config for {} x {} with model '{}'",
-            num_gpus,
-            gpu_type,
-            model_repo_id
-        )
-    })
+    lookup_in_table(&table, &gpu_type, num_gpus)
+        .ok_or_else(|| anyhow::anyhow!("No config for {} x {}", num_gpus, gpu_type))
 }

From 6644c67a77d7987aac2947baa54eb197b3cf9022 Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Wed, 28 Jan 2026 12:47:33 -0800
Subject: [PATCH 15/16] nvml_wrapper

---
 Cargo.lock                              |  1 +
 shared/client/Cargo.toml                |  1 +
 shared/client/src/parallelism_lookup.rs | 83 ++++++++++++++-----------
 3 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 24ee94226..6eb0ed0cd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6946,6 +6946,7 @@ dependencies = [
  "iroh",
  "iroh-blobs",
  "lazy_static",
+ "nvml-wrapper",
  "postcard",
  "psyche-coordinator",
  "psyche-core",
diff --git a/shared/client/Cargo.toml b/shared/client/Cargo.toml
index 5f7159d81..faf508a4a 100644
--- a/shared/client/Cargo.toml
+++ b/shared/client/Cargo.toml
@@ -36,6 +36,7 @@ clap.workspace = true
 sysinfo = "0.32.0"
 iroh.workspace = true
 iroh-blobs.workspace = true
+nvml-wrapper = "0.11.0"
 
 [features]
 parallelism = ["psyche-modeling/parallelism"]
diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs
index 30b3a02b0..6ea6f9e26 100644
--- a/shared/client/src/parallelism_lookup.rs
+++ b/shared/client/src/parallelism_lookup.rs
@@ -1,8 +1,8 @@
 use anyhow::Result;
 use hf_hub::{Repo, RepoType};
+use nvml_wrapper::Nvml;
 use serde::Deserialize;
 use std::collections::HashMap;
-use std::process::Command;
 use tracing::info;
 
 const REMOTE_CONFIG_FILENAME: &str = "parallelism_data.json";
@@ -17,41 +17,48 @@ pub struct ParallelismConfig {
 // Table format: gpu_type -> num_gpus -> config
 type Table = HashMap<String, HashMap<String, ParallelismConfig>>;
 
-fn get_gpu_type() -> String {
-    // Try nvidia-smi first
-    let raw_gpu_name = Command::new("nvidia-smi")
-        .args(["--query-gpu=name", "--format=csv,noheader"])
-        .output()
-        .ok()
-        .and_then(|o| String::from_utf8(o.stdout).ok())
-        .and_then(|s| s.lines().next().map(|l| l.trim().to_string()))
-        .filter(|s| !s.is_empty())
-        // Fallback: read from /proc/driver/nvidia (works in containers without nvidia-smi)
-        .or_else(|| {
-            std::fs::read_dir("/proc/driver/nvidia/gpus")
-                .ok()?
-                .filter_map(|e| e.ok())
-                .next()
-                .and_then(|entry| {
-                    let info_path = entry.path().join("information");
-                    std::fs::read_to_string(info_path).ok()
-                })
-                .and_then(|content| {
-                    content
-                        .lines()
-                        .find(|line| line.starts_with("Model:"))
-                        .map(|line| line.trim_start_matches("Model:").trim().to_string())
-                })
-        })
-        .unwrap_or_default();
-
-    // Normalize GPU name to match table keys
-    if raw_gpu_name.to_uppercase().contains("H200") {
+#[derive(Debug)]
+struct GpuInfo {
+    name: String,
+    device_count: u32,
+}
+
+fn get_gpu_info() -> Result<GpuInfo> {
+    let nvml = Nvml::init()?;
+    let device_count = nvml.device_count()?;
+
+    if device_count == 0 {
+        anyhow::bail!("No GPUs found!");
+    }
+
+    let mut gpu_names = Vec::new();
+    for i in 0..device_count {
+        let device = nvml.device_by_index(i)?;
+        gpu_names.push(device.name()?);
+    }
+
+    let first_name = &gpu_names[0];
+    if !gpu_names.iter().all(|name| name == first_name) {
+        anyhow::bail!(
+            "All GPUs must be of the same type, but we have mismatching names: {:?}",
+            gpu_names
+        );
+    }
+
+    Ok(GpuInfo {
+        name: gpu_names.pop().unwrap(),
+        device_count,
+    })
+}
+
+fn normalize_gpu_name(raw_name: &str) -> String {
+    let upper = raw_name.to_uppercase();
+    if upper.contains("H200") {
         "H200".to_string()
-    } else if raw_gpu_name.to_uppercase().contains("H100") {
+    } else if upper.contains("H100") {
         "H100".to_string()
     } else {
-        raw_gpu_name
+        raw_name.to_string()
     }
 }
 
@@ -79,9 +86,9 @@ fn lookup_in_table(table: &Table, gpu_type: &str, num_gpus: usize) -> Option<Par
 
 /// Lookup parallelism config from the model's HuggingFace repo
 pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
-    let num_gpus = tch::Cuda::device_count() as usize;
-    let gpu_type = get_gpu_type();
-    info!("Detected {} x {} GPU(s)", num_gpus, gpu_type);
+    let gpu_info = get_gpu_info()?;
+    let gpu_type = normalize_gpu_name(&gpu_info.name);
+    info!("Detected {} x {} GPU(s)", gpu_info.device_count, gpu_type);
 
     let raw_json = load_json_from_model_repo(model_repo_id).ok_or_else(|| {
         anyhow::anyhow!(
@@ -99,6 +106,6 @@ pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
         model_repo_id
     );
 
-    lookup_in_table(&table, &gpu_type, num_gpus)
-        .ok_or_else(|| anyhow::anyhow!("No config for {} x {}", num_gpus, gpu_type))
+    lookup_in_table(&table, &gpu_type, gpu_info.device_count as usize)
+        .ok_or_else(|| anyhow::anyhow!("No config for {} x {}", gpu_info.device_count, gpu_type))
 }

From fdf4f0ddb4c2f2f35412968c3277be6ed66a8e97 Mon Sep 17 00:00:00 2001
From: Pedro Fontana <fontana.pedro93@gmail.com>
Date: Thu, 29 Jan 2026 09:11:01 -0800
Subject: [PATCH 16/16] se tch for GPU count (respects CUDA_VISIBLE_DEVICES)

---
 shared/client/src/parallelism_lookup.rs | 50 +++++++------------------
 1 file changed, 14 insertions(+), 36 deletions(-)

diff --git a/shared/client/src/parallelism_lookup.rs b/shared/client/src/parallelism_lookup.rs
index 6ea6f9e26..d6f18035a 100644
--- a/shared/client/src/parallelism_lookup.rs
+++ b/shared/client/src/parallelism_lookup.rs
@@ -17,38 +17,11 @@ pub struct ParallelismConfig {
 // Table format: gpu_type -> num_gpus -> config
 type Table = HashMap<String, HashMap<String, ParallelismConfig>>;
 
-#[derive(Debug)]
-struct GpuInfo {
-    name: String,
-    device_count: u32,
-}
-
-fn get_gpu_info() -> Result<GpuInfo> {
+/// Get GPU type from NVML (reads first visible GPU)
+fn get_gpu_type_from_nvml() -> Result<String> {
     let nvml = Nvml::init()?;
-    let device_count = nvml.device_count()?;
-
-    if device_count == 0 {
-        anyhow::bail!("No GPUs found!");
-    }
-
-    let mut gpu_names = Vec::new();
-    for i in 0..device_count {
-        let device = nvml.device_by_index(i)?;
-        gpu_names.push(device.name()?);
-    }
-
-    let first_name = &gpu_names[0];
-    if !gpu_names.iter().all(|name| name == first_name) {
-        anyhow::bail!(
-            "All GPUs must be of the same type, but we have mismatching names: {:?}",
-            gpu_names
-        );
-    }
-
-    Ok(GpuInfo {
-        name: gpu_names.pop().unwrap(),
-        device_count,
-    })
+    let device = nvml.device_by_index(0)?;
+    Ok(device.name()?)
 }
 
 fn normalize_gpu_name(raw_name: &str) -> String {
@@ -86,9 +59,14 @@ fn lookup_in_table(table: &Table, gpu_type: &str, num_gpus: usize) -> Option<Par
 
 /// Lookup parallelism config from the model's HuggingFace repo
 pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
-    let gpu_info = get_gpu_info()?;
-    let gpu_type = normalize_gpu_name(&gpu_info.name);
-    info!("Detected {} x {} GPU(s)", gpu_info.device_count, gpu_type);
+    let device_count = tch::Cuda::device_count() as usize;
+    if device_count == 0 {
+        anyhow::bail!("No GPUs found!");
+    }
+
+    // Use NVML for GPU type detection
+    let gpu_type = normalize_gpu_name(&get_gpu_type_from_nvml()?);
+    info!("Detected {} x {} GPU(s)", device_count, gpu_type);
 
     let raw_json = load_json_from_model_repo(model_repo_id).ok_or_else(|| {
         anyhow::anyhow!(
@@ -106,6 +84,6 @@ pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
         model_repo_id
     );
 
-    lookup_in_table(&table, &gpu_type, gpu_info.device_count as usize)
-        .ok_or_else(|| anyhow::anyhow!("No config for {} x {}", gpu_info.device_count, gpu_type))
+    lookup_in_table(&table, &gpu_type, device_count)
+        .ok_or_else(|| anyhow::anyhow!("No config for {} x {}", device_count, gpu_type))
 }