Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions architectures/centralized/client/src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ pub async fn build_app(
.await?;

let state_options: RunInitConfig<ClientId, ClientId> = RunInitConfig {
parallelism_auto: p.parallelism_auto,
data_parallelism: p.data_parallelism,
tensor_parallelism: p.tensor_parallelism,
micro_batch_size: p.micro_batch_size,
Expand Down
1 change: 1 addition & 0 deletions architectures/decentralized/solana-client/src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ pub async fn build_app(

let state_options: RunInitConfig<psyche_solana_coordinator::ClientId, NetworkIdentity> =
RunInitConfig {
parallelism_auto: p.parallelism_auto,
data_parallelism: p.data_parallelism,
tensor_parallelism: p.tensor_parallelism,
micro_batch_size: p.micro_batch_size,
Expand Down
3 changes: 2 additions & 1 deletion nix/lib.nix
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ let
|| (builtins.match ".*tests/fixtures/.*$" path != null)
|| (builtins.match ".*.config/.*$" path != null)
|| (builtins.match ".*local-dev-keypair.json$" path != null)
|| (builtins.match ".*shared/client/src/state/prompt_texts/index\\.json$" path != null);
|| (builtins.match ".*shared/client/src/state/prompt_texts/index\\.json$" path != null)
|| (builtins.match ".*shared/client/src/parallelism_data\\.json$" path != null);

src = lib.cleanSourceWith {
src = ../.;
Expand Down
28 changes: 27 additions & 1 deletion psyche-book/src/enduser/create-run.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,37 @@ run-manager create-run \

At this point, your run has been successfully created.

### Adding parallelism configuration (required for --parallelism-auto)

If you want clients to use `PARALLELISM_AUTO=true` for automatic configuration, you must add a `parallelism_data.json` file to your model's HuggingFace repository.

```json
{
"H100": {
"1": { "dp": 1, "tp": 1, "micro_batch_size": 4 },
"8": { "dp": 4, "tp": 2, "micro_batch_size": 4 }
},
"H200": {
"8": { "dp": 8, "tp": 1, "micro_batch_size": 8 }
}
}
```

Format: `gpu_type` → `num_gpus` → config

- **gpu_type**: GPU model name (e.g., "H100", "H200")
- **num_gpus**: Number of GPUs available (e.g., "1", "8")
- **dp**: Data parallelism
- **tp**: Tensor parallelism
- **micro_batch_size**: Micro batch size per GPU

The config is shared via P2P when clients join a run.

### Initializing configuration

Initially, the run will not have any configuration defined and will remain paused, so no clients can join yet.

To set the run configuration, youll need to provide mostly the same parameters as when creating the run, along with the path to a `config.toml` file that follows the [run config schema](./run-config.md).
To set the run configuration, you'll need to provide mostly the same parameters as when creating the run, along with the path to a `config.toml` file that follows the [run config schema](./run-config.md).

```bash
run-manager update-config \
Expand Down
10 changes: 10 additions & 0 deletions psyche-book/src/enduser/join-run.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,19 +93,29 @@ though you might need to.

**`NVIDIA_DRIVER_CAPABILITIES`** - An environment variable that the NVIDIA Container Toolkit uses to determine which compute capabilities should be provided to your container. It is recommended to set it to 'all', e.g. `NVIDIA_DRIVER_CAPABILITIES=all`.

**`PARALLELISM_AUTO`** - Set to `true` to automatically detect optimal parallelism settings based on the model and your GPU hardware.

- When enabled, the client will look up the best `DATA_PARALLELISM`, `TENSOR_PARALLELISM`, and `MICRO_BATCH_SIZE` values from a [built-in configuration table](https://github.com/PsycheFoundation/psyche/blob/main/shared/client/src/parallelism_data.json)
- Your model and GPU hardware combination must be present in the table
- This is the recommended option for most users
- If set, manual parallelism settings below will be ignored

**`DATA_PARALLELISM`** - Number of GPUs to distribute training data across.

- If you have multiple GPUs, you can set this to 2, 4, etc. to speed up training
- If you have 1 GPU, set this to `1`
- Ignored if `PARALLELISM_AUTO=true`

**`TENSOR_PARALLELISM`** - Number of GPUs to distribute the model across, this lets you train a model you can't fit on one single GPU.

- If you have 1 GPU, set this to `1`
- If your have `n` GPUs you can distribute the model across all of them by setting it to `n`.
- Ignored if `PARALLELISM_AUTO=true`

**`MICRO_BATCH_SIZE`** - Number of samples processed per GPU per training step

- Set as high as your GPU memory allows
- Ignored if `PARALLELISM_AUTO=true`

**`AUTHORIZER`** - The Solana address that authorized your wallet to join this run

Expand Down
1 change: 1 addition & 0 deletions shared/client/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ clap.workspace = true
sysinfo = "0.32.0"
iroh.workspace = true
iroh-blobs.workspace = true
nvml-wrapper = "0.11.0"

[features]
parallelism = ["psyche-modeling/parallelism"]
Expand Down
4 changes: 4 additions & 0 deletions shared/client/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ pub struct TrainArgs {
#[clap(long, env, value_parser = parse_trim_quotes)]
pub run_id: String,

/// Auto-detect parallelism settings from lookup table based on model and GPU count
#[clap(long, env)]
pub parallelism_auto: bool,

#[clap(long, default_value_t = 1, env)]
pub data_parallelism: usize,

Expand Down
1 change: 1 addition & 0 deletions shared/client/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod cli;
mod client;
mod fetch_data;
pub mod parallelism_lookup;
mod protocol;
mod state;
mod tui;
Expand Down
89 changes: 89 additions & 0 deletions shared/client/src/parallelism_lookup.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
use anyhow::Result;
use hf_hub::{Repo, RepoType};
use nvml_wrapper::Nvml;
use serde::Deserialize;
use std::collections::HashMap;
use tracing::info;

const REMOTE_CONFIG_FILENAME: &str = "parallelism_data.json";

#[derive(Debug, Clone, Copy, Deserialize)]
pub struct ParallelismConfig {
pub dp: usize,
pub tp: usize,
pub micro_batch_size: usize,
}

// Table format: gpu_type -> num_gpus -> config
type Table = HashMap<String, HashMap<String, ParallelismConfig>>;

/// Get GPU type from NVML (reads first visible GPU)
fn get_gpu_type_from_nvml() -> Result<String> {
let nvml = Nvml::init()?;
let device = nvml.device_by_index(0)?;
Ok(device.name()?)
}

fn normalize_gpu_name(raw_name: &str) -> String {
let upper = raw_name.to_uppercase();
if upper.contains("H200") {
"H200".to_string()
} else if upper.contains("H100") {
"H100".to_string()
} else {
raw_name.to_string()
}
}

/// Try to load parallelism config JSON from the model's HuggingFace repo
fn load_json_from_model_repo(model_repo_id: &str) -> Option<String> {
let token = std::env::var("HF_TOKEN").ok();

let api = hf_hub::api::sync::ApiBuilder::new()
.with_token(token)
.build()
.ok()?
.repo(Repo::new(model_repo_id.to_string(), RepoType::Model));

let path = api.get(REMOTE_CONFIG_FILENAME).ok()?;
std::fs::read_to_string(path).ok()
}

/// Lookup config in a table
fn lookup_in_table(table: &Table, gpu_type: &str, num_gpus: usize) -> Option<ParallelismConfig> {
table
.get(gpu_type)
.and_then(|n| n.get(&num_gpus.to_string()))
.copied()
}

/// Lookup parallelism config from the model's HuggingFace repo
pub fn lookup(model_repo_id: &str) -> Result<ParallelismConfig> {
let device_count = tch::Cuda::device_count() as usize;
if device_count == 0 {
anyhow::bail!("No GPUs found!");
}

// Use NVML for GPU type detection
let gpu_type = normalize_gpu_name(&get_gpu_type_from_nvml()?);
info!("Detected {} x {} GPU(s)", device_count, gpu_type);

let raw_json = load_json_from_model_repo(model_repo_id).ok_or_else(|| {
anyhow::anyhow!(
"No parallelism_data.json found in model repo '{}'. \
Add this file to use --parallelism-auto",
model_repo_id
)
})?;

let table: Table = serde_json::from_str(&raw_json)
.map_err(|e| anyhow::anyhow!("Failed to parse parallelism_data.json: {}", e))?;

info!(
"Using parallelism config from model repo '{}'",
model_repo_id
);

lookup_in_table(&table, &gpu_type, device_count)
.ok_or_else(|| anyhow::anyhow!("No config for {} x {}", device_count, gpu_type))
}
Loading