From c51fb1cf6db13cb5302b586280fa17fcc38e4b25 Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Wed, 30 Apr 2025 17:27:41 -0300 Subject: [PATCH 01/33] use several data providers and cycle between them on error --- shared/client/src/fetch_data.rs | 89 +++++++++++++++++++++++--------- shared/client/src/state/init.rs | 5 +- shared/client/src/state/train.rs | 2 + 3 files changed, 70 insertions(+), 26 deletions(-) diff --git a/shared/client/src/fetch_data.rs b/shared/client/src/fetch_data.rs index 49ca578e9..e0dc0f184 100644 --- a/shared/client/src/fetch_data.rs +++ b/shared/client/src/fetch_data.rs @@ -14,7 +14,7 @@ use tokio::{ task::JoinHandle, time::sleep, }; -use tracing::{debug, error, trace, trace_span, warn, Instrument}; +use tracing::{debug, error, trace, trace_span, warn, info, Instrument}; pub type BatchStep = u32; pub type BatchIdSet = HashSet; @@ -23,16 +23,19 @@ const MAX_RETRIES: u32 = 7; const BASE_DELAY_MS: u64 = 2000; pub struct DataFetcher { - data_provider: Arc>>, + data_providers: Vec>>>, active_fetch_task: Option<(BatchStep, JoinHandle<()>)>, buffer_size: usize, _phantom: PhantomData, } impl DataFetcher { - pub fn new(data_provider: DataProvider, buffer_size: usize) -> Self { + pub fn new(data_providers: Vec>, buffer_size: usize) -> Self { Self { - data_provider: Arc::new(Mutex::new(data_provider)), + data_providers: data_providers + .into_iter() // Use into_iter to consume the input vector + .map(|dp| Arc::new(Mutex::new(dp))) // No need for clone here + .collect(), active_fetch_task: None, buffer_size, _phantom: Default::default(), @@ -69,7 +72,7 @@ impl DataFetcher { step, tokio::spawn({ trace!("New fetch task for step {step} has been spawned"); - let data_provider = self.data_provider.clone(); // only one of these tasks will acquire the lock at once. once one dies, the lock is released for sure. + let data_providers = self.data_providers.clone(); // Clone the Arc vector for the async task async move { loop { @@ -78,32 +81,68 @@ impl DataFetcher { Some(assigned) => assigned, None => { // out of assigned data! + debug!("No more assigned batch IDs for step {step}."); return; } } }; - let mut retry_count = 0; - let batch = loop { - match data_provider.lock().await.get_samples(batch_id).await { - Ok(batch) => break batch, - Err(err) if retry_count < MAX_RETRIES => { - retry_count += 1; - let delay_ms = BASE_DELAY_MS * (retry_count as u64 - 1); - warn!( - "Data fetch error (attempt {}/{}): \"{}\". Retrying in {}ms", - retry_count, MAX_RETRIES, err, delay_ms - ); - sleep(Duration::from_millis(delay_ms)).await; - continue; - } - Err(err) => { - error!("Data fetch error: {err:#}"); - return; + let mut batch_option = None; + for (provider_idx, data_provider) in data_providers.iter().enumerate() { + info!(batch_id = %batch_id, provider_idx, "Attempting fetch with provider {}", provider_idx); + let mut retry_count = 0; + loop { + match data_provider.lock().await.get_samples(batch_id).await { + Ok(batch) => { + info!(batch_id = %batch_id, provider_idx, "Successfully fetched batch with provider {}", provider_idx); + batch_option = Some(batch); + break; // Break retry loop, batch found + }, + Err(err) if retry_count < MAX_RETRIES => { + retry_count += 1; + // Use exponential backoff with full jitter + let delay_ms = BASE_DELAY_MS * 2u64.pow(retry_count - 1); + let jitter = rand::random::() % delay_ms; + let final_delay = Duration::from_millis(delay_ms / 2 + jitter); // Example: Full Jitter + + warn!( + batch_id = %batch_id, + provider_idx, + attempt = retry_count, + max_retries = MAX_RETRIES, + error = %err, + delay_ms = final_delay.as_millis(), + "Data fetch error with provider {}. Retrying in {}ms", + provider_idx, final_delay.as_millis() + ); + sleep(final_delay).await; + continue; // Continue retry loop + } + Err(err) => { + error!(batch_id = %batch_id, provider_idx, error = %err, "Data fetch failed permanently for provider {}", provider_idx); + break; // Break retry loop, provider failed permanently for this batch + } } } + if batch_option.is_some() { + break; // Break provider loop, batch found + } + // If batch_option is None here, it means the current provider failed permanently for this batch_id + warn!(batch_id = %batch_id, provider_idx, "Provider {} failed, trying next.", provider_idx); + } + + // After trying all providers + let batch = match batch_option { + Some(b) => b, + None => { + error!(batch_id = %batch_id, "Failed to fetch batch after trying all data providers."); + // Decide how to handle this: skip the batch and continue, or stop the task? + // For now, let's skip this batch and try the next assigned ID. + continue; // Continue the outer loop to get the next batch_id + } }; + if tx_next_sample .send(Batch { id: batch_id, @@ -112,12 +151,12 @@ impl DataFetcher { .await .is_err() { - debug!("Data loop finished"); - return; + debug!("Data loop finished because receiver dropped (step {step})."); + return; // Receiver is gone, stop the task } } } - .instrument(trace_span!("fetch_data")) + .instrument(trace_span!("fetch_data", step = step)) // Add step to span }), )); diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 1fea00be5..136be83e6 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -485,7 +485,10 @@ impl RunInitConfigAndIO::new(data_provider, init_config.data_parallelism * 2); + DataFetcher::::new(vec![ + DataProvider::Dummy(DummyDataProvider::new(TokenSize::TwoBytes, 2, 10)), + data_provider, + ], init_config.data_parallelism * 2); let data_parallel: Option, Arc)>> = if init_config.data_parallelism > 1 { diff --git a/shared/client/src/state/train.rs b/shared/client/src/state/train.rs index 6fd017aa8..a610aee45 100644 --- a/shared/client/src/state/train.rs +++ b/shared/client/src/state/train.rs @@ -286,6 +286,8 @@ impl TrainingStepMetadata }; let finished = finished.clone(); + + info!("fetching data!!!"); let TrainingDataForStep { step, mut next_sample, From 049c54294c4fe47ed15efa104194d1cb1f3c636e Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Wed, 30 Apr 2025 17:37:51 -0300 Subject: [PATCH 02/33] save latest working data provider to use next time --- shared/client/src/fetch_data.rs | 49 +++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/shared/client/src/fetch_data.rs b/shared/client/src/fetch_data.rs index e0dc0f184..e2b6f27f9 100644 --- a/shared/client/src/fetch_data.rs +++ b/shared/client/src/fetch_data.rs @@ -14,7 +14,7 @@ use tokio::{ task::JoinHandle, time::sleep, }; -use tracing::{debug, error, trace, trace_span, warn, info, Instrument}; +use tracing::{debug, error, info, trace, trace_span, warn, Instrument}; pub type BatchStep = u32; pub type BatchIdSet = HashSet; @@ -26,18 +26,21 @@ pub struct DataFetcher { data_providers: Vec>>>, active_fetch_task: Option<(BatchStep, JoinHandle<()>)>, buffer_size: usize, + last_successful_provider_idx: Arc>, // Store the index of the last successful provider _phantom: PhantomData, } impl DataFetcher { pub fn new(data_providers: Vec>, buffer_size: usize) -> Self { + assert!(!data_providers.is_empty(), "Must provide at least one data provider"); Self { data_providers: data_providers - .into_iter() // Use into_iter to consume the input vector - .map(|dp| Arc::new(Mutex::new(dp))) // No need for clone here + .into_iter() + .map(|dp| Arc::new(Mutex::new(dp))) .collect(), active_fetch_task: None, buffer_size, + last_successful_provider_idx: Arc::new(Mutex::new(0)), // Start with the first provider _phantom: Default::default(), } } @@ -72,15 +75,21 @@ impl DataFetcher { step, tokio::spawn({ trace!("New fetch task for step {step} has been spawned"); - let data_providers = self.data_providers.clone(); // Clone the Arc vector for the async task + let data_providers = self.data_providers.clone(); + let last_successful_provider_idx = self.last_successful_provider_idx.clone(); // Clone Arc for the task async move { + let num_providers = data_providers.len(); + if num_providers == 0 { + error!("No data providers configured."); + return; + } + loop { let batch_id = { match assigned_batch_ids.pop() { Some(assigned) => assigned, None => { - // out of assigned data! debug!("No more assigned batch IDs for step {step}."); return; } @@ -88,7 +97,13 @@ impl DataFetcher { }; let mut batch_option = None; - for (provider_idx, data_provider) in data_providers.iter().enumerate() { + let start_idx = *last_successful_provider_idx.lock().await; // Read the last successful index + + // Iterate through providers, starting from the last successful one and wrapping around + for i in 0..num_providers { + let provider_idx = (start_idx + i) % num_providers; + let data_provider = &data_providers[provider_idx]; + info!(batch_id = %batch_id, provider_idx, "Attempting fetch with provider {}", provider_idx); let mut retry_count = 0; loop { @@ -96,6 +111,8 @@ impl DataFetcher { Ok(batch) => { info!(batch_id = %batch_id, provider_idx, "Successfully fetched batch with provider {}", provider_idx); batch_option = Some(batch); + // Update the last successful index + *last_successful_provider_idx.lock().await = provider_idx; break; // Break retry loop, batch found }, Err(err) if retry_count < MAX_RETRIES => { @@ -103,7 +120,7 @@ impl DataFetcher { // Use exponential backoff with full jitter let delay_ms = BASE_DELAY_MS * 2u64.pow(retry_count - 1); let jitter = rand::random::() % delay_ms; - let final_delay = Duration::from_millis(delay_ms / 2 + jitter); // Example: Full Jitter + let final_delay = Duration::from_millis(delay_ms / 2 + jitter); warn!( batch_id = %batch_id, @@ -123,26 +140,24 @@ impl DataFetcher { break; // Break retry loop, provider failed permanently for this batch } } - } + } // End retry loop + if batch_option.is_some() { - break; // Break provider loop, batch found + break; // Break provider loop (for i in 0..num_providers), batch found } // If batch_option is None here, it means the current provider failed permanently for this batch_id - warn!(batch_id = %batch_id, provider_idx, "Provider {} failed, trying next.", provider_idx); - } + warn!(batch_id = %batch_id, provider_idx, "Provider {} failed permanently for this batch, trying next.", provider_idx); + } // End provider loop // After trying all providers let batch = match batch_option { Some(b) => b, None => { error!(batch_id = %batch_id, "Failed to fetch batch after trying all data providers."); - // Decide how to handle this: skip the batch and continue, or stop the task? - // For now, let's skip this batch and try the next assigned ID. - continue; // Continue the outer loop to get the next batch_id + continue; // Skip this batch and try the next assigned ID } }; - if tx_next_sample .send(Batch { id: batch_id, @@ -154,9 +169,9 @@ impl DataFetcher { debug!("Data loop finished because receiver dropped (step {step})."); return; // Receiver is gone, stop the task } - } + } // End main loop } - .instrument(trace_span!("fetch_data", step = step)) // Add step to span + .instrument(trace_span!("fetch_data", step = step)) }), )); From e4cde0c9e07fc0f3b9026b6fc0287b353921c762 Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Wed, 30 Apr 2025 18:37:00 -0300 Subject: [PATCH 03/33] allow defining backup data providers in config --- architectures/centralized/server/src/app.rs | 4 +- config/solana-test/light-config.toml | 15 ++-- shared/client/src/state/init.rs | 80 +++++++++++---------- shared/coordinator/src/model.rs | 48 +++++++------ 4 files changed, 75 insertions(+), 72 deletions(-) diff --git a/architectures/centralized/server/src/app.rs b/architectures/centralized/server/src/app.rs index 2260ca052..eafaf3bbd 100644 --- a/architectures/centralized/server/src/app.rs +++ b/architectures/centralized/server/src/app.rs @@ -179,11 +179,13 @@ impl App { let training_data_server = match &coordinator.model { Model::LLM(LLM { - data_location, + data_locations, data_type, checkpoint, .. }) => { + // If you fail with the 1st data provider at the start don't even bother trying with the fallbacks + let data_location = &data_locations[0]; if let LLMTrainingDataType::Finetuning = data_type { panic!("Finetuning is not supported yet.") } diff --git a/config/solana-test/light-config.toml b/config/solana-test/light-config.toml index ad0a7adc1..038337efd 100644 --- a/config/solana-test/light-config.toml +++ b/config/solana-test/light-config.toml @@ -19,16 +19,11 @@ data_type = "Pretraining" max_seq_len = 2048 cold_start_warmup_steps = 0 -[model.LLM.checkpoint.Hub] -repo_id = "emozilla/llama2-20m-init" - -[model.LLM.data_location.Http] -token_size_in_bytes = "TwoBytes" -shuffle = "DontShuffle" - -[model.LLM.data_location.Http.location.Gcp] -bucket_name = "nous-pretraining-public-us" -filter_directory = "fineweb-edu-tokenized-llama2" +checkpoint = { Hub = { repo_id = "emozilla/llama2-20m-init" } } +data_locations = [ + { Dummy = {} }, + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } +] [model.LLM.lr_schedule.Cosine] base_lr = 4.0e-4 diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 136be83e6..2a0a9e0d7 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -153,42 +153,47 @@ impl RunInitConfigAndIO DataProvider::Server( - DataProviderTcpClient::connect( - (&data_server).into(), - init_config.network_identity, - init_config.private_key, - ) - .await?, - ), - LLMTrainingDataLocation::Local(_) => todo!(), - LLMTrainingDataLocation::Dummy => { - DataProvider::Dummy(DummyDataProvider::new(TokenSize::TwoBytes, 2048, u64::MAX)) - } - LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { - location, - token_size_in_bytes, - shuffle, - }) => { - let file_urls = FileURLs::from_location(&location).await?; - DataProvider::Http(HttpDataProvider::new( - file_urls, + debug!("Setting up data providers from {:?}", llm.data_locations); + let mut data_providers = Vec::new(); + + for data_location in llm.data_locations.iter() { + let provider = match data_location { + LLMTrainingDataLocation::Server(data_server) => DataProvider::Server( + DataProviderTcpClient::connect( + data_server.into(), + init_config.network_identity.clone(), + init_config.private_key.clone(), + ) + .await?, + ), + LLMTrainingDataLocation::Local(_) => todo!(), + LLMTrainingDataLocation::Dummy => { + DataProvider::Dummy(DummyDataProvider::new(TokenSize::TwoBytes, 2048, u64::MAX)) + } + LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { + location, token_size_in_bytes, - llm.max_seq_len, shuffle, - )?) - } - LLMTrainingDataLocation::WeightedHttp(config_url) => DataProvider::WeightedHttp( - WeightedDataProvider::::from_config_url( - &String::from(&config_url), - llm.max_seq_len, - ) - .await?, - ), - }; - Ok(data_provider) + }) => { + let file_urls = FileURLs::from_location(&location).await?; + DataProvider::Http(HttpDataProvider::new( + file_urls, + *token_size_in_bytes, + llm.max_seq_len, + *shuffle, + )?) + } + LLMTrainingDataLocation::WeightedHttp(config_url) => DataProvider::WeightedHttp( + WeightedDataProvider::::from_config_url( + &String::from(config_url), + llm.max_seq_len, + ) + .await?, + ), + }; + data_providers.push(provider); + } + Ok(data_providers) }; let model_future: JoinHandle> = match &llm.architecture @@ -482,13 +487,10 @@ impl RunInitConfigAndIO::new(vec![ - DataProvider::Dummy(DummyDataProvider::new(TokenSize::TwoBytes, 2, 10)), - data_provider, - ], init_config.data_parallelism * 2); + DataFetcher::::new(data_providers, init_config.data_parallelism * 2); let data_parallel: Option, Arc)>> = if init_config.data_parallelism > 1 { diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index d19dff2d7..cf2351619 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -181,17 +181,18 @@ pub struct LLM { pub architecture: LLMArchitecture, pub checkpoint: Checkpoint, pub data_type: LLMTrainingDataType, - pub data_location: LLMTrainingDataLocation, + pub data_locations: FixedVec, pub lr_schedule: LearningRateSchedule, pub optimizer: OptimizerDefinition, } impl LLM { pub fn dummy() -> Self { + let data_locations: FixedVec = FixedVec::new(); Self { architecture: LLMArchitecture::HfLlama, checkpoint: Checkpoint::Dummy(HubRepo::dummy()), - data_location: LLMTrainingDataLocation::default(), + data_locations, data_type: LLMTrainingDataType::Pretraining, lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), max_seq_len: 2048, @@ -269,27 +270,30 @@ impl Model { return false; } - let bad_data_location = match llm.data_location { - LLMTrainingDataLocation::Dummy => false, - LLMTrainingDataLocation::Server(url) => url.is_empty(), - LLMTrainingDataLocation::Local(_) => false, - LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { - location, .. - }) => match location { - HttpTrainingDataLocation::SingleUrl(url) => url.is_empty(), - HttpTrainingDataLocation::NumberedFiles { - url_template, - num_files, - .. - } => url_template.is_empty() || num_files == 0, - HttpTrainingDataLocation::Gcp { bucket_name, .. } => bucket_name.is_empty(), - }, - LLMTrainingDataLocation::WeightedHttp(url) => url.is_empty(), - }; - if bad_data_location { - msg!("model check failed: bad LLM training data location."); - return false; + for data_location in llm.data_locations.iter() { + let bad_data_location = match data_location { + LLMTrainingDataLocation::Dummy => false, + LLMTrainingDataLocation::Server(url) => url.is_empty(), + LLMTrainingDataLocation::Local(_) => false, + LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { + location, .. + }) => match location { + HttpTrainingDataLocation::SingleUrl(url) => url.is_empty(), + HttpTrainingDataLocation::NumberedFiles { + url_template, + num_files, + .. + } => url_template.is_empty() || *num_files == 0, + HttpTrainingDataLocation::Gcp { bucket_name, .. } => bucket_name.is_empty(), + }, + LLMTrainingDataLocation::WeightedHttp(url) => url.is_empty(), + }; + if bad_data_location { + msg!("model check failed: bad LLM training data location."); + return false; + } } + let bad_checkpoint = match llm.checkpoint { Checkpoint::Dummy(_hub_repo) => false, Checkpoint::Ephemeral => true, From 0ea2ef753a5bcb0fa0f27a3ddee4f34f1bc469dc Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Mon, 5 May 2025 16:24:34 -0300 Subject: [PATCH 04/33] Add test_backup_data_provider This modifies some of the syntax in the config files as now you can specify if you want a working Dummy data provider or a failing one. --- .../testing/src/docker_watcher.rs | 22 +++++ .../testing/tests/integration_tests.rs | 96 +++++++++++++++++++ .../light-config-dummy-failing.toml | 40 ++++++++ config/solana-test/light-config.toml | 1 - config/solana-test/light-two-min-clients.toml | 11 +-- shared/client/src/fetch_data.rs | 23 ++++- shared/client/src/state/init.rs | 24 ++--- shared/client/src/state/train.rs | 2 - shared/client/src/testing.rs | 6 ++ shared/coordinator/src/model.rs | 33 ++++++- shared/data-provider/src/dummy.rs | 7 ++ 11 files changed, 236 insertions(+), 29 deletions(-) create mode 100644 config/solana-test/light-config-dummy-failing.toml diff --git a/architectures/decentralized/testing/src/docker_watcher.rs b/architectures/decentralized/testing/src/docker_watcher.rs index bf0a78772..856a41a25 100644 --- a/architectures/decentralized/testing/src/docker_watcher.rs +++ b/architectures/decentralized/testing/src/docker_watcher.rs @@ -29,6 +29,8 @@ pub enum Response { SolanaSubscription(String, String), WitnessElected(String), Error(ObservedErrorKind, String), + DataProviderFetchSuccess(u64), + DataProviderFetchError(u64), } #[derive(thiserror::Error, Debug)] @@ -310,6 +312,26 @@ impl DockerWatcher { println!("Probably the test ended so we drop the log sender"); } } + IntegrationTestLogMarker::DataProviderFetchSuccess => { + let provider_idx = parsed_log + .get("provider_idx") + .and_then(|v| v.as_u64()) + .unwrap(); + let response = Response::DataProviderFetchSuccess(provider_idx); + if log_sender.send(response).await.is_err() { + println!("Probably the test ended so we drop the log sender"); + } + } + IntegrationTestLogMarker::DataProviderFetchError => { + let provider_idx = parsed_log + .get("provider_idx") + .and_then(|v| v.as_u64()) + .unwrap(); + let response = Response::DataProviderFetchError(provider_idx); + if log_sender.send(response).await.is_err() { + println!("Probably the test ended so we drop the log sender"); + } + } } } Ok(()) diff --git a/architectures/decentralized/testing/tests/integration_tests.rs b/architectures/decentralized/testing/tests/integration_tests.rs index bb653f979..e2241e36d 100644 --- a/architectures/decentralized/testing/tests/integration_tests.rs +++ b/architectures/decentralized/testing/tests/integration_tests.rs @@ -1023,3 +1023,99 @@ async fn test_lost_only_peer_go_back_to_hub_checkpoint() { } } } + +/// spawn 2 clients and run for 3 epochs but the first defined data provider fails +/// this tests checks that the logic for retrying the failing data provider and switching to the new is working +#[test_log::test(tokio::test(flavor = "multi_thread"))] +#[serial] +async fn test_backup_data_provider() { + let run_id = "test".to_string(); + let mut saw_provider_0_error = false; + let mut successful_fetches_after_error = 0; + let mut current_epoch = -1; + + let docker = Arc::new(Docker::connect_with_socket_defaults().unwrap()); + let mut watcher = DockerWatcher::new(docker.clone()); + + let _cleanup = e2e_testing_setup( + docker.clone(), + 2, + Some(PathBuf::from( + "../../config/solana-test/light-config-dummy-failing.toml", + )), + ) + .await; + + let _monitor_client_1 = watcher + .monitor_container( + &format!("{CLIENT_CONTAINER_PREFIX}-1"), + vec![ + IntegrationTestLogMarker::Loss, + IntegrationTestLogMarker::DataProviderFetchError, + IntegrationTestLogMarker::DataProviderFetchSuccess, + ], + ) + .unwrap(); + let _monitor_client_2 = watcher + .monitor_container( + &format!("{CLIENT_CONTAINER_PREFIX}-2"), + vec![ + IntegrationTestLogMarker::Loss, + IntegrationTestLogMarker::DataProviderFetchError, + IntegrationTestLogMarker::DataProviderFetchSuccess, + ], + ) + .unwrap(); + + // Initialize solana client to query the coordinator state + let solana_client = SolanaTestClient::new(run_id).await; + let mut live_interval = time::interval(Duration::from_secs(10)); + + loop { + tokio::select! { + _ = live_interval.tick() => { + if let Err(e) = watcher.monitor_clients_health(2).await { + panic!("{}", e); + } + } + response = watcher.log_rx.recv() => { + match response { + Some(Response::DataProviderFetchError(provider_idx)) => { + println!("Data provider {} fetch error", provider_idx); + if provider_idx == 0 { + saw_provider_0_error = true; + } + } + Some(Response::DataProviderFetchSuccess(provider_idx)) => { + println!("Data provider {} fetch success", provider_idx); + if provider_idx == 1 && saw_provider_0_error { + successful_fetches_after_error += 1; + println!("Successful fetch {} after error", successful_fetches_after_error); + if successful_fetches_after_error >= 2 { + println!("Saw 2 successful fetches after error, test successful!"); + return; + } + } + } + Some(Response::Loss(client, epoch, step, loss)) => { + println!( + "client: {:?}, epoch: {}, step: {}, Loss: {:?}", + client, epoch, step, loss + ); + // assert that the loss decreases each epoch + if epoch as i64 > current_epoch { + current_epoch = epoch as i64; + + if epoch > 1 { + assert!(saw_provider_0_error, "Should have seen error from provider 0"); + assert!(successful_fetches_after_error >= 2, "Should have seen successful fetch after error"); + return; + } + } + } + _ => {} + } + } + } + } +} diff --git a/config/solana-test/light-config-dummy-failing.toml b/config/solana-test/light-config-dummy-failing.toml new file mode 100644 index 000000000..8029cc0e5 --- /dev/null +++ b/config/solana-test/light-config-dummy-failing.toml @@ -0,0 +1,40 @@ +[config] +warmup_time = 30 +cooldown_time = 30 +rounds_per_epoch = 20 +max_round_train_time = 30 +round_witness_time = 1 +min_clients = 1 +init_min_clients = 1 +verification_percent = 0 +witness_nodes = 1 +global_batch_size_start = 8 +global_batch_size_end = 8 +global_batch_size_warmup_tokens = 0 +total_steps = 25000 + +[model.LLM] +architecture = "HfLlama" +data_type = "Pretraining" +max_seq_len = 2048 +cold_start_warmup_steps = 0 +data_locations = [ + { Dummy = "Failing" }, + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } +] +[model.LLM.checkpoint.Hub] +repo_id = "emozilla/llama2-20m-init" + +[model.LLM.lr_schedule.Cosine] +base_lr = 4.0e-4 +warmup_steps = 250 +warmup_init_lr = 0.0 +total_steps = 25000 +final_lr = 4.0e-5 + +[model.LLM.optimizer.Distro] +clip_grad_norm = 1.0 +compression_decay = 0.999 +compression_chunk = 64 +compression_topk = 8 +quantize_1bit = true diff --git a/config/solana-test/light-config.toml b/config/solana-test/light-config.toml index 038337efd..29448da5b 100644 --- a/config/solana-test/light-config.toml +++ b/config/solana-test/light-config.toml @@ -21,7 +21,6 @@ cold_start_warmup_steps = 0 checkpoint = { Hub = { repo_id = "emozilla/llama2-20m-init" } } data_locations = [ - { Dummy = {} }, { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } ] diff --git a/config/solana-test/light-two-min-clients.toml b/config/solana-test/light-two-min-clients.toml index 0ff0834b7..0d13afedf 100644 --- a/config/solana-test/light-two-min-clients.toml +++ b/config/solana-test/light-two-min-clients.toml @@ -19,16 +19,13 @@ data_type = "Pretraining" max_seq_len = 2048 cold_start_warmup_steps = 0 -[model.LLM.checkpoint.Hub] -repo_id = "emozilla/llama2-20m-init" - -[model.LLM.data_location.Http] -token_size_in_bytes = "TwoBytes" -shuffle = "DontShuffle" - [model.LLM.data_location.Http.location.Gcp] bucket_name = "nous-pretraining-public-us" filter_directory = "fineweb-edu-tokenized-llama2" +checkpoint = { Hub = { repo_id = "emozilla/llama2-20m-init" } } +data_locations = [ + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } +] [model.LLM.lr_schedule.Cosine] base_lr = 4.0e-4 diff --git a/shared/client/src/fetch_data.rs b/shared/client/src/fetch_data.rs index e2b6f27f9..16a819b67 100644 --- a/shared/client/src/fetch_data.rs +++ b/shared/client/src/fetch_data.rs @@ -16,6 +16,8 @@ use tokio::{ }; use tracing::{debug, error, info, trace, trace_span, warn, Instrument}; +use crate::IntegrationTestLogMarker; + pub type BatchStep = u32; pub type BatchIdSet = HashSet; @@ -32,7 +34,10 @@ pub struct DataFetcher { impl DataFetcher { pub fn new(data_providers: Vec>, buffer_size: usize) -> Self { - assert!(!data_providers.is_empty(), "Must provide at least one data provider"); + assert!( + !data_providers.is_empty(), + "Must provide at least one data provider" + ); Self { data_providers: data_providers .into_iter() @@ -109,7 +114,12 @@ impl DataFetcher { loop { match data_provider.lock().await.get_samples(batch_id).await { Ok(batch) => { - info!(batch_id = %batch_id, provider_idx, "Successfully fetched batch with provider {}", provider_idx); + info!( + integration_test_log_marker = %IntegrationTestLogMarker::DataProviderFetchSuccess, + batch_id = %batch_id, + provider_idx, + "Successfully fetched batch with provider", + ); batch_option = Some(batch); // Update the last successful index *last_successful_provider_idx.lock().await = provider_idx; @@ -136,7 +146,14 @@ impl DataFetcher { continue; // Continue retry loop } Err(err) => { - error!(batch_id = %batch_id, provider_idx, error = %err, "Data fetch failed permanently for provider {}", provider_idx); + error!( + integration_test_log_marker = %IntegrationTestLogMarker::DataProviderFetchError, + batch_id = %batch_id, + provider_idx, + error = %err, + "Data fetch failed permanently for provider {}", + provider_idx + ); break; // Break retry loop, provider failed permanently for this batch } } diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 2a0a9e0d7..4d1fb76a0 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -155,7 +155,7 @@ impl RunInitConfigAndIO DataProvider::Server( @@ -167,15 +167,15 @@ impl RunInitConfigAndIO todo!(), - LLMTrainingDataLocation::Dummy => { - DataProvider::Dummy(DummyDataProvider::new(TokenSize::TwoBytes, 2048, u64::MAX)) - } + LLMTrainingDataLocation::Dummy(dummy_type) => DataProvider::Dummy( + DummyDataProvider::new(TokenSize::TwoBytes, 2048, u64::MAX, *dummy_type), + ), LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { location, token_size_in_bytes, shuffle, }) => { - let file_urls = FileURLs::from_location(&location).await?; + let file_urls = FileURLs::from_location(location).await?; DataProvider::Http(HttpDataProvider::new( file_urls, *token_size_in_bytes, @@ -183,13 +183,15 @@ impl RunInitConfigAndIO DataProvider::WeightedHttp( - WeightedDataProvider::::from_config_url( - &String::from(config_url), - llm.max_seq_len, + LLMTrainingDataLocation::WeightedHttp(config_url) => { + DataProvider::WeightedHttp( + WeightedDataProvider::::from_config_url( + &String::from(config_url), + llm.max_seq_len, + ) + .await?, ) - .await?, - ), + } }; data_providers.push(provider); } diff --git a/shared/client/src/state/train.rs b/shared/client/src/state/train.rs index a610aee45..6fd017aa8 100644 --- a/shared/client/src/state/train.rs +++ b/shared/client/src/state/train.rs @@ -286,8 +286,6 @@ impl TrainingStepMetadata }; let finished = finished.clone(); - - info!("fetching data!!!"); let TrainingDataForStep { step, mut next_sample, diff --git a/shared/client/src/testing.rs b/shared/client/src/testing.rs index 630308035..7b9202a5d 100644 --- a/shared/client/src/testing.rs +++ b/shared/client/src/testing.rs @@ -10,6 +10,8 @@ pub enum IntegrationTestLogMarker { SolanaSubscription, WitnessElected, Error, + DataProviderFetchSuccess, + DataProviderFetchError, } impl std::fmt::Display for IntegrationTestLogMarker { @@ -26,6 +28,8 @@ impl std::fmt::Display for IntegrationTestLogMarker { Self::SolanaSubscription => "solana_subscription", Self::WitnessElected => "witness_elected", Self::Error => "error", + Self::DataProviderFetchSuccess => "data_provider_fetch_success", + Self::DataProviderFetchError => "data_provider_fetch_error", } ) } @@ -44,6 +48,8 @@ impl FromStr for IntegrationTestLogMarker { "solana_subscription" => Self::SolanaSubscription, "witness_elected" => Self::WitnessElected, "error" => Self::Error, + "data_provider_fetch_success" => Self::DataProviderFetchSuccess, + "data_provider_fetch_error" => Self::DataProviderFetchError, _ => return Err(()), }) } diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index cf2351619..a5e4ab22d 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -73,7 +73,7 @@ pub enum LLMTrainingDataType { #[repr(C)] #[allow(clippy::large_enum_variant)] pub enum LLMTrainingDataLocation { - Dummy, + Dummy(DummyType), Server(FixedString<{ SOLANA_MAX_STRING_LEN }>), Local(FixedString<{ SOLANA_MAX_URL_STRING_LEN }>), Http(HttpLLMTrainingDataLocation), @@ -81,9 +81,29 @@ pub enum LLMTrainingDataLocation { WeightedHttp(FixedString<{ SOLANA_MAX_URL_STRING_LEN }>), } +#[derive( + AnchorSerialize, + AnchorDeserialize, + InitSpace, + Serialize, + Deserialize, + Clone, + Debug, + Zeroable, + Copy, + TS, + PartialEq, + Eq, +)] +#[repr(C)] +pub enum DummyType { + Working, + Failing, +} + impl Default for LLMTrainingDataLocation { fn default() -> Self { - Self::Dummy + Self::Dummy(DummyType::Working) } } @@ -272,11 +292,12 @@ impl Model { for data_location in llm.data_locations.iter() { let bad_data_location = match data_location { - LLMTrainingDataLocation::Dummy => false, + LLMTrainingDataLocation::Dummy(_) => false, LLMTrainingDataLocation::Server(url) => url.is_empty(), LLMTrainingDataLocation::Local(_) => false, LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { - location, .. + location, + .. }) => match location { HttpTrainingDataLocation::SingleUrl(url) => url.is_empty(), HttpTrainingDataLocation::NumberedFiles { @@ -284,7 +305,9 @@ impl Model { num_files, .. } => url_template.is_empty() || *num_files == 0, - HttpTrainingDataLocation::Gcp { bucket_name, .. } => bucket_name.is_empty(), + HttpTrainingDataLocation::Gcp { bucket_name, .. } => { + bucket_name.is_empty() + } }, LLMTrainingDataLocation::WeightedHttp(url) => url.is_empty(), }; diff --git a/shared/data-provider/src/dummy.rs b/shared/data-provider/src/dummy.rs index 050518287..9838a4fa5 100644 --- a/shared/data-provider/src/dummy.rs +++ b/shared/data-provider/src/dummy.rs @@ -1,11 +1,13 @@ use crate::{traits::TokenizedDataProvider, LengthKnownDataProvider}; use anyhow::{bail, Result}; +use psyche_coordinator::model::DummyType; use psyche_core::{BatchId, TokenSize}; pub struct DummyDataProvider { seq_len: usize, token_size_in_bytes: TokenSize, num_sequences: u64, + dummy_type: DummyType, } impl DummyDataProvider { @@ -13,11 +15,13 @@ impl DummyDataProvider { token_size_in_bytes: TokenSize, num_tokens_per_sequence: usize, // num tokens per sequence num_sequences: u64, + dummy_type: DummyType, ) -> Self { Self { seq_len: num_tokens_per_sequence, token_size_in_bytes, num_sequences, + dummy_type, } } @@ -45,6 +49,9 @@ impl DummyDataProvider { impl TokenizedDataProvider for DummyDataProvider { async fn get_samples(&mut self, data_ids: BatchId) -> Result>> { + if self.dummy_type == DummyType::Failing { + return Err(anyhow::anyhow!("DummyDataProvider dummy error")); + } for id in data_ids.iter() { if id > self.num_sequences { bail!("id {id} > self.num_sequences {}", self.num_sequences) From 4ccbdf7b12f85ca71482f016f9b63e28cbb5fe22 Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Tue, 6 May 2025 17:07:11 -0300 Subject: [PATCH 05/33] make DataFetcher::new use anyhow errors --- shared/client/src/fetch_data.rs | 14 +++++++------- shared/client/src/state/init.rs | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/shared/client/src/fetch_data.rs b/shared/client/src/fetch_data.rs index 16a819b67..e1e9f4698 100644 --- a/shared/client/src/fetch_data.rs +++ b/shared/client/src/fetch_data.rs @@ -1,3 +1,4 @@ +use anyhow::{bail, Result}; use psyche_coordinator::{get_batch_ids_for_node, Coordinator}; use psyche_core::{BatchId, NodeIdentity}; use psyche_data_provider::{DataProvider, TokenizedDataProvider}; @@ -33,12 +34,11 @@ pub struct DataFetcher { } impl DataFetcher { - pub fn new(data_providers: Vec>, buffer_size: usize) -> Self { - assert!( - !data_providers.is_empty(), - "Must provide at least one data provider" - ); - Self { + pub fn new(data_providers: Vec>, buffer_size: usize) -> Result { + if data_providers.is_empty() { + bail!("Must provide at least one data provider"); + } + Ok(Self { data_providers: data_providers .into_iter() .map(|dp| Arc::new(Mutex::new(dp))) @@ -47,7 +47,7 @@ impl DataFetcher { buffer_size, last_successful_provider_idx: Arc::new(Mutex::new(0)), // Start with the first provider _phantom: Default::default(), - } + }) } pub fn fetch_data( diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 4d1fb76a0..236e302c7 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -547,7 +547,7 @@ impl RunInitConfigAndIO Date: Wed, 7 May 2025 12:25:39 -0300 Subject: [PATCH 06/33] fix memnet tests --- .../tests/suites/memnet_coordinator_full_cycle.rs | 8 +++++++- .../tests/suites/memnet_treasurer_full_epoch.rs | 11 ++++++++++- shared/coordinator/src/model.rs | 6 ++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_cycle.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_cycle.rs index 2a7f61050..0530cb47e 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_cycle.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_cycle.rs @@ -82,6 +82,12 @@ pub async fn run() { RunState::Uninitialized ); + let mut data_locations: FixedVec< + LLMTrainingDataLocation, + MAX_DATA_LOCATIONS, + > = FixedVec::default(); + data_locations.push(LLMTrainingDataLocation::Dummy(DummyType::Working)); + // update the coordinator's model process_update( &mut endpoint, @@ -110,7 +116,7 @@ pub async fn run() { checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, data_type: LLMTrainingDataType::Pretraining, - data_location: LLMTrainingDataLocation::default(), + data_locations, lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), optimizer: OptimizerDefinition::Distro { clip_grad_norm: None, diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs index 32442879a..8ecf7e567 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs @@ -1,13 +1,16 @@ use psyche_coordinator::model::Checkpoint; +use psyche_coordinator::model::DummyType; use psyche_coordinator::model::HubRepo; use psyche_coordinator::model::LLMArchitecture; use psyche_coordinator::model::LLMTrainingDataLocation; use psyche_coordinator::model::LLMTrainingDataType; use psyche_coordinator::model::Model; use psyche_coordinator::model::LLM; +use psyche_coordinator::model::MAX_DATA_LOCATIONS; use psyche_coordinator::CoordinatorConfig; use psyche_coordinator::WitnessProof; use psyche_core::ConstantLR; +use psyche_core::FixedVec; use psyche_core::LearningRateSchedule; use psyche_core::OptimizerDefinition; use psyche_solana_authorizer::logic::AuthorizationGranteeUpdateParams; @@ -180,6 +183,12 @@ pub async fn run() { .await .unwrap(); + let mut data_locations: FixedVec< + LLMTrainingDataLocation, + MAX_DATA_LOCATIONS, + > = FixedVec::default(); + data_locations.push(LLMTrainingDataLocation::Dummy(DummyType::Working)); + // Prepare the coordinator's config process_treasurer_run_update( &mut endpoint, @@ -210,7 +219,7 @@ pub async fn run() { checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, data_type: LLMTrainingDataType::Pretraining, - data_location: LLMTrainingDataLocation::default(), + data_locations, lr_schedule: LearningRateSchedule::Constant( ConstantLR::default(), ), diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index a5e4ab22d..bc4702ff5 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -191,6 +191,8 @@ pub enum HttpTrainingDataLocation { }, } +pub const MAX_DATA_LOCATIONS: usize = 4; + #[derive( AnchorSerialize, AnchorDeserialize, Serialize, Deserialize, Clone, Debug, Zeroable, Copy, TS, )] @@ -201,14 +203,14 @@ pub struct LLM { pub architecture: LLMArchitecture, pub checkpoint: Checkpoint, pub data_type: LLMTrainingDataType, - pub data_locations: FixedVec, + pub data_locations: FixedVec, pub lr_schedule: LearningRateSchedule, pub optimizer: OptimizerDefinition, } impl LLM { pub fn dummy() -> Self { - let data_locations: FixedVec = FixedVec::new(); + let data_locations: FixedVec = FixedVec::new(); Self { architecture: LLMArchitecture::HfLlama, checkpoint: Checkpoint::Dummy(HubRepo::dummy()), From 7565ce4559babc412e5e53cad94d3700ae5c5b31 Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Wed, 7 May 2025 12:56:16 -0300 Subject: [PATCH 07/33] update configs to use new data_locations --- .../tests/suites/memnet_coordinator_full_cycle.rs | 7 ++++++- .../tests/suites/memnet_treasurer_full_epoch.rs | 4 +++- .../testing/tests/integration_tests.rs | 3 --- config/solana-test/config-three-clients.toml | 11 +++-------- config/solana-test/config.toml | 15 ++++----------- .../solana-test/light-config-dummy-failing.toml | 1 + config/solana-test/light-config.toml | 5 +++-- config/solana-test/light-two-min-clients.toml | 8 +++----- psyche-book/src/enduser/run-config.md | 12 ++++-------- shared/coordinator/src/model.rs | 8 ++++++-- shared/data-provider/tests/weighted.rs | 5 +++-- 11 files changed, 36 insertions(+), 43 deletions(-) diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_cycle.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_cycle.rs index 0530cb47e..354b9a54c 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_cycle.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_cycle.rs @@ -1,14 +1,17 @@ use psyche_coordinator::model::Checkpoint; +use psyche_coordinator::model::DummyType; use psyche_coordinator::model::HubRepo; use psyche_coordinator::model::LLMArchitecture; use psyche_coordinator::model::LLMTrainingDataLocation; use psyche_coordinator::model::LLMTrainingDataType; use psyche_coordinator::model::Model; use psyche_coordinator::model::LLM; +use psyche_coordinator::model::MAX_DATA_LOCATIONS; use psyche_coordinator::CoordinatorConfig; use psyche_coordinator::RunState; use psyche_coordinator::WitnessProof; use psyche_core::ConstantLR; +use psyche_core::FixedVec; use psyche_core::LearningRateSchedule; use psyche_core::OptimizerDefinition; use psyche_solana_authorizer::logic::AuthorizationGrantorUpdateParams; @@ -86,7 +89,9 @@ pub async fn run() { LLMTrainingDataLocation, MAX_DATA_LOCATIONS, > = FixedVec::default(); - data_locations.push(LLMTrainingDataLocation::Dummy(DummyType::Working)); + data_locations + .push(LLMTrainingDataLocation::Dummy(DummyType::Working)) + .unwrap(); // update the coordinator's model process_update( diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs index 8ecf7e567..47c3de862 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs @@ -187,7 +187,9 @@ pub async fn run() { LLMTrainingDataLocation, MAX_DATA_LOCATIONS, > = FixedVec::default(); - data_locations.push(LLMTrainingDataLocation::Dummy(DummyType::Working)); + data_locations + .push(LLMTrainingDataLocation::Dummy(DummyType::Working)) + .unwrap(); // Prepare the coordinator's config process_treasurer_run_update( diff --git a/architectures/decentralized/testing/tests/integration_tests.rs b/architectures/decentralized/testing/tests/integration_tests.rs index e2241e36d..0d3b14c60 100644 --- a/architectures/decentralized/testing/tests/integration_tests.rs +++ b/architectures/decentralized/testing/tests/integration_tests.rs @@ -1067,10 +1067,7 @@ async fn test_backup_data_provider() { ) .unwrap(); - // Initialize solana client to query the coordinator state - let solana_client = SolanaTestClient::new(run_id).await; let mut live_interval = time::interval(Duration::from_secs(10)); - loop { tokio::select! { _ = live_interval.tick() => { diff --git a/config/solana-test/config-three-clients.toml b/config/solana-test/config-three-clients.toml index bbe29ad41..932fccdd5 100644 --- a/config/solana-test/config-three-clients.toml +++ b/config/solana-test/config-three-clients.toml @@ -15,18 +15,13 @@ checkpointers = [] architecture = "HfLlama" data_type = "Pretraining" max_seq_len = 2048 +data_locations = [ + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } +] [model.LLM.checkpoint.Hub] repo_id = "emozilla/llama2-20m-init" -[model.LLM.data_location.Http] -token_size_in_bytes = "TwoBytes" -shuffle = "DontShuffle" - -[model.LLM.data_location.Http.location.Gcp] -bucket_name = "nous-pretraining-public-us" -filter_directory = "fineweb-edu-tokenized-llama2" - [model.LLM.lr_schedule.Cosine] base_lr = 4.0e-4 warmup_steps = 250 diff --git a/config/solana-test/config.toml b/config/solana-test/config.toml index aeac198de..a8392b93c 100644 --- a/config/solana-test/config.toml +++ b/config/solana-test/config.toml @@ -18,17 +18,10 @@ architecture = "HfLlama" data_type = "Pretraining" max_seq_len = 2048 cold_start_warmup_steps = 0 - -[model.LLM.checkpoint.Hub] -repo_id = "emozilla/llama2-1.1b-gqa-init" - -[model.LLM.data_location.Http] -token_size_in_bytes = "TwoBytes" -shuffle = "DontShuffle" - -[model.LLM.data_location.Http.location.Gcp] -bucket_name = "nous-pretraining-public-us" -filter_directory = "fineweb-edu-tokenized-llama2" +checkpoint = { Hub = { repo_id = "emozilla/llama2-1.1b-gqa-init" } } +data_locations = [ + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } +] [model.LLM.lr_schedule.Cosine] base_lr = 4.0e-4 diff --git a/config/solana-test/light-config-dummy-failing.toml b/config/solana-test/light-config-dummy-failing.toml index 8029cc0e5..a3d840ff1 100644 --- a/config/solana-test/light-config-dummy-failing.toml +++ b/config/solana-test/light-config-dummy-failing.toml @@ -22,6 +22,7 @@ data_locations = [ { Dummy = "Failing" }, { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } ] + [model.LLM.checkpoint.Hub] repo_id = "emozilla/llama2-20m-init" diff --git a/config/solana-test/light-config.toml b/config/solana-test/light-config.toml index 29448da5b..829a40d0e 100644 --- a/config/solana-test/light-config.toml +++ b/config/solana-test/light-config.toml @@ -18,12 +18,13 @@ architecture = "HfLlama" data_type = "Pretraining" max_seq_len = 2048 cold_start_warmup_steps = 0 - -checkpoint = { Hub = { repo_id = "emozilla/llama2-20m-init" } } data_locations = [ { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } ] +[model.LLM.checkpoint.Hub] +repo_id = "emozilla/llama2-20m-init" + [model.LLM.lr_schedule.Cosine] base_lr = 4.0e-4 warmup_steps = 250 diff --git a/config/solana-test/light-two-min-clients.toml b/config/solana-test/light-two-min-clients.toml index 0d13afedf..a770d0822 100644 --- a/config/solana-test/light-two-min-clients.toml +++ b/config/solana-test/light-two-min-clients.toml @@ -18,15 +18,13 @@ architecture = "HfLlama" data_type = "Pretraining" max_seq_len = 2048 cold_start_warmup_steps = 0 - -[model.LLM.data_location.Http.location.Gcp] -bucket_name = "nous-pretraining-public-us" -filter_directory = "fineweb-edu-tokenized-llama2" -checkpoint = { Hub = { repo_id = "emozilla/llama2-20m-init" } } data_locations = [ { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } ] +[model.LLM.checkpoint.Hub] +repo_id = "emozilla/llama2-20m-init" + [model.LLM.lr_schedule.Cosine] base_lr = 4.0e-4 warmup_steps = 250 diff --git a/psyche-book/src/enduser/run-config.md b/psyche-book/src/enduser/run-config.md index 299ab98ea..d225fc666 100644 --- a/psyche-book/src/enduser/run-config.md +++ b/psyche-book/src/enduser/run-config.md @@ -66,18 +66,14 @@ total_steps = 25000 architecture = "HfLlama" data_type = "Pretraining" max_seq_len = 2048 +# You may define more than one data location, to use as backup +data_locations = [ + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } +] [model.LLM.checkpoint.Hub] repo_id = "emozilla/llama2-20m-init" -[model.LLM.data_location.Http] -token_size_in_bytes = "TwoBytes" -shuffle = "DontShuffle" - -[model.LLM.data_location.Http.location.Gcp] -bucket_name = "nous-pretraining-public-us" -filter_directory = "fineweb-edu-tokenized-llama2" - [model.LLM.lr_schedule.Cosine] base_lr = 4.0e-4 warmup_steps = 250 diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index bc4702ff5..8b19c49e3 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -203,14 +203,18 @@ pub struct LLM { pub architecture: LLMArchitecture, pub checkpoint: Checkpoint, pub data_type: LLMTrainingDataType, - pub data_locations: FixedVec, + pub data_locations: FixedVec, pub lr_schedule: LearningRateSchedule, pub optimizer: OptimizerDefinition, } impl LLM { pub fn dummy() -> Self { - let data_locations: FixedVec = FixedVec::new(); + let mut data_locations: FixedVec = + FixedVec::new(); + data_locations + .push(LLMTrainingDataLocation::Dummy(DummyType::Working)) + .unwrap(); Self { architecture: LLMArchitecture::HfLlama, checkpoint: Checkpoint::Dummy(HubRepo::dummy()), diff --git a/shared/data-provider/tests/weighted.rs b/shared/data-provider/tests/weighted.rs index 24a736985..f0ada7fcd 100644 --- a/shared/data-provider/tests/weighted.rs +++ b/shared/data-provider/tests/weighted.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use psyche_coordinator::model::DummyType; use psyche_core::{BatchId, ClosedInterval, Shuffle, TokenSize}; use psyche_data_provider::{ DummyDataProvider, LengthKnownDataProvider, TokenizedDataProvider, WeightedDataProvider, @@ -184,8 +185,8 @@ async fn test_weighted_data_provider_consistency() -> Result<()> { #[test(tokio::test)] async fn test_weighted_data_provider_with_dummy_provider() -> Result<()> { - let dummy1 = DummyDataProvider::new(TokenSize::TwoBytes, 10, 50); // 10 tokens per sequence - let dummy2 = DummyDataProvider::new(TokenSize::TwoBytes, 10, 50); + let dummy1 = DummyDataProvider::new(TokenSize::TwoBytes, 10, 50, DummyType::Working); // 10 tokens per sequence + let dummy2 = DummyDataProvider::new(TokenSize::TwoBytes, 10, 50, DummyType::Working); let mut weighted_provider = WeightedDataProvider::new( vec![(dummy1, 0.5), (dummy2, 0.5)], From b3237adf2dcf1f2c7345a0f9968058a1a8318c5b Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Wed, 7 May 2025 14:59:37 -0300 Subject: [PATCH 08/33] centralized-server: throw error if no data providers present --- architectures/centralized/server/src/app.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/architectures/centralized/server/src/app.rs b/architectures/centralized/server/src/app.rs index eafaf3bbd..389aaab0e 100644 --- a/architectures/centralized/server/src/app.rs +++ b/architectures/centralized/server/src/app.rs @@ -185,7 +185,7 @@ impl App { .. }) => { // If you fail with the 1st data provider at the start don't even bother trying with the fallbacks - let data_location = &data_locations[0]; + let data_location = data_locations.get(0).ok_or_else(|| anyhow!("No data location provided"))?; if let LLMTrainingDataType::Finetuning = data_type { panic!("Finetuning is not supported yet.") } From e410055c17b348536a6c0b7b0dee86d10a051d60 Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Thu, 8 May 2025 16:52:43 -0300 Subject: [PATCH 09/33] Simplify and shorten fetch retries --- shared/client/src/fetch_data.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/shared/client/src/fetch_data.rs b/shared/client/src/fetch_data.rs index e1e9f4698..44a15d84a 100644 --- a/shared/client/src/fetch_data.rs +++ b/shared/client/src/fetch_data.rs @@ -22,8 +22,8 @@ use crate::IntegrationTestLogMarker; pub type BatchStep = u32; pub type BatchIdSet = HashSet; -const MAX_RETRIES: u32 = 7; -const BASE_DELAY_MS: u64 = 2000; +const MAX_RETRIES: u32 = 4; +const BASE_DELAY_MS: u64 = 500; pub struct DataFetcher { data_providers: Vec>>>, @@ -127,10 +127,8 @@ impl DataFetcher { }, Err(err) if retry_count < MAX_RETRIES => { retry_count += 1; - // Use exponential backoff with full jitter let delay_ms = BASE_DELAY_MS * 2u64.pow(retry_count - 1); - let jitter = rand::random::() % delay_ms; - let final_delay = Duration::from_millis(delay_ms / 2 + jitter); + let delay_ms = Duration::from_millis(delay_ms / 2); warn!( batch_id = %batch_id, @@ -138,11 +136,10 @@ impl DataFetcher { attempt = retry_count, max_retries = MAX_RETRIES, error = %err, - delay_ms = final_delay.as_millis(), "Data fetch error with provider {}. Retrying in {}ms", - provider_idx, final_delay.as_millis() + provider_idx, delay_ms.as_millis() ); - sleep(final_delay).await; + sleep(delay_ms).await; continue; // Continue retry loop } Err(err) => { From c26c77332b529fda6d3d93fd3b086e3ae0aa1079 Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Wed, 18 Jun 2025 15:24:08 -0300 Subject: [PATCH 10/33] nix fmt --- .../decentralized/testing/tests/integration_tests.rs | 1 - config/solana-test/config-three-clients.toml | 2 +- config/solana-test/config.toml | 2 +- config/solana-test/light-config-dummy-failing.toml | 4 ++-- config/solana-test/light-config.toml | 2 +- config/solana-test/light-two-min-clients.toml | 2 +- 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/architectures/decentralized/testing/tests/integration_tests.rs b/architectures/decentralized/testing/tests/integration_tests.rs index 0d3b14c60..e54555f87 100644 --- a/architectures/decentralized/testing/tests/integration_tests.rs +++ b/architectures/decentralized/testing/tests/integration_tests.rs @@ -1029,7 +1029,6 @@ async fn test_lost_only_peer_go_back_to_hub_checkpoint() { #[test_log::test(tokio::test(flavor = "multi_thread"))] #[serial] async fn test_backup_data_provider() { - let run_id = "test".to_string(); let mut saw_provider_0_error = false; let mut successful_fetches_after_error = 0; let mut current_epoch = -1; diff --git a/config/solana-test/config-three-clients.toml b/config/solana-test/config-three-clients.toml index 932fccdd5..45a9763dc 100644 --- a/config/solana-test/config-three-clients.toml +++ b/config/solana-test/config-three-clients.toml @@ -16,7 +16,7 @@ architecture = "HfLlama" data_type = "Pretraining" max_seq_len = 2048 data_locations = [ - { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } }, ] [model.LLM.checkpoint.Hub] diff --git a/config/solana-test/config.toml b/config/solana-test/config.toml index a8392b93c..284214dec 100644 --- a/config/solana-test/config.toml +++ b/config/solana-test/config.toml @@ -20,7 +20,7 @@ max_seq_len = 2048 cold_start_warmup_steps = 0 checkpoint = { Hub = { repo_id = "emozilla/llama2-1.1b-gqa-init" } } data_locations = [ - { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } }, ] [model.LLM.lr_schedule.Cosine] diff --git a/config/solana-test/light-config-dummy-failing.toml b/config/solana-test/light-config-dummy-failing.toml index a3d840ff1..d8c48ef89 100644 --- a/config/solana-test/light-config-dummy-failing.toml +++ b/config/solana-test/light-config-dummy-failing.toml @@ -19,8 +19,8 @@ data_type = "Pretraining" max_seq_len = 2048 cold_start_warmup_steps = 0 data_locations = [ - { Dummy = "Failing" }, - { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } + { Dummy = "Failing" }, + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } }, ] [model.LLM.checkpoint.Hub] diff --git a/config/solana-test/light-config.toml b/config/solana-test/light-config.toml index 829a40d0e..19e36a5e6 100644 --- a/config/solana-test/light-config.toml +++ b/config/solana-test/light-config.toml @@ -19,7 +19,7 @@ data_type = "Pretraining" max_seq_len = 2048 cold_start_warmup_steps = 0 data_locations = [ - { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } }, ] [model.LLM.checkpoint.Hub] diff --git a/config/solana-test/light-two-min-clients.toml b/config/solana-test/light-two-min-clients.toml index a770d0822..a51d12689 100644 --- a/config/solana-test/light-two-min-clients.toml +++ b/config/solana-test/light-two-min-clients.toml @@ -19,7 +19,7 @@ data_type = "Pretraining" max_seq_len = 2048 cold_start_warmup_steps = 0 data_locations = [ - { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } } + { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } }, ] [model.LLM.checkpoint.Hub] From 1878512345b681e1aae401a618b6190cb94e3b6d Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Thu, 19 Jun 2025 16:28:25 -0300 Subject: [PATCH 11/33] client: handle multiple data providers initialization in init.rs --- architectures/centralized/server/src/app.rs | 1 - shared/client/src/state/init.rs | 80 +++++++++++++++------ 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/architectures/centralized/server/src/app.rs b/architectures/centralized/server/src/app.rs index 389aaab0e..169e98455 100644 --- a/architectures/centralized/server/src/app.rs +++ b/architectures/centralized/server/src/app.rs @@ -184,7 +184,6 @@ impl App { checkpoint, .. }) => { - // If you fail with the 1st data provider at the start don't even bother trying with the fallbacks let data_location = data_locations.get(0).ok_or_else(|| anyhow!("No data location provided"))?; if let LLMTrainingDataType::Finetuning = data_type { panic!("Finetuning is not supported yet.") diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 236e302c7..8325f6a76 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -25,7 +25,7 @@ use tokio::{ sync::{mpsc::UnboundedSender, oneshot}, task::{JoinError, JoinHandle}, }; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; use super::{ cooldown::CooldownStepMetadata, evals::EvalRunner, stats::StatsLogger, steps::StepStateMachine, @@ -158,44 +158,84 @@ impl RunInitConfigAndIO DataProvider::Server( - DataProviderTcpClient::connect( + LLMTrainingDataLocation::Server(data_server) => { + let client = match DataProviderTcpClient::connect( data_server.into(), init_config.network_identity.clone(), init_config.private_key.clone(), ) - .await?, - ), + .await + { + Ok(client) => client, + Err(e) => { + warn!("Failed to connect to data server at {}: {}", data_server, e); + continue; + } + }; + Some(DataProvider::Server(client)) + } LLMTrainingDataLocation::Local(_) => todo!(), - LLMTrainingDataLocation::Dummy(dummy_type) => DataProvider::Dummy( + LLMTrainingDataLocation::Dummy(dummy_type) => Some(DataProvider::Dummy( DummyDataProvider::new(TokenSize::TwoBytes, 2048, u64::MAX, *dummy_type), - ), + )), LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { location, token_size_in_bytes, shuffle, }) => { - let file_urls = FileURLs::from_location(location).await?; - DataProvider::Http(HttpDataProvider::new( - file_urls, - *token_size_in_bytes, - llm.max_seq_len, - *shuffle, - )?) + if let Ok(file_urls) = FileURLs::from_location(location).await { + if let Ok(provider) = HttpDataProvider::new( + file_urls, + *token_size_in_bytes, + llm.max_seq_len, + *shuffle, + ) { + Some(DataProvider::Http(provider)) + } else { + warn!( + "Failed to create HTTP data provider for location: {:?}", + location + ); + None + } + } else { + warn!( + "Failed to create HTTP data provider for location: {:?}", + location + ); + None + } } LLMTrainingDataLocation::WeightedHttp(config_url) => { - DataProvider::WeightedHttp( + if let Ok(provider) = WeightedDataProvider::::from_config_url( &String::from(config_url), llm.max_seq_len, ) - .await?, - ) + .await + { + Some(DataProvider::WeightedHttp(provider)) + } else { + warn!( + "Failed to create Weighted HTTP data provider for config URL: {}", + config_url + ); + None + } } }; - data_providers.push(provider); + if let Some(provider) = provider { + data_providers.push(provider); + } + } + if data_providers.is_empty() { + Err(InitRunError::DataProviderConnect(anyhow::anyhow!( + "No valid data providers could be initialized." + ))) + } else { + info!("Initialized {} data providers", data_providers.len()); + Ok(data_providers) } - Ok(data_providers) }; let model_future: JoinHandle> = match &llm.architecture @@ -489,7 +529,7 @@ impl RunInitConfigAndIO::new(data_providers, init_config.data_parallelism * 2); From a6fe9b8e9315d8416d53055e6d6c15dedef1c488 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Fri, 5 Dec 2025 18:53:25 -0300 Subject: [PATCH 12/33] Fix typos and correct old docs --- .../solana-client/src/backend.rs | 6 +- justfile | 4 - psyche-book/src/enduser/create-run.md | 6 +- psyche-book/src/explain/general-workflow.md | 89 ++++++++++++------- psyche-book/src/explain/glossary.md | 2 +- psyche-book/src/explain/index.md | 51 ++++++----- psyche-book/src/explain/model-sharing.md | 8 +- psyche-book/src/explain/rewards.md | 2 +- shared/client/src/state/train.rs | 6 +- shared/coordinator/src/committee_selection.rs | 2 +- 10 files changed, 102 insertions(+), 74 deletions(-) diff --git a/architectures/decentralized/solana-client/src/backend.rs b/architectures/decentralized/solana-client/src/backend.rs index 25eae27d0..ce4ea99a5 100644 --- a/architectures/decentralized/solana-client/src/backend.rs +++ b/architectures/decentralized/solana-client/src/backend.rs @@ -134,9 +134,9 @@ impl SolanaBackend { cluster: Cluster, backup_clusters: Vec, payer: Arc, - committment: CommitmentConfig, + commitment: CommitmentConfig, ) -> Result { - let client = Client::new_with_options(cluster.clone(), payer.clone(), committment); + let client = Client::new_with_options(cluster.clone(), payer.clone(), commitment); let mut program_coordinators = vec![]; program_coordinators.push(Arc::new(client.program(psyche_solana_coordinator::ID)?)); @@ -144,7 +144,7 @@ impl SolanaBackend { let backup_program_coordinators: Result, _> = backup_clusters .iter() .map(|cluster| { - Client::new_with_options(cluster.clone(), payer.clone(), committment) + Client::new_with_options(cluster.clone(), payer.clone(), commitment) .program(psyche_solana_coordinator::ID) }) .collect(); diff --git a/justfile b/justfile index 4fba7fe4e..85205e9b4 100644 --- a/justfile +++ b/justfile @@ -119,10 +119,6 @@ start-training-devnet-light-client run_id="test" *args='': solana-client-tests: cargo test --package psyche-solana-client --features solana-localnet-tests -# install deps for building mdbook -book_deps: - cargo install mdbook mdbook-mermaid mdbook-linkcheck - build_book output-dir="../book": generate_cli_docs mdbook build psyche-book -d {{ output-dir }} diff --git a/psyche-book/src/enduser/create-run.md b/psyche-book/src/enduser/create-run.md index bdc23d3dc..0c721826d 100644 --- a/psyche-book/src/enduser/create-run.md +++ b/psyche-book/src/enduser/create-run.md @@ -1,13 +1,13 @@ # Creating a run -To create a new training run and make it available for nodes to join, you'll need to create it, configure it, and unpause it. +To create a new training run and make it available for nodes to join, you'll need to create it, configure it, and unpause it. By default every new run stays in the pause state until being unpaused by the owner and can be paused anytime. -## Creating the account +## Creating the Coordinator account First, create the run on-chain. You'll need to provide: -- the RPC & websocket RPC urls so the client can communicate with an RPC node. +- The RPC & websocket RPC urls so the client can communicate with an RPC node. - a unique run ID - just a few characters to uniquely identify your run. - a name & description for your run diff --git a/psyche-book/src/explain/general-workflow.md b/psyche-book/src/explain/general-workflow.md index 8fd7a4a29..2333fb5f4 100644 --- a/psyche-book/src/explain/general-workflow.md +++ b/psyche-book/src/explain/general-workflow.md @@ -29,6 +29,8 @@ sequenceDiagram end ``` +In this case the backend is just the layer of the Clients that communicates with the Coordinator, depending on the run nature it will communicate with Solana blockchain or just via TCP to the coordinator. + ### Beginning an Epoch (state: WaitingForMembers) The Coordinator begins in the `WaitingForMembers` phase, with no clients connected. @@ -45,70 +47,98 @@ sequenceDiagram Note over Coordinator: Entering Warmup Client1->>Client2: Connect Client2->>Client1: Connect - Note over Coordinator: The Warmup countdown elapses - Note over Coordinator: Entering Training ``` ### Model Loading (state: Warmup) This phase is designed to let all clients download the model & load it onto their GPUs. -If a client has dropped whilst waiting for the warmup time, the Backend then removes the client from the Coordinator's clients list. - -If the number of clients falls below min_clients, the Coordinator goes back to the `WaitingForMembers` phase. +If a client has dropped whilst waiting for the warmup time to elapse, the Backend then removes the client from the Coordinator's clients list and in case the number of clients falls below `min_clients`, the Coordinator goes back to the `WaitingForMembers` phase and wait for more clients to join. -Once the `Warmup` time passes, the Coordinator loads all the information for the next training round and change its phase to `RoundTrain`. The Server will broadcast this `Training` Coordinator state to all clients. - -### Training (state: RoundTrain) +There's two different ways the coordinator will transition to the `RoundTrain` phase: -In this phase, the Coordinator provides a random seed. +- If all the participant clients have finished loading the model and are ready to start training they send a specific message to the Coordinator and if the Coordinator receives this message from all clients, it transitions to the `RoundTrain` phase earlier. +- If the `Warmup` max time passes the Coordinator will transition to the `RoundTrain` phase even if not all clients have finished loading the model. This max time is configurable and can be set in the configuration file. -Each client can use this seed, alongside the current round index and epoch index to determine which indices of the training data to use. +The Backend will watch for the state transition and all clients will be notified of this new `Training` Coordinator state. -Each client then proceeds to run the training on the selected training data. - -This state will end when clients later exchanges `Witness` messages. +### Training (state: RoundTrain) -#### Witnessing training results +In this phase, the Coordinator provides a random seed, each client can use this seed, alongside the current round index and epoch index to determine which indices of the whole training batch they will train on. Basically every client will train on a different subset of the training data. As clients complete their training, they send their results to all other clients, including the Witnesses. The witnesses will each send a **witness proof** to the Coordinator, building towards a **witness quorum**. A witness proof contains a bloom filter describing which pieces of data the witness received training results for, and which clients did that work. Elected witnesses are responsible for creating these witness proofs and and sending them to the Coordinator. -The witnesses for each round are chosen randomly from all the clients, using the same random seed as for data assignments. A witness will attempt to send an **opportunistic witness** message once it's seen a received a training result for every single batch in the current round. - -#### Witness Quorum +The witnesses for each round are chosen randomly from all the clients, using the same random seed as for data assignments. A witness will attempt to send an **opportunistic witness** message once it's seen a received a training result for every single batch in the current round. That message lets the Coordinator know that it can transition to the _Witness_ phase without waiting all the training time. The Coordinator advances the run from the _Training_ phase to the _Witness_ phase in one of two ways: - If enough witnesses observe all results and reach a **witness quorum** for the round, they notify the Coordinator that it is safe to advance. This process, named **opportunistic witnessing**, accelerates the transition to the _Witness_ phase, rather than having to wait a fixed time for training results. - If witnesses do not receive all required results from other clients before the maximum time specified for the _Training_ phase, the Coordinator will nonetheless transition to the _Witness_ phase after the maximum _Training_ time elapses. +The Backend will watch for the state transition and all clients will be notified of this new `Witness` Coordinator state. + ### Witness phase (state: RoundWitness) This phase exists to give the witnesses an opportunity to send their proofs to the Coordinator in the event that they have not received enough training results from other clients to have reached the quorum and send their proofs opportunistically. There is also brief slack period for non-witness nodes to catch up by downloading any remaining results they might have not received. -When the _Witness_ phase finishes via timeout, the Coordinator transitions from _Witness_ to the _Cooldown_ phase in three cases: +When the _Witness_ phase finishes only reaching the maximum witness time, the Coordinator transitions from _Witness_ to the _Training_ phase again in most of the cases, it only transitions to a new state known as _Cooldown_ in the following three cases: - If we are in the last round of the epoch. - If the clients have dropped to less than the minimum required by the config. - If the number of witnesses for the round is less than the quorum specified by the config. -Any clients that have failed health checks will also be removed from the current epoch. +Any clients that have failed [health checks](#health-checks) will also be removed from the current epoch. ### Cooldown phase (state: Cooldown) -The _Cooldown_ phase is the last phase of an epoch, during which the Coordinator waits for either the _Cooldown_ period to elapse, or a checkpoint to have happened. +The _Cooldown_ phase is the last phase of an epoch, during which the Coordinator waits the _Cooldown_ period to elapse. At this point the clients will begin to do a new checkpoint of the model, this is saving the state of the model at that time to a external storage, such as a Hugging Face. -When the _Cooldown_ phase begins, the Coordinator resets the current model checkpoint state to `Checkpoint::P2P`, signifying that new joiners should download the latest copy of the model from the other participants. +When the _Cooldown_ phase begins, the Coordinator also resets the current model checkpoint state to `Checkpoint::P2P`, indicating that new joiners should download the latest copy of the model from the other participants and not from the usual checkpoint. -Upon exiting the _Cooldown_ phase, the Coordinator transitions to the next epoch, saving the previous epoch state, and moving back to the _WaitingForMembers_ phase. +Upon exiting the _Cooldown_ phase, the Coordinator transitions to the next epoch, saving the previous epoch state, and moving back to the _WaitingForMembers_ phase. All the clients that were participating in the previous epoch automatically join to the new epoch unless they exit manually. ### It all comes together -Here is an overview of the whole process from a high level perspective: +Here's is an overview of how the state of the run can change depending on the situation: + +```mermaid +%%{init: {'theme':'base', 'themeVariables': { 'fontSize':'35px'}}}%% +flowchart LR + WFM((Waiting For Members)) + W((Warmup)) + T((Training)) + WI((Witness)) + CD((Cooldown)) + a{Are enough clients to start} + b{All clients loaded the model} + c{Max warmup time passed} + d{Witness quorum reached} + e{Max training time passed} + f{All rounds completed} + + WFM --> a + a -->|Yes| W + a -->|No| WFM + b -->|Yes| T + b -->|No| c + W --> b + c -->|Yes| T + c -->|No| W + T --> d + d -->|Yes| WI + d -->|No| e + e -->|Yes| WI + WI --> f + f -->|Yes| CD + f -->|No| T + CD --> WFM +``` + +And this is how it fits with real the real clients and how they interact in each of the stages. The committee in this case is the structure that contains all the witness data for the round. ```mermaid sequenceDiagram @@ -127,7 +157,7 @@ sequenceDiagram Note over Client1: Train Note over Client2: Train Note over Client2: Fill bloom filters - Client2->>Backend: try send opportunistic witness + Client2->>Backend: send opportunistic witness Backend->>Coordinator: Witness message Note over Coordinator: Enough witnesses for round Coordinator->>Coordinator: Update state to RoundWitness @@ -151,17 +181,14 @@ A client also sends a list of other clients it considers unhealthy to the server In this Backend, the Coordinator is owned and ticked forwards by a Server that communicates via clients over TCP. -The Server's Coordinator is initially configured in `main.rs`. -It's loaded using the configuration file `state.toml`. +The Server's Coordinator is initially configured in the main file of the server. +It's loaded using the configuration a specific configuration file `state.toml` ```mermaid flowchart LR S[Server] --run--> A[App] S --new--> C[Coordinator] - C --run_id - init warmup - min clients - model--> A + C --Run config--> A ``` The Server uses some parts of the Coordinator configuration, like the data server configuration, if enabled, to boot up all the functionality it needs. @@ -192,6 +219,8 @@ flowchart LR ### Decentralized training flow +Here's a more detailed diagram including mostly every component involved in the Psyche training flow with a little more implementation details: + ```mermaid flowchart TD subgraph sg_solana["Solana"] diff --git a/psyche-book/src/explain/glossary.md b/psyche-book/src/explain/glossary.md index c57176116..6dfd313cd 100644 --- a/psyche-book/src/explain/glossary.md +++ b/psyche-book/src/explain/glossary.md @@ -39,7 +39,7 @@ The status of a `Client` as tracked by the `Coordinator`. Key states include `He **Commitment** A cryptographic hash (SHA-256) of a client's computational results for a given `Batch`. Submitting commitments allows the `Coordinator` and `Witnesses` to verify work was done without transferring the full results initially. -**Commitee** +**Committee** The particular role of a client in a given round. Can be one of `Trainer`, `Verifier` or `TieBreaker`. **Cooldown** diff --git a/psyche-book/src/explain/index.md b/psyche-book/src/explain/index.md index 46f0182e9..b4262e47c 100644 --- a/psyche-book/src/explain/index.md +++ b/psyche-book/src/explain/index.md @@ -8,6 +8,8 @@ The core system is composed of three main actors: - **[Data Provider](./data-provider.md)**: Each run requires training data. This data could be served by the Psyche Data Provider server, over HTTP, or loaded from local copies of a dataset. +Psyche provides two different implementations of the network, one for [decentralized](./general-workflow.md#decentralized-backend) runs that use the Solana Blockchain with the coordinator running in it and another for [centralized](./general-workflow.md#centralized-backend) runs that use the Coordinator as a regular TCP server and mostly is mostly used to test local runs and as a dev oriented tool. + ## Sample topologies ```mermaid @@ -29,7 +31,7 @@ and model snapshots ```mermaid --- -title: Centralized Run, training data provided thru TCP data server +title: Centralized Run, training data provided thru TCP data server. --- flowchart TB subgraph "Coordinator Server" @@ -46,7 +48,7 @@ and model snapshots ## What constitutes a training run? -The training process for a given model is divided into small steps that incrementally train the model in a coordinated manner. A training run is divided into **epochs**, where clients can join and leave the run, and **epochs** are further divided into **steps**, where the model is incrementally trained. +The training process for a given model is divided into small steps that incrementally train the model in a coordinated manner. A training run is divided into **epochs**, where clients can join and leave the run, and **epochs** are further divided into **rounds** that will be further divided into **steps**, where the model is incrementally trained. During a training run, clients primarily perform three tasks: @@ -54,17 +56,19 @@ During a training run, clients primarily perform three tasks: - **Witnessing**: Verify the liveness and correctness of other participants. - **Verifying**: Recompute and compare training results to identify and punish malicious participants. +These three phases constitute a **round** of training and will be looping until the **epoch** is completed. + ## Waiting for Members & Warmup -At the start of an **epoch**, all clients have a window of time to join the run by requesting to be added by coordinator, and then connecting to the other participating clients. +At the start of an **epoch**, all clients have a window of time to join the run by requesting to be added by coordinator, and then connecting to the other participating clients. This state will be known as the _Waiting for Members_ phase. -Once a minimum threshold of clients has been met, the run will transition to the _Warmup_ phase and begin a countdown to allow connected clients to update their copy of the model, after which it will enter the _Training_ phase. +Once a minimum threshold of clients has been met, the run will transition to the _Warmup_ phase and begin a countdown to allow connected clients to update their copy of the model. To obtain a copy of the model, the Coordinator will either direct clients to a checkpoint uploaded somewhere like HuggingFace and they will have to download it from there or direct clients to [download the model from other clients](./model-sharing.md) via the p2p network. In the first epoch, all clients will download the model from HuggingFace and after that every new epoch, clients will download the model from other clients via the p2p network. -To obtain a copy of the model, the Coordinator will either direct clients to a checkpoint uploaded somewhere like: HuggingFace or direct clients to [download the model from other clients](./model-sharing.md) via the p2p network. +After the _Warmup_ phase ends, it will enter the _Training_ phase. ## Training -At the beginning of an **epoch**, after the _Warmup_ phase ends, clients are assigned specific tasks that require them to train the model on a portion of the data. +At the beginning of a **round**, either after the _Warmup_ or _Witness_ phase ends, clients are assigned specific tasks that require them to train the model on a portion of the data. The coordinator contains information that uniquely assigns pieces of training data to clients based on the current **round**. @@ -72,7 +76,7 @@ If clients have already been training (i.e., it is not the first round of the ep After completing the training on their assigned data, each client emits a p2p broadcast to all other clients containing their training results and a cryptographic commitment that binds them to those results. -As the training results are received from other clients, they are downloaded to be later incorporated into the current model. +As the training results are received from other clients, they are downloaded to be later incorporated into the current copy of the model of each client. ## Witnessing @@ -80,15 +84,13 @@ At the start of each round, one or more clients are randomly selected as witness These bloom filters are sent to the coordinator, which then combines them into a provable consensus of which results to apply to the model. -Once a witness quorum is reached, the coordinator advances to the _Training_ phase to allow all clients a brief window to download every training result. - -Once the _Witness_ phase concludes, the coordinator returns to the _Training_ phase. Clients are assigned new data, and the process repeats. After a predefined number of rounds, a _Cooldown_ round occurs, marking the end of an **epoch**. +Once a witness quorum is reached, the coordinator advances to the _Training_ phase to allow all clients a brief window to download every training result of the previous round, clients are assigned new data, and the process repeats. After a predefined number of rounds, a _Cooldown_ round occurs, marking the end of an **epoch**. ## The witness/train loop visualized Here's a high-level overview of the process. -Additional details exist, but this captures the overall flow of a single Round from an Epoch: +There's additional implementation details, but this captures the overall flow of a single Round in an Epoch: ```mermaid sequenceDiagram @@ -96,18 +98,21 @@ sequenceDiagram participant Client2 participant Coordinator participant Data Hosting - Client1 ->> Data Hosting: get_data - Client2 ->> Data Hosting: get_data - Coordinator ->> Client2: witness - Note over Client1: Train - Note over Client2: Train - Client1 ->> Client2: Send results - Client2 ->> Client1: Send results - Note over Client1: Download results - Note over Client2: Download results - Client2 ->> Coordinator: Send witness - Note over Coordinator: Quorum reached - Note over Coordinator: Starting Witness phase + loop Every round + Client1 ->> Data Hosting: get_data + Client2 ->> Data Hosting: get_data + Coordinator ->> Client2: witness + Note over Client1: Train + Note over Client2: Train + Client1 ->> Client2: Send results + Client2 ->> Client1: Send results + Note over Client1: Download results + Note over Client2: Download results + Client2 ->> Coordinator: Send witness + Note over Coordinator: Quorum reached + Note over Coordinator: Starting Witness phase + Note over Coordinator: Starting Training phase + end ``` ## Glossary diff --git a/psyche-book/src/explain/model-sharing.md b/psyche-book/src/explain/model-sharing.md index d8b8d322d..ba61ebb12 100644 --- a/psyche-book/src/explain/model-sharing.md +++ b/psyche-book/src/explain/model-sharing.md @@ -2,9 +2,7 @@ When an **epoch** starts, all clients must have an identical model to train with. -At the beginning of a run, all clients must download the model parameters, tokenizer configuration, and model configuration from HuggingFace, where the model must have been previously uploaded - -(TODO: add more details on uploading a model). +At the beginning of a run, all clients must download the model parameters, tokenizer configuration, and model configuration from HuggingFace, where the model must have been previously uploaded or updated. Each client will then modify their copy of the model by receiving new training results from other clients and applying them. This keeps everyone's copy of model identical within an **epoch** without an additional full synchronization step. @@ -16,11 +14,11 @@ To address this, we **checkpoint** the model at the end of an **epoch**, where c ## HuggingFace checkpoint -In this approach, a client or a set of clients are designated as the **checkpointers** for the run. These clients upload their copy of updated model to HuggingFace after each epoch, and send the URL for this checkpoint to the coordinator. When a new client joins the run, it retrieves the checkpoint URL from the coordinator, and connects to HuggingFace to download the latest copy of the model parameters and configuration files. +In this approach, a client or a set of clients can optionally run as **checkpointers** if they declare a checkpoint URL when joining the run. These clients upload their copy of updated model to HuggingFace after each epoch, and send the URL for this checkpoint to the coordinator. When a new client joins the run, it retrieves the checkpoint URL from the coordinator, and connects to HuggingFace to download the latest copy of the model parameters and configuration files. ## P2P checkpoint -In the peer-to-peer (P2P) approach, a new client synchronizes by obtaining the latest model directly from other peers. It receives the model information and parameters from any available peer, requesting a set of parameters for each layer from different clients. This process allows the client to assemble the latest model state and participate in the training without an explicit upload step to a central server occuring. +In the peer-to-peer (P2P) approach, a new client synchronizes by obtaining the latest model directly from other peers. It receives the model information and parameters from any available peer, requesting a set of parameters for each layer from different clients. This process allows the client to assemble the latest model state and participate in the training without an explicit upload step to a central server occurring. Here's an example of a P2P model sharing interaction: diff --git a/psyche-book/src/explain/rewards.md b/psyche-book/src/explain/rewards.md index bc8dda8e6..c644c1472 100644 --- a/psyche-book/src/explain/rewards.md +++ b/psyche-book/src/explain/rewards.md @@ -2,7 +2,7 @@ When clients participate in a training run, the `Coordinator` keeps track of the compute contributions. -Each client is rewarded at the end of an epoch if the client sucessfully completed the whole epoch. A pool of reward "points" is shared equally among all the finishing clients of a given epoch. The reward is accounted through a counter of `earned` "points" for each client. The points can then be used as proof of contribution in rewards mechanisms such as the `Treasurer` (see below) +Each client is rewarded at the end of an epoch if the client successfully completed the whole epoch. A pool of reward "points" is shared equally among all the finishing clients of a given epoch. The reward is accounted through a counter of `earned` "points" for each client. The points can then be used as proof of contribution in rewards mechanisms such as the `Treasurer` (see below) ## Run Treasurer, Compute Incentives diff --git a/shared/client/src/state/train.rs b/shared/client/src/state/train.rs index f30b6ebb3..c67ff6953 100644 --- a/shared/client/src/state/train.rs +++ b/shared/client/src/state/train.rs @@ -84,7 +84,7 @@ pub enum TrainError { #[error("Failed to send health checks, channel must be closed")] SendHealthChecks, - #[error("Healthcheck thread crashed")] + #[error("Health check thread crashed")] HealthCheckCrashed, #[error("Coordinator error: {0}")] @@ -241,7 +241,7 @@ impl TrainingStepMetadata round = round.height, epoch = epoch, index = client_index, - comittee_position = committee_proof.position, + committee_position = committee_proof.position, committee = %committee_proof.committee, witness_position = witness_proof.position, witness = %witness_proof.witness, @@ -682,7 +682,7 @@ fn start_sending_health_checks( } if !checks.is_empty() { - info!("Sending health check for following indicies: {:?}", checks); + info!("Sending health check for following indices: {:?}", checks); tx_health_check .send(checks) .map_err(|_| TrainError::SendHealthChecks) diff --git a/shared/coordinator/src/committee_selection.rs b/shared/coordinator/src/committee_selection.rs index f7240e0ca..1edc3c0f5 100644 --- a/shared/coordinator/src/committee_selection.rs +++ b/shared/coordinator/src/committee_selection.rs @@ -385,7 +385,7 @@ mod tests { } #[test] - fn test_invalid_comittee_selections() { + fn test_invalid_committee_selections() { // verification_percent > 100 assert!(CommitteeSelection::new(10, 5, 101, 100, 12345).is_err()); // total_nodes < tie_breaker_nodes From 063970e6ea3707cc505fc320da080517d1fe8678 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Tue, 9 Dec 2025 18:29:27 -0300 Subject: [PATCH 13/33] wip --- justfile | 11 ++-- .../src/development/running-onchain.md | 54 ++++++++++--------- psyche-book/src/development/setup.md | 49 +++++++---------- psyche-book/src/explain/glossary.md | 2 +- 4 files changed, 54 insertions(+), 62 deletions(-) diff --git a/justfile b/justfile index 85205e9b4..abd1bd634 100644 --- a/justfile +++ b/justfile @@ -3,13 +3,8 @@ mod nix default: just --list -# format & lint-fix code -fmt: - echo "deprecated, use 'nix fmt' instead..." - sleep 5 - cargo clippy --fix --allow-staged --all-targets - cargo fmt - nixfmt . +check-client: + cargo run -p psyche-solana-client -- --help # spin up a local testnet local-testnet *args='': @@ -136,7 +131,7 @@ generate_cli_docs: run_docker_client *ARGS: just nix build_docker_solana_client - docker run -d {{ ARGS }} --gpus all psyche-prod-solana-client + docker run -d {{ ARGS }} --gpus all psyche-solana-client # Setup clients assigning one available GPU to each of them. diff --git a/psyche-book/src/development/running-onchain.md b/psyche-book/src/development/running-onchain.md index b0d356e77..c40d56d25 100644 --- a/psyche-book/src/development/running-onchain.md +++ b/psyche-book/src/development/running-onchain.md @@ -1,6 +1,6 @@ # Running Psyche on-chain -To build the Solana programs, you'll need a handful of Solana tools installed. See [the setup](./setup.md) if you're not using Nix. +To build the Solana programs, you'll need a handful of Solana tools installed. See [the setup](./setup.md) if you're not using Nix. If you're using Nix, make sure you are in the development environment by running `nix develop`. To start, you'll need to create a Solana wallet to fund your transactions. @@ -8,25 +8,34 @@ To start, you'll need to create a Solana wallet to fund your transactions. solana-keygen new ``` -## Run on a local validator (localnet) +By default the KeyPair will be generated in `~/.config/solana/id.json`. -In a new terminal, run the following command to: +## Run on a local validator (localnet) -- setup a `solana-test-validator` -- Deploy all the required programs -- Create a local run with name ``. If no run name is provided, the name `test` will be used by default. +In a new terminal, run the following command: ```bash just setup-solana-localnet-test-run run_id= ``` +This will: + +- Setup a `solana-test-validator` +- Deploy all the required programs (Coordinator and Authorizer) +- Create a local run with name ``. If no run name is provided, the name `test` will be used by default. The run id should not exceed 32 characters, it will be truncated if it exceeds this limit. + Then, in another terminal, run a client to train the test model and joining the run with name `RUN_ID`. If no run name is provided, the name `test` will be used by default. ```bash just start-training-localnet-client run_id= ``` -This will start a run to train a 1.1b parameter model with all the parallelism features enabled. +This will start a run to train a 1.1b parameter model with all the parallelism features enabled. This Psyche client will use a temporal private key, which will be generated and deleted automatically running the mentioned command. In case you want to check these keys, they will be stored in `~/solana-keys`. To run it with a specific private key, you can run the same command but adding the `WALLET_FILE` env var: + +```bash +WALLET_FILE=/path/to/wallet.json just start-training-localnet-client run_id= +``` + For a more lightweight run to avoid OOM errors, or just to use your hardware less, (we see you 8gb VRAM cards!) there's also: ```bash @@ -34,38 +43,34 @@ just setup-solana-localnet-light-test-run just start-training-localnet-light-client ``` -By default the client will use the private key generated by `solana-keygen new` above (located by default in `~/.config/solana/id.json`). +This will train a 12m which should fit on most GPUs. -To spin up another client and join the run we'll have to create another keypair using: +To spin up another client and join the run you can run the same command as before: ```bash -solana-keygen new --outfile +just start-training-localnet-client run_id= ``` -and run the same `just` command but using the new created keypair: +or ```bash -WALLET_FILE= just start-training-localnet-client run_id= +just start-training-localnet-light-client run_id= ``` -or: - -```bash -WALLET_FILE= just start-training-localnet-light-client run_id= -``` +Like before this will create a temporal solana keypair in `~/solana-keys` and be removed when the client is stopped. ## Run on Solana's Devnet You'll need to fund your wallet to make transactions on Devnet. -You can [request an airdrop](https://faucet.solana.com/) from the Solana foundation of up to 10 devnet sol every 8 hours. Simply run +You can [request an airdrop](https://faucet.solana.com/) from the Solana foundation of up to 10 devnet sol every 8 hours. To get your public key, run: ```bash -solana-keygen pubkey +solana-keygen pubkey ``` -and paste the resulting key into the airdrop website. +If no path to keypair is provided, it will use the default keypair located at `~/.config/solana/id.json`. Paste the resulting key into the airdrop website to get tokens. -You can then use the same steps for deploying the programs, creating a run, and training on localnet above, but using following `just` commands: +You can then use the same steps for deploying the programs, creating a run, and training on localnet above, but using the following `just` commands: ```bash just setup-solana-devnet-test-run @@ -79,11 +84,12 @@ just setup-solana-devnet-light-test-run just start-training-devnet-light-client ``` -## Regenerating program keypairs +Remember to set the `WALLET_FILE` environment variable to the path of your Solana keypair file, since this will be the one with the devnet funds. -If you're developing things that change the structure of the program's accounts layout, deploying an update to the coordinator program will likely cause breakage with existing runs that have coordinator accounts already instantiated. +## Changing contracts -Any programs, including the Psyche website's indexer, will fail to read the content of the on-chain data if you use a new IDL with an old in-memory layout. +Psyche uses two main accounts that are deployed to Solana, the coordinator and the authorizer. +If you're developing things that change the structure of the program's accounts layout, deploying an update to the coordinator program will likely cause breakage with existing runs that have coordinator accounts already instantiated. Therefore, changes to the data structures that end up on-chain will require a deployment of a new coordinator program under a new ProgramID to prevent breakage of existing runs. diff --git a/psyche-book/src/development/setup.md b/psyche-book/src/development/setup.md index 8d1110681..7141c1734 100644 --- a/psyche-book/src/development/setup.md +++ b/psyche-book/src/development/setup.md @@ -25,23 +25,30 @@ To speed up your builds & your local dev shell, we recommend enabling the binary In order to use the cache that garnix provides, change your `nix.conf`, adding `https://cache.garnix.io` to substituters, and `cache.garnix.io:CTFPyKSLcx5RMJKfLo5EEPUObbA78b0YQ2DTCJXqr9g=` to `trusted-public-keys`. -If you've just installed Nix via the Determinite Systems installer above, you can do this by adding these lines to `/etc/nix/nix.conf`: +If you've just installed Nix via the Determinate Systems installer above, you can do this by adding these lines to `/etc/nix/nix.conf`: ```conf extra-substituters = https://cache.garnix.io extra-trusted-public-keys = cache.garnix.io:CTFPyKSLcx5RMJKfLo5EEPUObbA78b0YQ2DTCJXqr9g= ``` +#### Setup + +Each time you open a new shell in the Psyche directory, run `nix develop` to enter the Psyche development shell with all the necessary dependencies. + #### Setup Using `direnv` You can optionally use `direnv` to automatically enter a Nix environment when you `cd` into the Psyche folder. -Install `direnv` from your system's package manager. -After running `direnv allow` in the Psyche directory once, your terminal will automatically enter a development shell when you subsequently `cd` into the Psyche directory. +1. Install `direnv` from your system's package manager: -#### Setup Without `direnv` +- `sudo apt install direnv` on Debian-based systems +- `brew install direnv` on macOS -Each time you open a new shell in the Psyche directory, run `nix develop` to enter a development shell. +2. Run `nix profile install nixpkgs#nix-direnv` to install the `direnv` plugin in nix. +3. Run `echo "source ~/.nix-profile/share/nix-direnv/direnvrc" > ~/.direnvrc` to enable the plugin. +4. Add `eval "$(direnv hook bash)"` line to your shell configuration file (e.g., `~/.bashrc` or `~/.zshrc`). +5. Run `direnv allow` in the Psyche directory once, your terminal will automatically enter a development shell when you subsequently `cd` into the Psyche directory. #### Platform Differences @@ -139,41 +146,25 @@ OPENSSL_LIB_DIR = /lib/VC/x64/MT OPENSSL_INCLUDE_DIR = /include ``` -### Docker - -> requires Nix! - -Create a Docker image with the necessary dependencies to run a Psyche client: - -1. Install the necessary NVIDIA and CUDA drivers as explained in the previous sections. -2. Install the NVIDIA [container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). If using Ubuntu, just run: - -```bash -sudo apt-get update -sudo apt-get install -y nvidia-container-toolkit -``` - -3. Create an `.env` file following the `.env.example` in `psyche/config/client` and update the necessary environment variables. -4. Run `just nix build_docker_solana_client`. - ## Useful commands Psyche uses [`just`](https://github.com/casey/just) to run some common tasks. - -You can run `just` to see the whole list of commands! +You can run `just` to see the whole list of available commands! ### Running checks -> requires Nix! - ```bash -just check +just check-client ``` -If it passes, CI will pass. +Will run the psyche-solana-client package with the `--help` flag. If you see a list of commands, it means that the env compiles and can run the basic commands. ### Formatting +> requires Nix! + ```bash -just fmt +nix fmt ``` + +Format all the project files. diff --git a/psyche-book/src/explain/glossary.md b/psyche-book/src/explain/glossary.md index 6dfd313cd..e7ec2a6a5 100644 --- a/psyche-book/src/explain/glossary.md +++ b/psyche-book/src/explain/glossary.md @@ -46,7 +46,7 @@ The particular role of a client in a given round. Can be one of `Trainer`, `Veri A phase (`RunState` and `ActiveStep`) at the end of an `Epoch` where model `Checkpoints` are saved and the system prepares for the next epoch. **Coordinator** -The central orchestrator of the Psyche training system, implemented as a Solana program. It manages the training lifecycle (`RunState`), client participation (`ClientState`), data batch assignment, and `Witnessing`. +The central orchestrator of the Psyche training system, implemented as a Solana program. It manages the training life cycle (`RunState`), client participation (`ClientState`), data batch assignment, and `Witnessing`. **CoordinatorConfig** The set of parameters defining how a specific training run operates (e.g., `warmup_time`, `witness_quorum`, `rounds_per_epoch`). From 206b173af69b077c4b91e78fd3426995d928c934 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Tue, 9 Dec 2025 14:17:19 -0800 Subject: [PATCH 14/33] Fix localnet --- .../state.toml | 2 +- psyche-book/src/development/running-offchain.md | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/config/consilience-match-llama2-20m-fineweb-pretrain-dev/state.toml b/config/consilience-match-llama2-20m-fineweb-pretrain-dev/state.toml index e021957ac..f5cf8c726 100644 --- a/config/consilience-match-llama2-20m-fineweb-pretrain-dev/state.toml +++ b/config/consilience-match-llama2-20m-fineweb-pretrain-dev/state.toml @@ -1,4 +1,4 @@ -run_id = "consilience-match-llama2-20m-fineweb" +run_id = "consilience-llama2-20m-fineweb" run_state = "WaitingForMembers" [config] warmup_time = 30 diff --git a/psyche-book/src/development/running-offchain.md b/psyche-book/src/development/running-offchain.md index 7f4106323..be8105581 100644 --- a/psyche-book/src/development/running-offchain.md +++ b/psyche-book/src/development/running-offchain.md @@ -20,6 +20,13 @@ Since we want to run many clients and the server we'll need several terminal win ### Running +Since the local-testnet examples uses a local server to provide the data for the clients to train on, you'll need to download the data first. +The best way to do it is install the HuggingFace CLI tool running `curl -LsSf https://hf.co/cli/install.sh | bash`, once installed just run the following command to get some random data and place it in the correct place for the local server to use it: + +```bash +hf download emozilla/fineweb-10bt-tokenized-datatrove-llama2 --repo-type dataset --local-dir ./data/fineweb-10bt +``` + A sample invocation that fires up 3 clients to train on a 20m model might look like this: ```bash @@ -28,7 +35,9 @@ just local-testnet \ --config-path ./config/consilience-match-llama2-20m-fineweb-pretrain-dev/ ``` -There's a _lot_ of options to configure the local testnet. Check em out below! +This will run a server locally that acts as the coordinator and 3 clients that will connect to the server and start training on the downloaded data. We'll talk about the configuration of the run later on but this example will use the config located at `./config/consilience-match-llama2-20m-fineweb-pretrain-dev/state.toml`, there you can have a glimpse of the configuration options. + +There's a _lot_ of options to configure the local testnet. Check em out below to configure runs as you see fit:
Command-line options From 2214d2a45522484c88001c45be6ead30df32da3d Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Wed, 10 Dec 2025 13:24:02 -0300 Subject: [PATCH 15/33] Fix decentralized section --- justfile | 1 + psyche-book/src/development/python.md | 6 +++--- psyche-book/src/development/running-onchain.md | 11 +++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/justfile b/justfile index abd1bd634..3f99e297c 100644 --- a/justfile +++ b/justfile @@ -128,6 +128,7 @@ generate_cli_docs: cargo run -p psyche-centralized-server print-all-help --markdown > psyche-book/generated/cli/psyche-centralized-server.md cargo run -p psyche-centralized-local-testnet print-all-help --markdown > psyche-book/generated/cli/psyche-centralized-local-testnet.md cargo run -p psyche-sidecar print-all-help --markdown > psyche-book/generated/cli/psyche-sidecar.md + cargo run -p psyche-solana-client print-all-help --markdown > psyche-book/generated/cli/psyche-solana-client.md run_docker_client *ARGS: just nix build_docker_solana_client diff --git a/psyche-book/src/development/python.md b/psyche-book/src/development/python.md index 574061a9b..3b6e8384f 100644 --- a/psyche-book/src/development/python.md +++ b/psyche-book/src/development/python.md @@ -7,13 +7,13 @@ ## Overview -Psyche provides a Python integration that allows you to write modeling code in Python using libraries like [Hugging Face Transformers](https://github.com/huggingface/transformers) while leveraging Psyche's Rust core for training orchestration. This integration is designed for research where you want the flexibility of Python modeling with Psyche's training infrastructure, and production-scale training where you want to take advanted of highly optimized training frameworks already built in Python. +Psyche provides a Python integration that allows you to write modeling code in Python using libraries like [Hugging Face Transformers](https://github.com/huggingface/transformers) while leveraging Psyche's Rust core for training orchestration. This integration is designed for research where you want the flexibility of Python modeling with Psyche's training infrastructure, and production-scale training where you want to take advantage of highly optimized training frameworks already built in Python. The Python integration works through a "sidecar" process that Psyche spawns and communicates with during training. ## Development Setup -To develop with the Python integration, we have a Python development shell available. +To develop with the Python integration, we have a Nix development shell with Python available. This shell provides: @@ -24,7 +24,7 @@ This shell provides: ### Development Workflow -You can use `uv pip` to install arbitrary packages. Dependencies are tracked via `uv.lock`, so if you don't have `direnv` set up, you must exit and re-enter the development shell with `nix develop`. +You can use `uv pip` to install arbitrary packages. Dependencies are tracked via `uv.lock`, so if you don't have `direnv` set up, you must exit and re-enter the development shell with `nix develop .#dev-python`. When you enter the dev shell, it compiles the Rust extension that provides the `psyche` Python module. **If you modify any Rust code in the Python extension or its dependencies, you must exit and re-enter the dev shell** to recompile the extension. diff --git a/psyche-book/src/development/running-onchain.md b/psyche-book/src/development/running-onchain.md index c40d56d25..aa8009ca9 100644 --- a/psyche-book/src/development/running-onchain.md +++ b/psyche-book/src/development/running-onchain.md @@ -86,6 +86,17 @@ just start-training-devnet-light-client Remember to set the `WALLET_FILE` environment variable to the path of your Solana keypair file, since this will be the one with the devnet funds. +## Psyche decentralized client reference + +All the commands above will use the same package `psyche-solana-client` with specific parameters to be able to do a quick train on the local validator but it actually has a _lot_ of different configs to be able to test and run different scenarios. + +Here's a summary of all the available commands and options that can be used: + +
+ Command-line options + {{#include ../../generated/cli/psyche-solana-client.md}} +
+ ## Changing contracts Psyche uses two main accounts that are deployed to Solana, the coordinator and the authorizer. From db5341b821bb0ebdcba0b1b125939bff405c2f0b Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Wed, 10 Dec 2025 15:41:31 -0300 Subject: [PATCH 16/33] Fix python section --- psyche-book/src/development/python.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/psyche-book/src/development/python.md b/psyche-book/src/development/python.md index 3b6e8384f..62339776d 100644 --- a/psyche-book/src/development/python.md +++ b/psyche-book/src/development/python.md @@ -33,7 +33,7 @@ We recommend running commands directly through the dev shell without entering it For example, to run the `train` program using python: ```bash -nix develop .#dev-python --command cargo run --features python --example train -- \ +nix develop .#dev-python --command just train-model-python \ --model emozilla/llama2-20m-init \ --data-path ./data/fineweb-10bt/ \ --total-batch 2 \ @@ -41,7 +41,7 @@ nix develop .#dev-python --command cargo run --features python --example train - --python ``` -Alternatively, you _could_ enter the shell with +Alternatively, you _could_ enter the shell and run the commands with: ```bash nix develop .#dev-python @@ -62,21 +62,21 @@ When you use the `--python` flag, Psyche automatically spawns Python sidecar pro python -m psyche.sidecar --parent-pid --backend --init-method --world-size --rank ``` +By default only one sidecar using one GPU will be spawned, the amount will change depending on two different arguments `--data-parallelism` and `--tensor-parallelism`. The first one will spawned one entire copy of the model per GPU and the latter will split the model across multiple GPUs. The amount of sidecars spawned will be the product of these two arguments. Take into account that you will need `tensor_parallelism * data_parallelism` GPUs to run that amount of sidecars. + +Here's an overview of the different options that the `psyche-sidecar` provides in case you want to test sidecars with different configurations. + +
+ Command-line options + {{#include ../../generated/cli/psyche-sidecar.md}} +
+ ## Testing Your Changes To test modifications to the Python integration: 1. **Modify the sidecar code** in the Python extension -2. **Run the training example**: - -```bash -nix develop .#dev-python --command cargo run --features python --example train -- \ - --model emozilla/llama2-20m-init \ - --data-path ./data/fineweb-10bt/ \ - --total-batch 2 \ - --micro-batch 1 \ - --python -``` +2. **Run the training example** with the same `just train-model-python` command we outlined earlier. ## How It Works From 48646fd4b3e68713dbceef6679cb403cc8cec1c9 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Wed, 10 Dec 2025 19:10:13 -0300 Subject: [PATCH 17/33] Fix latest development docs --- psyche-book/src/development/agenix.md | 2 +- psyche-book/src/development/book.md | 2 +- psyche-book/src/development/contributing.md | 4 ++-- psyche-book/src/explain/general-workflow.md | 6 +++--- psyche-book/src/explain/index.md | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/psyche-book/src/development/agenix.md b/psyche-book/src/development/agenix.md index dbda95634..56eff7b22 100644 --- a/psyche-book/src/development/agenix.md +++ b/psyche-book/src/development/agenix.md @@ -9,7 +9,7 @@ You can read more about agenix and how secrets are used in our deployment [HERE] ## What secrets do we store? ```nix -{{#include ../../generated/secrets.nix}} +{{#include ../../../secrets.nix}} ``` ## Editing a secret diff --git a/psyche-book/src/development/book.md b/psyche-book/src/development/book.md index f5d7865f4..cdac7fc5f 100644 --- a/psyche-book/src/development/book.md +++ b/psyche-book/src/development/book.md @@ -4,7 +4,7 @@ That's the document you're reading! :D ## Development -Simply run `just serve_book` to serve the book over http on localhost! +Simply run `just serve_book` to serve the book over http on localhost! This will also automatically rebuild the book when changes are made. ## Building diff --git a/psyche-book/src/development/contributing.md b/psyche-book/src/development/contributing.md index 99ff15730..8bf4c0d4e 100644 --- a/psyche-book/src/development/contributing.md +++ b/psyche-book/src/development/contributing.md @@ -37,7 +37,7 @@ It's not a deal-breaker, but rebase makes us happy \<3 ### Clean Linear History Rebasing creates a linear commit history without merges going back and forth, making it much easier to identify the place a change was made. -Fixups in merge commits that introduce bugs are no longer associated with the original code, wheras with with rebase you'd find the bug as part of its original commit. +Fix-ups in merge commits that introduce bugs are no longer associated with the original code, whereas with with rebase you'd find the bug as part of its original commit. Merge commits add extra noise to the history without adding meaningful content about what changed. @@ -62,4 +62,4 @@ While we advocate for rebase, **we do not advocate for squashing all commits**. - **Don't squash meaningful commits together** - this buries important changes in large diffs and loses the step-by-step narrative - **Don't use merge commits** within feature branches -- **Don't include "fix up" or "oops" commits** in your final PR - these are fine to have during development, but before opening your PR, use `git commit --amend` or interactive rebase to clean these up. A typical rebase workflow is explained [in this blog post](https://simondosda.github.io/posts/2022-01-03-git-rebase-workflow.html). [git absorb](https://andrewlock.net/super-charging-git-rebase-with-git-absorb/) is also very useful for small fixups. +- **Don't include "fix up" or "oops" commits** in your final PR - these are fine to have during development, but before opening your PR, use `git commit --amend` or interactive rebase to clean these up. A typical rebase workflow is explained [in this blog post](https://simondosda.github.io/posts/2022-01-03-git-rebase-workflow.html). [git absorb](https://andrewlock.net/super-charging-git-rebase-with-git-absorb/) is also very useful for small fix-ups. diff --git a/psyche-book/src/explain/general-workflow.md b/psyche-book/src/explain/general-workflow.md index 2333fb5f4..4ac8bfa63 100644 --- a/psyche-book/src/explain/general-workflow.md +++ b/psyche-book/src/explain/general-workflow.md @@ -118,7 +118,7 @@ flowchart LR c{Max warmup time passed} d{Witness quorum reached} e{Max training time passed} - f{All rounds completed} + f{End of the epoch reached} WFM --> a a -->|Yes| W @@ -163,8 +163,8 @@ sequenceDiagram Coordinator->>Coordinator: Update state to RoundWitness Note over Coordinator: Timeout round witness time alt step > total steps - Coordinator->>Coordinator: Update state to Waitingformembers - else height == rounds per epoch + Coordinator->>Coordinator: Update state to Finished + else current_epoch_time == max_time_per_epoch Coordinator->>Coordinator: Update state to Cooldown else Coordinator->>Coordinator: Update state to RoundTrain with step + 1 diff --git a/psyche-book/src/explain/index.md b/psyche-book/src/explain/index.md index b4262e47c..7e012123a 100644 --- a/psyche-book/src/explain/index.md +++ b/psyche-book/src/explain/index.md @@ -84,7 +84,7 @@ At the start of each round, one or more clients are randomly selected as witness These bloom filters are sent to the coordinator, which then combines them into a provable consensus of which results to apply to the model. -Once a witness quorum is reached, the coordinator advances to the _Training_ phase to allow all clients a brief window to download every training result of the previous round, clients are assigned new data, and the process repeats. After a predefined number of rounds, a _Cooldown_ round occurs, marking the end of an **epoch**. +Once a witness quorum is reached, the coordinator advances to the _Training_ phase to allow all clients a brief window to download every training result of the previous round, clients are assigned new data, and the process repeats. After a fixed amount of time, a _Cooldown_ round occurs, marking the end of an **epoch**. This time is configurable in the run creation process that we'll explore in the other sections. ## The witness/train loop visualized From b739d1afea39f5af00d89bc7abdc831713671d9e Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Wed, 10 Dec 2025 21:14:57 -0300 Subject: [PATCH 18/33] Improve create run docs --- justfile | 3 +++ psyche-book/src/enduser/create-run.md | 12 ++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/justfile b/justfile index 3f99e297c..675d824f5 100644 --- a/justfile +++ b/justfile @@ -18,6 +18,9 @@ integration-test test_name="": cargo test --release -p psyche-centralized-testing --test integration_tests -- --nocapture "{{ test_name }}"; \ fi +run_authorizer rpc="http://127.0.0.1:8899" grantor="~/.config/solana/id.json" grantee="11111111111111111111111111111111": + sh scripts/join-authorization-create.sh {{ rpc }} {{ grantor }} {{ grantee }} + # Determine whether to use Python support based on environment variable use_python := env("USE_PYTHON", "0") diff --git a/psyche-book/src/enduser/create-run.md b/psyche-book/src/enduser/create-run.md index 0c721826d..00ee98d99 100644 --- a/psyche-book/src/enduser/create-run.md +++ b/psyche-book/src/enduser/create-run.md @@ -18,22 +18,26 @@ Also, for all the commands you will need to provide the path to you Solana priva Before we can get started we need to decide who will be able to join the run. You can read more about [authorization here](./authentication.md). -We'll need a private key that manages join permissions, we'll call it: `join_authority.json` +We'll need a key-pair file that manages join permissions, it can be the default created by Solana when you do `solana-keygen new` located in `~/.config/solana/id.json` #### Join Authority for Public Runs If we're looking to make a permissionless run (anyone can join), we'll need to create an authorization that's valid for everyone. +Running: + ```sh -sh scripts/join-authorization-create.sh [RPC] join_authority.json 11111111111111111111111111111111 +just run_authorizer rpc= grantor= grantee= ``` +By default the `just run_authorizer` command will use the values needed to create an authorizer in a solana local validator using the default Solana key-pair mentioned above and with permissionless access. Basically everyone can join the run without restrictions. + #### Join Authority for Private Runs -If we'll only allow some users to join the run we'll need to create one authorization per user (each user can then set multiple delegate keys later) +If we'll only allow some users to join the run we'll need to create one authorization per user (each user can then set multiple delegate keys later) For example to use it locally we can do ```sh -sh scripts/join-authorization-create.sh [RPC] join_authority.json [MY_USER_PUBKEY] +just run_authorizer rpc= grantee= ``` ### Creating a run without token rewards From b77ba97ca8c4f5f8d784956f87e50599e3d59817 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 11 Dec 2025 08:42:30 -0800 Subject: [PATCH 19/33] Update create run section --- justfile | 6 ++- psyche-book/src/enduser/create-run.md | 58 +++++++++++++++++---------- 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/justfile b/justfile index 675d824f5..0c7006dbf 100644 --- a/justfile +++ b/justfile @@ -18,7 +18,11 @@ integration-test test_name="": cargo test --release -p psyche-centralized-testing --test integration_tests -- --nocapture "{{ test_name }}"; \ fi -run_authorizer rpc="http://127.0.0.1:8899" grantor="~/.config/solana/id.json" grantee="11111111111111111111111111111111": +rpc := "http://127.0.0.1:8899" +grantor := "~/.config/solana/id.json" +grantee := "11111111111111111111111111111111" + +run_authorizer: sh scripts/join-authorization-create.sh {{ rpc }} {{ grantor }} {{ grantee }} # Determine whether to use Python support based on environment variable diff --git a/psyche-book/src/enduser/create-run.md b/psyche-book/src/enduser/create-run.md index 00ee98d99..da1018468 100644 --- a/psyche-book/src/enduser/create-run.md +++ b/psyche-book/src/enduser/create-run.md @@ -2,7 +2,7 @@ To create a new training run and make it available for nodes to join, you'll need to create it, configure it, and unpause it. By default every new run stays in the pause state until being unpaused by the owner and can be paused anytime. -## Creating the Coordinator account +## Setting up the Run First, create the run on-chain. You'll need to provide: @@ -13,7 +13,7 @@ You'll need to provide: Also, for all the commands you will need to provide the path to you Solana private key. -### Setup Joining Authorizations +### Setting up Join Authorizations Before we can get started we need to decide who will be able to join the run. You can read more about [authorization here](./authentication.md). @@ -27,10 +27,22 @@ If we're looking to make a permissionless run (anyone can join), we'll need to c Running: ```sh -just run_authorizer rpc= grantor= grantee= +just run_authorizer ``` -By default the `just run_authorizer` command will use the values needed to create an authorizer in a solana local validator using the default Solana key-pair mentioned above and with permissionless access. Basically everyone can join the run without restrictions. +By default the command will use the values needed to create an authorizer in a Solana localnet using the default Solana key-pair mentioned above and with permissionless access. Basically everyone can join the run without restrictions. + +There's three variables that this command can receive: + +- `rpc`: The RPC URL to use for the Solana network. By default: `http://127.0.0.1:8899` +- `grantor`: The path to the file with a Solana Keypair, will be used to create the authorization and grant access to the run. By default: `~/.config/solana/id.json` +- `grantee`: The public key of the user that will be granted access to the run. By default: `11111111111111111111111111111111` that means is permissionless. + +You can override any of these values like this: + +```sh +just rpc= grantor= grantee= run_authorizer +``` #### Join Authority for Private Runs @@ -40,39 +52,43 @@ If we'll only allow some users to join the run we'll need to create one authoriz just run_authorizer rpc= grantee= ``` -### Creating a run without token rewards +### Creating the run -For a standard run without token incentive distribution layer +> For all the following commands you can use the psyche client with the docker image or directly cloning the Psyche repo and running the package there using `cargo run --bin psyche-solana-client -- ...`. + +The run creation will accept a variety of different parameters we'll go through the fundamentals and then we'll go through the rest of the options. Primarily a run needs the RPC of Solana depending on the validator we want to use, a unique identifier known as the `run id`, a join authority that will be the public key that will manage the access to the run (by default will be the one that creates the run) and the private key of the wallet that will be used to create the run. + +For a standard run without token incentive distribution layer (see [rewards](../explain/rewards.md) for more details) ```bash psyche-solana-client create-run \ --rpc [RPC] \ --run-id [RUN_ID] \ + --description "A description of your run" \ --join-authority [JOIN_AUTHORITY_PUBKEY] \ --wallet-private-key-path [JSON_PRIVATE_KEY_PATH] ``` -### Creating a run with token rewards - -For a run that distributes tokens as reward to the training participants, we need to specify the mint of the collateral token to be distributed: +For a run that distributes tokens as reward to the training participants, we need to specify the pubkey of the created token in the Solana Blockchain, this will be used as the mint of the collateral token to be distributed: ```bash psyche-solana-client create-run \ --rpc [RPC] \ --run-id [RUN_ID] \ + --description "A description of your run" \ --join-authority [JOIN_AUTHORITY_PUBKEY] \ --treasurer-collateral-mint [COLLATERAL_MINT_PUBKEY] \ --wallet-private-key-path [JSON_PRIVATE_KEY_PATH] ``` -## Initializing configuration +At that point we successfully created our run. -Then, set the run's config. -You'll need to provide: +### Initializing configuration + +At first the run will not hold any configuration on its behavior and will be paused so no client can join yet. -- the RPC & websocket RPC urls so the client can communicate with an RPC node. -- the run ID you previously used -- the path to a `config.toml` file, following the [run config schema](./run-config.md) +To set the run's config. +You'll need to provide mostly the same parameters as when creating the run and also the path to a `config.toml` file, that follows the [run config schema](./run-config.md). ```bash psyche-solana-client update-config \ @@ -82,7 +98,7 @@ psyche-solana-client update-config \ --wallet-private-key-path [JSON_PRIVATE_KEY_PATH] ``` -## Starting the training +### Unpausing the run At this point, your run is ready to go! You can now set its state to "unpaused", and let clients join & begin training your model. @@ -94,11 +110,11 @@ psyche-solana-client set-paused \ --wallet-private-key-path [JSON_PRIVATE_KEY_PATH] ``` -Congratulations! As soon as your first client joins, your model is being trained. +Congratulations! As soon as your first client joins, your model will start being trained. ## Configuring training rewards -You can configure how many points does each client earns and loses for each epoch of training. +If you created a run with rewards, you can configure how many points does each client earns and loses for each epoch of training. ```bash psyche-solana-client set-future-epoch-rates \ @@ -109,8 +125,6 @@ psyche-solana-client set-future-epoch-rates \ --wallet-private-key-path [JSON_PRIVATE_KEY_PATH] ``` -## Funding the run with collateral - To distribute collateral to users, we need to periodically top-up the run's treasury so that points earned by users during compute can then be claimed against the treasury. ```sh @@ -121,9 +135,9 @@ psyche-solana-client treasurer-top-up-rewards \ --wallet-private-key-path [JSON_PRIVATE_KEY_PATH] ``` -## Inspect the content of a run +## Getting information about a run -Optionally you can get detailled technical information about a run that was previously created for troubleshooting purposes. +Optionally you can get detailed technical information about a run that was previously created for troubleshooting purposes. ```bash psyche-solana-client json-dump-run \ From e095f858f967e5c48de375161f65021206a68931 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 11 Dec 2025 10:36:27 -0800 Subject: [PATCH 20/33] Fix rest of the sections --- .../decentralized/solana-authorizer/README.md | 2 +- psyche-book/src/enduser/authentication.md | 34 +++---------------- psyche-book/src/enduser/client-faq.md | 1 + psyche-book/src/enduser/run-config.md | 13 +++++-- scripts/join-authorization-create.sh | 4 +-- 5 files changed, 19 insertions(+), 35 deletions(-) diff --git a/architectures/decentralized/solana-authorizer/README.md b/architectures/decentralized/solana-authorizer/README.md index 6fd3e32ba..9d1b96ea2 100644 --- a/architectures/decentralized/solana-authorizer/README.md +++ b/architectures/decentralized/solana-authorizer/README.md @@ -49,7 +49,7 @@ impl Authorization { The smart contract then exposes a set of instruction to manipulate those `Authorization` PDAs: -- `authoziation_create`, create a new PDA +- `authorization_create`, create a new PDA - `authorization_grantor_update`, allow the grantor to activate/deactivate the authorization - `authorization_grantee_update`, allow the grantee to add/remove delegates - `authorization_close` allow the grantor to remove the PDA diff --git a/psyche-book/src/enduser/authentication.md b/psyche-book/src/enduser/authentication.md index 209ca99f3..c3ca630a1 100644 --- a/psyche-book/src/enduser/authentication.md +++ b/psyche-book/src/enduser/authentication.md @@ -28,35 +28,11 @@ This is done through the following steps: 2. The `authorizer` (the grantee) sets a list of `delegate` keys that can join the run on its behalf 3. The `delegate` key then can join a run -## Keys Authorizations - -Make sure to install the scripting dependencies: - -```bash -sudo apt-get install jq -cargo install solana_toolbox_cli -``` - -For the `join_authority` (the grantor) to issues new `authorization` a script is provided: - -```sh -# We assume that "grantor.json" contains the Private Key of the "join_authority" -# The "grantor.json" can be created using: $ solana-keygen new -o grantee.json -# We assume that $GRANTEE_PUBKEY is set to the public key of the "authorizer" (or grantee) -# The $GRANTEE_PUBKEY can be retrieved by using: $ solana-keygen pubkey grantee.json -sh scripts/join-authorization-create.sh devnet grantor.json $GRANTEE_PUBKEY -``` - -For the `authorizer` (the grantee) to set a list of delegate, the following script is provided: - -```sh -# We assume that $GRANTOR_PUBKEY is set to the public key of the "join_authority" of the run -# The $GRANTOR_PUBKEY can be retrieved by using: $ solana-keygen pubkey grantor.json -# We assume that "grantee.json" contains the Private Key of the "authorizer" -# The "grantee.json" can be created using: $ solana-keygen new -o grantee.json -# We assume that a set of keypairs exist at path: delegate1.json, delegate2.json, etc -sh scripts/join-authorization-set-delegates.sh devnet $GRANTOR_PUBKEY grantee.json delegate*.json -``` +## Permissionless Runs + +Permissionless runs are open to anyone without any `authorization` required. The owner of the run can set this for a run when creating it. + +To see how to create an authorization for a permissioned or permissionless run, see the [Authorization section](./create-run.md#Setting-up-Join-Authorizations) in the create run guide. ## Further information diff --git a/psyche-book/src/enduser/client-faq.md b/psyche-book/src/enduser/client-faq.md index 1cc64ad17..62d04dc13 100644 --- a/psyche-book/src/enduser/client-faq.md +++ b/psyche-book/src/enduser/client-faq.md @@ -15,5 +15,6 @@ - How do I update the client to the latest version? - You can force Docker to pull the latest image by running `docker pull nousresearch/psyche-client:latest` before running the client. - Do I need a Solana wallet to train? Does it need to have funds? + - Yes, even if you want to join a run that does not track rewards you will need a Solana wallet with funds to pay for the transactions to the coordinator. - Are the client and coordinator open-source? Can I report bugs? - Yes, you may check [Psyche's github repo](https://github.com/PsycheFoundation/psyche) diff --git a/psyche-book/src/enduser/run-config.md b/psyche-book/src/enduser/run-config.md index 299ab98ea..8440dd0a7 100644 --- a/psyche-book/src/enduser/run-config.md +++ b/psyche-book/src/enduser/run-config.md @@ -19,8 +19,8 @@ warmup_time = 30 # time, in seconds, to let nodes bring the model from the GPU to disk, and to opt to join the next round. cooldown_time = 30 -# how many training rounds in one "epoch", from warmup to cooldown. -rounds_per_epoch = 20 +# time, in seconds, that an epoch will last. +epoch_time = 60 # maximum time, in seconds, to allow nodes to train in one round. # this will limit the types of GPUs your model can be trained on, @@ -38,13 +38,14 @@ round_witness_time = 1 min_clients = 1 # minumum number of clients required before we transition from WaitingForMembers to Warmup. -# must be equal to or greater than min_clients +# must be equal to or greater than min_clients. init_min_clients = 1 # what percent of nodes are dedicated to verifying correctness. always set to 0 for now. verification_percent = 0 # how many nodes are selected each round to publish witness proofs +# Can be set to 0 to select all nodes as witnesses. witness_nodes = 1 # the total number of training data batches per-step. this also determines your maximum number of clients. @@ -63,17 +64,23 @@ total_steps = 25000 ```toml # so far only LLMs are supported. [model.LLM] +# Architecture of the model to train on can be HfLlama or HfDeepseek for now. +# If running with Python sidecars this must be set to HfAuto. architecture = "HfLlama" data_type = "Pretraining" max_seq_len = 2048 [model.LLM.checkpoint.Hub] +# Repo where the model is located in HugggingFace, will be used to download the model at the beginning of training. repo_id = "emozilla/llama2-20m-init" [model.LLM.data_location.Http] +# Token size in bytes, can be "TwoBytes" or "FourBytes" token_size_in_bytes = "TwoBytes" +# Shuffle or not tokens for training, can be "Seeded" with a seed value or "DontShuffle" shuffle = "DontShuffle" +# Data location to train on [model.LLM.data_location.Http.location.Gcp] bucket_name = "nous-pretraining-public-us" filter_directory = "fineweb-edu-tokenized-llama2" diff --git a/scripts/join-authorization-create.sh b/scripts/join-authorization-create.sh index 7b2e521f2..22c7cff44 100644 --- a/scripts/join-authorization-create.sh +++ b/scripts/join-authorization-create.sh @@ -54,7 +54,7 @@ AUTHORIZATION_CREATE_JSON=$( \ --args=params.scope:$PSYCHE_AUTH_SCOPE \ --execute ) -echo $AUTHORIZATION_CREATE_JSON | jq -r .outcome.explorer +echo $AUTHORIZATION_CREATE_JSON | jq -r .outcome.explorer_url echo "----" # Extract the authorization PDA from the JSON response @@ -72,5 +72,5 @@ AUTHORIZATION_ACTIVATE_JSON=$( \ authorization:$AUTHORIZATION_PUBKEY \ --execute ) -echo $AUTHORIZATION_ACTIVATE_JSON | jq -r .outcome.explorer +echo $AUTHORIZATION_ACTIVATE_JSON | jq -r .outcome.explorer_url echo "----" From 3882805a565f75bf7415b8547f1049f32792bb73 Mon Sep 17 00:00:00 2001 From: Nacho Avecilla Date: Thu, 11 Dec 2025 15:43:00 -0300 Subject: [PATCH 21/33] Update psyche-book/src/explain/index.md Co-authored-by: Dylan Socolobsky --- psyche-book/src/explain/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/psyche-book/src/explain/index.md b/psyche-book/src/explain/index.md index 7e012123a..16d6eb6aa 100644 --- a/psyche-book/src/explain/index.md +++ b/psyche-book/src/explain/index.md @@ -90,7 +90,7 @@ Once a witness quorum is reached, the coordinator advances to the _Training_ pha Here's a high-level overview of the process. -There's additional implementation details, but this captures the overall flow of a single Round in an Epoch: +There are additional implementation details, but this captures the overall flow of a single Round in an Epoch: ```mermaid sequenceDiagram From 5f928d57f257728b0b552b9046a8adcca6cf4d6d Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Sun, 14 Dec 2025 19:26:02 -0300 Subject: [PATCH 22/33] Improve deploy script and docs --- docker/README.md | 2 +- justfile | 43 +------- .../src/development/running-onchain.md | 52 +++++++--- psyche-book/src/enduser/create-run.md | 2 +- scripts/deploy-solana-test.sh | 97 ++++++++++++++++--- scripts/train-solana-test.sh | 4 +- telemetry/README.md | 2 +- website/README.md | 4 +- 8 files changed, 127 insertions(+), 79 deletions(-) diff --git a/docker/README.md b/docker/README.md index e125ea143..2377920e9 100644 --- a/docker/README.md +++ b/docker/README.md @@ -55,7 +55,7 @@ validator, build and deploy the Coordinator program and create a training run. For creating a lightweight run, you can use ```bash -just setup-solana-localnet-light-test-run +just dev setup-solana-localnet-light-test-run ``` When the Solana setup finishes, you will see a log saying diff --git a/justfile b/justfile index 0c7006dbf..55c16761b 100644 --- a/justfile +++ b/justfile @@ -1,4 +1,5 @@ mod nix +mod dev 'architectures/decentralized/justfile' default: just --list @@ -76,48 +77,6 @@ decentralized-chaos-integration-test test_name="": cargo test --release -p psyche-decentralized-testing --test chaos_tests -- --nocapture "{{ test_name }}"; \ fi -# Deploy coordinator on localnet and create a "test" run for 1.1b model. -setup-solana-localnet-test-run run_id="test" *args='': - RUN_ID={{ run_id }} ./scripts/setup-and-deploy-solana-test.sh {{ args }} - -# Deploy coordinator on localnet and create a "test" run for 20m model. -setup-solana-localnet-light-test-run run_id="test" *args='': - RUN_ID={{ run_id }} CONFIG_FILE=./config/solana-test/light-config.toml ./scripts/setup-and-deploy-solana-test.sh {{ args }} - -# Start client for training on localnet. -start-training-localnet-client run_id="test" *args='': - RUN_ID={{ run_id }} ./scripts/train-solana-test.sh {{ args }} - -# Start client for training on localnet without data parallelism features and using light model. -start-training-localnet-light-client run_id="test" *args='': - RUN_ID={{ run_id }} BATCH_SIZE=1 DP=1 ./scripts/train-solana-test.sh {{ args }} - -OTLP_METRICS_URL := "http://localhost:4318/v1/metrics" -OTLP_LOGS_URL := "http://localhost:4318/v1/logs" - -# The same command as above but with arguments set to export telemetry data -start-training-localnet-light-client-telemetry run_id="test" *args='': - OTLP_METRICS_URL={{ OTLP_METRICS_URL }} OTLP_LOGS_URL={{ OTLP_LOGS_URL }} RUN_ID={{ run_id }} BATCH_SIZE=1 DP=1 ./scripts/train-solana-test.sh {{ args }} - -DEVNET_RPC := "https://api.devnet.solana.com" -DEVNET_WS_RPC := "wss://api.devnet.solana.com" - -# Deploy coordinator on Devnet and create a "test" run for 1.1b model. -setup-solana-devnet-test-run run_id="test" *args='': - RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} ./scripts/deploy-solana-test.sh {{ args }} - -# Deploy coordinator on Devnet and create a "test" run for 20m model. -setup-solana-devnet-light-test-run run_id="test" *args='': - RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} CONFIG_FILE=./config/solana-test/light-config.toml ./scripts/deploy-solana-test.sh {{ args }} - -# Start client for training on Devnet. -start-training-devnet-client run_id="test" *args='': - RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} ./scripts/train-solana-test.sh {{ args }} - -# Start client for training on localnet without data parallelism features and using light model. -start-training-devnet-light-client run_id="test" *args='': - RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} BATCH_SIZE=1 DP=1 ./scripts/train-solana-test.sh {{ args }} - solana-client-tests: cargo test --package psyche-solana-client --features solana-localnet-tests diff --git a/psyche-book/src/development/running-onchain.md b/psyche-book/src/development/running-onchain.md index aa8009ca9..8a1d1459a 100644 --- a/psyche-book/src/development/running-onchain.md +++ b/psyche-book/src/development/running-onchain.md @@ -12,10 +12,10 @@ By default the KeyPair will be generated in `~/.config/solana/id.json`. ## Run on a local validator (localnet) -In a new terminal, run the following command: +To quickly be able to test the decentralized training you can spin up a Solana Validator locally an fund your Solana wallet with fake tokens to be able to make transactions there. To setup a new training run with this tool, in a new terminal, run the following command: ```bash -just setup-solana-localnet-test-run run_id= +just dev setup-solana-localnet-test-run run_id= ``` This will: @@ -24,23 +24,23 @@ This will: - Deploy all the required programs (Coordinator and Authorizer) - Create a local run with name ``. If no run name is provided, the name `test` will be used by default. The run id should not exceed 32 characters, it will be truncated if it exceeds this limit. -Then, in another terminal, run a client to train the test model and joining the run with name `RUN_ID`. If no run name is provided, the name `test` will be used by default. +Then, in another terminal, run a client to train the test model and joining the run with name `RUN_ID`. ```bash -just start-training-localnet-client run_id= +just dev start-training-localnet-client run_id= ``` -This will start a run to train a 1.1b parameter model with all the parallelism features enabled. This Psyche client will use a temporal private key, which will be generated and deleted automatically running the mentioned command. In case you want to check these keys, they will be stored in `~/solana-keys`. To run it with a specific private key, you can run the same command but adding the `WALLET_FILE` env var: +This will start a run to train a 1.1b parameter model with all the parallelism features enabled. This Psyche client will use a temporal private key, which will be generated and deleted automatically running the mentioned command. In case you want to check these keys, they will be stored in `~/.config/solana/solana-keys`. To run it with a specific private key, you can run the same command but adding the `WALLET_FILE` env var: ```bash -WALLET_FILE=/path/to/wallet.json just start-training-localnet-client run_id= +WALLET_FILE=/path/to/wallet.json just dev start-training-localnet-client run_id= ``` For a more lightweight run to avoid OOM errors, or just to use your hardware less, (we see you 8gb VRAM cards!) there's also: ```bash -just setup-solana-localnet-light-test-run -just start-training-localnet-light-client +just dev setup-solana-localnet-light-test-run +just dev start-training-localnet-light-client ``` This will train a 12m which should fit on most GPUs. @@ -48,16 +48,16 @@ This will train a 12m which should fit on most GPUs. To spin up another client and join the run you can run the same command as before: ```bash -just start-training-localnet-client run_id= +just dev start-training-localnet-client run_id= ``` or ```bash -just start-training-localnet-light-client run_id= +just dev start-training-localnet-light-client run_id= ``` -Like before this will create a temporal solana keypair in `~/solana-keys` and be removed when the client is stopped. +This will create a new temporal solana keypair in `~/.config/solana/solana-keys` and be removed when the client is stopped so you can spawn as many clients you want. ## Run on Solana's Devnet @@ -73,19 +73,39 @@ If no path to keypair is provided, it will use the default keypair located at `~ You can then use the same steps for deploying the programs, creating a run, and training on localnet above, but using the following `just` commands: ```bash -just setup-solana-devnet-test-run -just start-training-devnet-client +just dev setup-solana-devnet-test-run +just dev start-training-devnet-client ``` alongside the `-light` variants ```bash -just setup-solana-devnet-light-test-run -just start-training-devnet-light-client +just dev setup-solana-devnet-light-test-run +just dev start-training-devnet-light-client ``` Remember to set the `WALLET_FILE` environment variable to the path of your Solana keypair file, since this will be the one with the devnet funds. +These commands work almost the same as the ones using localnet, but they use the public Solana Devnet RPC endpoint (`https://api.devnet.solana.com`). Also, for all the programs (Coordinator, Authorizer, and Treasurer), we need to generate new program IDs, basically the “addresses” where the contracts will be deployed, since the current IDs are the ones used by the Psyche team for development and can’t be overridden. More details on how we update the program ids in the [changing contracts section](#changing-contracts). + +## Running run with rewards + +There's another program that adds a new layer to the whole Psyche run named the `Treasurer` when this program gets deployed it adds a whole new layer of rewards on top of the Coordinator that will calculate the amount of a specific token that a client will get for their training time. This contract is not mandatory to test a run it will only add the functionality in case we want to test the rewards. You can check a more in depth explanation on the [rewards section](../explain/rewards.md). + +To be able to test this, all the commands we already mentioned have also a version with the treasurer usage, like: + +```bash +# Localnet +just dev setup-solana-localnet-test-run-treasurer +just dev setup-solana-localnet-light-test-run-treasurer + +# Devnet +just dev setup-solana-devnet-test-run-treasurer +just dev setup-solana-devnet-light-test-run-treasurer +``` + +These commands will deploy the Treasurer along the other contracts and also create a new test token with the [SPL Token tool](https://solana.com/docs/tokens/basics) in the used network to be able to use it as collateral of in the run. + ## Psyche decentralized client reference All the commands above will use the same package `psyche-solana-client` with specific parameters to be able to do a quick train on the local validator but it actually has a _lot_ of different configs to be able to test and run different scenarios. @@ -99,7 +119,7 @@ Here's a summary of all the available commands and options that can be used: ## Changing contracts -Psyche uses two main accounts that are deployed to Solana, the coordinator and the authorizer. +Psyche uses two main accounts that are deployed to Solana, the coordinator and the authorizer and one optional account that is the Treasurer. If you're developing things that change the structure of the program's accounts layout, deploying an update to the coordinator program will likely cause breakage with existing runs that have coordinator accounts already instantiated. Therefore, changes to the data structures that end up on-chain will require a deployment of a new coordinator program under a new ProgramID to prevent breakage of existing runs. diff --git a/psyche-book/src/enduser/create-run.md b/psyche-book/src/enduser/create-run.md index da1018468..da96be552 100644 --- a/psyche-book/src/enduser/create-run.md +++ b/psyche-book/src/enduser/create-run.md @@ -77,7 +77,7 @@ psyche-solana-client create-run \ --run-id [RUN_ID] \ --description "A description of your run" \ --join-authority [JOIN_AUTHORITY_PUBKEY] \ - --treasurer-collateral-mint [COLLATERAL_MINT_PUBKEY] \ + --treasurer-collateral-mint [TOKEN_PUBKEY] \ --wallet-private-key-path [JSON_PRIVATE_KEY_PATH] ``` diff --git a/scripts/deploy-solana-test.sh b/scripts/deploy-solana-test.sh index 5e1bc6ed6..385a7abdb 100755 --- a/scripts/deploy-solana-test.sh +++ b/scripts/deploy-solana-test.sh @@ -4,8 +4,19 @@ set -o errexit set -e set -m -# use the agenix provided wallet if you have it +# Parse command line arguments +DEPLOY_TREASURER=false +EXTRA_ARGS=() + +for arg in "$@"; do + if [[ "$arg" == "--treasurer" ]]; then + DEPLOY_TREASURER=true + else + EXTRA_ARGS+=("$arg") + fi +done +# use the agenix provided wallet if you have it if [[ -n "${devnet__keypair__wallet_PATH}" && -f "${devnet__keypair__wallet_PATH}" ]]; then DEFAULT_WALLET="${devnet__keypair__wallet_PATH}" else @@ -17,47 +28,102 @@ WS_RPC=${WS_RPC:-"ws://127.0.0.1:8900"} RUN_ID=${RUN_ID:-"test"} CONFIG_FILE=${CONFIG_FILE:-"./config/solana-test/config.toml"} +# Detect if we're deploying to devnet +IS_DEVNET=false +if [[ "$RPC" == *"devnet.solana.com"* ]]; then + IS_DEVNET=true +fi + echo -e "\n[+] deploy info:" echo -e "[+] WALLET_FILE = $WALLET_FILE" echo -e "[+] RPC = $RPC" echo -e "[+] WS_RPC = $WS_RPC" echo -e "[+] RUN_ID = $RUN_ID" echo -e "[+] CONFIG_FILE = $CONFIG_FILE" +echo -e "[+] IS_DEVNET = $IS_DEVNET" +echo -e "[+] DEPLOY_TREASURER = $DEPLOY_TREASURER" echo -e "[+] -----------------------------------------------------------" -echo -e "\n[+] Starting authorizor deploy" +# Deploy Coordinator +echo -e "\n[+] Starting coordinator deploy" +pushd $(pwd)/architectures/decentralized/solana-coordinator + +if [[ "$IS_DEVNET" == "true" ]]; then + echo -e "\n[+] - generating new keypair for devnet..." + solana-keygen new -o ./target/deploy/psyche_solana_coordinator-keypair.json -f --no-bip39-passphrase + anchor keys sync +fi + +echo -e "\n[+] - building..." +anchor build --no-idl + +echo -e "\n[+] - deploying..." +anchor deploy --provider.cluster devnet --provider.wallet ${WALLET_FILE} -- --max-len 500000 +sleep 1 + +echo -e "\n[+] Coordinator program deployed successfully!" +popd + +# Deploy Authorizer +echo -e "\n[+] Starting authorizer deploy" pushd architectures/decentralized/solana-authorizer +if [[ "$IS_DEVNET" == "true" ]]; then + echo -e "\n[+] - generating new keypair for devnet..." + solana-keygen new -o ./target/deploy/psyche_solana_authorizer-keypair.json -f --no-bip39-passphrase + anchor keys sync +fi + echo -e "\n[+] - building..." anchor build echo -e "\n[+] - deploying..." anchor deploy --provider.cluster ${RPC} --provider.wallet ${WALLET_FILE} -sleep 1 # wait for the program to be deployed and ready in the validator +sleep 1 echo -e "\n[+] - init-idl..." +AUTHORIZER_PUBKEY=$(solana-keygen pubkey ./target/deploy/psyche_solana_authorizer-keypair.json) anchor idl init \ --provider.cluster ${RPC} \ --provider.wallet ${WALLET_FILE} \ --filepath target/idl/psyche_solana_authorizer.json \ - PsyAUmhpmiUouWsnJdNGFSX8vZ6rWjXjgDPHsgqPGyw + ${AUTHORIZER_PUBKEY} echo -e "\n[+] Authorizer program deployed successfully!" popd -echo -e "\n[+] Starting coordinator deploy" -pushd architectures/decentralized/solana-coordinator +# Deploy Treasurer (if flag is set) +TREASURER_ARGS="" +if [[ "$DEPLOY_TREASURER" == "true" ]]; then + echo -e "\n[+] Starting treasurer deploy" + pushd architectures/decentralized/solana-treasurer -echo -e "\n[+] - building..." -anchor build --no-idl + if [[ "$IS_DEVNET" == "true" ]]; then + echo -e "\n[+] - generating new keypair for devnet..." + solana-keygen new -o ./target/deploy/psyche_solana_treasurer-keypair.json -f --no-bip39-passphrase + anchor keys sync + fi -echo -e "\n[+] - deploying..." -anchor deploy --provider.cluster ${RPC} --provider.wallet ${WALLET_FILE} -- --max-len 500000 -sleep 1 # wait for the program to be deployed and ready in the validator + echo -e "\n[+] - building..." + anchor build -echo -e "\n[+] Coordinator program deployed successfully!" -popd + echo -e "\n[+] - deploying..." + anchor deploy --provider.cluster ${RPC} --provider.wallet ${WALLET_FILE} + sleep 1 + + echo -e "\n[+] Treasurer program deployed successfully!" + popd + + # Create token + echo -e "\n[+] Creating token" + TOKEN_ADDRESS=$(spl-token create-token --decimals 0 --url ${RPC} | grep "Address:" | awk '{print $2}') + spl-token create-account ${TOKEN_ADDRESS} --url ${RPC} + spl-token mint ${TOKEN_ADDRESS} 1000000 --url ${RPC} + TREASURER_ARGS="--treasurer-collateral-mint ${TOKEN_ADDRESS}" +fi + +# Create permisionless authorization echo -e "\n[+] Creating authorization for everyone to join the run" bash ./scripts/join-authorization-create.sh ${RPC} ${WALLET_FILE} 11111111111111111111111111111111 @@ -67,7 +133,10 @@ cargo run --release --bin psyche-solana-client -- \ --wallet-private-key-path ${WALLET_FILE} \ --rpc ${RPC} \ --ws-rpc ${WS_RPC} \ - --run-id ${RUN_ID} "$@" + --run-id ${RUN_ID} \ + --client-version test \ + ${TREASURER_ARGS} \ + "${EXTRA_ARGS[@]}" echo -e "\n[+] Update training run config..." cargo run --release --bin psyche-solana-client -- \ diff --git a/scripts/train-solana-test.sh b/scripts/train-solana-test.sh index c366fcc15..9f448d78a 100755 --- a/scripts/train-solana-test.sh +++ b/scripts/train-solana-test.sh @@ -8,8 +8,8 @@ if [[ -n "${devnet__keypair__wallet_PATH}" && -f "${devnet__keypair__wallet_PATH elif [[ -z "${WALLET_FILE:-}" ]]; then echo "No wallet file specified, generating ephemeral keypair..." # Create a named pipe for the keypair data - mkdir -p ~/solana-keys - WALLET_FILE=$(mktemp ~/solana-keys/solana-wallet-XXXXXXXXX) + mkdir -p ~/.config/solana/solana-keys + WALLET_FILE=$(mktemp ~/.config/solana/solana-keys/solana-wallet-XXXXXXXXX) # Generate keypair and write to the generated file solana-keygen new --no-bip39-passphrase --force --outfile "${WALLET_FILE}" diff --git a/telemetry/README.md b/telemetry/README.md index 8428a5446..fd7705645 100644 --- a/telemetry/README.md +++ b/telemetry/README.md @@ -17,5 +17,5 @@ OTLP_METRICS_URL = "http://localhost:4318/v1/metrics" # OpenTelemetry collector OTLP_LOGS_URL = "http://localhost:4318/v1/logs" # OpenTelemetry collector logs endpoint ``` -For convenience, you can run `just start-training-localnet-light-client-telemetry` to start the Psyche client with +For convenience, you can run `just dev start-training-localnet-light-client-telemetry` to start the Psyche client with the arguments already set for telemetry collection diff --git a/website/README.md b/website/README.md index 76b552011..0435a285a 100644 --- a/website/README.md +++ b/website/README.md @@ -44,8 +44,8 @@ Optional: ### setting up a localnet run for data 1. start `solana-test-validator --limit-ledger-size 10000000` in another terminal. -2. deploy a run to the localnet. locally, you probably want to use a small model, so do `just setup-solana-localnet-light-test-run RUN_ID --name "\"silly run name\"" --description "\"this is a test run set up locally. it's used for training a silly model.\"" --num-parameters 12345678` -3. start training your run! `just start-training-localnet-light-client RUN_ID` in another terminal. +2. deploy a run to the localnet. locally, you probably want to use a small model, so do `just dev setup-solana-localnet-light-test-run RUN_ID --name "\"silly run name\"" --description "\"this is a test run set up locally. it's used for training a silly model.\"" --num-parameters 12345678` +3. start training your run! `just dev start-training-localnet-light-client RUN_ID` in another terminal. ### running with the backend pointed to localnet From cf047c60c0f7858d15e8da5ba18005589ded10c0 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Mon, 15 Dec 2025 10:31:00 -0300 Subject: [PATCH 23/33] Add justfile for dev commands --- architectures/decentralized/justfile | 52 ++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 architectures/decentralized/justfile diff --git a/architectures/decentralized/justfile b/architectures/decentralized/justfile new file mode 100644 index 000000000..af8a88770 --- /dev/null +++ b/architectures/decentralized/justfile @@ -0,0 +1,52 @@ +# Run these commands from the root + +set working-directory := '../../' + +default: + just --list + +# LOCALNET +setup-solana-localnet-test-run run_id="test" *args='': + RUN_ID={{ run_id }} ./scripts/deploy-solana-test.sh {{ args }} + +setup-solana-localnet-light-test-run run_id="test" *args='': + RUN_ID={{ run_id }} CONFIG_FILE=./config/solana-test/light-config.toml ./scripts/deploy-solana-test.sh {{ args }} + +setup-solana-localnet-test-run-treasurer run_id="test" *args='': + RUN_ID={{ run_id }} ./scripts/deploy-solana-test.sh --treasurer {{ args }} + +setup-solana-localnet-light-test-run-treasurer run_id="test" *args='': + RUN_ID={{ run_id }} CONFIG_FILE=./config/solana-test/light-config.toml ./scripts/deploy-solana-test.sh --treasurer {{ args }} + +start-training-localnet-client run_id="test" *args='': + RUN_ID={{ run_id }} ./scripts/train-solana-test.sh {{ args }} + +start-training-localnet-light-client run_id="test" *args='': + RUN_ID={{ run_id }} BATCH_SIZE=1 DP=1 ./scripts/train-solana-test.sh {{ args }} + +OTLP_METRICS_URL := "http://localhost:4318/v1/metrics" +OTLP_LOGS_URL := "http://localhost:4318/v1/logs" + +start-training-localnet-light-client-telemetry run_id="test" *args='': + OTLP_METRICS_URL={{ OTLP_METRICS_URL }} OTLP_LOGS_URL={{ OTLP_LOGS_URL }} RUN_ID={{ run_id }} BATCH_SIZE=1 DP=1 ./scripts/train-solana-test.sh {{ args }} + +DEVNET_RPC := "https://api.devnet.solana.com" +DEVNET_WS_RPC := "wss://api.devnet.solana.com" + +setup-solana-devnet-test-run run_id="test" *args='': + RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} ./scripts/deploy-solana-test.sh {{ args }} + +setup-solana-devnet-light-test-run run_id="test" *args='': + RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} CONFIG_FILE=./config/solana-test/light-config.toml ./scripts/deploy-solana-test.sh {{ args }} + +setup-solana-devnet-test-run-treasurer run_id="test" *args='': + RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} ./scripts/deploy-solana-test.sh --treasurer {{ args }} + +setup-solana-devnet-light-test-run-treasurer run_id="test" *args='': + RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} CONFIG_FILE=./config/solana-test/light-config.toml ./scripts/deploy-solana-test.sh --treasurer {{ args }} + +start-training-devnet-client run_id="test" *args='': + RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} ./scripts/train-solana-test.sh {{ args }} + +start-training-devnet-light-client run_id="test" *args='': + RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} BATCH_SIZE=1 DP=1 ./scripts/train-solana-test.sh {{ args }} From b6b7e2ff14734bd03fd75b0d779b9dab2880c5f9 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Mon, 15 Dec 2025 08:15:25 -0800 Subject: [PATCH 24/33] Finish testing new commands --- architectures/decentralized/justfile | 18 ++++-- config/solana-test/light-config.toml | 2 +- .../src/development/running-onchain.md | 14 ++++- scripts/close-programs.sh | 9 +++ scripts/deploy-solana-test.sh | 59 ++++++++++--------- scripts/join-authorization-create.sh | 2 +- scripts/setup-and-deploy-solana-test.sh | 20 ++++++- 7 files changed, 86 insertions(+), 38 deletions(-) create mode 100755 scripts/close-programs.sh diff --git a/architectures/decentralized/justfile b/architectures/decentralized/justfile index af8a88770..28a3b1bae 100644 --- a/architectures/decentralized/justfile +++ b/architectures/decentralized/justfile @@ -2,21 +2,25 @@ set working-directory := '../../' +# In case a recipe is not found here, it will fallback to the root justfile. + +set fallback := true + default: just --list -# LOCALNET +# Localnet setup-solana-localnet-test-run run_id="test" *args='': - RUN_ID={{ run_id }} ./scripts/deploy-solana-test.sh {{ args }} + RUN_ID={{ run_id }} ./scripts/setup-and-deploy-solana-test.sh {{ args }} setup-solana-localnet-light-test-run run_id="test" *args='': - RUN_ID={{ run_id }} CONFIG_FILE=./config/solana-test/light-config.toml ./scripts/deploy-solana-test.sh {{ args }} + RUN_ID={{ run_id }} CONFIG_FILE=./config/solana-test/light-config.toml ./scripts/setup-and-deploy-solana-test.sh {{ args }} setup-solana-localnet-test-run-treasurer run_id="test" *args='': - RUN_ID={{ run_id }} ./scripts/deploy-solana-test.sh --treasurer {{ args }} + RUN_ID={{ run_id }} ./scripts/setup-and-deploy-solana-test.sh --treasurer {{ args }} setup-solana-localnet-light-test-run-treasurer run_id="test" *args='': - RUN_ID={{ run_id }} CONFIG_FILE=./config/solana-test/light-config.toml ./scripts/deploy-solana-test.sh --treasurer {{ args }} + RUN_ID={{ run_id }} CONFIG_FILE=./config/solana-test/light-config.toml ./scripts/setup-and-deploy-solana-test.sh --treasurer {{ args }} start-training-localnet-client run_id="test" *args='': RUN_ID={{ run_id }} ./scripts/train-solana-test.sh {{ args }} @@ -33,6 +37,7 @@ start-training-localnet-light-client-telemetry run_id="test" *args='': DEVNET_RPC := "https://api.devnet.solana.com" DEVNET_WS_RPC := "wss://api.devnet.solana.com" +# Devnet setup-solana-devnet-test-run run_id="test" *args='': RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} ./scripts/deploy-solana-test.sh {{ args }} @@ -50,3 +55,6 @@ start-training-devnet-client run_id="test" *args='': start-training-devnet-light-client run_id="test" *args='': RUN_ID={{ run_id }} RPC={{ DEVNET_RPC }} WS_RPC={{ DEVNET_WS_RPC }} BATCH_SIZE=1 DP=1 ./scripts/train-solana-test.sh {{ args }} + +close-dev-programs: + ./scripts/close-programs.sh diff --git a/config/solana-test/light-config.toml b/config/solana-test/light-config.toml index 5de8ca8a4..eab015342 100644 --- a/config/solana-test/light-config.toml +++ b/config/solana-test/light-config.toml @@ -3,7 +3,7 @@ warmup_time = 30 cooldown_time = 30 epoch_time = 60 max_round_train_time = 15 -round_witness_time = 1 +round_witness_time = 5 min_clients = 1 init_min_clients = 1 verification_percent = 0 diff --git a/psyche-book/src/development/running-onchain.md b/psyche-book/src/development/running-onchain.md index 8a1d1459a..b4168967d 100644 --- a/psyche-book/src/development/running-onchain.md +++ b/psyche-book/src/development/running-onchain.md @@ -84,7 +84,7 @@ just dev setup-solana-devnet-light-test-run just dev start-training-devnet-light-client ``` -Remember to set the `WALLET_FILE` environment variable to the path of your Solana keypair file, since this will be the one with the devnet funds. +Remember to set the `WALLET_FILE` environment variable to the path of your Solana keypair file when running the training commands, since this will be the one with the devnet funds. These commands work almost the same as the ones using localnet, but they use the public Solana Devnet RPC endpoint (`https://api.devnet.solana.com`). Also, for all the programs (Coordinator, Authorizer, and Treasurer), we need to generate new program IDs, basically the “addresses” where the contracts will be deployed, since the current IDs are the ones used by the Psyche team for development and can’t be overridden. More details on how we update the program ids in the [changing contracts section](#changing-contracts). @@ -104,7 +104,17 @@ just dev setup-solana-devnet-test-run-treasurer just dev setup-solana-devnet-light-test-run-treasurer ``` -These commands will deploy the Treasurer along the other contracts and also create a new test token with the [SPL Token tool](https://solana.com/docs/tokens/basics) in the used network to be able to use it as collateral of in the run. +These commands will deploy the Treasurer along the other contracts, create a new test token with the [SPL Token tool](https://solana.com/docs/tokens/basics) in the used network to be able to use it as collateral of in the run and also top up the run with the rewards and some collateral distribution for all the clients that train for more than one epoch. + +### Recovering dev tokens + +The majority of the tokens used in devnet are needed to deploy the different contracts, you can re-claim those tokens once you finished your testing. This is very useful since the devnet faucet for Solana is limited. You can run: + +```bash +just dev close-dev-programs +``` + +This will close all the deployed accounts in devnet giving the tokens back to the wallet used on the deployment. Be aware that this is an irreversable change, once you closed your program you cannot reuse that same program ID and have to generate a new one. ## Psyche decentralized client reference diff --git a/scripts/close-programs.sh b/scripts/close-programs.sh new file mode 100755 index 000000000..b68b5925b --- /dev/null +++ b/scripts/close-programs.sh @@ -0,0 +1,9 @@ +echo -e "\n[+] Closing deployed Solana programs from devnet, are you sure? (y/N)" +read -r CONFIRMATION +if [[ "$CONFIRMATION" != "y" && "$CONFIRMATION" != "Y" ]]; then + echo "Aborting." + exit 0 +fi +solana program close $(solana-keygen pubkey architectures/decentralized/solana-coordinator/target/deploy/psyche_solana_coordinator-keypair.json) --bypass-warning --url devnet +solana program close $(solana-keygen pubkey architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json) --bypass-warning --url devnet +solana program close $(solana-keygen pubkey architectures/decentralized/solana-treasurer/target/deploy/psyche_solana_treasurer-keypair.json) --bypass-warning --url devnet diff --git a/scripts/deploy-solana-test.sh b/scripts/deploy-solana-test.sh index 4c65b07e8..01a10ec1b 100755 --- a/scripts/deploy-solana-test.sh +++ b/scripts/deploy-solana-test.sh @@ -44,21 +44,29 @@ echo -e "[+] IS_DEVNET = $IS_DEVNET" echo -e "[+] DEPLOY_TREASURER = $DEPLOY_TREASURER" echo -e "[+] -----------------------------------------------------------" -# Deploy Coordinator -echo -e "\n[+] Starting coordinator deploy" -pushd $(pwd)/architectures/decentralized/solana-coordinator - if [[ "$IS_DEVNET" == "true" ]]; then - echo -e "\n[+] - generating new keypair for devnet..." - solana-keygen new -o ./target/deploy/psyche_solana_coordinator-keypair.json -f --no-bip39-passphrase - anchor keys sync + echo -e "\n[+] - generating new keypairs for devnet..." + solana-keygen new -o architectures/decentralized/solana-coordinator/target/deploy/psyche_solana_coordinator-keypair.json -f --no-bip39-passphrase + solana-keygen new -o architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json -f --no-bip39-passphrase + if [[ "$DEPLOY_TREASURER" == "true" ]]; then + solana-keygen new -o architectures/decentralized/solana-treasurer/target/deploy/psyche_solana_treasurer-keypair.json -f --no-bip39-passphrase + fi + cd architectures/decentralized/solana-coordinator && anchor keys sync && cd - + cd architectures/decentralized/solana-authorizer && anchor keys sync && cd - + if [[ "$DEPLOY_TREASURER" == "true" ]]; then + cd architectures/decentralized/solana-treasurer && anchor keys sync && cd - + fi fi +# Deploy Coordinator +echo -e "\n[+] Starting coordinator deploy" +pushd architectures/decentralized/solana-coordinator + echo -e "\n[+] - building..." anchor build --no-idl echo -e "\n[+] - deploying..." -anchor deploy --provider.cluster devnet --provider.wallet ${WALLET_FILE} -- --max-len 500000 +anchor deploy --provider.cluster ${RPC} --provider.wallet ${WALLET_FILE} -- --max-len 500000 sleep 1 echo -e "\n[+] Coordinator program deployed successfully!" @@ -68,12 +76,6 @@ popd echo -e "\n[+] Starting authorizer deploy" pushd architectures/decentralized/solana-authorizer -if [[ "$IS_DEVNET" == "true" ]]; then - echo -e "\n[+] - generating new keypair for devnet..." - solana-keygen new -o ./target/deploy/psyche_solana_authorizer-keypair.json -f --no-bip39-passphrase - anchor keys sync -fi - echo -e "\n[+] - building..." anchor build @@ -98,12 +100,6 @@ if [[ "$DEPLOY_TREASURER" == "true" ]]; then echo -e "\n[+] Starting treasurer deploy" pushd architectures/decentralized/solana-treasurer - if [[ "$IS_DEVNET" == "true" ]]; then - echo -e "\n[+] - generating new keypair for devnet..." - solana-keygen new -o ./target/deploy/psyche_solana_treasurer-keypair.json -f --no-bip39-passphrase - anchor keys sync - fi - echo -e "\n[+] - building..." anchor build @@ -138,14 +134,21 @@ cargo run --release --bin psyche-solana-client -- \ ${TREASURER_ARGS} \ "${EXTRA_ARGS[@]}" -echo -e "\n[+] Setting training run earning rate..." -cargo run --release --bin psyche-solana-client -- \ - set-future-epoch-rates \ - --wallet-private-key-path ${WALLET_FILE} \ - --rpc ${RPC} \ - --ws-rpc ${WS_RPC} \ - --run-id ${RUN_ID} \ - --earning-rate-total-shared 100.0 +if [[ "$DEPLOY_TREASURER" == "true" ]]; then + echo -e "\n[+] Setting treasurer collateral requirements..." + cargo run --release --bin psyche-solana-client treasurer-top-up-rewards \ + --run-id ${RUN_ID} \ + --collateral-amount 10 \ + --wallet-private-key-path ${WALLET_FILE} \ + --rpc ${RPC} + + cargo run --release --bin psyche-solana-client -- set-future-epoch-rates \ + --rpc ${RPC} \ + --run-id ${RUN_ID} \ + --wallet-private-key-path ${WALLET_FILE} \ + --earning-rate-total-shared 10 \ + --slashing-rate-per-client 10 +fi echo -e "\n[+] Update training run config..." cargo run --release --bin psyche-solana-client -- \ diff --git a/scripts/join-authorization-create.sh b/scripts/join-authorization-create.sh index 22c7cff44..c02aebd6d 100644 --- a/scripts/join-authorization-create.sh +++ b/scripts/join-authorization-create.sh @@ -31,7 +31,7 @@ GRANTOR_PUBKEY=$(solana-keygen pubkey $GRANTOR_KEYPAIR_FILE) GRANTEE_PUBKEY="$1" shift -PSYCHE_AUTHORIZER_ID="PsyAUmhpmiUouWsnJdNGFSX8vZ6rWjXjgDPHsgqPGyw" +PSYCHE_AUTHORIZER_ID=$(solana-keygen pubkey architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json) PSYCHE_AUTH_SCOPE="utf8:CoordinatorJoinRun" # Make sure all is good to go diff --git a/scripts/setup-and-deploy-solana-test.sh b/scripts/setup-and-deploy-solana-test.sh index 873c3dbc4..c1c692f14 100755 --- a/scripts/setup-and-deploy-solana-test.sh +++ b/scripts/setup-and-deploy-solana-test.sh @@ -4,6 +4,18 @@ set -o errexit set -e set -m +# Parse command line arguments +DEPLOY_TREASURER=false +EXTRA_ARGS=() + +for arg in "$@"; do + if [[ "$arg" == "--treasurer" ]]; then + DEPLOY_TREASURER=true + else + EXTRA_ARGS+=("$arg") + fi +done + RPC=${RPC:-"http://127.0.0.1:8899"} CONFIG_FILE=${CONFIG_FILE:-"./config/solana-test/config.toml"} # use the agenix provided wallet if you have it @@ -27,7 +39,13 @@ echo -e "\n[+] Started test validator!" sleep 3 solana airdrop 10 --url ${RPC} --keypair ${WALLET_FILE} -CONFIG_FILE=${CONFIG_FILE} WALLET_FILE=${WALLET_FILE} ./scripts/deploy-solana-test.sh + +# Pass treasurer flag to deploy script if set +if [[ "$DEPLOY_TREASURER" == "true" ]]; then + CONFIG_FILE=${CONFIG_FILE} WALLET_FILE=${WALLET_FILE} ./scripts/deploy-solana-test.sh --treasurer "${EXTRA_ARGS[@]}" +else + CONFIG_FILE=${CONFIG_FILE} WALLET_FILE=${WALLET_FILE} ./scripts/deploy-solana-test.sh "${EXTRA_ARGS[@]}" +fi echo -e "\n[+] Testing Solana setup ready, starting Solana logs...\n" From 4dbc24462c68e697b3028137d77198836ee6a576 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Tue, 16 Dec 2025 13:45:59 -0300 Subject: [PATCH 25/33] Solana mem fix wip --- architectures/centralized/server/src/app.rs | 2 +- .../programs/solana-authorizer/src/lib.rs | 2 +- .../psyche_solana_authorizer-keypair.json | 7 +- .../src/command/json_dump_run.rs | 1 + .../src/command/update_config.rs | 67 +++++++++- .../solana-client/src/instructions.rs | 20 +++ .../decentralized/solana-client/src/main.rs | 24 ++++ .../solana-coordinator/src/instance_state.rs | 62 +++++++++ .../programs/solana-coordinator/src/lib.rs | 12 +- .../psyche_solana_coordinator-keypair.json | 2 +- .../suites/memnet_coordinator_full_round.rs | 2 +- .../suites/memnet_coordinator_rewards.rs | 8 +- .../suites/memnet_treasurer_create_update.rs | 8 +- .../suites/memnet_treasurer_full_epoch.rs | 2 +- .../programs/solana-treasurer/src/lib.rs | 2 +- scripts/deploy-solana-treasurer-test.sh | 121 ++++++++++++++++++ scripts/join-authorization-create.sh | 2 +- shared/client/src/state/init.rs | 5 +- shared/coordinator/src/coordinator.rs | 4 +- shared/coordinator/src/model.rs | 76 ++++++----- 20 files changed, 368 insertions(+), 61 deletions(-) create mode 100755 scripts/deploy-solana-treasurer-test.sh diff --git a/architectures/centralized/server/src/app.rs b/architectures/centralized/server/src/app.rs index 909477880..75a3827eb 100644 --- a/architectures/centralized/server/src/app.rs +++ b/architectures/centralized/server/src/app.rs @@ -177,10 +177,10 @@ impl App { let training_data_server = match &coordinator.model { Model::LLM(LLM { - data_locations, checkpoint, .. }) => { + let data_locations = &coordinator.data_locations; let data_location_server_urls:Vec<_> = data_locations.iter().filter_map(|l| match l {LLMTrainingDataLocation::Server(url) => Some(url.to_string()), _=> None}).collect(); if data_location_server_urls.is_empty() { diff --git a/architectures/decentralized/solana-authorizer/programs/solana-authorizer/src/lib.rs b/architectures/decentralized/solana-authorizer/programs/solana-authorizer/src/lib.rs index 0cbeaaad1..a23aa2a7d 100644 --- a/architectures/decentralized/solana-authorizer/programs/solana-authorizer/src/lib.rs +++ b/architectures/decentralized/solana-authorizer/programs/solana-authorizer/src/lib.rs @@ -4,7 +4,7 @@ pub mod state; use anchor_lang::prelude::*; use logic::*; -declare_id!("PsyAUmhpmiUouWsnJdNGFSX8vZ6rWjXjgDPHsgqPGyw"); +declare_id!("AbPtyECaUE9kYyyu5hiEa5LF1yffU7kwso2AnDJebvr1"); pub fn find_authorization( grantor: &Pubkey, diff --git a/architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json b/architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json index b406272a7..cefb6b946 100644 --- a/architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json +++ b/architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json @@ -1,6 +1 @@ -[ - 133, 168, 22, 30, 172, 129, 250, 222, 16, 179, 41, 22, 161, 2, 171, 87, 19, - 134, 196, 81, 67, 2, 234, 210, 48, 19, 234, 27, 192, 50, 161, 118, 5, 220, - 104, 106, 206, 33, 252, 167, 179, 12, 115, 187, 217, 59, 144, 210, 71, 193, - 163, 175, 67, 155, 20, 145, 152, 109, 251, 190, 151, 5, 10, 82 -] +[148,173,21,87,228,46,77,36,82,244,20,158,176,165,14,32,145,188,74,19,67,237,0,24,105,77,140,96,46,211,192,150,142,136,209,134,211,194,76,210,164,42,110,223,90,57,233,190,42,214,249,60,238,206,241,73,224,182,127,44,202,58,254,78] \ No newline at end of file diff --git a/architectures/decentralized/solana-client/src/command/json_dump_run.rs b/architectures/decentralized/solana-client/src/command/json_dump_run.rs index 90aabacef..6882e7078 100644 --- a/architectures/decentralized/solana-client/src/command/json_dump_run.rs +++ b/architectures/decentralized/solana-client/src/command/json_dump_run.rs @@ -105,6 +105,7 @@ pub async fn command_json_dump_run_execute( "setup": { "metadata": coordinator_account_state.state.metadata, "model": coordinator_account_state.state.coordinator.model, + "data_locations": coordinator_account_state.state.coordinator.data_locations, "config": coordinator_account_state.state.coordinator.config, }, "status": { diff --git a/architectures/decentralized/solana-client/src/command/update_config.rs b/architectures/decentralized/solana-client/src/command/update_config.rs index 250a7042c..4d85bb558 100644 --- a/architectures/decentralized/solana-client/src/command/update_config.rs +++ b/architectures/decentralized/solana-client/src/command/update_config.rs @@ -1,11 +1,13 @@ use std::path::PathBuf; +use anchor_client::anchor_lang::prelude::borsh; use anyhow::{Context, Result, bail}; use clap::Args; use psyche_coordinator::{ CoordinatorConfig, CoordinatorProgress, get_data_index_for_step, - model::{Checkpoint, Model}, + model::{Checkpoint, LLMDataLocations, LLMTrainingDataLocation, Model}, }; +use psyche_core::FixedVec; use psyche_solana_treasurer::logic::RunUpdateParams; use serde::{Deserialize, Serialize}; @@ -76,22 +78,60 @@ pub async fn command_update_config_execute( ), }); - let (config, mut model) = match config_path { + let (config, mut model, data_locations) = match config_path { Some(config_path) => { + #[derive(Serialize, Deserialize)] + struct ModelWrapper { + #[serde(flatten)] + pub model: Model, // This will deserialize the enum variant (LLM, etc.) + } + #[derive(Serialize, Deserialize)] struct State { pub config: CoordinatorConfig, - pub model: Model, + pub model: ModelWrapper, } + + // First, parse without data_locations to get the Model enum let state: State = toml::from_str(std::str::from_utf8( &std::fs::read(&config_path) .with_context(|| format!("failed to read config toml file {config_path:?}"))?, )?) .with_context(|| format!("failed to parse config toml file {config_path:?}"))?; - (Some(state.config), Some(state.model)) + // Then parse just the data_locations separately + #[derive(Serialize, Deserialize)] + struct DataLocationsWrapper { + pub data_locations: Vec, + } + + #[derive(Serialize, Deserialize)] + struct LLMSection { + #[serde(rename = "LLM")] + pub llm: DataLocationsWrapper, + } + + #[derive(Serialize, Deserialize)] + struct ModelSection { + pub model: LLMSection, + } + + let data_section: ModelSection = toml::from_str(std::str::from_utf8( + &std::fs::read(&config_path) + .with_context(|| format!("failed to read config toml file {config_path:?}"))?, + )?)?; + + let data_locs = LLMDataLocations { + data_locations: FixedVec::from_iter( + data_section.model.llm.data_locations.into_iter(), + ), + }; + + println!("DATA LOCS: {data_locs:#?}"); + + (Some(state.config), Some(state.model.model), Some(data_locs)) } - None => (None, None), + None => (None, None, None), }; model = if switch_to_hub { @@ -132,12 +172,17 @@ pub async fn command_update_config_execute( (metadata != coordinator_account_state.state.metadata).then_some(metadata) }; - let coordinator_update = - metadata.is_some() || config.is_some() || model.is_some() || progress.is_some(); + let coordinator_update = metadata.is_some() + || config.is_some() + || model.is_some() + || progress.is_some() + || data_locations.is_some(); if !coordinator_update && client_version.is_none() { bail!("this invocation would not update anything, bailing.") } + let serialized = borsh::to_vec(&data_locations)?; + println!("Serialized data_locations size: {} bytes", serialized.len()); let instructions = if let Some(treasurer_index) = backend .resolve_treasurer_index(&run_id, treasurer_index) .await? @@ -171,6 +216,12 @@ pub async fn command_update_config_execute( model, progress, )); + instructions.push(instructions::coordinator_update_data_locations( + &run_id, + &coordinator_account, + &main_authority, + data_locations.as_ref(), + )); } if let Some(client_version) = client_version.clone() { @@ -184,6 +235,7 @@ pub async fn command_update_config_execute( instructions }; + println!("SENDING INSTRUCTIONS: {instructions:#?}"); let signature = backend .send_and_retry("Update config", &instructions, &[]) .await?; @@ -192,6 +244,7 @@ pub async fn command_update_config_execute( println!(" - Metadata: {metadata:#?}"); println!(" - Config: {config:#?}"); println!(" - Model: {model:#?}"); + println!(" - Data locations: {data_locations:#?}"); println!(" - Progress: {progress:#?}"); println!(" - Client version: {client_version:#?}"); diff --git a/architectures/decentralized/solana-client/src/instructions.rs b/architectures/decentralized/solana-client/src/instructions.rs index e8a38531d..a8b985362 100644 --- a/architectures/decentralized/solana-client/src/instructions.rs +++ b/architectures/decentralized/solana-client/src/instructions.rs @@ -53,6 +53,26 @@ pub fn coordinator_close_run( ) } +pub fn coordinator_update_data_locations( + run_id: &str, + coordinator_account: &Pubkey, + main_authority: &Pubkey, + data_locations: Option<&psyche_coordinator::model::LLMDataLocations>, +) -> Instruction { + let coordinator_instance = psyche_solana_coordinator::find_coordinator_instance(run_id); + anchor_instruction( + psyche_solana_coordinator::ID, + psyche_solana_coordinator::accounts::OwnerCoordinatorAccounts { + authority: *main_authority, + coordinator_instance, + coordinator_account: *coordinator_account, + }, + psyche_solana_coordinator::instruction::UpdateDataLocations { + data_locations: data_locations.cloned(), + }, + ) +} + pub fn coordinator_update( run_id: &str, coordinator_account: &Pubkey, diff --git a/architectures/decentralized/solana-client/src/main.rs b/architectures/decentralized/solana-client/src/main.rs index 53f060861..132e84d1d 100644 --- a/architectures/decentralized/solana-client/src/main.rs +++ b/architectures/decentralized/solana-client/src/main.rs @@ -23,6 +23,8 @@ use crate::command::treasurer_top_up_rewards::CommandTreasurerTopUpRewardsParams use crate::command::treasurer_top_up_rewards::command_treasurer_top_up_rewards_execute; use crate::command::update_config::CommandUpdateConfigParams; use crate::command::update_config::command_update_config_execute; +// use crate::command::update_model::CommandUpdateModelParams; +// use crate::command::update_model::command_update_model_execute; use crate::{ app::{AppParams, TAB_NAMES, Tabs}, backend::SolanaBackend, @@ -121,6 +123,14 @@ enum Commands { #[clap(flatten)] params: CommandUpdateConfigParams, }, + // UpdateModel { + // #[clap(flatten)] + // cluster: ClusterArgs, + // #[clap(flatten)] + // wallet: WalletArgs, + // #[clap(flatten)] + // params: CommandUpdateModelParams, + // }, Tick { #[clap(flatten)] cluster: ClusterArgs, @@ -251,6 +261,9 @@ impl TryInto for WalletArgs { async fn async_main() -> Result<()> { let args = CliArgs::parse(); + let logger = psyche_tui::logging() + .with_output(LogOutput::Console) + .init()?; match args.command { Commands::ShowStaticP2PIdentity { @@ -308,6 +321,17 @@ async fn async_main() -> Result<()> { .unwrap(); command_update_config_execute(backend, params).await } + // Commands::UpdateModel { cluster, wallet, params } => { + // let key_pair: Arc = Arc::new(wallet.try_into()?); + // let backend = SolanaBackend::new( + // cluster.into(), + // vec![], + // key_pair.clone(), + // CommitmentConfig::confirmed(), + // ) + // .unwrap(); + // command_update_model_execute(backend, params).await + // } Commands::SetPaused { cluster, wallet, diff --git a/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/instance_state.rs b/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/instance_state.rs index 254588222..8de426506 100644 --- a/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/instance_state.rs +++ b/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/instance_state.rs @@ -10,7 +10,10 @@ use psyche_coordinator::RunState; use psyche_coordinator::SOLANA_MAX_STRING_LEN; use psyche_coordinator::TickResult; use psyche_coordinator::Witness; +use psyche_coordinator::model::HttpLLMTrainingDataLocation; +use psyche_coordinator::model::HttpTrainingDataLocation; use psyche_coordinator::model::HubRepo; +use psyche_coordinator::model::LLMTrainingDataLocation; use psyche_coordinator::model::Model; use psyche_core::FixedString; use psyche_core::SmallBoolean; @@ -192,6 +195,42 @@ impl CoordinatorInstanceState { return err!(ProgramError::ModelSanityCheckFailed); } + for data_location in self.coordinator.data_locations.iter() { + let bad_data_location = match data_location { + LLMTrainingDataLocation::Dummy(_) => false, + LLMTrainingDataLocation::Server(url) => url.is_empty(), + LLMTrainingDataLocation::Local(_) => false, + LLMTrainingDataLocation::Http( + HttpLLMTrainingDataLocation { location, .. }, + ) => match location { + HttpTrainingDataLocation::SingleUrl(url) => { + url.is_empty() + }, + HttpTrainingDataLocation::NumberedFiles { + url_template, + num_files, + .. + } => url_template.is_empty() || *num_files == 0, + HttpTrainingDataLocation::Gcp { + bucket_name, + .. + } => bucket_name.is_empty(), + }, + LLMTrainingDataLocation::WeightedHttp(url) => { + url.is_empty() + }, + LLMTrainingDataLocation::Preprocessed(url) => { + url.is_empty() + }, + }; + if bad_data_location { + msg!( + "model check failed: bad LLM training data location." + ); + return err!(ProgramError::ModelSanityCheckFailed); + } + } + if self.coordinator.run_state == RunState::Uninitialized { // this is the only way to get out of RunState::Uninitialized // by doing this we force the sanity checks on the config and model @@ -275,6 +314,29 @@ impl CoordinatorInstanceState { Ok(()) } + pub fn update_data_locations( + &mut self, + data_locations: Option, + ) -> Result<()> { + if self.coordinator.run_state == RunState::Finished { + return err!(ProgramError::UpdateConfigFinished); + } else if !self.coordinator.halted() && data_locations.is_some() { + return err!(ProgramError::UpdateConfigNotHalted); + } + + if let Some(data_locations) = data_locations { + // if !data_locations.check() { + // return err!(ProgramError::ModelSanityCheckFailed); + // } + let _ = std::mem::replace( + &mut self.coordinator.data_locations, + data_locations, + ); + } + + Ok(()) + } + pub fn update( &mut self, metadata: Option, diff --git a/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs b/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs index 39e4c57ad..3768a9c29 100644 --- a/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs +++ b/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs @@ -29,7 +29,7 @@ use ts_rs::TS; pub use crate::instance_state::RunMetadata; -declare_id!("4SHugWqSXwKE5fqDchkJcPEqnoZE22VYKtSTVm7axbT7"); +declare_id!("6DvXZnaJd2RHmmMVFZE8P4GpCdZXGYpvVda7LDVBUK7v"); pub const SOLANA_MAX_NUM_PENDING_CLIENTS: usize = SOLANA_MAX_NUM_CLIENTS; @@ -165,6 +165,7 @@ impl CoordinatorInstance { pub mod psyche_solana_coordinator { use super::*; + use psyche_coordinator::model::LLMDataLocations; use psyche_core::FixedString; pub fn init_coordinator( @@ -181,6 +182,15 @@ pub mod psyche_solana_coordinator { free_coordinator_processor(context, params) } + pub fn update_data_locations( + ctx: Context, + data_locations: Option, + ) -> Result<()> { + let mut account = ctx.accounts.coordinator_account.load_mut()?; + account.increment_nonce(); + account.state.update_data_locations(data_locations) + } + pub fn update( ctx: Context, metadata: Option, diff --git a/architectures/decentralized/solana-coordinator/target/deploy/psyche_solana_coordinator-keypair.json b/architectures/decentralized/solana-coordinator/target/deploy/psyche_solana_coordinator-keypair.json index e7a29e9b4..7bfe03c0c 100644 --- a/architectures/decentralized/solana-coordinator/target/deploy/psyche_solana_coordinator-keypair.json +++ b/architectures/decentralized/solana-coordinator/target/deploy/psyche_solana_coordinator-keypair.json @@ -1 +1 @@ -[64,238,5,158,112,133,38,180,4,62,68,219,46,236,189,68,44,131,70,134,229,152,44,218,72,233,162,120,147,52,99,51,51,13,179,3,249,169,215,84,254,219,157,144,170,99,145,211,144,51,17,103,241,3,92,148,244,17,156,198,157,197,61,26] \ No newline at end of file +[27,96,35,184,165,2,254,184,93,154,151,51,92,73,10,13,208,2,234,87,55,141,189,96,202,13,104,226,195,88,241,154,77,153,237,112,135,152,74,74,2,139,208,162,159,49,8,104,38,172,1,251,202,41,79,20,234,97,104,137,100,127,66,241] \ No newline at end of file diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs index 94460991b..9910f5e33 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs @@ -125,7 +125,7 @@ pub async fn run() { checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, data_type: LLMTrainingDataType::Pretraining, - data_locations, + // data_locations, lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), optimizer: OptimizerDefinition::Distro { clip_grad_norm: None, diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs index f99afd43f..450852c8b 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs @@ -111,10 +111,10 @@ pub async fn run() { checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, data_type: LLMTrainingDataType::Pretraining, - data_locations: FixedVec::try_from_iter([ - LLMTrainingDataLocation::default(), - ]) - .unwrap(), + // data_locations: FixedVec::try_from_iter([ + // LLMTrainingDataLocation::default(), + // ]) + // .unwrap(), lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), optimizer: OptimizerDefinition::Distro { clip_grad_norm: None, diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs index e947f047c..83aab96ef 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs @@ -57,10 +57,10 @@ pub async fn run() { checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, data_type: LLMTrainingDataType::Pretraining, - data_locations: FixedVec::try_from_iter([ - LLMTrainingDataLocation::default(), - ]) - .unwrap(), + // data_locations: FixedVec::try_from_iter([ + // LLMTrainingDataLocation::default(), + // ]) + // .unwrap(), lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), optimizer: OptimizerDefinition::Distro { clip_grad_norm: None, diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs index 73897f6ab..4c9b9df0c 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs @@ -243,7 +243,7 @@ pub async fn run() { checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, data_type: LLMTrainingDataType::Pretraining, - data_locations, + // data_locations, lr_schedule: LearningRateSchedule::Constant( ConstantLR::default(), ), diff --git a/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/lib.rs b/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/lib.rs index 79137e1f9..0e37dfcea 100644 --- a/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/lib.rs +++ b/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/lib.rs @@ -4,7 +4,7 @@ pub mod state; use anchor_lang::prelude::*; use logic::*; -declare_id!("vVeH6Xd43HAScbxjVtvfwDGqBMaMvNDLsAxwM5WK1pG"); +declare_id!("DJwSz92yTRBUWxSDvcVB9ggoeWS6Jo59XUGMjLRDFJkC"); pub fn find_run(index: u64) -> Pubkey { Pubkey::find_program_address( diff --git a/scripts/deploy-solana-treasurer-test.sh b/scripts/deploy-solana-treasurer-test.sh new file mode 100755 index 000000000..a4ac75de2 --- /dev/null +++ b/scripts/deploy-solana-treasurer-test.sh @@ -0,0 +1,121 @@ +#!/usr/bin/env bash + +set -o errexit +set -e +set -m + +# use the agenix provided wallet if you have it +if [[ -n "${devnet__keypair__wallet_PATH}" && -f "${devnet__keypair__wallet_PATH}" ]]; then + DEFAULT_WALLET="${devnet__keypair__wallet_PATH}" +else + DEFAULT_WALLET="$HOME/.config/solana/id.json" +fi +WALLET_FILE=${KEY_FILE:-"$DEFAULT_WALLET"} +RPC=${RPC:-"http://127.0.0.1:8899"} +WS_RPC=${WS_RPC:-"ws://127.0.0.1:8900"} +RUN_ID=${RUN_ID:-"test"} +CONFIG_FILE=${CONFIG_FILE:-"./config/solana-test/config.toml"} + +echo -e "\n[+] deploy info:" +echo -e "[+] WALLET_FILE = $WALLET_FILE" +echo -e "[+] RPC = $RPC" +echo -e "[+] WS_RPC = $WS_RPC" +echo -e "[+] RUN_ID = $RUN_ID" +echo -e "[+] CONFIG_FILE = $CONFIG_FILE" +echo -e "[+] -----------------------------------------------------------" + +echo -e "\n[+] Starting coordinator deploy" +pushd architectures/decentralized/solana-coordinator +solana-keygen new -o ./target/deploy/psyche_solana_coordinator-keypair.json -f --no-bip39-passphrase +anchor keys sync + +echo -e "\n[+] - building..." +anchor build --no-idl + +echo -e "\n[+] - deploying..." +anchor deploy --provider.cluster devnet --provider.wallet ${WALLET_FILE} -- --max-len 500000 +sleep 1 # wait for the program to be deployed and ready in the validator + +echo -e "\n[+] Coordinator program deployed successfully!" +popd + +echo -e "\n[+] Starting authorizor deploy" +pushd architectures/decentralized/solana-authorizer + +solana-keygen new -o ./target/deploy/psyche_solana_authorizer-keypair.json -f --no-bip39-passphrase +anchor keys sync + +echo -e "\n[+] - building..." +anchor build + +echo -e "\n[+] - deploying..." +anchor deploy --provider.cluster devnet --provider.wallet ${WALLET_FILE} +sleep 1 # wait for the program to be deployed and ready in the validator + +echo -e "\n[+] - init-idl..." +anchor idl init \ + --provider.cluster devnet \ + --provider.wallet ${WALLET_FILE} \ + --filepath target/idl/psyche_solana_authorizer.json \ + $(solana-keygen pubkey ./target/deploy/psyche_solana_authorizer-keypair.json) + +echo -e "\n[+] Authorizer program deployed successfully!" +popd + +# echo -e "\n[+] Starting treasurer deploy" +# pushd architectures/decentralized/solana-treasurer +# solana-keygen new -o ./target/deploy/psyche_solana_treasurer-keypair.json -f --no-bip39-passphrase +# anchor keys sync + +# echo -e "\n[+] - building..." +# anchor build + +# echo -e "\n[+] - deploying..." +# anchor deploy --provider.cluster devnet --provider.wallet ${WALLET_FILE} +# sleep 1 # wait for the program to be deployed and ready in the validator +# echo -e "\n[+] Treasurer program deployed successfully!" +# popd + +echo -e "\n[+] Creating authorization for everyone to join the run" +bash ./scripts/join-authorization-create.sh "https://api.devnet.solana.com" ${WALLET_FILE} 11111111111111111111111111111111 + +# echo -e "\n[+] Creating token" +# TOKEN_ADDRESS=$(spl-token create-token --decimals 0 --url "https://api.devnet.solana.com" | grep "Address:" | awk '{print $2}') +# spl-token create-account ${TOKEN_ADDRESS} --url "https://api.devnet.solana.com" +# spl-token mint ${TOKEN_ADDRESS} 1000000 --url "https://api.devnet.solana.com" + +echo -e "\n[+] Creating training run..." +cargo run --release --bin psyche-solana-client -- \ + create-run \ + --wallet-private-key-path ${WALLET_FILE} \ + --rpc "https://api.devnet.solana.com" \ + --ws-rpc "wss://api.devnet.solana.com" \ + --client-version "test" \ + --run-id ${RUN_ID} "$@" + +echo -e "\n[+] Update training run config..." +cargo run --release --bin psyche-solana-client -- \ + update-config \ + --wallet-private-key-path ${WALLET_FILE} \ + --rpc "https://api.devnet.solana.com" \ + --ws-rpc "wss://api.devnet.solana.com" \ + --run-id ${RUN_ID} \ + --config-path ${CONFIG_FILE} + +# echo -e "\n[+] Update training run model..." +# cargo run --release --bin psyche-solana-client -- \ + # update-model \ + # --wallet-private-key-path ${WALLET_FILE} \ + # --rpc "https://api.devnet.solana.com" \ + # --ws-rpc "wss://api.devnet.solana.com" \ + # --run-id ${RUN_ID} \ + # --config-path ${CONFIG_FILE} + +# echo -e "\n[+] Unpause the training run..." +# cargo run --release --bin psyche-solana-client -- \ + # set-paused \ + # --wallet-private-key-path ${WALLET_FILE} \ + # --rpc "https://api.devnet.solana.com" \ + # --ws-rpc "wss://api.devnet.solana.com" \ + # --run-id ${RUN_ID} \ + # --resume diff --git a/scripts/join-authorization-create.sh b/scripts/join-authorization-create.sh index 7b2e521f2..fba7d8201 100644 --- a/scripts/join-authorization-create.sh +++ b/scripts/join-authorization-create.sh @@ -31,7 +31,7 @@ GRANTOR_PUBKEY=$(solana-keygen pubkey $GRANTOR_KEYPAIR_FILE) GRANTEE_PUBKEY="$1" shift -PSYCHE_AUTHORIZER_ID="PsyAUmhpmiUouWsnJdNGFSX8vZ6rWjXjgDPHsgqPGyw" +PSYCHE_AUTHORIZER_ID=$(solana-keygen pubkey ./architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json) PSYCHE_AUTH_SCOPE="utf8:CoordinatorJoinRun" # Make sure all is good to go diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 55f8cce8b..48a278332 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -193,14 +193,15 @@ impl RunInitConfigAndIO { let client = match DataProviderTcpClient::connect( diff --git a/shared/coordinator/src/coordinator.rs b/shared/coordinator/src/coordinator.rs index ead4e342f..993389897 100644 --- a/shared/coordinator/src/coordinator.rs +++ b/shared/coordinator/src/coordinator.rs @@ -1,6 +1,6 @@ use crate::{ Commitment, Committee, CommitteeProof, CommitteeSelection, WitnessProof, - model::{Checkpoint, HubRepo, Model}, + model::{Checkpoint, HubRepo, LLMDataLocations, Model}, }; use anchor_lang::{AnchorDeserialize, AnchorSerialize, InitSpace, prelude::borsh}; @@ -303,6 +303,8 @@ pub struct Coordinator { pub model: Model, + pub data_locations: LLMDataLocations, + pub config: CoordinatorConfig, #[serde(default)] diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index bff9aa036..2fcd03c25 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -1,3 +1,5 @@ +use std::path::Iter; + use crate::{SOLANA_MAX_STRING_LEN, coordinator::SOLANA_MAX_URL_STRING_LEN}; use anchor_lang::{ @@ -206,11 +208,28 @@ pub struct LLM { pub architecture: LLMArchitecture, pub checkpoint: Checkpoint, pub data_type: LLMTrainingDataType, - pub data_locations: FixedVec, pub lr_schedule: LearningRateSchedule, pub optimizer: OptimizerDefinition, } +#[derive( + AnchorSerialize, AnchorDeserialize, Serialize, Deserialize, Clone, Debug, Zeroable, Copy, TS, +)] +#[repr(C)] +pub struct LLMDataLocations { + pub data_locations: FixedVec, +} + +impl LLMDataLocations { + pub fn iter(&self) -> impl DoubleEndedIterator { + self.data_locations.iter() + } + + pub fn iter_mut(&mut self) -> impl DoubleEndedIterator { + self.data_locations.iter_mut() + } +} + impl LLM { pub fn dummy() -> Self { let mut data_locations: FixedVec = @@ -221,7 +240,6 @@ impl LLM { Self { architecture: LLMArchitecture::HfLlama, checkpoint: Checkpoint::Dummy(HubRepo::dummy()), - data_locations, data_type: LLMTrainingDataType::Pretraining, lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), max_seq_len: 2048, @@ -299,33 +317,33 @@ impl Model { return false; } - for data_location in llm.data_locations.iter() { - let bad_data_location = match data_location { - LLMTrainingDataLocation::Dummy(_) => false, - LLMTrainingDataLocation::Server(url) => url.is_empty(), - LLMTrainingDataLocation::Local(_) => false, - LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { - location, - .. - }) => match location { - HttpTrainingDataLocation::SingleUrl(url) => url.is_empty(), - HttpTrainingDataLocation::NumberedFiles { - url_template, - num_files, - .. - } => url_template.is_empty() || *num_files == 0, - HttpTrainingDataLocation::Gcp { bucket_name, .. } => { - bucket_name.is_empty() - } - }, - LLMTrainingDataLocation::WeightedHttp(url) => url.is_empty(), - LLMTrainingDataLocation::Preprocessed(url) => url.is_empty(), - }; - if bad_data_location { - msg!("model check failed: bad LLM training data location."); - return false; - } - } + // for data_location in llm.data_locations.iter() { + // let bad_data_location = match data_location { + // LLMTrainingDataLocation::Dummy(_) => false, + // LLMTrainingDataLocation::Server(url) => url.is_empty(), + // LLMTrainingDataLocation::Local(_) => false, + // LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { + // location, + // .. + // }) => match location { + // HttpTrainingDataLocation::SingleUrl(url) => url.is_empty(), + // HttpTrainingDataLocation::NumberedFiles { + // url_template, + // num_files, + // .. + // } => url_template.is_empty() || *num_files == 0, + // HttpTrainingDataLocation::Gcp { bucket_name, .. } => { + // bucket_name.is_empty() + // } + // }, + // LLMTrainingDataLocation::WeightedHttp(url) => url.is_empty(), + // LLMTrainingDataLocation::Preprocessed(url) => url.is_empty(), + // }; + // if bad_data_location { + // msg!("model check failed: bad LLM training data location."); + // return false; + // } + // } let bad_checkpoint = match llm.checkpoint { Checkpoint::Dummy(_hub_repo) => false, From 38ddfc0682d156d91d17eec0ced7ddfbe9f89e56 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Fri, 19 Dec 2025 12:52:10 -0300 Subject: [PATCH 26/33] Send instructions for each data location --- .../src/command/set_future_epoch_rates.rs | 1 + .../solana-client/src/command/set_paused.rs | 1 + .../src/command/update_config.rs | 39 +++++++++++++++++-- .../solana-client/src/instructions.rs | 23 +++++++++-- .../decentralized/solana-client/src/main.rs | 3 -- .../solana-coordinator/src/instance_state.rs | 28 ++++++++++--- .../programs/solana-coordinator/src/lib.rs | 18 +++++++-- .../suites/memnet_treasurer_create_update.rs | 5 +-- .../suites/memnet_treasurer_full_epoch.rs | 2 +- .../solana-treasurer/src/logic/run_update.rs | 23 +++++++++++ shared/coordinator/src/model.rs | 5 +-- 11 files changed, 118 insertions(+), 30 deletions(-) diff --git a/architectures/decentralized/solana-client/src/command/set_future_epoch_rates.rs b/architectures/decentralized/solana-client/src/command/set_future_epoch_rates.rs index 9b188de57..3eecd541e 100644 --- a/architectures/decentralized/solana-client/src/command/set_future_epoch_rates.rs +++ b/architectures/decentralized/solana-client/src/command/set_future_epoch_rates.rs @@ -54,6 +54,7 @@ pub async fn command_set_future_epoch_rates_execute( epoch_slashing_rate_per_client: slashing_rate_per_client, paused: None, client_version: None, + data_location: None, }, ) } else { diff --git a/architectures/decentralized/solana-client/src/command/set_paused.rs b/architectures/decentralized/solana-client/src/command/set_paused.rs index 1479e2d27..1c97faf1d 100644 --- a/architectures/decentralized/solana-client/src/command/set_paused.rs +++ b/architectures/decentralized/solana-client/src/command/set_paused.rs @@ -52,6 +52,7 @@ pub async fn command_set_paused_execute( epoch_slashing_rate_per_client: None, paused: Some(paused), client_version: None, + data_location: None, }, ) } else { diff --git a/architectures/decentralized/solana-client/src/command/update_config.rs b/architectures/decentralized/solana-client/src/command/update_config.rs index 4d85bb558..5f8b66008 100644 --- a/architectures/decentralized/solana-client/src/command/update_config.rs +++ b/architectures/decentralized/solana-client/src/command/update_config.rs @@ -187,7 +187,7 @@ pub async fn command_update_config_execute( .resolve_treasurer_index(&run_id, treasurer_index) .await? { - vec![instructions::treasurer_run_update( + let mut instructions = vec![instructions::treasurer_run_update( &run_id, treasurer_index, &coordinator_account, @@ -201,10 +201,34 @@ pub async fn command_update_config_execute( epoch_slashing_rate_per_client: None, paused: None, client_version: client_version.clone(), + data_location: None, }, - )] + )]; + if let Some(data_locations) = data_locations { + for dl in data_locations.data_locations.iter() { + instructions.push(instructions::treasurer_run_update( + &run_id, + treasurer_index, + &coordinator_account, + &main_authority, + RunUpdateParams { + metadata: None, + config: None, + model: None, + progress: None, + epoch_earning_rate_total_shared: None, + epoch_slashing_rate_per_client: None, + paused: None, + client_version: None, + data_location: Some(*dl), + }, + )); + } + } + instructions } else { let mut instructions = Vec::new(); + let data_locations_iter = data_locations.unwrap().iter().cloned().collect::>(); if coordinator_update { instructions.push(instructions::coordinator_update( @@ -216,12 +240,19 @@ pub async fn command_update_config_execute( model, progress, )); - instructions.push(instructions::coordinator_update_data_locations( + instructions.push(instructions::clear_data_locations( &run_id, &coordinator_account, &main_authority, - data_locations.as_ref(), )); + for dl in data_locations_iter.iter() { + instructions.push(instructions::coordinator_update_data_locations( + &run_id, + &coordinator_account, + &main_authority, + Some(*dl), + )); + } } if let Some(client_version) = client_version.clone() { diff --git a/architectures/decentralized/solana-client/src/instructions.rs b/architectures/decentralized/solana-client/src/instructions.rs index a8b985362..feb092a81 100644 --- a/architectures/decentralized/solana-client/src/instructions.rs +++ b/architectures/decentralized/solana-client/src/instructions.rs @@ -53,11 +53,10 @@ pub fn coordinator_close_run( ) } -pub fn coordinator_update_data_locations( +pub fn clear_data_locations( run_id: &str, coordinator_account: &Pubkey, main_authority: &Pubkey, - data_locations: Option<&psyche_coordinator::model::LLMDataLocations>, ) -> Instruction { let coordinator_instance = psyche_solana_coordinator::find_coordinator_instance(run_id); anchor_instruction( @@ -67,9 +66,25 @@ pub fn coordinator_update_data_locations( coordinator_instance, coordinator_account: *coordinator_account, }, - psyche_solana_coordinator::instruction::UpdateDataLocations { - data_locations: data_locations.cloned(), + psyche_solana_coordinator::instruction::ClearDataLocations {}, + ) +} + +pub fn coordinator_update_data_locations( + run_id: &str, + coordinator_account: &Pubkey, + main_authority: &Pubkey, + data_location: Option, +) -> Instruction { + let coordinator_instance = psyche_solana_coordinator::find_coordinator_instance(run_id); + anchor_instruction( + psyche_solana_coordinator::ID, + psyche_solana_coordinator::accounts::OwnerCoordinatorAccounts { + authority: *main_authority, + coordinator_instance, + coordinator_account: *coordinator_account, }, + psyche_solana_coordinator::instruction::UpdateDataLocations { data_location }, ) } diff --git a/architectures/decentralized/solana-client/src/main.rs b/architectures/decentralized/solana-client/src/main.rs index 132e84d1d..e540ea763 100644 --- a/architectures/decentralized/solana-client/src/main.rs +++ b/architectures/decentralized/solana-client/src/main.rs @@ -261,9 +261,6 @@ impl TryInto for WalletArgs { async fn async_main() -> Result<()> { let args = CliArgs::parse(); - let logger = psyche_tui::logging() - .with_output(LogOutput::Console) - .init()?; match args.command { Commands::ShowStaticP2PIdentity { diff --git a/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/instance_state.rs b/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/instance_state.rs index 8de426506..436f1bb05 100644 --- a/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/instance_state.rs +++ b/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/instance_state.rs @@ -16,6 +16,7 @@ use psyche_coordinator::model::HubRepo; use psyche_coordinator::model::LLMTrainingDataLocation; use psyche_coordinator::model::Model; use psyche_core::FixedString; +use psyche_core::FixedVec; use psyche_core::SmallBoolean; use psyche_core::sha256v; use serde::Deserialize; @@ -316,18 +317,19 @@ impl CoordinatorInstanceState { pub fn update_data_locations( &mut self, - data_locations: Option, + data_location: Option< + psyche_coordinator::model::LLMTrainingDataLocation, + >, ) -> Result<()> { if self.coordinator.run_state == RunState::Finished { return err!(ProgramError::UpdateConfigFinished); - } else if !self.coordinator.halted() && data_locations.is_some() { + } else if !self.coordinator.halted() && data_location.is_some() { return err!(ProgramError::UpdateConfigNotHalted); } - if let Some(data_locations) = data_locations { - // if !data_locations.check() { - // return err!(ProgramError::ModelSanityCheckFailed); - // } + let mut data_locations = self.coordinator.data_locations; + if let Some(dl) = data_location { + let _ = data_locations.data_locations.push(dl); let _ = std::mem::replace( &mut self.coordinator.data_locations, data_locations, @@ -337,6 +339,20 @@ impl CoordinatorInstanceState { Ok(()) } + pub fn clear_data_locations(&mut self) -> Result<()> { + if self.coordinator.run_state == RunState::Finished { + return err!(ProgramError::UpdateConfigFinished); + } else if !self.coordinator.halted() { + return err!(ProgramError::UpdateConfigNotHalted); + } + let _ = std::mem::replace( + &mut self.coordinator.data_locations.data_locations, + FixedVec::new(), + ); + + Ok(()) + } + pub fn update( &mut self, metadata: Option, diff --git a/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs b/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs index 3768a9c29..d6cc7376d 100644 --- a/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs +++ b/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs @@ -164,10 +164,10 @@ impl CoordinatorInstance { #[program] pub mod psyche_solana_coordinator { - use super::*; - use psyche_coordinator::model::LLMDataLocations; use psyche_core::FixedString; + use super::*; + pub fn init_coordinator( context: Context, params: InitCoordinatorParams, @@ -184,11 +184,21 @@ pub mod psyche_solana_coordinator { pub fn update_data_locations( ctx: Context, - data_locations: Option, + data_location: Option< + psyche_coordinator::model::LLMTrainingDataLocation, + >, + ) -> Result<()> { + let mut account = ctx.accounts.coordinator_account.load_mut()?; + account.increment_nonce(); + account.state.update_data_locations(data_location) + } + + pub fn clear_data_locations( + ctx: Context, ) -> Result<()> { let mut account = ctx.accounts.coordinator_account.load_mut()?; account.increment_nonce(); - account.state.update_data_locations(data_locations) + account.state.clear_data_locations() } pub fn update( diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs index 83aab96ef..eaa9d8e65 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs @@ -57,10 +57,6 @@ pub async fn run() { checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, data_type: LLMTrainingDataType::Pretraining, - // data_locations: FixedVec::try_from_iter([ - // LLMTrainingDataLocation::default(), - // ]) - // .unwrap(), lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), optimizer: OptimizerDefinition::Distro { clip_grad_norm: None, @@ -77,6 +73,7 @@ pub async fn run() { epoch_slashing_rate_per_client: None, paused: Some(false), client_version: None, + data_location: Some(LLMTrainingDataLocation::default()), }; // Prepare the collateral mint diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs index 4c9b9df0c..6d2ac8ec9 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs @@ -243,7 +243,6 @@ pub async fn run() { checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, data_type: LLMTrainingDataType::Pretraining, - // data_locations, lr_schedule: LearningRateSchedule::Constant( ConstantLR::default(), ), @@ -264,6 +263,7 @@ pub async fn run() { epoch_slashing_rate_per_client: None, paused: Some(false), client_version: None, + data_location: Some(data_locations[0].clone()), }, ) .await diff --git a/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/logic/run_update.rs b/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/logic/run_update.rs index b1f4e02bc..ff9c55592 100644 --- a/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/logic/run_update.rs +++ b/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/logic/run_update.rs @@ -1,6 +1,7 @@ use anchor_lang::prelude::*; use psyche_coordinator::CoordinatorConfig; use psyche_coordinator::CoordinatorProgress; +use psyche_coordinator::model::LLMTrainingDataLocation; use psyche_coordinator::model::Model; use psyche_solana_coordinator::CoordinatorAccount; use psyche_solana_coordinator::CoordinatorInstance; @@ -47,6 +48,7 @@ pub struct RunUpdateParams { pub epoch_slashing_rate_per_client: Option, pub paused: Option, pub client_version: Option, + pub data_location: Option, } pub fn run_update_processor( @@ -130,6 +132,27 @@ pub fn run_update_processor( )?; } + if let Some(data_location) = params.data_location { + psyche_solana_coordinator::cpi::update_data_locations( + CpiContext::new( + context.accounts.coordinator_program.to_account_info(), + OwnerCoordinatorAccounts { + authority: context.accounts.run.to_account_info(), + coordinator_instance: context + .accounts + .coordinator_instance + .to_account_info(), + coordinator_account: context + .accounts + .coordinator_account + .to_account_info(), + }, + ) + .with_signer(run_signer_seeds), + Some(data_location), + )?; + } + if let Some(client_version) = params.client_version { update_client_version( CpiContext::new( diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index 2fcd03c25..012fbb983 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -1,5 +1,3 @@ -use std::path::Iter; - use crate::{SOLANA_MAX_STRING_LEN, coordinator::SOLANA_MAX_URL_STRING_LEN}; use anchor_lang::{ @@ -179,7 +177,6 @@ impl LLMTrainingDataLocationAndWeight { TS, )] #[repr(C)] -#[allow(clippy::large_enum_variant)] pub enum HttpTrainingDataLocation { SingleUrl(FixedString<{ SOLANA_MAX_URL_STRING_LEN }>), NumberedFiles { @@ -196,7 +193,7 @@ pub enum HttpTrainingDataLocation { }, } -pub const MAX_DATA_LOCATIONS: usize = 4; +pub const MAX_DATA_LOCATIONS: usize = 3; #[derive( AnchorSerialize, AnchorDeserialize, Serialize, Deserialize, Clone, Debug, Zeroable, Copy, TS, From b010e1cc75375671c40fab1676866ab5721d5691 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Fri, 19 Dec 2025 15:40:00 -0300 Subject: [PATCH 27/33] Fix clippy and remove commented code --- .../programs/solana-authorizer/src/lib.rs | 2 +- .../psyche_solana_authorizer-keypair.json | 7 ++++- .../src/command/update_config.rs | 6 ++-- .../decentralized/solana-client/src/main.rs | 21 -------------- .../programs/solana-coordinator/src/lib.rs | 2 +- .../psyche_solana_coordinator-keypair.json | 2 +- .../suites/memnet_coordinator_full_round.rs | 1 - shared/coordinator/src/model.rs | 28 ------------------- 8 files changed, 12 insertions(+), 57 deletions(-) diff --git a/architectures/decentralized/solana-authorizer/programs/solana-authorizer/src/lib.rs b/architectures/decentralized/solana-authorizer/programs/solana-authorizer/src/lib.rs index a23aa2a7d..0cbeaaad1 100644 --- a/architectures/decentralized/solana-authorizer/programs/solana-authorizer/src/lib.rs +++ b/architectures/decentralized/solana-authorizer/programs/solana-authorizer/src/lib.rs @@ -4,7 +4,7 @@ pub mod state; use anchor_lang::prelude::*; use logic::*; -declare_id!("AbPtyECaUE9kYyyu5hiEa5LF1yffU7kwso2AnDJebvr1"); +declare_id!("PsyAUmhpmiUouWsnJdNGFSX8vZ6rWjXjgDPHsgqPGyw"); pub fn find_authorization( grantor: &Pubkey, diff --git a/architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json b/architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json index cefb6b946..b406272a7 100644 --- a/architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json +++ b/architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json @@ -1 +1,6 @@ -[148,173,21,87,228,46,77,36,82,244,20,158,176,165,14,32,145,188,74,19,67,237,0,24,105,77,140,96,46,211,192,150,142,136,209,134,211,194,76,210,164,42,110,223,90,57,233,190,42,214,249,60,238,206,241,73,224,182,127,44,202,58,254,78] \ No newline at end of file +[ + 133, 168, 22, 30, 172, 129, 250, 222, 16, 179, 41, 22, 161, 2, 171, 87, 19, + 134, 196, 81, 67, 2, 234, 210, 48, 19, 234, 27, 192, 50, 161, 118, 5, 220, + 104, 106, 206, 33, 252, 167, 179, 12, 115, 187, 217, 59, 144, 210, 71, 193, + 163, 175, 67, 155, 20, 145, 152, 109, 251, 190, 151, 5, 10, 82 +] diff --git a/architectures/decentralized/solana-client/src/command/update_config.rs b/architectures/decentralized/solana-client/src/command/update_config.rs index f7e35586f..e61b81f0e 100644 --- a/architectures/decentralized/solana-client/src/command/update_config.rs +++ b/architectures/decentralized/solana-client/src/command/update_config.rs @@ -75,7 +75,7 @@ pub async fn command_update_config_execute( #[derive(Serialize, Deserialize)] struct ModelWrapper { #[serde(flatten)] - pub model: Model, // This will deserialize the enum variant (LLM, etc.) + pub model: Model, // This will deserialize the enum variant (LLM) } #[derive(Serialize, Deserialize)] @@ -173,8 +173,8 @@ pub async fn command_update_config_execute( coordinator_account_state.state.coordinator.model = model; } - if let Some(data_locations) = &data_locations { - coordinator_account_state.state.coordinator.data_locations = data_locations.clone(); + if let Some(data_locations) = data_locations { + coordinator_account_state.state.coordinator.data_locations = data_locations; } let progress = restart_from_step.map(|step| CoordinatorProgress { diff --git a/architectures/decentralized/solana-client/src/main.rs b/architectures/decentralized/solana-client/src/main.rs index a8baf1694..284c8f002 100644 --- a/architectures/decentralized/solana-client/src/main.rs +++ b/architectures/decentralized/solana-client/src/main.rs @@ -23,8 +23,6 @@ use crate::command::treasurer_top_up_rewards::CommandTreasurerTopUpRewardsParams use crate::command::treasurer_top_up_rewards::command_treasurer_top_up_rewards_execute; use crate::command::update_config::CommandUpdateConfigParams; use crate::command::update_config::command_update_config_execute; -// use crate::command::update_model::CommandUpdateModelParams; -// use crate::command::update_model::command_update_model_execute; use crate::{ app::{AppParams, TAB_NAMES, Tabs}, backend::SolanaBackend, @@ -124,14 +122,6 @@ enum Commands { #[clap(flatten)] params: CommandUpdateConfigParams, }, - // UpdateModel { - // #[clap(flatten)] - // cluster: ClusterArgs, - // #[clap(flatten)] - // wallet: WalletArgs, - // #[clap(flatten)] - // params: CommandUpdateModelParams, - // }, Tick { #[clap(flatten)] cluster: ClusterArgs, @@ -328,17 +318,6 @@ async fn async_main() -> Result<()> { .unwrap(); command_update_config_execute(backend, params).await } - // Commands::UpdateModel { cluster, wallet, params } => { - // let key_pair: Arc = Arc::new(wallet.try_into()?); - // let backend = SolanaBackend::new( - // cluster.into(), - // vec![], - // key_pair.clone(), - // CommitmentConfig::confirmed(), - // ) - // .unwrap(); - // command_update_model_execute(backend, params).await - // } Commands::SetPaused { cluster, wallet, diff --git a/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs b/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs index 938b48e90..2f1feb19e 100644 --- a/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs +++ b/architectures/decentralized/solana-coordinator/programs/solana-coordinator/src/lib.rs @@ -29,7 +29,7 @@ use ts_rs::TS; pub use crate::instance_state::RunMetadata; -declare_id!("6DvXZnaJd2RHmmMVFZE8P4GpCdZXGYpvVda7LDVBUK7v"); +declare_id!("4SHugWqSXwKE5fqDchkJcPEqnoZE22VYKtSTVm7axbT7"); pub const SOLANA_MAX_NUM_PENDING_CLIENTS: usize = SOLANA_MAX_NUM_CLIENTS; diff --git a/architectures/decentralized/solana-coordinator/target/deploy/psyche_solana_coordinator-keypair.json b/architectures/decentralized/solana-coordinator/target/deploy/psyche_solana_coordinator-keypair.json index 7bfe03c0c..49e92701b 100644 --- a/architectures/decentralized/solana-coordinator/target/deploy/psyche_solana_coordinator-keypair.json +++ b/architectures/decentralized/solana-coordinator/target/deploy/psyche_solana_coordinator-keypair.json @@ -1 +1 @@ -[27,96,35,184,165,2,254,184,93,154,151,51,92,73,10,13,208,2,234,87,55,141,189,96,202,13,104,226,195,88,241,154,77,153,237,112,135,152,74,74,2,139,208,162,159,49,8,104,38,172,1,251,202,41,79,20,234,97,104,137,100,127,66,241] \ No newline at end of file +[64,238,5,158,112,133,38,180,4,62,68,219,46,236,189,68,44,131,70,134,229,152,44,218,72,233,162,120,147,52,99,51,51,13,179,3,249,169,215,84,254,219,157,144,170,99,145,211,144,51,17,103,241,3,92,148,244,17,156,198,157,197,61,26] diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs index 9c50c7736..7cddbc275 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs @@ -126,7 +126,6 @@ pub async fn run() { checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, data_type: LLMTrainingDataType::Pretraining, - // data_locations, lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), optimizer: OptimizerDefinition::Distro { clip_grad_norm: None, diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index 012fbb983..dd802a691 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -314,34 +314,6 @@ impl Model { return false; } - // for data_location in llm.data_locations.iter() { - // let bad_data_location = match data_location { - // LLMTrainingDataLocation::Dummy(_) => false, - // LLMTrainingDataLocation::Server(url) => url.is_empty(), - // LLMTrainingDataLocation::Local(_) => false, - // LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { - // location, - // .. - // }) => match location { - // HttpTrainingDataLocation::SingleUrl(url) => url.is_empty(), - // HttpTrainingDataLocation::NumberedFiles { - // url_template, - // num_files, - // .. - // } => url_template.is_empty() || *num_files == 0, - // HttpTrainingDataLocation::Gcp { bucket_name, .. } => { - // bucket_name.is_empty() - // } - // }, - // LLMTrainingDataLocation::WeightedHttp(url) => url.is_empty(), - // LLMTrainingDataLocation::Preprocessed(url) => url.is_empty(), - // }; - // if bad_data_location { - // msg!("model check failed: bad LLM training data location."); - // return false; - // } - // } - let bad_checkpoint = match llm.checkpoint { Checkpoint::Dummy(_hub_repo) => false, Checkpoint::Ephemeral => true, From 1241efa82f1a223b09314921c3178ddaa2f40f09 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Fri, 19 Dec 2025 16:15:33 -0300 Subject: [PATCH 28/33] Fix tests --- architectures/centralized/testing/src/server.rs | 2 ++ .../tests/suites/memnet_coordinator_rewards.rs | 2 -- .../tests/suites/memnet_treasurer_create_update.rs | 1 - shared/coordinator/src/model.rs | 8 +++++++- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/architectures/centralized/testing/src/server.rs b/architectures/centralized/testing/src/server.rs index 0c0ec60a8..59f9686ec 100644 --- a/architectures/centralized/testing/src/server.rs +++ b/architectures/centralized/testing/src/server.rs @@ -3,6 +3,7 @@ use crate::{MAX_ROUND_TRAIN_TIME, ROUND_WITNESS_TIME, WARMUP_TIME}; use bytemuck::Zeroable; use psyche_centralized_server::app::App as ServerApp; use psyche_centralized_shared::ClientId; +use psyche_coordinator::model::LLMDataLocations; use psyche_coordinator::{Client, Round}; use psyche_coordinator::{ Coordinator, CoordinatorConfig, CoordinatorEpochState, RunState, SOLANA_MAX_NUM_CLIENTS, @@ -94,6 +95,7 @@ impl CoordinatorServer { model: Model::LLM(LLM::dummy()), config: coordinator_config, epoch_state, + data_locations: LLMDataLocations::dummy(), ..Coordinator::::zeroed() }; diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs index 76c405d4b..cf28751f5 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs @@ -6,11 +6,9 @@ use psyche_coordinator::model::Checkpoint; use psyche_coordinator::model::HubRepo; use psyche_coordinator::model::LLM; use psyche_coordinator::model::LLMArchitecture; -use psyche_coordinator::model::LLMTrainingDataLocation; use psyche_coordinator::model::LLMTrainingDataType; use psyche_coordinator::model::Model; use psyche_core::ConstantLR; -use psyche_core::FixedVec; use psyche_core::LearningRateSchedule; use psyche_core::OptimizerDefinition; use psyche_solana_authorizer::logic::AuthorizationGrantorUpdateParams; diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs index 80ff26948..fe0724550 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs @@ -8,7 +8,6 @@ use psyche_coordinator::model::LLMTrainingDataLocation; use psyche_coordinator::model::LLMTrainingDataType; use psyche_coordinator::model::Model; use psyche_core::ConstantLR; -use psyche_core::FixedVec; use psyche_core::LearningRateSchedule; use psyche_core::OptimizerDefinition; use psyche_solana_coordinator::CoordinatorAccount; diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index dd802a691..a249b7b1b 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -227,13 +227,19 @@ impl LLMDataLocations { } } -impl LLM { +impl LLMDataLocations { pub fn dummy() -> Self { let mut data_locations: FixedVec = FixedVec::new(); data_locations .push(LLMTrainingDataLocation::Dummy(DummyType::Working)) .unwrap(); + Self { data_locations } + } +} + +impl LLM { + pub fn dummy() -> Self { Self { architecture: LLMArchitecture::HfLlama, checkpoint: Checkpoint::Dummy(HubRepo::dummy()), From a2452c1c86c6d87b83e3253a8b3d694a25298969 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Fri, 19 Dec 2025 17:52:20 -0300 Subject: [PATCH 29/33] Fix memnet tests --- .../solana-treasurer/src/logic/run_update.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/logic/run_update.rs b/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/logic/run_update.rs index ff9c55592..65e4dc5c0 100644 --- a/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/logic/run_update.rs +++ b/architectures/decentralized/solana-treasurer/programs/solana-treasurer/src/logic/run_update.rs @@ -111,8 +111,8 @@ pub fn run_update_processor( )?; } - if let Some(paused) = params.paused { - set_paused( + if let Some(data_location) = params.data_location { + psyche_solana_coordinator::cpi::update_data_locations( CpiContext::new( context.accounts.coordinator_program.to_account_info(), OwnerCoordinatorAccounts { @@ -128,12 +128,12 @@ pub fn run_update_processor( }, ) .with_signer(run_signer_seeds), - paused, + Some(data_location), )?; } - if let Some(data_location) = params.data_location { - psyche_solana_coordinator::cpi::update_data_locations( + if let Some(paused) = params.paused { + set_paused( CpiContext::new( context.accounts.coordinator_program.to_account_info(), OwnerCoordinatorAccounts { @@ -149,7 +149,7 @@ pub fn run_update_processor( }, ) .with_signer(run_signer_seeds), - Some(data_location), + paused, )?; } From 8f528dd482c1e0f8ca0df90d0d590f74fcadb899 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Fri, 19 Dec 2025 18:08:52 -0300 Subject: [PATCH 30/33] Fix backend compilation --- website/backend/src/coordinatorChainLoop.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/website/backend/src/coordinatorChainLoop.ts b/website/backend/src/coordinatorChainLoop.ts index 21aa04dc2..d50a313c4 100644 --- a/website/backend/src/coordinatorChainLoop.ts +++ b/website/backend/src/coordinatorChainLoop.ts @@ -353,6 +353,17 @@ export async function startWatchCoordinatorChainLoop( }) break } + case 'clear_data_locations': { + const runPdaAddr = i.accounts[1].toString() + const coordinatorAddr = i.accounts[2].toString() + runUpdates.getAndTouchCurrentRun({ + runPdaAddr, + coordinatorAddr, + decoded, + tx, + }) + break + } default: { const _missed_tx: never = decoded throw new Error( From 71ff15185c06c301f38bf94063fc366e4ea78b04 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Fri, 19 Dec 2025 18:15:03 -0300 Subject: [PATCH 31/33] Add instruction in backend --- website/backend/src/coordinatorChainLoop.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/website/backend/src/coordinatorChainLoop.ts b/website/backend/src/coordinatorChainLoop.ts index d50a313c4..2076d8651 100644 --- a/website/backend/src/coordinatorChainLoop.ts +++ b/website/backend/src/coordinatorChainLoop.ts @@ -364,6 +364,17 @@ export async function startWatchCoordinatorChainLoop( }) break } + case 'update_data_locations': { + const runPdaAddr = i.accounts[1].toString() + const coordinatorAddr = i.accounts[2].toString() + runUpdates.getAndTouchCurrentRun({ + runPdaAddr, + coordinatorAddr, + decoded, + tx, + }) + break + } default: { const _missed_tx: never = decoded throw new Error( From 56d5152a202a4fb1ca38b53a1f67e3d0efb5e2b7 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Mon, 22 Dec 2025 08:38:23 -0800 Subject: [PATCH 32/33] Fix decentralized tests --- nix/docker.nix | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nix/docker.nix b/nix/docker.nix index 6fa2e2774..34e153ea1 100644 --- a/nix/docker.nix +++ b/nix/docker.nix @@ -66,9 +66,11 @@ let '') (pkgs.runCommand "entrypoint" { } '' mkdir -p $out/bin + mkdir -p $out/architectures/decentralized/solana-authorizer/target/deploy cp ${../docker/test/client_test_entrypoint.sh} $out/bin/client_test_entrypoint.sh cp ${../docker/test/run_owner_entrypoint.sh} $out/bin/run_owner_entrypoint.sh cp ${../scripts/join-authorization-create.sh} $out/bin/join-authorization-create.sh + cp ${../architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json} $out/architectures/decentralized/solana-authorizer/target/deploy/psyche_solana_authorizer-keypair.json chmod +x $out/bin/client_test_entrypoint.sh chmod +x $out/bin/run_owner_entrypoint.sh chmod +x $out/bin/join-authorization-create.sh From 46718d9a1140dba160186226eef1bf0613493d61 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Mon, 22 Dec 2025 16:53:12 -0300 Subject: [PATCH 33/33] Remove unused scripts --- .../src/command/update_config.rs | 62 +++------ .../src/process_coordinator_instructions.rs | 26 ++++ .../suites/memnet_coordinator_full_round.rs | 20 +-- .../suites/memnet_coordinator_rewards.rs | 17 ++- .../suites/memnet_treasurer_full_epoch.rs | 10 +- config/solana-test/config-three-clients.toml | 37 ------ scripts/deploy-solana-treasurer-test.sh | 121 ------------------ 7 files changed, 73 insertions(+), 220 deletions(-) delete mode 100644 config/solana-test/config-three-clients.toml delete mode 100755 scripts/deploy-solana-treasurer-test.sh diff --git a/architectures/decentralized/solana-client/src/command/update_config.rs b/architectures/decentralized/solana-client/src/command/update_config.rs index e61b81f0e..4e1288c84 100644 --- a/architectures/decentralized/solana-client/src/command/update_config.rs +++ b/architectures/decentralized/solana-client/src/command/update_config.rs @@ -5,7 +5,7 @@ use anyhow::{Context, Result, bail}; use clap::Args; use psyche_coordinator::{ CoordinatorConfig, CoordinatorProgress, get_data_index_for_step, - model::{Checkpoint, LLMDataLocations, LLMTrainingDataLocation, Model}, + model::{Checkpoint, LLM, LLMDataLocations, LLMTrainingDataLocation, Model}, }; use psyche_core::FixedVec; use psyche_solana_treasurer::logic::RunUpdateParams; @@ -73,55 +73,36 @@ pub async fn command_update_config_execute( let (config, mut model, data_locations) = match config_path { Some(config_path) => { #[derive(Serialize, Deserialize)] - struct ModelWrapper { - #[serde(flatten)] - pub model: Model, // This will deserialize the enum variant (LLM) - } - - #[derive(Serialize, Deserialize)] - struct State { - pub config: CoordinatorConfig, - pub model: ModelWrapper, - } - - // First, parse without data_locations to get the Model enum - let state: State = toml::from_str(std::str::from_utf8( - &std::fs::read(&config_path) - .with_context(|| format!("failed to read config toml file {config_path:?}"))?, - )?) - .with_context(|| format!("failed to parse config toml file {config_path:?}"))?; - - // Then parse just the data_locations separately - #[derive(Serialize, Deserialize)] - struct DataLocationsWrapper { - pub data_locations: Vec, + struct ConfigFile { + config: CoordinatorConfig, + model: ModelWithData, } #[derive(Serialize, Deserialize)] - struct LLMSection { - #[serde(rename = "LLM")] - pub llm: DataLocationsWrapper, + #[serde(tag = "LLM")] // This handles the enum variant name + struct ModelWithData { + // LLM struct fields + #[serde(flatten)] + llm: LLM, + // Additional field not in the Copy struct + data_locations: Vec, } - #[derive(Serialize, Deserialize)] - struct ModelSection { - pub model: LLMSection, - } + let content = std::fs::read(&config_path) + .with_context(|| format!("failed to read config toml file {config_path:?}"))?; - let data_section: ModelSection = toml::from_str(std::str::from_utf8( - &std::fs::read(&config_path) - .with_context(|| format!("failed to read config toml file {config_path:?}"))?, - )?)?; + let config_file: ConfigFile = toml::from_str(std::str::from_utf8(&content)?) + .with_context(|| format!("failed to parse config toml file {config_path:?}"))?; let data_locs = LLMDataLocations { - data_locations: FixedVec::from_iter( - data_section.model.llm.data_locations.into_iter(), - ), + data_locations: FixedVec::from_iter(config_file.model.data_locations.into_iter()), }; - println!("DATA LOCS: {data_locs:#?}"); - - (Some(state.config), Some(state.model.model), Some(data_locs)) + ( + Some(config_file.config), + Some(Model::LLM(config_file.model.llm)), + Some(data_locs), + ) } None => (None, None, None), }; @@ -277,7 +258,6 @@ pub async fn command_update_config_execute( instructions }; - println!("SENDING INSTRUCTIONS: {instructions:#?}"); let signature = backend .send_and_retry("Update config", &instructions, &[]) .await?; diff --git a/architectures/decentralized/solana-tooling/src/process_coordinator_instructions.rs b/architectures/decentralized/solana-tooling/src/process_coordinator_instructions.rs index 90489607e..d73b08730 100644 --- a/architectures/decentralized/solana-tooling/src/process_coordinator_instructions.rs +++ b/architectures/decentralized/solana-tooling/src/process_coordinator_instructions.rs @@ -3,6 +3,7 @@ use anchor_lang::ToAccountMetas; use anyhow::Result; use psyche_coordinator::CoordinatorConfig; use psyche_coordinator::CoordinatorProgress; +use psyche_coordinator::model::LLMTrainingDataLocation; use psyche_coordinator::model::Model; use psyche_solana_coordinator::ClientId; use psyche_solana_coordinator::RunMetadata; @@ -19,6 +20,7 @@ use psyche_solana_coordinator::instruction::SetFutureEpochRates; use psyche_solana_coordinator::instruction::SetPaused; use psyche_solana_coordinator::instruction::Tick; use psyche_solana_coordinator::instruction::Update; +use psyche_solana_coordinator::instruction::UpdateDataLocations; use psyche_solana_coordinator::instruction::Witness; use psyche_solana_coordinator::logic::FreeCoordinatorParams; use psyche_solana_coordinator::logic::InitCoordinatorParams; @@ -114,6 +116,30 @@ pub async fn process_update( Ok(()) } +pub async fn process_data_locations_update( + endpoint: &mut ToolboxEndpoint, + payer: &Keypair, + authority: &Keypair, + coordinator_instance: &Pubkey, + coordinator_account: &Pubkey, + data_location: Option, +) -> Result<()> { + let accounts = OwnerCoordinatorAccounts { + authority: authority.pubkey(), + coordinator_instance: *coordinator_instance, + coordinator_account: *coordinator_account, + }; + let instruction = Instruction { + accounts: accounts.to_account_metas(None), + data: UpdateDataLocations { data_location }.data(), + program_id: psyche_solana_coordinator::ID, + }; + endpoint + .process_instruction_with_signers(payer, instruction, &[authority]) + .await?; + Ok(()) +} + pub async fn process_coordinator_join_run( endpoint: &mut ToolboxEndpoint, payer: &Keypair, diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs index 7cddbc275..3545e5686 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs @@ -30,6 +30,7 @@ use psyche_solana_tooling::process_coordinator_instructions::process_coordinator use psyche_solana_tooling::process_coordinator_instructions::process_coordinator_set_paused; use psyche_solana_tooling::process_coordinator_instructions::process_coordinator_tick; use psyche_solana_tooling::process_coordinator_instructions::process_coordinator_witness; +use psyche_solana_tooling::process_coordinator_instructions::process_data_locations_update; use psyche_solana_tooling::process_coordinator_instructions::process_update; use solana_sdk::signature::Keypair; use solana_sdk::signer::Signer; @@ -89,14 +90,6 @@ pub async fn run() { RunState::Uninitialized ); - let mut data_locations: FixedVec< - LLMTrainingDataLocation, - MAX_DATA_LOCATIONS, - > = FixedVec::default(); - data_locations - .push(LLMTrainingDataLocation::Dummy(DummyType::Working)) - .unwrap(); - // update the coordinator's model process_update( &mut endpoint, @@ -142,6 +135,17 @@ pub async fn run() { .await .unwrap(); + process_data_locations_update( + &mut endpoint, + &payer, + &main_authority, + &coordinator_instance, + &coordinator_account, + Some(LLMTrainingDataLocation::default()), + ) + .await + .unwrap(); + // Coordinator's state should now have changed assert_eq!( get_coordinator_account_state(&mut endpoint, &coordinator_account) diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs index cf28751f5..a08c7d0ed 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs @@ -6,6 +6,7 @@ use psyche_coordinator::model::Checkpoint; use psyche_coordinator::model::HubRepo; use psyche_coordinator::model::LLM; use psyche_coordinator::model::LLMArchitecture; +use psyche_coordinator::model::LLMTrainingDataLocation; use psyche_coordinator::model::LLMTrainingDataType; use psyche_coordinator::model::Model; use psyche_core::ConstantLR; @@ -27,6 +28,7 @@ use psyche_solana_tooling::process_coordinator_instructions::process_coordinator use psyche_solana_tooling::process_coordinator_instructions::process_coordinator_set_paused; use psyche_solana_tooling::process_coordinator_instructions::process_coordinator_tick; use psyche_solana_tooling::process_coordinator_instructions::process_coordinator_witness; +use psyche_solana_tooling::process_coordinator_instructions::process_data_locations_update; use psyche_solana_tooling::process_coordinator_instructions::process_update; use solana_sdk::signature::Keypair; use solana_sdk::signer::Signer; @@ -111,10 +113,6 @@ pub async fn run() { checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, data_type: LLMTrainingDataType::Pretraining, - // data_locations: FixedVec::try_from_iter([ - // LLMTrainingDataLocation::default(), - // ]) - // .unwrap(), lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), optimizer: OptimizerDefinition::Distro { clip_grad_norm: None, @@ -131,6 +129,17 @@ pub async fn run() { .await .unwrap(); + process_data_locations_update( + &mut endpoint, + &payer, + &main_authority, + &coordinator_instance, + &coordinator_account, + Some(LLMTrainingDataLocation::default()), + ) + .await + .unwrap(); + // Set the reward rate for the epoch process_coordiantor_set_future_epoch_rates( &mut endpoint, diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs index c2971bc64..dffafc5ff 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs @@ -205,14 +205,6 @@ pub async fn run() { .await .unwrap_err(); - let mut data_locations: FixedVec< - LLMTrainingDataLocation, - MAX_DATA_LOCATIONS, - > = FixedVec::default(); - data_locations - .push(LLMTrainingDataLocation::Dummy(DummyType::Working)) - .unwrap(); - // Prepare the coordinator's config process_treasurer_run_update( &mut endpoint, @@ -264,7 +256,7 @@ pub async fn run() { epoch_slashing_rate_per_client: None, paused: Some(false), client_version: None, - data_location: Some(data_locations[0].clone()), + data_location: Some(LLMTrainingDataLocation::default()), }, ) .await diff --git a/config/solana-test/config-three-clients.toml b/config/solana-test/config-three-clients.toml deleted file mode 100644 index 45a9763dc..000000000 --- a/config/solana-test/config-three-clients.toml +++ /dev/null @@ -1,37 +0,0 @@ -[config] -warmup_time = 30 -cooldown_time = 30 -rounds_per_epoch = 10 -max_round_train_time = 30 -round_witness_time = 1 -min_clients = 3 -verification_percent = 0 -witness_nodes = 3 -global_batch_size = 8 -total_steps = 25000 -checkpointers = [] - -[model.LLM] -architecture = "HfLlama" -data_type = "Pretraining" -max_seq_len = 2048 -data_locations = [ - { Http = { location = { Gcp = { bucket_name = "nous-pretraining-public-us", filter_directory = "fineweb-edu-tokenized-llama2" } }, token_size_in_bytes = "TwoBytes", shuffle = "DontShuffle" } }, -] - -[model.LLM.checkpoint.Hub] -repo_id = "emozilla/llama2-20m-init" - -[model.LLM.lr_schedule.Cosine] -base_lr = 4.0e-4 -warmup_steps = 250 -warmup_init_lr = 0.0 -total_steps = 25000 -final_lr = 4.0e-5 - -[model.LLM.optimizer.Distro] -clip_grad_norm = 1.0 -compression_decay = 0.999 -compression_chunk = 64 -compression_topk = 8 -quantize_1bit = true diff --git a/scripts/deploy-solana-treasurer-test.sh b/scripts/deploy-solana-treasurer-test.sh deleted file mode 100755 index a4ac75de2..000000000 --- a/scripts/deploy-solana-treasurer-test.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env bash - -set -o errexit -set -e -set -m - -# use the agenix provided wallet if you have it -if [[ -n "${devnet__keypair__wallet_PATH}" && -f "${devnet__keypair__wallet_PATH}" ]]; then - DEFAULT_WALLET="${devnet__keypair__wallet_PATH}" -else - DEFAULT_WALLET="$HOME/.config/solana/id.json" -fi -WALLET_FILE=${KEY_FILE:-"$DEFAULT_WALLET"} -RPC=${RPC:-"http://127.0.0.1:8899"} -WS_RPC=${WS_RPC:-"ws://127.0.0.1:8900"} -RUN_ID=${RUN_ID:-"test"} -CONFIG_FILE=${CONFIG_FILE:-"./config/solana-test/config.toml"} - -echo -e "\n[+] deploy info:" -echo -e "[+] WALLET_FILE = $WALLET_FILE" -echo -e "[+] RPC = $RPC" -echo -e "[+] WS_RPC = $WS_RPC" -echo -e "[+] RUN_ID = $RUN_ID" -echo -e "[+] CONFIG_FILE = $CONFIG_FILE" -echo -e "[+] -----------------------------------------------------------" - -echo -e "\n[+] Starting coordinator deploy" -pushd architectures/decentralized/solana-coordinator -solana-keygen new -o ./target/deploy/psyche_solana_coordinator-keypair.json -f --no-bip39-passphrase -anchor keys sync - -echo -e "\n[+] - building..." -anchor build --no-idl - -echo -e "\n[+] - deploying..." -anchor deploy --provider.cluster devnet --provider.wallet ${WALLET_FILE} -- --max-len 500000 -sleep 1 # wait for the program to be deployed and ready in the validator - -echo -e "\n[+] Coordinator program deployed successfully!" -popd - -echo -e "\n[+] Starting authorizor deploy" -pushd architectures/decentralized/solana-authorizer - -solana-keygen new -o ./target/deploy/psyche_solana_authorizer-keypair.json -f --no-bip39-passphrase -anchor keys sync - -echo -e "\n[+] - building..." -anchor build - -echo -e "\n[+] - deploying..." -anchor deploy --provider.cluster devnet --provider.wallet ${WALLET_FILE} -sleep 1 # wait for the program to be deployed and ready in the validator - -echo -e "\n[+] - init-idl..." -anchor idl init \ - --provider.cluster devnet \ - --provider.wallet ${WALLET_FILE} \ - --filepath target/idl/psyche_solana_authorizer.json \ - $(solana-keygen pubkey ./target/deploy/psyche_solana_authorizer-keypair.json) - -echo -e "\n[+] Authorizer program deployed successfully!" -popd - -# echo -e "\n[+] Starting treasurer deploy" -# pushd architectures/decentralized/solana-treasurer -# solana-keygen new -o ./target/deploy/psyche_solana_treasurer-keypair.json -f --no-bip39-passphrase -# anchor keys sync - -# echo -e "\n[+] - building..." -# anchor build - -# echo -e "\n[+] - deploying..." -# anchor deploy --provider.cluster devnet --provider.wallet ${WALLET_FILE} -# sleep 1 # wait for the program to be deployed and ready in the validator -# echo -e "\n[+] Treasurer program deployed successfully!" -# popd - -echo -e "\n[+] Creating authorization for everyone to join the run" -bash ./scripts/join-authorization-create.sh "https://api.devnet.solana.com" ${WALLET_FILE} 11111111111111111111111111111111 - -# echo -e "\n[+] Creating token" -# TOKEN_ADDRESS=$(spl-token create-token --decimals 0 --url "https://api.devnet.solana.com" | grep "Address:" | awk '{print $2}') -# spl-token create-account ${TOKEN_ADDRESS} --url "https://api.devnet.solana.com" -# spl-token mint ${TOKEN_ADDRESS} 1000000 --url "https://api.devnet.solana.com" - -echo -e "\n[+] Creating training run..." -cargo run --release --bin psyche-solana-client -- \ - create-run \ - --wallet-private-key-path ${WALLET_FILE} \ - --rpc "https://api.devnet.solana.com" \ - --ws-rpc "wss://api.devnet.solana.com" \ - --client-version "test" \ - --run-id ${RUN_ID} "$@" - -echo -e "\n[+] Update training run config..." -cargo run --release --bin psyche-solana-client -- \ - update-config \ - --wallet-private-key-path ${WALLET_FILE} \ - --rpc "https://api.devnet.solana.com" \ - --ws-rpc "wss://api.devnet.solana.com" \ - --run-id ${RUN_ID} \ - --config-path ${CONFIG_FILE} - -# echo -e "\n[+] Update training run model..." -# cargo run --release --bin psyche-solana-client -- \ - # update-model \ - # --wallet-private-key-path ${WALLET_FILE} \ - # --rpc "https://api.devnet.solana.com" \ - # --ws-rpc "wss://api.devnet.solana.com" \ - # --run-id ${RUN_ID} \ - # --config-path ${CONFIG_FILE} - -# echo -e "\n[+] Unpause the training run..." -# cargo run --release --bin psyche-solana-client -- \ - # set-paused \ - # --wallet-private-key-path ${WALLET_FILE} \ - # --rpc "https://api.devnet.solana.com" \ - # --ws-rpc "wss://api.devnet.solana.com" \ - # --run-id ${RUN_ID} \ - # --resume