From 560b7681db3f23045a5342f3558e37f5bba2959e Mon Sep 17 00:00:00 2001 From: Jacob-Chmura Date: Tue, 20 May 2025 10:50:06 -0400 Subject: [PATCH 1/5] Add ppo ewc jobs --- jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh | 40 +++++++++++++++++++ .../ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh | 39 ++++++++++++++++++ jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh | 40 +++++++++++++++++++ .../ppo_ewc_long_piecewise_multi_gpu.sh | 39 ++++++++++++++++++ jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh | 40 +++++++++++++++++++ .../ppo_ewc_short_piecewise_multi_gpu.sh | 38 ++++++++++++++++++ 6 files changed, 236 insertions(+) create mode 100644 jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh create mode 100644 jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh create mode 100644 jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh create mode 100644 jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh create mode 100644 jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh create mode 100644 jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh diff --git a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh new file mode 100644 index 0000000..fde542d --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-ewc-cppo +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='CPPO-RL' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name "$dataset_name" \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$HOME/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh new file mode 100644 index 0000000..5260b1e --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-ewc-domain_shift +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email +source .env + +dataset_name='aifgen-domain-preference-shift' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh new file mode 100644 index 0000000..3cb21e6 --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-ewc-lipschitz +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-lipschitz' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 8 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh new file mode 100644 index 0000000..9779503 --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-ewc-long-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email +source .env + +dataset_name='aifgen-long-piecewise' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh new file mode 100644 index 0000000..07f3735 --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-piecewise-preference-shift' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo/ppo_continual.py \ + --wandb_project "$dataset_name-post-May-19" \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 1.0e-6 \ + --kl_coef 0.37 \ + --cliprange 0.1 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 8 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 200 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh new file mode 100644 index 0000000..af9852a --- /dev/null +++ b/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh @@ -0,0 +1,38 @@ +#!/bin/bash +#SBATCH --job-name=aif-gen-ppo-short-piecewise +#SBATCH --nodes=1 # Request 2 nodes +#SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node +#SBATCH --ntasks-per-node=4 # One task per GPU +#SBATCH --cpus-per-task=6 +#SBATCH --mem=64G +#SBATCH --time=24:00:00 +#SBATCH --output=out/%x.%j.out # Include job name + job ID +#SBATCH --error=out/%x.%j.err # Include job name + job ID +#SBATCH --mail-type=ALL +#SBATCH --account=aip-rrabba +#SBATCH --mail-user=shahrad_m@icloud.com # Update with your email + +source .env + +dataset_name='aifgen-short-piecewise' + +accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ + benchmarks/ppo/ppo_continual.py \ + --wandb_project $dataset_name \ + --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --dataset_name $dataset_name \ + --sft_model_path Qwen/Qwen2-0.5B-Instruct \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --learning_rate 5.0e-6 \ + --response_length 256 \ + --num_train_epochs 4 \ + --gradient_checkpointing \ + --per_device_train_batch_size 16 \ + --logging_steps 10 \ + --eval_strategy steps \ + --eval_steps 300 \ + --save_steps 300 \ + --bf16 \ + --output_dir "$HOME/Qwen2-0.5B-PPO-${dataset_name}" \ + --no_remove_unused_columns From 27e57d1f8c39022fab2aea7b00d12a3b4f001152 Mon Sep 17 00:00:00 2001 From: Jacob-Chmura Date: Tue, 20 May 2025 11:08:59 -0400 Subject: [PATCH 2/5] Update benchmark entry points --- benchmarks/ppo/ppo_continual.py | 28 +++---- benchmarks/ppo_ewc/ppo_EWC_continual.py | 107 +++++++++++++++--------- 2 files changed, 80 insertions(+), 55 deletions(-) diff --git a/benchmarks/ppo/ppo_continual.py b/benchmarks/ppo/ppo_continual.py index d1759d0..5c98e83 100644 --- a/benchmarks/ppo/ppo_continual.py +++ b/benchmarks/ppo/ppo_continual.py @@ -4,11 +4,6 @@ import torch import wandb as wb -from continual_ppo_trainer import ( - ContinualPPOArguments, - ContinualPPOConfig, - ContinualPPOTrainer, -) from datasets import Dataset from transformers import ( AutoModelForCausalLM, @@ -21,10 +16,15 @@ get_kbit_device_map, get_peft_config, get_quantization_config, + setup_chat_format, ) -from trl import setup_chat_format from benchmarks.dataloading import init_continual_dataset +from benchmarks.ppo.continual_ppo_trainer import ( + ContinualPPOArguments, + ContinualPPOConfig, + ContinualPPOTrainer, +) def main( @@ -106,7 +106,9 @@ def main( value_model_path = script_args.value_model_path else: model_path = os.path.join(training_args.output_dir, 'last') - value_model_path = os.path.join(training_args.output_dir, 'last', 'value_model') + value_model_path = os.path.join( + training_args.output_dir, 'last', 'value_model' + ) policy = AutoModelForCausalLM.from_pretrained( pretrained_model_name_or_path=model_path, trust_remote_code=model_args.trust_remote_code, @@ -126,7 +128,7 @@ def main( value_model_path, trust_remote_code=model_args.trust_remote_code, num_labels=1, - from_tf=True, # or use `subfolder="safetensors"` if you saved a .safetensors file + from_tf=True, # or use `subfolder="safetensors"` if you saved a .safetensors file ) # Build custom repository name for this task @@ -173,9 +175,6 @@ def main( peft_config=peft_config, ) - # if i == 0: - # trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0')) - # Set current task in trainer for task-based logging trainer.set_task(f'task_{i}') @@ -208,9 +207,8 @@ def main( value_model_dir = os.path.join(last_dir, 'value_model') os.makedirs(value_model_dir, exist_ok=True) - value_model.save_pretrained(value_model_dir, - safe_serialization=False) - + value_model.save_pretrained(value_model_dir, safe_serialization=False) + trainer.accelerator.wait_for_everyone() if training_args.push_to_hub: @@ -226,4 +224,4 @@ def main( dataclass_types = (ContinualPPOArguments, ContinualPPOConfig, ModelConfig) parser = TrlParser(dataclass_types) script_args, training_args, model_args = parser.parse_args_and_config() - main(script_args, training_args, model_args) \ No newline at end of file + main(script_args, training_args, model_args) diff --git a/benchmarks/ppo_ewc/ppo_EWC_continual.py b/benchmarks/ppo_ewc/ppo_EWC_continual.py index c71e90e..f6cb496 100644 --- a/benchmarks/ppo_ewc/ppo_EWC_continual.py +++ b/benchmarks/ppo_ewc/ppo_EWC_continual.py @@ -16,8 +16,8 @@ get_kbit_device_map, get_peft_config, get_quantization_config, + setup_chat_format, ) -from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE from benchmarks.dataloading import init_continual_dataset from benchmarks.ppo_ewc.continual_ppo_EWC_trainer import ( @@ -52,15 +52,7 @@ def main( quantization_config=quantization_config, ) - # Load main model and (optionally) reference model model = str(training_args.sft_model_path) - policy = AutoModelForCausalLM.from_pretrained( - training_args.sft_model_path, - trust_remote_code=model_args.trust_remote_code, - **model_kwargs, - ) - - # Configure PEFT if needed peft_config = get_peft_config(model_args) if peft_config is None: ref_policy = AutoModelForCausalLM.from_pretrained( @@ -71,32 +63,11 @@ def main( else: ref_policy = None - # Load value model - value_model = None - if script_args.value_model_path: - value_model = AutoModelForSequenceClassification.from_pretrained( - script_args.value_model_path, - trust_remote_code=model_args.trust_remote_code, - num_labels=1, - ) - # Load tokenizer and set chat template if needed tokenizer = AutoTokenizer.from_pretrained( training_args.sft_model_path, trust_remote_code=model_args.trust_remote_code, ) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - if tokenizer.chat_template is None: - tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE - - # EWC-specific: DDPT distributed setup - if script_args.ignore_bias_buffers: - policy._ddp_params_and_buffers_to_ignore = [ - name - for name, buffer in policy.named_buffers() - if buffer.dtype == torch.bool - ] # Initialize continual dataset continual_dataset: list[dict[str, Dataset]] = init_continual_dataset( @@ -112,6 +83,7 @@ def main( if '.' in clean_dataset_name: clean_dataset_name = clean_dataset_name.split('.')[0] + print(f'Training PPO-EWC on {len(continual_dataset)} tasks') # check if the reward models are present either in the path or in the hub if training_args.reward_model_path is not None: for i in range(len(continual_dataset)): @@ -128,6 +100,44 @@ def main( # Task Loop for i, dataset in enumerate(continual_dataset): + # Load main model and (optionally) reference model + if i == 0: + model_path = training_args.sft_model_path + value_model_path = script_args.value_model_path + else: + model_path = os.path.join(training_args.output_dir, 'last') + value_model_path = os.path.join( + training_args.output_dir, 'last', 'value_model' + ) + policy = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_path, + trust_remote_code=model_args.trust_remote_code, + **model_kwargs, + ) + # EWC-specific: DDPT distributed setup + if script_args.ignore_bias_buffers: + policy._ddp_params_and_buffers_to_ignore = [ + name + for name, buffer in policy.named_buffers() + if buffer.dtype == torch.bool + ] + + # Load value model and policy model (main model) + try: + value_model = AutoModelForSequenceClassification.from_pretrained( + value_model_path, + trust_remote_code=model_args.trust_remote_code, + num_labels=1, + ) + except OSError: + # Maybe it was saved as safetensors? + value_model = AutoModelForSequenceClassification.from_pretrained( + value_model_path, + trust_remote_code=model_args.trust_remote_code, + num_labels=1, + from_tf=True, # or use `subfolder="safetensors"` if you saved a .safetensors file + ) + # Build custom repository name for this task custom_repo_name = ( model.split('/')[-1] + '_' + clean_dataset_name + '_PPO_EWC_' + str(i) @@ -141,6 +151,22 @@ def main( training_args.reward_model_path + '_' + str(i), num_labels=1 ) + for idx, _model in enumerate([policy, value_model, reward_model]): + # Align padding tokens between tokenizer and model + _model.config.pad_token_id = tokenizer.pad_token_id + + # Use ChatML format if the tokenizer doesn't already have a chat template + if tokenizer.chat_template is None: + updated_model, updated_tokenizer = setup_chat_format(_model, tokenizer) + # Actually store the updated model + if idx == 0: + policy = updated_model + elif idx == 1: + value_model = updated_model + else: + reward_model = updated_model + tokenizer = updated_tokenizer + ################ # Training and Evaluation ################ @@ -181,21 +207,22 @@ def main( wb.log({f'task/{custom_repo_name}/last': metrics}) # type: ignore[attr-defined] # Save model checkpoint and optionally push - if not training_args.push_to_hub: - trainer.save_model(os.path.join(training_args.output_dir, 'last')) - else: + last_dir = os.path.join(training_args.output_dir, 'last') + policy.save_pretrained(last_dir) + tokenizer.save_pretrained(last_dir) + + value_model_dir = os.path.join(last_dir, 'value_model') + os.makedirs(value_model_dir, exist_ok=True) + value_model.save_pretrained(value_model_dir, safe_serialization=False) + + trainer.accelerator.wait_for_everyone() + + if training_args.push_to_hub: trainer.push_to_hub( model_name=custom_repo_name, dataset_name='Continual_PPO_EWC_' + clean_dataset_name + '_' + str(i), ) - # Clean up for next task - EWC specific - if hasattr(trainer, 'deepspeed') and trainer.deepspeed is not None: - # Remove reference to the DeepSpeed engine to allow proper cleanup - del trainer.deepspeed - # Free cached GPU memory - torch.cuda.empty_cache() - print('Training completed for all tasks!') From 070bf644db9fcfa5e20e5d3d2dfd362ce5d6a591 Mon Sep 17 00:00:00 2001 From: Jacob-Chmura Date: Tue, 20 May 2025 11:20:17 -0400 Subject: [PATCH 3/5] Update trainers --- benchmarks/ppo/continual_ppo_trainer.py | 14 +++++++------- benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py | 14 ++++++-------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/benchmarks/ppo/continual_ppo_trainer.py b/benchmarks/ppo/continual_ppo_trainer.py index 235d4b5..79ea9c6 100644 --- a/benchmarks/ppo/continual_ppo_trainer.py +++ b/benchmarks/ppo/continual_ppo_trainer.py @@ -113,7 +113,6 @@ class ContinualPPOConfig(PPOConfig): class ContinualPPOTrainer(PPOTrainer): - def __init__( self, args: Optional[PPOConfig] = None, @@ -142,7 +141,9 @@ def __init__( self.shared_accelerator: Optional[Accelerator] = None self.current_task_index: Optional[int] = None self.policy_value_models: Any = None # the policy and value model wrapper - self.ds_wrapped_models: Any = None # TODO work with this after deepspeed is initialized + self.ds_wrapped_models: Any = ( + None # TODO work with this after deepspeed is initialized + ) self.accelerator: Accelerator = None # now non-optional after creation # Basic setup and validation @@ -1192,13 +1193,12 @@ def mark_final_eval(self, is_final: bool = True) -> 'ContinualPPOTrainer': return self def save_model(self, output_dir: str, _internal_call=True) -> None: - """ - Manually save the model (and training state) to a specified directory. + """Manually save the model (and training state) to a specified directory. This follows a similar procedure as _save_checkpoint. """ - # Save the model files to output_dir (marking _internal_call True) from transformers import Trainer # ensure Trainer is imported + Trainer.save_model(self, output_dir, _internal_call=True) # If not saving only the model, save optimizer, scheduler, and RNG state @@ -1208,9 +1208,9 @@ def save_model(self, output_dir: str, _internal_call=True) -> None: self._save_rng_state(output_dir) # Save the trainer state - trainer_state_path = os.path.join(output_dir, "trainer_state.json") + trainer_state_path = os.path.join(output_dir, 'trainer_state.json') self.state.save_to_json(trainer_state_path) # Optionally push to hub if that option is enabled if self.args.push_to_hub: - self._push_from_checkpoint(output_dir) \ No newline at end of file + self._push_from_checkpoint(output_dir) diff --git a/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py b/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py index 22717eb..c47cab2 100644 --- a/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py +++ b/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py @@ -112,9 +112,7 @@ def __init__( # Store EWC-specific parameters self.ewc_lambda = args.ewc_lambda - # Track if we're on the first task - is_first_task = ContinualPPOTrainer.current_task_index == 0 - if is_first_task: + if self.current_task_index == 0: # Initialize empty dictionaries for first task ContinualPPOEWCTrainer.class_fisher_information = {} ContinualPPOEWCTrainer.class_old_params = {} @@ -775,15 +773,15 @@ def repeat_generator() -> DataLoader: if self.ref_model is None and original_ref_model is not None: print('Reference model was cleared during training - restoring') self.ref_model = original_ref_model - ContinualPPOTrainer.class_ref_model = original_ref_model + self.class_ref_model = original_ref_model # Ensure the class variable is updated - ContinualPPOTrainer.class_ref_model = self.ref_model + self.class_ref_model = self.ref_model if self.is_deepspeed_enabled: - ContinualPPOTrainer.ds_wrapped_models = self.deepspeed + self.ds_wrapped_models = self.deepspeed else: - ContinualPPOTrainer.ds_wrapped_models = self.model - ContinualPPOTrainer.policy_value_models = self.model + self.ds_wrapped_models = self.model + self.policy_value_models = self.model def update_fisher_and_params(self) -> None: """Explicitly update the Fisher information and parameter values. From bd5d76e90833cb8d205f02ee899746e2a31482a5 Mon Sep 17 00:00:00 2001 From: Jacob-Chmura Date: Tue, 20 May 2025 11:23:10 -0400 Subject: [PATCH 4/5] Typo in CPPO job --- jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh index fde542d..fbfbf1f 100644 --- a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh +++ b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh @@ -22,8 +22,8 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ --dataset_name "$dataset_name" \ --sft_model_path Qwen/Qwen2-0.5B-Instruct \ - --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ - --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \ + --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_CPPO_REWARD_0 \ + --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_CPPO_REWARD \ --learning_rate 1.0e-6 \ --kl_coef 0.37 \ --cliprange 0.1 \ From cb094279efb130fcb3ffd0dcdd32db6c9bd39e4d Mon Sep 17 00:00:00 2001 From: Jacob-Chmura Date: Tue, 20 May 2025 11:29:20 -0400 Subject: [PATCH 5/5] Update jobs output dir --- jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh | 4 ++-- jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh | 4 ++-- jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh | 4 ++-- jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh | 4 ++-- jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh | 8 ++++---- jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh | 8 ++++---- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh index fbfbf1f..4615ccd 100644 --- a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh +++ b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh @@ -19,7 +19,7 @@ dataset_name='CPPO-RL' accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ benchmarks/ppo_ewc/ppo_EWC_continual.py \ --wandb_project "$dataset_name-post-May-19" \ - --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ --dataset_name "$dataset_name" \ --sft_model_path Qwen/Qwen2-0.5B-Instruct \ --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_CPPO_REWARD_0 \ @@ -36,5 +36,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero --eval_steps 200 \ --save_steps 300 \ --bf16 \ - --output_dir "$HOME/Qwen2-0.5B-PPO-${dataset_name}" \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh index 5260b1e..22a943c 100644 --- a/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh +++ b/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh @@ -18,7 +18,7 @@ dataset_name='aifgen-domain-preference-shift' accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ benchmarks/ppo_ewc/ppo_EWC_continual.py \ --wandb_project "$dataset_name-post-May-19" \ - --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ --dataset_name $dataset_name \ --sft_model_path Qwen/Qwen2-0.5B-Instruct \ --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ @@ -35,5 +35,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero --eval_steps 200 \ --save_steps 300 \ --bf16 \ - --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh index 3cb21e6..cc1406a 100644 --- a/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh +++ b/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh @@ -19,7 +19,7 @@ dataset_name='aifgen-lipschitz' accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ benchmarks/ppo_ewc/ppo_EWC_continual.py \ --wandb_project "$dataset_name-post-May-19" \ - --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ --dataset_name $dataset_name \ --sft_model_path Qwen/Qwen2-0.5B-Instruct \ --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ @@ -36,5 +36,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero --eval_steps 200 \ --save_steps 300 \ --bf16 \ - --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh index 9779503..22cf720 100644 --- a/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh +++ b/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh @@ -18,7 +18,7 @@ dataset_name='aifgen-long-piecewise' accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ benchmarks/ppo_ewc/ppo_EWC_continual.py \ --wandb_project "$dataset_name-post-May-19" \ - --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ --dataset_name $dataset_name \ --sft_model_path Qwen/Qwen2-0.5B-Instruct \ --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ @@ -35,5 +35,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero --eval_steps 200 \ --save_steps 300 \ --bf16 \ - --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh index 07f3735..721a46c 100644 --- a/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh +++ b/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=aif-gen-ppo-piecewise +#SBATCH --job-name=aif-gen-ppo-ewc-piecewise #SBATCH --nodes=1 # Request 2 nodes #SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node #SBATCH --ntasks-per-node=4 # One task per GPU @@ -17,9 +17,9 @@ source .env dataset_name='aifgen-piecewise-preference-shift' accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ - benchmarks/ppo/ppo_continual.py \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ --wandb_project "$dataset_name-post-May-19" \ - --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ --dataset_name $dataset_name \ --sft_model_path Qwen/Qwen2-0.5B-Instruct \ --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ @@ -36,5 +36,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero --eval_steps 200 \ --save_steps 300 \ --bf16 \ - --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ --no_remove_unused_columns diff --git a/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh index af9852a..029eb2e 100644 --- a/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh +++ b/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=aif-gen-ppo-short-piecewise +#SBATCH --job-name=aif-gen-ppo-ewc-short-piecewise #SBATCH --nodes=1 # Request 2 nodes #SBATCH --gpus-per-node=h100:4 # Request 4 H100 GPUs per node #SBATCH --ntasks-per-node=4 # One task per GPU @@ -17,9 +17,9 @@ source .env dataset_name='aifgen-short-piecewise' accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \ - benchmarks/ppo/ppo_continual.py \ + benchmarks/ppo_ewc/ppo_EWC_continual.py \ --wandb_project $dataset_name \ - --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \ + --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \ --dataset_name $dataset_name \ --sft_model_path Qwen/Qwen2-0.5B-Instruct \ --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \ @@ -34,5 +34,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero --eval_steps 300 \ --save_steps 300 \ --bf16 \ - --output_dir "$HOME/Qwen2-0.5B-PPO-${dataset_name}" \ + --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \ --no_remove_unused_columns