From 560b7681db3f23045a5342f3558e37f5bba2959e Mon Sep 17 00:00:00 2001
From: Jacob-Chmura <jacobpaul.chmura@gmail.com>
Date: Tue, 20 May 2025 10:50:06 -0400
Subject: [PATCH 1/5] Add ppo ewc jobs

---
 jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh        | 40 +++++++++++++++++++
 .../ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh | 39 ++++++++++++++++++
 jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh   | 40 +++++++++++++++++++
 .../ppo_ewc_long_piecewise_multi_gpu.sh       | 39 ++++++++++++++++++
 jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh   | 40 +++++++++++++++++++
 .../ppo_ewc_short_piecewise_multi_gpu.sh      | 38 ++++++++++++++++++
 6 files changed, 236 insertions(+)
 create mode 100644 jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh
 create mode 100644 jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh
 create mode 100644 jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh
 create mode 100644 jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh
 create mode 100644 jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh
 create mode 100644 jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh

diff --git a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh
new file mode 100644
index 0000000..fde542d
--- /dev/null
+++ b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+#SBATCH --job-name=aif-gen-ppo-ewc-cppo
+#SBATCH --nodes=1                 # Request 2 nodes
+#SBATCH --gpus-per-node=h100:4     # Request 4 H100 GPUs per node
+#SBATCH --ntasks-per-node=4        # One task per GPU
+#SBATCH --cpus-per-task=6
+#SBATCH --mem=64G
+#SBATCH --time=24:00:00
+#SBATCH --output=out/%x.%j.out     # Include job name + job ID
+#SBATCH --error=out/%x.%j.err      # Include job name + job ID
+#SBATCH --mail-type=ALL
+#SBATCH --account=aip-rrabba
+#SBATCH --mail-user=shahrad_m@icloud.com  # Update with your email
+
+source .env
+
+dataset_name='CPPO-RL'
+
+accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
+    benchmarks/ppo_ewc/ppo_EWC_continual.py \
+    --wandb_project "$dataset_name-post-May-19" \
+    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --dataset_name "$dataset_name" \
+    --sft_model_path Qwen/Qwen2-0.5B-Instruct \
+    --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
+    --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \
+    --learning_rate 1.0e-6 \
+    --kl_coef 0.37 \
+    --cliprange 0.1 \
+    --response_length 256 \
+    --num_train_epochs 4 \
+    --gradient_checkpointing \
+    --per_device_train_batch_size 16 \
+    --logging_steps 10 \
+    --eval_strategy steps \
+    --eval_steps 200 \
+    --save_steps 300 \
+    --bf16 \
+    --output_dir "$HOME/Qwen2-0.5B-PPO-${dataset_name}" \
+    --no_remove_unused_columns
diff --git a/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh
new file mode 100644
index 0000000..5260b1e
--- /dev/null
+++ b/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --job-name=aif-gen-ppo-ewc-domain_shift
+#SBATCH --nodes=1                  # Request 2 nodes
+#SBATCH --gpus-per-node=h100:4     # Request 4 H100 GPUs per node
+#SBATCH --ntasks-per-node=4        # One task per GPU
+#SBATCH --cpus-per-task=6
+#SBATCH --mem=64G
+#SBATCH --time=24:00:00
+#SBATCH --output=out/%x.%j.out     # Include job name + job ID
+#SBATCH --error=out/%x.%j.err      # Include job name + job ID
+#SBATCH --mail-type=ALL
+#SBATCH --account=aip-rrabba
+#SBATCH --mail-user=shahrad_m@icloud.com  # Update with your email
+source .env
+
+dataset_name='aifgen-domain-preference-shift'
+
+accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
+    benchmarks/ppo_ewc/ppo_EWC_continual.py \
+    --wandb_project "$dataset_name-post-May-19" \
+    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --dataset_name $dataset_name \
+    --sft_model_path Qwen/Qwen2-0.5B-Instruct \
+    --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
+    --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \
+    --learning_rate 1.0e-6 \
+    --kl_coef 0.37 \
+    --cliprange 0.1 \
+    --response_length 256 \
+    --num_train_epochs 4 \
+    --gradient_checkpointing \
+    --per_device_train_batch_size 16 \
+    --logging_steps 10 \
+    --eval_strategy steps \
+    --eval_steps 200 \
+    --save_steps 300 \
+    --bf16 \
+    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \
+    --no_remove_unused_columns
diff --git a/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh
new file mode 100644
index 0000000..3cb21e6
--- /dev/null
+++ b/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+#SBATCH --job-name=aif-gen-ppo-ewc-lipschitz
+#SBATCH --nodes=1                  # Request 2 nodes
+#SBATCH --gpus-per-node=h100:4     # Request 4 H100 GPUs per node
+#SBATCH --ntasks-per-node=4        # One task per GPU
+#SBATCH --cpus-per-task=6
+#SBATCH --mem=64G
+#SBATCH --time=24:00:00
+#SBATCH --output=out/%x.%j.out     # Include job name + job ID
+#SBATCH --error=out/%x.%j.err      # Include job name + job ID
+#SBATCH --mail-type=ALL
+#SBATCH --account=aip-rrabba
+#SBATCH --mail-user=shahrad_m@icloud.com  # Update with your email
+
+source .env
+
+dataset_name='aifgen-lipschitz'
+
+accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
+    benchmarks/ppo_ewc/ppo_EWC_continual.py \
+    --wandb_project "$dataset_name-post-May-19" \
+    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --dataset_name $dataset_name \
+    --sft_model_path Qwen/Qwen2-0.5B-Instruct \
+    --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
+    --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \
+    --learning_rate 1.0e-6 \
+    --kl_coef 0.37 \
+    --cliprange 0.1 \
+    --response_length 256 \
+    --num_train_epochs 4 \
+    --gradient_checkpointing \
+    --per_device_train_batch_size 8 \
+    --logging_steps 10 \
+    --eval_strategy steps \
+    --eval_steps 200 \
+    --save_steps 300 \
+    --bf16 \
+    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \
+    --no_remove_unused_columns
diff --git a/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh
new file mode 100644
index 0000000..9779503
--- /dev/null
+++ b/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --job-name=aif-gen-ppo-ewc-long-piecewise
+#SBATCH --nodes=1                  # Request 2 nodes
+#SBATCH --gpus-per-node=h100:4     # Request 4 H100 GPUs per node
+#SBATCH --ntasks-per-node=4        # One task per GPU
+#SBATCH --cpus-per-task=6
+#SBATCH --mem=64G
+#SBATCH --time=24:00:00
+#SBATCH --output=out/%x.%j.out     # Include job name + job ID
+#SBATCH --error=out/%x.%j.err      # Include job name + job ID
+#SBATCH --mail-type=ALL
+#SBATCH --account=aip-rrabba
+#SBATCH --mail-user=shahrad_m@icloud.com  # Update with your email
+source .env
+
+dataset_name='aifgen-long-piecewise'
+
+accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
+    benchmarks/ppo_ewc/ppo_EWC_continual.py \
+    --wandb_project "$dataset_name-post-May-19" \
+    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --dataset_name $dataset_name \
+    --sft_model_path Qwen/Qwen2-0.5B-Instruct \
+    --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
+    --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \
+    --learning_rate 1.0e-6 \
+    --kl_coef 0.37 \
+    --cliprange 0.1 \
+    --response_length 256 \
+    --num_train_epochs 4 \
+    --gradient_checkpointing \
+    --per_device_train_batch_size 16 \
+    --logging_steps 10 \
+    --eval_strategy steps \
+    --eval_steps 200 \
+    --save_steps 300 \
+    --bf16 \
+    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \
+    --no_remove_unused_columns
diff --git a/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh
new file mode 100644
index 0000000..07f3735
--- /dev/null
+++ b/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+#SBATCH --job-name=aif-gen-ppo-piecewise
+#SBATCH --nodes=1                  # Request 2 nodes
+#SBATCH --gpus-per-node=h100:4     # Request 4 H100 GPUs per node
+#SBATCH --ntasks-per-node=4        # One task per GPU
+#SBATCH --cpus-per-task=6
+#SBATCH --mem=64G
+#SBATCH --time=24:00:00
+#SBATCH --output=out/%x.%j.out     # Include job name + job ID
+#SBATCH --error=out/%x.%j.err      # Include job name + job ID
+#SBATCH --mail-type=ALL
+#SBATCH --account=aip-rrabba
+#SBATCH --mail-user=shahrad_m@icloud.com  # Update with your email
+
+source .env
+
+dataset_name='aifgen-piecewise-preference-shift'
+
+accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
+    benchmarks/ppo/ppo_continual.py \
+    --wandb_project "$dataset_name-post-May-19" \
+    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --dataset_name $dataset_name \
+    --sft_model_path Qwen/Qwen2-0.5B-Instruct \
+    --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
+    --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \
+    --learning_rate 1.0e-6 \
+    --kl_coef 0.37 \
+    --cliprange 0.1 \
+    --response_length 256 \
+    --num_train_epochs 4 \
+    --gradient_checkpointing \
+    --per_device_train_batch_size 8 \
+    --logging_steps 10 \
+    --eval_strategy steps \
+    --eval_steps 200 \
+    --save_steps 300 \
+    --bf16 \
+    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \
+    --no_remove_unused_columns
diff --git a/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh
new file mode 100644
index 0000000..af9852a
--- /dev/null
+++ b/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#SBATCH --job-name=aif-gen-ppo-short-piecewise
+#SBATCH --nodes=1                 # Request 2 nodes
+#SBATCH --gpus-per-node=h100:4     # Request 4 H100 GPUs per node
+#SBATCH --ntasks-per-node=4        # One task per GPU
+#SBATCH --cpus-per-task=6
+#SBATCH --mem=64G
+#SBATCH --time=24:00:00
+#SBATCH --output=out/%x.%j.out     # Include job name + job ID
+#SBATCH --error=out/%x.%j.err      # Include job name + job ID
+#SBATCH --mail-type=ALL
+#SBATCH --account=aip-rrabba
+#SBATCH --mail-user=shahrad_m@icloud.com  # Update with your email
+
+source .env
+
+dataset_name='aifgen-short-piecewise'
+
+accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
+    benchmarks/ppo/ppo_continual.py \
+    --wandb_project $dataset_name \
+    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --dataset_name $dataset_name \
+    --sft_model_path Qwen/Qwen2-0.5B-Instruct \
+    --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
+    --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \
+    --learning_rate 5.0e-6 \
+    --response_length 256 \
+    --num_train_epochs 4 \
+    --gradient_checkpointing \
+    --per_device_train_batch_size 16 \
+    --logging_steps 10 \
+    --eval_strategy steps \
+    --eval_steps 300 \
+    --save_steps 300 \
+    --bf16 \
+    --output_dir "$HOME/Qwen2-0.5B-PPO-${dataset_name}" \
+    --no_remove_unused_columns

From 27e57d1f8c39022fab2aea7b00d12a3b4f001152 Mon Sep 17 00:00:00 2001
From: Jacob-Chmura <jacobpaul.chmura@gmail.com>
Date: Tue, 20 May 2025 11:08:59 -0400
Subject: [PATCH 2/5] Update benchmark entry points

---
 benchmarks/ppo/ppo_continual.py         |  28 +++----
 benchmarks/ppo_ewc/ppo_EWC_continual.py | 107 +++++++++++++++---------
 2 files changed, 80 insertions(+), 55 deletions(-)

diff --git a/benchmarks/ppo/ppo_continual.py b/benchmarks/ppo/ppo_continual.py
index d1759d0..5c98e83 100644
--- a/benchmarks/ppo/ppo_continual.py
+++ b/benchmarks/ppo/ppo_continual.py
@@ -4,11 +4,6 @@
 
 import torch
 import wandb as wb
-from continual_ppo_trainer import (
-    ContinualPPOArguments,
-    ContinualPPOConfig,
-    ContinualPPOTrainer,
-)
 from datasets import Dataset
 from transformers import (
     AutoModelForCausalLM,
@@ -21,10 +16,15 @@
     get_kbit_device_map,
     get_peft_config,
     get_quantization_config,
+    setup_chat_format,
 )
-from trl import setup_chat_format
 
 from benchmarks.dataloading import init_continual_dataset
+from benchmarks.ppo.continual_ppo_trainer import (
+    ContinualPPOArguments,
+    ContinualPPOConfig,
+    ContinualPPOTrainer,
+)
 
 
 def main(
@@ -106,7 +106,9 @@ def main(
             value_model_path = script_args.value_model_path
         else:
             model_path = os.path.join(training_args.output_dir, 'last')
-            value_model_path = os.path.join(training_args.output_dir, 'last', 'value_model')
+            value_model_path = os.path.join(
+                training_args.output_dir, 'last', 'value_model'
+            )
         policy = AutoModelForCausalLM.from_pretrained(
             pretrained_model_name_or_path=model_path,
             trust_remote_code=model_args.trust_remote_code,
@@ -126,7 +128,7 @@ def main(
                 value_model_path,
                 trust_remote_code=model_args.trust_remote_code,
                 num_labels=1,
-                from_tf=True,            # or use `subfolder="safetensors"` if you saved a .safetensors file
+                from_tf=True,  # or use `subfolder="safetensors"` if you saved a .safetensors file
             )
 
         # Build custom repository name for this task
@@ -173,9 +175,6 @@ def main(
             peft_config=peft_config,
         )
 
-        # if i == 0:
-        #     trainer.save_model(os.path.join(training_args.output_dir, 'checkpoint-0'))
-
         # Set current task in trainer for task-based logging
         trainer.set_task(f'task_{i}')
 
@@ -208,9 +207,8 @@ def main(
 
         value_model_dir = os.path.join(last_dir, 'value_model')
         os.makedirs(value_model_dir, exist_ok=True)
-        value_model.save_pretrained(value_model_dir,
-                                    safe_serialization=False)
-        
+        value_model.save_pretrained(value_model_dir, safe_serialization=False)
+
         trainer.accelerator.wait_for_everyone()
 
         if training_args.push_to_hub:
@@ -226,4 +224,4 @@ def main(
     dataclass_types = (ContinualPPOArguments, ContinualPPOConfig, ModelConfig)
     parser = TrlParser(dataclass_types)
     script_args, training_args, model_args = parser.parse_args_and_config()
-    main(script_args, training_args, model_args)
\ No newline at end of file
+    main(script_args, training_args, model_args)
diff --git a/benchmarks/ppo_ewc/ppo_EWC_continual.py b/benchmarks/ppo_ewc/ppo_EWC_continual.py
index c71e90e..f6cb496 100644
--- a/benchmarks/ppo_ewc/ppo_EWC_continual.py
+++ b/benchmarks/ppo_ewc/ppo_EWC_continual.py
@@ -16,8 +16,8 @@
     get_kbit_device_map,
     get_peft_config,
     get_quantization_config,
+    setup_chat_format,
 )
-from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 from benchmarks.dataloading import init_continual_dataset
 from benchmarks.ppo_ewc.continual_ppo_EWC_trainer import (
@@ -52,15 +52,7 @@ def main(
         quantization_config=quantization_config,
     )
 
-    # Load main model and (optionally) reference model
     model = str(training_args.sft_model_path)
-    policy = AutoModelForCausalLM.from_pretrained(
-        training_args.sft_model_path,
-        trust_remote_code=model_args.trust_remote_code,
-        **model_kwargs,
-    )
-
-    # Configure PEFT if needed
     peft_config = get_peft_config(model_args)
     if peft_config is None:
         ref_policy = AutoModelForCausalLM.from_pretrained(
@@ -71,32 +63,11 @@ def main(
     else:
         ref_policy = None
 
-    # Load value model
-    value_model = None
-    if script_args.value_model_path:
-        value_model = AutoModelForSequenceClassification.from_pretrained(
-            script_args.value_model_path,
-            trust_remote_code=model_args.trust_remote_code,
-            num_labels=1,
-        )
-
     # Load tokenizer and set chat template if needed
     tokenizer = AutoTokenizer.from_pretrained(
         training_args.sft_model_path,
         trust_remote_code=model_args.trust_remote_code,
     )
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    if tokenizer.chat_template is None:
-        tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
-
-    # EWC-specific: DDPT distributed setup
-    if script_args.ignore_bias_buffers:
-        policy._ddp_params_and_buffers_to_ignore = [
-            name
-            for name, buffer in policy.named_buffers()
-            if buffer.dtype == torch.bool
-        ]
 
     # Initialize continual dataset
     continual_dataset: list[dict[str, Dataset]] = init_continual_dataset(
@@ -112,6 +83,7 @@ def main(
     if '.' in clean_dataset_name:
         clean_dataset_name = clean_dataset_name.split('.')[0]
 
+    print(f'Training PPO-EWC on {len(continual_dataset)} tasks')
     # check if the reward models are present either in the path or in the hub
     if training_args.reward_model_path is not None:
         for i in range(len(continual_dataset)):
@@ -128,6 +100,44 @@ def main(
 
     # Task Loop
     for i, dataset in enumerate(continual_dataset):
+        # Load main model and (optionally) reference model
+        if i == 0:
+            model_path = training_args.sft_model_path
+            value_model_path = script_args.value_model_path
+        else:
+            model_path = os.path.join(training_args.output_dir, 'last')
+            value_model_path = os.path.join(
+                training_args.output_dir, 'last', 'value_model'
+            )
+        policy = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_name_or_path=model_path,
+            trust_remote_code=model_args.trust_remote_code,
+            **model_kwargs,
+        )
+        # EWC-specific: DDPT distributed setup
+        if script_args.ignore_bias_buffers:
+            policy._ddp_params_and_buffers_to_ignore = [
+                name
+                for name, buffer in policy.named_buffers()
+                if buffer.dtype == torch.bool
+            ]
+
+        # Load value model and policy model (main model)
+        try:
+            value_model = AutoModelForSequenceClassification.from_pretrained(
+                value_model_path,
+                trust_remote_code=model_args.trust_remote_code,
+                num_labels=1,
+            )
+        except OSError:
+            # Maybe it was saved as safetensors?
+            value_model = AutoModelForSequenceClassification.from_pretrained(
+                value_model_path,
+                trust_remote_code=model_args.trust_remote_code,
+                num_labels=1,
+                from_tf=True,  # or use `subfolder="safetensors"` if you saved a .safetensors file
+            )
+
         # Build custom repository name for this task
         custom_repo_name = (
             model.split('/')[-1] + '_' + clean_dataset_name + '_PPO_EWC_' + str(i)
@@ -141,6 +151,22 @@ def main(
             training_args.reward_model_path + '_' + str(i), num_labels=1
         )
 
+        for idx, _model in enumerate([policy, value_model, reward_model]):
+            # Align padding tokens between tokenizer and model
+            _model.config.pad_token_id = tokenizer.pad_token_id
+
+            # Use ChatML format if the tokenizer doesn't already have a chat template
+            if tokenizer.chat_template is None:
+                updated_model, updated_tokenizer = setup_chat_format(_model, tokenizer)
+                # Actually store the updated model
+                if idx == 0:
+                    policy = updated_model
+                elif idx == 1:
+                    value_model = updated_model
+                else:
+                    reward_model = updated_model
+                tokenizer = updated_tokenizer
+
         ################
         # Training and Evaluation
         ################
@@ -181,21 +207,22 @@ def main(
                 wb.log({f'task/{custom_repo_name}/last': metrics})  # type: ignore[attr-defined]
 
         # Save model checkpoint and optionally push
-        if not training_args.push_to_hub:
-            trainer.save_model(os.path.join(training_args.output_dir, 'last'))
-        else:
+        last_dir = os.path.join(training_args.output_dir, 'last')
+        policy.save_pretrained(last_dir)
+        tokenizer.save_pretrained(last_dir)
+
+        value_model_dir = os.path.join(last_dir, 'value_model')
+        os.makedirs(value_model_dir, exist_ok=True)
+        value_model.save_pretrained(value_model_dir, safe_serialization=False)
+
+        trainer.accelerator.wait_for_everyone()
+
+        if training_args.push_to_hub:
             trainer.push_to_hub(
                 model_name=custom_repo_name,
                 dataset_name='Continual_PPO_EWC_' + clean_dataset_name + '_' + str(i),
             )
 
-        # Clean up for next task - EWC specific
-        if hasattr(trainer, 'deepspeed') and trainer.deepspeed is not None:
-            # Remove reference to the DeepSpeed engine to allow proper cleanup
-            del trainer.deepspeed
-        # Free cached GPU memory
-        torch.cuda.empty_cache()
-
     print('Training completed for all tasks!')
 
 

From 070bf644db9fcfa5e20e5d3d2dfd362ce5d6a591 Mon Sep 17 00:00:00 2001
From: Jacob-Chmura <jacobpaul.chmura@gmail.com>
Date: Tue, 20 May 2025 11:20:17 -0400
Subject: [PATCH 3/5] Update trainers

---
 benchmarks/ppo/continual_ppo_trainer.py         | 14 +++++++-------
 benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py | 14 ++++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/benchmarks/ppo/continual_ppo_trainer.py b/benchmarks/ppo/continual_ppo_trainer.py
index 235d4b5..79ea9c6 100644
--- a/benchmarks/ppo/continual_ppo_trainer.py
+++ b/benchmarks/ppo/continual_ppo_trainer.py
@@ -113,7 +113,6 @@ class ContinualPPOConfig(PPOConfig):
 
 
 class ContinualPPOTrainer(PPOTrainer):
-
     def __init__(
         self,
         args: Optional[PPOConfig] = None,
@@ -142,7 +141,9 @@ def __init__(
         self.shared_accelerator: Optional[Accelerator] = None
         self.current_task_index: Optional[int] = None
         self.policy_value_models: Any = None  # the policy and value model wrapper
-        self.ds_wrapped_models: Any = None # TODO work with this after deepspeed is initialized
+        self.ds_wrapped_models: Any = (
+            None  # TODO work with this after deepspeed is initialized
+        )
         self.accelerator: Accelerator = None  # now non-optional after creation
 
         # Basic setup and validation
@@ -1192,13 +1193,12 @@ def mark_final_eval(self, is_final: bool = True) -> 'ContinualPPOTrainer':
         return self
 
     def save_model(self, output_dir: str, _internal_call=True) -> None:
-        """
-        Manually save the model (and training state) to a specified directory.
+        """Manually save the model (and training state) to a specified directory.
         This follows a similar procedure as _save_checkpoint.
         """
-
         # Save the model files to output_dir (marking _internal_call True)
         from transformers import Trainer  # ensure Trainer is imported
+
         Trainer.save_model(self, output_dir, _internal_call=True)
 
         # If not saving only the model, save optimizer, scheduler, and RNG state
@@ -1208,9 +1208,9 @@ def save_model(self, output_dir: str, _internal_call=True) -> None:
             self._save_rng_state(output_dir)
 
         # Save the trainer state
-        trainer_state_path = os.path.join(output_dir, "trainer_state.json")
+        trainer_state_path = os.path.join(output_dir, 'trainer_state.json')
         self.state.save_to_json(trainer_state_path)
 
         # Optionally push to hub if that option is enabled
         if self.args.push_to_hub:
-            self._push_from_checkpoint(output_dir)
\ No newline at end of file
+            self._push_from_checkpoint(output_dir)
diff --git a/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py b/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py
index 22717eb..c47cab2 100644
--- a/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py
+++ b/benchmarks/ppo_ewc/continual_ppo_EWC_trainer.py
@@ -112,9 +112,7 @@ def __init__(
         # Store EWC-specific parameters
         self.ewc_lambda = args.ewc_lambda
 
-        # Track if we're on the first task
-        is_first_task = ContinualPPOTrainer.current_task_index == 0
-        if is_first_task:
+        if self.current_task_index == 0:
             # Initialize empty dictionaries for first task
             ContinualPPOEWCTrainer.class_fisher_information = {}
             ContinualPPOEWCTrainer.class_old_params = {}
@@ -775,15 +773,15 @@ def repeat_generator() -> DataLoader:
         if self.ref_model is None and original_ref_model is not None:
             print('Reference model was cleared during training - restoring')
             self.ref_model = original_ref_model
-            ContinualPPOTrainer.class_ref_model = original_ref_model
+            self.class_ref_model = original_ref_model
 
         # Ensure the class variable is updated
-        ContinualPPOTrainer.class_ref_model = self.ref_model
+        self.class_ref_model = self.ref_model
         if self.is_deepspeed_enabled:
-            ContinualPPOTrainer.ds_wrapped_models = self.deepspeed
+            self.ds_wrapped_models = self.deepspeed
         else:
-            ContinualPPOTrainer.ds_wrapped_models = self.model
-        ContinualPPOTrainer.policy_value_models = self.model
+            self.ds_wrapped_models = self.model
+        self.policy_value_models = self.model
 
     def update_fisher_and_params(self) -> None:
         """Explicitly update the Fisher information and parameter values.

From bd5d76e90833cb8d205f02ee899746e2a31482a5 Mon Sep 17 00:00:00 2001
From: Jacob-Chmura <jacobpaul.chmura@gmail.com>
Date: Tue, 20 May 2025 11:23:10 -0400
Subject: [PATCH 4/5] Typo in CPPO job

---
 jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh
index fde542d..fbfbf1f 100644
--- a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh
+++ b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh
@@ -22,8 +22,8 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero
     --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
     --dataset_name "$dataset_name" \
     --sft_model_path Qwen/Qwen2-0.5B-Instruct \
-    --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
-    --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD \
+    --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_CPPO_REWARD_0 \
+    --reward_model_path LifelongAlignment/Qwen2-0.5B-Instruct_CPPO_REWARD \
     --learning_rate 1.0e-6 \
     --kl_coef 0.37 \
     --cliprange 0.1 \

From cb094279efb130fcb3ffd0dcdd32db6c9bd39e4d Mon Sep 17 00:00:00 2001
From: Jacob-Chmura <jacobpaul.chmura@gmail.com>
Date: Tue, 20 May 2025 11:29:20 -0400
Subject: [PATCH 5/5] Update jobs output dir

---
 jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh            | 4 ++--
 jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh    | 4 ++--
 jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh       | 4 ++--
 jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh  | 4 ++--
 jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh       | 8 ++++----
 jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh | 8 ++++----
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh
index fbfbf1f..4615ccd 100644
--- a/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh
+++ b/jobs/ppo_ewc/ppo_ewc_cppo_multi_gpu.sh
@@ -19,7 +19,7 @@ dataset_name='CPPO-RL'
 accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
     benchmarks/ppo_ewc/ppo_EWC_continual.py \
     --wandb_project "$dataset_name-post-May-19" \
-    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \
     --dataset_name "$dataset_name" \
     --sft_model_path Qwen/Qwen2-0.5B-Instruct \
     --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_CPPO_REWARD_0 \
@@ -36,5 +36,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero
     --eval_steps 200 \
     --save_steps 300 \
     --bf16 \
-    --output_dir "$HOME/Qwen2-0.5B-PPO-${dataset_name}" \
+    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \
     --no_remove_unused_columns
diff --git a/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh
index 5260b1e..22a943c 100644
--- a/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh
+++ b/jobs/ppo_ewc/ppo_ewc_domain_shift_multi_gpu.sh
@@ -18,7 +18,7 @@ dataset_name='aifgen-domain-preference-shift'
 accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
     benchmarks/ppo_ewc/ppo_EWC_continual.py \
     --wandb_project "$dataset_name-post-May-19" \
-    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \
     --dataset_name $dataset_name \
     --sft_model_path Qwen/Qwen2-0.5B-Instruct \
     --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
@@ -35,5 +35,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero
     --eval_steps 200 \
     --save_steps 300 \
     --bf16 \
-    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \
+    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \
     --no_remove_unused_columns
diff --git a/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh
index 3cb21e6..cc1406a 100644
--- a/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh
+++ b/jobs/ppo_ewc/ppo_ewc_lipschitz_multi_gpu.sh
@@ -19,7 +19,7 @@ dataset_name='aifgen-lipschitz'
 accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
     benchmarks/ppo_ewc/ppo_EWC_continual.py \
     --wandb_project "$dataset_name-post-May-19" \
-    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \
     --dataset_name $dataset_name \
     --sft_model_path Qwen/Qwen2-0.5B-Instruct \
     --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
@@ -36,5 +36,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero
     --eval_steps 200 \
     --save_steps 300 \
     --bf16 \
-    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \
+    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \
     --no_remove_unused_columns
diff --git a/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh
index 9779503..22cf720 100644
--- a/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh
+++ b/jobs/ppo_ewc/ppo_ewc_long_piecewise_multi_gpu.sh
@@ -18,7 +18,7 @@ dataset_name='aifgen-long-piecewise'
 accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
     benchmarks/ppo_ewc/ppo_EWC_continual.py \
     --wandb_project "$dataset_name-post-May-19" \
-    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \
     --dataset_name $dataset_name \
     --sft_model_path Qwen/Qwen2-0.5B-Instruct \
     --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
@@ -35,5 +35,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero
     --eval_steps 200 \
     --save_steps 300 \
     --bf16 \
-    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \
+    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \
     --no_remove_unused_columns
diff --git a/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh
index 07f3735..721a46c 100644
--- a/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh
+++ b/jobs/ppo_ewc/ppo_ewc_piecewise_multi_gpu.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --job-name=aif-gen-ppo-piecewise
+#SBATCH --job-name=aif-gen-ppo-ewc-piecewise
 #SBATCH --nodes=1                  # Request 2 nodes
 #SBATCH --gpus-per-node=h100:4     # Request 4 H100 GPUs per node
 #SBATCH --ntasks-per-node=4        # One task per GPU
@@ -17,9 +17,9 @@ source .env
 dataset_name='aifgen-piecewise-preference-shift'
 
 accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
-    benchmarks/ppo/ppo_continual.py \
+    benchmarks/ppo_ewc/ppo_EWC_continual.py \
     --wandb_project "$dataset_name-post-May-19" \
-    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \
     --dataset_name $dataset_name \
     --sft_model_path Qwen/Qwen2-0.5B-Instruct \
     --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
@@ -36,5 +36,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero
     --eval_steps 200 \
     --save_steps 300 \
     --bf16 \
-    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-${dataset_name}" \
+    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \
     --no_remove_unused_columns
diff --git a/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh b/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh
index af9852a..029eb2e 100644
--- a/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh
+++ b/jobs/ppo_ewc/ppo_ewc_short_piecewise_multi_gpu.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --job-name=aif-gen-ppo-short-piecewise
+#SBATCH --job-name=aif-gen-ppo-ewc-short-piecewise
 #SBATCH --nodes=1                 # Request 2 nodes
 #SBATCH --gpus-per-node=h100:4     # Request 4 H100 GPUs per node
 #SBATCH --ntasks-per-node=4        # One task per GPU
@@ -17,9 +17,9 @@ source .env
 dataset_name='aifgen-short-piecewise'
 
 accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero2.yaml \
-    benchmarks/ppo/ppo_continual.py \
+    benchmarks/ppo_ewc/ppo_EWC_continual.py \
     --wandb_project $dataset_name \
-    --wandb_run_name "Qwen2-0.5B-PPO-${dataset_name}-multi-gpu" \
+    --wandb_run_name "Qwen2-0.5B-PPO-EWC-${dataset_name}-multi-gpu" \
     --dataset_name $dataset_name \
     --sft_model_path Qwen/Qwen2-0.5B-Instruct \
     --value_model_path LifelongAlignment/Qwen2-0.5B-Instruct_${dataset_name}_REWARD_0 \
@@ -34,5 +34,5 @@ accelerate launch --config_file benchmarks/ppo/accelerate_configs/deepspeed_zero
     --eval_steps 300 \
     --save_steps 300 \
     --bf16 \
-    --output_dir "$HOME/Qwen2-0.5B-PPO-${dataset_name}" \
+    --output_dir "$SCRATCH/Qwen2-0.5B-PPO-EWC-${dataset_name}" \
     --no_remove_unused_columns