Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added asyncpp/optim/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion asyncpp/optim/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def append_to_queue(self, data):
def get_from_queue(self, index):
if self.save_dir is not None:
fname = self.queue[index]
d = torch.load(fname)
d = torch.load(fname, weights_only=False)
return d["state_dicts"], d["version"]
else:
return self.queue[index]
Expand Down
Empty file added asyncpp/runtime/__init__.py
Empty file.
5 changes: 3 additions & 2 deletions asyncpp/runtime/threadsafe_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
# Licensed under the MIT license.

import threading
from collections import deque

"""
Implementation of a thread-safe queue with one producer and one consumer.
"""
class Queue:
def __init__(self):
self.queue = []
self.queue = deque()
self.cv = threading.Condition()

def add(self, tensor):
Expand All @@ -21,6 +22,6 @@ def remove(self):
self.cv.acquire()
while len(self.queue) == 0:
self.cv.wait()
tensor = self.queue.pop(0)
tensor = self.queue.popleft()
self.cv.release()
return tensor
Empty file added examples/models/__init__.py
Empty file.
12 changes: 6 additions & 6 deletions examples/pp_diloco_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ def main(args):
parser.add_argument('--eval_iters', type=int, default=25, help='Number of evaluation iterations')
parser.add_argument('--ckpt_interval', type=int, default=1000, help='Checkpoint interval')
parser.add_argument('--diloco_interval', type=int, default=10000000, help='Diloco interval') # disable diloco by default
parser.add_argument('--cosine_anneal', type=bool, default=True, help='Use cosine annealing')
parser.add_argument('--cosine_anneal', type=lambda v: v.lower() in ('true', '1', 'yes'), default=True, help='Use cosine annealing')
parser.add_argument('--warmup_steps', type=int, default=3000, help='Number of warmup steps')
parser.add_argument('--max_local_step', type=int, default=30000, help='Maximum local step')
parser.add_argument('--wandb_project', type=str, default=None, help='WandB project name')
Expand All @@ -409,24 +409,24 @@ def main(args):
parser.add_argument('--wandb_name', type=str, default=None, help='WandB name')
parser.add_argument('--eval_interval', type=int, default=1000, help='Evaluation interval')
parser.add_argument('--num_microbatches', type=int, default=1, help='Number of microbatches')
parser.add_argument('--deterministic', type=bool, default=False, help='Deterministic training')
parser.add_argument('--deterministic', type=lambda v: v.lower() in ('true', '1', 'yes'), default=False, help='Deterministic training')
parser.add_argument('--max_norm', type=float, default=1.0, help='Maximum norm')
parser.add_argument('--num_inner_steps', type=int, default=1000, help='Number of inner steps')
parser.add_argument('--adaptive_momentum', type=bool, default=False, help='Use adaptive momentum')
parser.add_argument('--adaptive_momentum', type=lambda v: v.lower() in ('true', '1', 'yes'), default=False, help='Use adaptive momentum')
parser.add_argument('--optimizer', type=str, default="nadamw", help='Optimizer class')
parser.add_argument('--backend', type=str, default="gloo", help='Backend')
parser.add_argument('--sparta_interval', type=int, default=1, help='Sparta interval')
parser.add_argument('--method', type=str, default='diloco', help='Method')
parser.add_argument('--sparta_method', type=str, default='avg', help='Sparta method')
parser.add_argument('--sparta_lambda', type=float, default=1.0, help='Sparta lambda')
parser.add_argument('--sparta_momentum', type=float, default=0.5, help='Sparta momentum')
parser.add_argument('--sparta_nesterov', type=bool, default=False, help='Sparta nesterov')
parser.add_argument('--sparta_adaptive_momentum', type=bool, default=True, help='Sparta adaptive momentum')
parser.add_argument('--sparta_nesterov', type=lambda v: v.lower() in ('true', '1', 'yes'), default=False, help='Sparta nesterov')
parser.add_argument('--sparta_adaptive_momentum', type=lambda v: v.lower() in ('true', '1', 'yes'), default=True, help='Sparta adaptive momentum')
parser.add_argument('--sparta_warmup_steps', type=int, default=1000, help='Number of warmup steps')
parser.add_argument("--instance_id", type=int, default=0, help="Instance ID")
parser.add_argument("--num_nodes_per_instance", type=int, default=None, help="Number of nodes per instance")
parser.add_argument('--master_addr', type=str, default="127.0.0.1", help='Master address for distributed training')
parser.add_argument('--buffer_to_cpu', type=bool, default=False, help='Buffer to CPU')
parser.add_argument('--buffer_to_cpu', type=lambda v: v.lower() in ('true', '1', 'yes'), default=False, help='Buffer to CPU')
args = parser.parse_args()

main(args)
10 changes: 5 additions & 5 deletions examples/pp_diloco_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,15 +342,15 @@ def main(args):
parser.add_argument('--eval_iters', type=int, default=25, help='Number of evaluation iterations')
parser.add_argument('--ckpt_interval', type=int, default=1000, help='Checkpoint interval')
parser.add_argument('--diloco_interval', type=int, default=10000000, help='Diloco interval') # disable diloco by default
parser.add_argument('--cosine_anneal', type=bool, default=True, help='Use cosine annealing')
parser.add_argument('--cosine_anneal', type=lambda v: v.lower() in ('true', '1', 'yes'), default=True, help='Use cosine annealing')
parser.add_argument('--warmup_steps', type=int, default=3000, help='Number of warmup steps')
parser.add_argument('--max_local_step', type=int, default=30000, help='Maximum local step')
parser.add_argument('--wandb_project', type=str, default=None, help='WandB project name')
parser.add_argument('--port', type=int, default=12345, help='Port number')
parser.add_argument('--async_sparta_delay', type=int, default=0, help='Async Sparta delay')
parser.add_argument('--wandb_name', type=str, default=None, help='WandB name')
parser.add_argument('--eval_interval', type=int, default=1000, help='Evaluation interval')
parser.add_argument('--deterministic', type=bool, default=False, help='Deterministic training')
parser.add_argument('--deterministic', type=lambda v: v.lower() in ('true', '1', 'yes'), default=False, help='Deterministic training')
parser.add_argument('--max_norm', type=float, default=1.0, help='Maximum norm')
parser.add_argument('--num_inner_steps', type=int, default=1000, help='Number of inner steps')
parser.add_argument('--optimizer', type=str, default="adamw", help='Optimizer class')
Expand All @@ -360,13 +360,13 @@ def main(args):
parser.add_argument('--sparta_method', type=str, default='avg', help='Sparta method')
parser.add_argument('--sparta_lambda', type=float, default=1.0, help='Sparta lambda')
parser.add_argument('--sparta_momentum', type=float, default=0.5, help='Sparta momentum')
parser.add_argument('--sparta_nesterov', type=bool, default=False, help='Sparta nesterov')
parser.add_argument('--sparta_adaptive_momentum', type=bool, default=True, help='Sparta adaptive momentum')
parser.add_argument('--sparta_nesterov', type=lambda v: v.lower() in ('true', '1', 'yes'), default=False, help='Sparta nesterov')
parser.add_argument('--sparta_adaptive_momentum', type=lambda v: v.lower() in ('true', '1', 'yes'), default=True, help='Sparta adaptive momentum')
parser.add_argument('--sparta_warmup_steps', type=int, default=1000, help='Number of warmup steps')
parser.add_argument("--instance_id", type=int, default=0, help="Instance ID")
parser.add_argument("--num_nodes_per_instance", type=int, default=None, help="Number of nodes per instance")
parser.add_argument('--master_addr', type=str, default="127.0.0.1", help='Master address for distributed training')
parser.add_argument('--buffer_to_cpu', type=bool, default=False, help='Buffer to CPU')
parser.add_argument('--buffer_to_cpu', type=lambda v: v.lower() in ('true', '1', 'yes'), default=False, help='Buffer to CPU')
args = parser.parse_args()

main(args)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ Pillow
wandb
transformers
datasets
wandb
pyarrow
huggingface_hub[hf_transfer]
7 changes: 3 additions & 4 deletions sparta/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def _get_stage_master(self):
return self.pp_stage

def _cleanup(self):
if self.rank == 0:
if self.rank == 0 and self.config.wandb_project:
wandb.finish()
if self.pbar:
self.pbar.close()
Expand Down Expand Up @@ -248,6 +248,5 @@ def _setup(self, rank: int):
dist.barrier()

def load_model(self, path):
self.master_model.load_state_dict(torch.load(path))
for model in self.models:
model.load_state_dict(self.master_model.state_dict())
self.master_model.load_state_dict(torch.load(path, weights_only=True))
self.model.load_state_dict(self.master_model.state_dict())