From b21f7c98dadd92909e630817e9c54fd2c42f4f41 Mon Sep 17 00:00:00 2001 From: Andrew Choi Date: Tue, 9 Dec 2025 16:27:24 -0800 Subject: [PATCH 1/2] [Bug Fix] Make DDP workers set absl logging as default Add pretty file names to logs --- alf/bin/train.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/alf/bin/train.py b/alf/bin/train.py index ff9a6f58e..344d2d8dc 100644 --- a/alf/bin/train.py +++ b/alf/bin/train.py @@ -127,7 +127,12 @@ def _setup_logging(rank: int, log_dir: str): """ FLAGS.alsologtostderr = True logging.set_verbosity(logging.INFO) - logging.get_absl_handler().use_absl_log_file(log_dir=log_dir) + logging.get_absl_handler().use_absl_log_file( + program_name=f'rank{rank}_logs', log_dir=log_dir) + # Spawned subprocesses create a new interpreter so will change the + # default logging back to python's logging module. + # For DDP worker logging to work, we need to explicitly set it back to absl. + logging.use_absl_handler() def _setup_device(): From ff8b3f83439910b3088ecf408cdce0fc526e2884 Mon Sep 17 00:00:00 2001 From: Andrew Choi Date: Tue, 9 Dec 2025 17:01:45 -0800 Subject: [PATCH 2/2] Remove program name as it creates duplicate logs --- alf/bin/train.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/alf/bin/train.py b/alf/bin/train.py index 344d2d8dc..05fb444e8 100644 --- a/alf/bin/train.py +++ b/alf/bin/train.py @@ -118,17 +118,15 @@ def check_valid_launch(): assert not extra, f"Unexpected environment variables for non-distributed launch: {extra}" -def _setup_logging(rank: int, log_dir: str): +def _setup_logging(log_dir: str): """Setup logging for each process Args: - rank (int): The ID of the process among all of the DDP processes log_dir (str): path to the directory where log files are written to """ FLAGS.alsologtostderr = True logging.set_verbosity(logging.INFO) - logging.get_absl_handler().use_absl_log_file( - program_name=f'rank{rank}_logs', log_dir=log_dir) + logging.get_absl_handler().use_absl_log_file(log_dir=log_dir) # Spawned subprocesses create a new interpreter so will change the # default logging back to python's logging module. # For DDP worker logging to work, we need to explicitly set it back to absl. @@ -237,7 +235,7 @@ def training_worker(rank: int, in different worker processes, if multi-gpu training is used. """ try: - _setup_logging(log_dir=root_dir, rank=rank) + _setup_logging(log_dir=root_dir) _setup_device() if world_size > 1: # Specialization for distributed mode @@ -303,7 +301,7 @@ def training_worker_multi_node(local_rank: int, in different worker processes, if multi-gpu training is used. """ try: - _setup_logging(log_dir=root_dir, rank=rank) + _setup_logging(log_dir=root_dir) _setup_device() # Specialization for distributed mode