diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index bf3b2076..ef7f0a7d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -28,7 +28,7 @@ jobs: run: | mv docs docs_src cd docs_src - pip install -U sphinx karma-sphinx-theme + pip install -U sphinx==6.0.0 karma-sphinx-theme pip install -U numpy numba tqdm pip install --upgrade -U pygments make html diff --git a/docs/bottleneck_doctor.rst b/docs/bottleneck_doctor.rst index f1d120be..ece997b9 100644 --- a/docs/bottleneck_doctor.rst +++ b/docs/bottleneck_doctor.rst @@ -23,7 +23,6 @@ either way, try: data sampler**, enabled using the ``order=OrderOption.QUASI_RANDOM`` argument to the :class:`~ffcv.loader.Loader` constructor. Quasi-random sampling tries to imitate random sampling while minimizing the underlying number of disk reads. - (Again, note that ``QUASI_RANDOM`` is not yet supported for distributed training.) - Another option for computer vision datasets is **storing resized images**: many datasets have gigantic images that end up being resized and cropped anyways in the data augmentation pipeline. You can avoid paying the cost of loading these diff --git a/docs/parameter_tuning.rst b/docs/parameter_tuning.rst index 13f1306e..3dc7c42b 100644 --- a/docs/parameter_tuning.rst +++ b/docs/parameter_tuning.rst @@ -34,12 +34,6 @@ FFCV should also work fine with PyTorch's ``DataParallel`` wrapper but we agree The same recommendations above related to dataset size still apply here, but we emphasize that ``os_cache=True`` is particularly beneficial in this scenario. Indeed, as multiple processes will access the same dataset, having the caching at the OS level allows for data sharing between them, reducing overall memory consumption. -.. note :: - `QUASI_RANDOM` isn't currently supported with ``distributed=True``. While - this is technically possible to implement, we haven't yet invested the - necessary time yet. It is on the medium-term roadmap, and we also welcome - pull requests! - We encourage users to try different values for the ``num_workers`` parameters. As FFCV is usually very CPU resource efficient it is sometimes beneficial to use fewer workers to avoid scheduling and cache inefficiencies. Scenario: Grid search (1 model per GPU) diff --git a/ffcv/loader/loader.py b/ffcv/loader/loader.py index ac365518..1927de94 100644 --- a/ffcv/loader/loader.py +++ b/ffcv/loader/loader.py @@ -4,6 +4,7 @@ import enum from os import environ import ast +import logging from multiprocessing import cpu_count from re import sub from typing import Any, Callable, Mapping, Sequence, Type, Union, Literal @@ -67,7 +68,7 @@ class Loader: order : Union[OrderOption, TraversalOrder] Traversal order, one of: SEQUENTIAL, RANDOM, QUASI_RANDOM, or a custom TraversalOrder - QUASI_RANDOM is a random order that tries to be as uniform as possible while minimizing the amount of data read from the disk. Note that it is mostly useful when `os_cache=False`. Currently unavailable in distributed mode. + QUASI_RANDOM is a random order that tries to be as uniform as possible while minimizing the amount of data read from the disk. Note that it is mostly useful when `os_cache=False`. distributed : bool For distributed training (multiple GPUs). Emulates the behavior of DistributedSampler from PyTorch. seed : int @@ -104,9 +105,9 @@ def __init__(self, recompile: bool = False, # Recompile at every epoch ): - if distributed and order == OrderOption.RANDOM and (seed is None): - print('Warning: no ordering seed was specified with distributed=True. ' - 'Setting seed to 0 to match PyTorch distributed sampler.') + if distributed and order != OrderOption.SEQUENTIAL and (seed is None): + logging.warn('No ordering seed was specified with distributed=True. ' + 'Setting seed to 0 to match PyTorch distributed sampler.') seed = 0 elif seed is None: tinfo = np.iinfo('int32') diff --git a/ffcv/traversal_order/quasi_random.py b/ffcv/traversal_order/quasi_random.py index fbc5231e..4771da9f 100644 --- a/ffcv/traversal_order/quasi_random.py +++ b/ffcv/traversal_order/quasi_random.py @@ -3,7 +3,7 @@ from numba import njit import numpy as np -from torch.utils.data import DistributedSampler +from torch.distributed import get_rank, get_world_size from .base import TraversalOrder @@ -51,10 +51,6 @@ def __init__(self, loader: 'Loader'): raise ValueError( "Dataset won't benefit from QuasiRandom order, use regular Random") - if self.distributed: - raise NotImplementedError( - "distributed Not implemented yet for QuasiRandom") - self.prepare_data_structures() @@ -83,4 +79,20 @@ def sample_order(self, epoch: int) -> Sequence[int]: result_order, 2*self.loader.batch_size) - return result_order \ No newline at end of file + if self.distributed: + world_size = get_world_size() + rank = get_rank() + + split_size, remainder = divmod(len(self.indices), world_size) + + start_idx = rank * split_size + min(rank, remainder) + if rank < remainder: + split_size += 1 + end_idx = start_idx + split_size + + # Duplicate some samples for last rank if needed + if remainder != 0 and rank == world_size - 1: + start_idx -= remainder + + return result_order[start_idx:end_idx] + return result_order diff --git a/tests/test_traversal_orders.py b/tests/test_traversal_orders.py index 21c00b6e..89b2dd0f 100644 --- a/tests/test_traversal_orders.py +++ b/tests/test_traversal_orders.py @@ -1,4 +1,3 @@ -from collections import defaultdict from tempfile import TemporaryDirectory from os import path from typing import Counter @@ -6,8 +5,8 @@ import pytest from assertpy import assert_that import numpy as np -from torch.utils.data import Dataset, distributed -from torch.multiprocessing import spawn, Queue +from torch.utils.data import Dataset +from torch.multiprocessing import spawn from torch.distributed import init_process_group from ffcv.loader.loader import ORDER_TYPE, OrderOption @@ -120,15 +119,12 @@ def test_traversal_random_4(): def test_traversal_quasirandom_1(): prep_and_run_test(1, OrderOption.QUASI_RANDOM) -@pytest.mark.skip() def test_traversal_quasirandom_2(): prep_and_run_test(2, OrderOption.QUASI_RANDOM) -@pytest.mark.skip() def test_traversal_quasirandom_3(): prep_and_run_test(3, OrderOption.QUASI_RANDOM) -@pytest.mark.skip() def test_traversal_quasirandom_4(): prep_and_run_test(4, OrderOption.QUASI_RANDOM) @@ -138,6 +134,5 @@ def test_traversal_sequential_distributed_with_indices(): def test_traversal_random_distributed_with_indices(): prep_and_run_test(2, OrderOption.RANDOM, True) -@pytest.mark.skip() def test_traversal_quasi_random_distributed_with_indices(): - prep_and_run_test(2, OrderOption.QUASI_RANDOM, True) \ No newline at end of file + prep_and_run_test(2, OrderOption.QUASI_RANDOM, True)