libffcv · wouterzwerink · Jul 5, 2023 · Jul 10, 2023 · Jul 13, 2023
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -28,7 +28,7 @@ jobs:
         run: |
           mv docs docs_src
           cd docs_src
-          pip install -U sphinx karma-sphinx-theme
+          pip install -U sphinx==6.0.0 karma-sphinx-theme
           pip install -U numpy numba tqdm
           pip install --upgrade -U pygments
           make html

diff --git a/docs/bottleneck_doctor.rst b/docs/bottleneck_doctor.rst
@@ -23,7 +23,6 @@ either way, try:
   data sampler**, enabled using the ``order=OrderOption.QUASI_RANDOM`` argument to
   the :class:`~ffcv.loader.Loader` constructor. Quasi-random sampling tries to
   imitate random sampling while minimizing the underlying number of disk reads.
-  (Again, note that ``QUASI_RANDOM`` is not yet supported for distributed training.)
 - Another option for computer vision datasets is **storing resized images**: many
   datasets have gigantic images that end up being resized and cropped anyways in
   the data augmentation pipeline. You can avoid paying the cost of loading these

diff --git a/docs/parameter_tuning.rst b/docs/parameter_tuning.rst
@@ -34,12 +34,6 @@ FFCV should also work fine with PyTorch's ``DataParallel`` wrapper but we agree
 
 The same recommendations above related to dataset size still apply here, but we emphasize that ``os_cache=True`` is particularly beneficial in this scenario. Indeed, as multiple processes will access the same dataset, having the caching at the OS level allows for data sharing between them, reducing overall memory consumption.
 
-.. note ::
-    `QUASI_RANDOM` isn't currently supported with ``distributed=True``. While
-    this is technically possible to implement, we haven't yet invested the
-    necessary time yet. It is on the medium-term roadmap, and we also welcome
-    pull requests!
-
 We encourage users to try different values for the ``num_workers`` parameters. As FFCV is usually very CPU resource efficient it is sometimes beneficial to use fewer workers to avoid scheduling and cache inefficiencies.
 
 Scenario: Grid search (1 model per GPU)

diff --git a/ffcv/loader/loader.py b/ffcv/loader/loader.py
@@ -4,6 +4,7 @@
 import enum
 from os import environ
 import ast
+import logging
 from multiprocessing import cpu_count
 from re import sub
 from typing import Any, Callable, Mapping, Sequence, Type, Union, Literal
@@ -67,7 +68,7 @@ class Loader:
     order : Union[OrderOption, TraversalOrder]
         Traversal order, one of: SEQUENTIAL, RANDOM, QUASI_RANDOM, or a custom TraversalOrder
 
-        QUASI_RANDOM is a random order that tries to be as uniform as possible while minimizing the amount of data read from the disk. Note that it is mostly useful when `os_cache=False`. Currently unavailable in distributed mode.
+        QUASI_RANDOM is a random order that tries to be as uniform as possible while minimizing the amount of data read from the disk. Note that it is mostly useful when `os_cache=False`.
     distributed : bool
         For distributed training (multiple GPUs). Emulates the behavior of DistributedSampler from PyTorch.
     seed : int
@@ -104,9 +105,9 @@ def __init__(self,
                  recompile: bool = False,  # Recompile at every epoch
                  ):
 
-        if distributed and order == OrderOption.RANDOM and (seed is None):
-            print('Warning: no ordering seed was specified with distributed=True. '
-                  'Setting seed to 0 to match PyTorch distributed sampler.')
+        if distributed and order != OrderOption.SEQUENTIAL and (seed is None):
+            logging.warn('No ordering seed was specified with distributed=True. '
+                         'Setting seed to 0 to match PyTorch distributed sampler.')
             seed = 0
         elif seed is None:
             tinfo = np.iinfo('int32')

diff --git a/ffcv/traversal_order/quasi_random.py b/ffcv/traversal_order/quasi_random.py
@@ -3,7 +3,7 @@
 from numba import njit
 import numpy as np
 
-from torch.utils.data import DistributedSampler
+from torch.distributed import get_rank, get_world_size
 
 from .base import TraversalOrder
 
@@ -51,10 +51,6 @@ def __init__(self, loader: 'Loader'):
             raise ValueError(
                 "Dataset won't benefit from QuasiRandom order, use regular Random")
 
-        if self.distributed:
-            raise NotImplementedError(
-                "distributed Not implemented yet for QuasiRandom")
-
         self.prepare_data_structures()
 
 
@@ -83,4 +79,20 @@ def sample_order(self, epoch: int) -> Sequence[int]:
                              result_order,
                              2*self.loader.batch_size)
 
-        return result_order
+        if self.distributed:
+            world_size = get_world_size()
+            rank = get_rank()
+
+            split_size, remainder = divmod(len(self.indices), world_size)
+
+            start_idx = rank * split_size + min(rank, remainder)
+            if rank < remainder:
+                split_size += 1
+            end_idx = start_idx + split_size
+
+            # Duplicate some samples for last rank if needed
+            if remainder != 0 and rank == world_size - 1:
+                start_idx -= remainder
+
+            return result_order[start_idx:end_idx]
+        return result_order
diff --git a/tests/test_traversal_orders.py b/tests/test_traversal_orders.py
@@ -1,13 +1,12 @@
-from collections import defaultdict
 from tempfile import TemporaryDirectory
 from os import path
 from typing import Counter
 
 import pytest
 from assertpy import assert_that
 import numpy as np
-from torch.utils.data import Dataset, distributed
-from torch.multiprocessing import spawn, Queue
+from torch.utils.data import Dataset
+from torch.multiprocessing import spawn
 from torch.distributed import init_process_group
 
 from ffcv.loader.loader import ORDER_TYPE, OrderOption
@@ -120,15 +119,12 @@ def test_traversal_random_4():
 def test_traversal_quasirandom_1():
     prep_and_run_test(1, OrderOption.QUASI_RANDOM)
 
-@pytest.mark.skip()
 def test_traversal_quasirandom_2():
     prep_and_run_test(2, OrderOption.QUASI_RANDOM)
 
-@pytest.mark.skip()
 def test_traversal_quasirandom_3():
     prep_and_run_test(3, OrderOption.QUASI_RANDOM)
 
-@pytest.mark.skip()
 def test_traversal_quasirandom_4():
     prep_and_run_test(4, OrderOption.QUASI_RANDOM)
 
@@ -138,6 +134,5 @@ def test_traversal_sequential_distributed_with_indices():
 def test_traversal_random_distributed_with_indices():
     prep_and_run_test(2, OrderOption.RANDOM, True)
 
-@pytest.mark.skip()
 def test_traversal_quasi_random_distributed_with_indices():
-    prep_and_run_test(2, OrderOption.QUASI_RANDOM, True)
+    prep_and_run_test(2, OrderOption.QUASI_RANDOM, True)