diff --git a/mmf/trainers/core/evaluation_loop.py b/mmf/trainers/core/evaluation_loop.py
index 9ca258b89..a254a4a89 100644
--- a/mmf/trainers/core/evaluation_loop.py
+++ b/mmf/trainers/core/evaluation_loop.py
@@ -65,7 +65,7 @@ def prediction_loop(self, dataset_type: str) -> None:
 
                 for batch in tqdm.tqdm(dataloader):
                     prepared_batch = reporter.prepare_batch(batch)
-                    prepared_batch = to_device(prepared_batch, torch.device("cuda"))
+                    prepared_batch = to_device(prepared_batch, self.device)
                     with torch.cuda.amp.autocast(enabled=self.training_config.fp16):
                         model_output = self.model(prepared_batch)
                     report = Report(prepared_batch, model_output)
diff --git a/mmf/utils/distributed.py b/mmf/utils/distributed.py
index 07f02b55c..4167145f2 100644
--- a/mmf/utils/distributed.py
+++ b/mmf/utils/distributed.py
@@ -9,7 +9,7 @@
 
 import torch
 from torch import distributed as dist
-try: 
+try:
     import torch_xla.core.xla_model as xm
 except ImportError:
     xm = None
@@ -82,10 +82,10 @@ def broadcast_tensor(tensor, src=0):
         if is_xla():
             tensor = xm.all_to_all(
                     tensor.repeat([world_size,1]),
-                    split_dimension=0, 
-                    concat_dimension=0, 
-                    split_count=world_size)[0] 
-        else: 
+                    split_dimension=0,
+                    concat_dimension=0,
+                    split_count=world_size)[0]
+        else:
             dist.broadcast(tensor, src=0)
 
     return tensor
@@ -128,9 +128,9 @@ def gather_tensor(tensor):
         if is_xla():
             tensor_list = xm.all_gather(tensor)
             tensor_list = tensor_list.view(world_size, *tensor.size())
-        else: 
+        else:
             dist.all_gather(tensor_list, tensor)
-        tensor_list = torch.stack(tensor_list, dim=0)
+        tensor_list = torch.stack(tuple(tensor_list), dim=0)
     return tensor_list
 
 
@@ -151,7 +151,7 @@ def reduce_dict(dictionary):
                                    [values],
                                    scale=1.0/world_size
                                    )[0]
-        else: 
+        else:
             dist.reduce(values, dst=0)
             if dist.get_rank() == 0:
                 # only main process gets accumulated, so only divide by