From 4424bda6e1cf31c6908941818572a952a008c262 Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Wed, 22 Jul 2020 10:54:51 -0700
Subject: [PATCH 001/123] Initial implementation of CIFAR evaluation, currently
 runs but haven't yet gotten it to reproduce SimCLR results

---
 algos/encoders.py |  41 ++++++-----
 run_cifar.py      | 176 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 200 insertions(+), 17 deletions(-)
 create mode 100644 run_cifar.py

diff --git a/algos/encoders.py b/algos/encoders.py
index 99876af0..286bdb2c 100644
--- a/algos/encoders.py
+++ b/algos/encoders.py
@@ -38,39 +38,46 @@ def encode_extra_context(self, x, traj_info):
 
 class CNNEncoder(Encoder):
     def __init__(self, obs_shape, representation_dim, architecture=None, learn_scale=False):
+        # obs_shape is assumed to be of form (H, W, C)
+        # Note in TorchVision PILImages and Numpy arrays are (H, W, C) by default but tensors are (C, H, W)
         super(CNNEncoder, self).__init__()
         if architecture is None:
             architecture = DEFAULT_CNN_ARCHITECTURE
         self.input_channel = obs_shape[2]
         self.representation_dim = representation_dim
-        shared_network_layers = []
 
-        for layer_spec in architecture['CONV']:
-            shared_network_layers.append(nn.Conv2d(self.input_channel, layer_spec['out_dim'],
+        if isinstance(architecture, nn.Module):
+            assert not learn_scale
+            self.shared_network = architecture
+            self.mean_layer = lambda x: x
+            self.scale_layer = lambda x: torch.ones(self.representation_dim)
+        else:
+            shared_network_layers = []
+            for layer_spec in architecture['CONV']:
+                shared_network_layers.append(nn.Conv2d(self.input_channel, layer_spec['out_dim'],
                                               kernel_size=layer_spec['kernel_size'], stride=layer_spec['stride']))
-            shared_network_layers.append(nn.ReLU())
-            self.input_channel = layer_spec['out_dim']
+                shared_network_layers.append(nn.ReLU())
+                self.input_channel = layer_spec['out_dim']
 
-        shared_network_layers.append(nn.Flatten())
-        for ind, layer_spec in enumerate(architecture['DENSE'][:-1]):
-            in_dim, out_dim = layer_spec.get('in_dim'), layer_spec.get('out_dim')
-            shared_network_layers.append(nn.Linear(in_dim, out_dim))
-            shared_network_layers.append(nn.ReLU())
+            shared_network_layers.append(nn.Flatten())
+            for ind, layer_spec in enumerate(architecture['DENSE'][:-1]):
+                in_dim, out_dim = layer_spec.get('in_dim'), layer_spec.get('out_dim')
+                shared_network_layers.append(nn.Linear(in_dim, out_dim))
+                shared_network_layers.append(nn.ReLU())
 
-        self.shared_network = nn.Sequential(*shared_network_layers)
+            self.shared_network = nn.Sequential(*shared_network_layers)
 
-        self.mean_layer = nn.Linear(architecture['DENSE'][-1]['in_dim'], self.representation_dim)
+            self.mean_layer = nn.Linear(architecture['DENSE'][-1]['in_dim'], self.representation_dim)
 
 
-        if learn_scale:
-            self.scale_layer = nn.Linear(architecture['DENSE'][-1]['in_dim'], self.representation_dim)
-        else:
-            self.scale_layer = lambda x: torch.ones(self.representation_dim)
+            if learn_scale:
+                self.scale_layer = nn.Linear(architecture['DENSE'][-1]['in_dim'], self.representation_dim)
+            else:
+                self.scale_layer = lambda x: torch.ones(self.representation_dim)
 
 
     def forward(self, x, traj_info=None):
         x = x.permute(0, 3, 1, 2)
-        x /= 255
         shared_repr = self.shared_network(x)
         mean = self.mean_layer(shared_repr)
         scale = torch.exp(self.scale_layer(shared_repr))
diff --git a/run_cifar.py b/run_cifar.py
new file mode 100644
index 00000000..e98d5f23
--- /dev/null
+++ b/run_cifar.py
@@ -0,0 +1,176 @@
+from algos import *
+from gym.spaces import Discrete, Box
+from sacred import Experiment
+from sacred.observers import FileStorageObserver
+from algos.utils import gaussian_blur
+
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision
+import torchvision.transforms as transforms
+from torchvision.models.resnet import resnet18
+
+
+class MockGymEnv(object):
+    """A mock Gym env for a supervised learning dataset pretending to be an RL
+    task. Action space is set to Discrete(1), observation space corresponds to
+    the original supervised learning task.
+    """
+    def __init__(self, obs_space):
+        self.observation_space = obs_space
+        self.action_space = Discrete(1)
+
+    def seed(self, seed):
+        pass
+
+    def close(self):
+        pass
+
+
+def transform_to_rl(dataset):
+    """Transforms the input supervised learning dataset into an "RL dataset", by
+    adding dummy 'actions' (always 0) and 'dones' (always False), and pretending
+    that everything is from the same 'trajectory'.
+    """
+    states = [img for img, label in dataset][:10000]
+    data_dict = {
+        'states': states,
+        'actions': [0.0] * len(states),
+        'dones': [False] * len(states),
+    }
+    return data_dict
+
+
+class LinearHead(nn.Module):
+    def __init__(self, encoder, output_dim):
+        super().__init__()
+        self.encoder = encoder
+        self.output_dim = output_dim
+        self.layer = nn.Linear(encoder.representation_dim, output_dim)
+
+    def forward(self, x):
+        encoding = self.encoder.encode_context(x, None).loc.detach()
+        return self.layer(encoding)
+
+
+def train_classifier(classifier, dataset, num_epochs):
+    trainloader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=True)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(classifier.layer.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0)
+
+    for epoch in range(num_epochs):
+        running_loss = 0.0
+        for i, data in enumerate(trainloader, 0):
+            inputs, labels = data
+            optimizer.zero_grad()
+            outputs = classifier(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+
+            # print statistics
+            running_loss += loss.item()
+            if i % 20 == 19:    # print every 20 mini-batches
+                print('[Epoch %d, Batch %3d] Average loss: %.3f' %
+                      (epoch + 1, i + 1, running_loss / 20))
+                running_loss = 0.0
+
+
+def evaluate_classifier(classifier, dataset):
+    testloader = torch.utils.data.DataLoader(dataset, batch_size=100, shuffle=False)
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for data in testloader:
+            images, labels = data
+            outputs = classifier(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+    print('Accuracy: %d %%' % (100 * correct / total))
+
+
+cifar_ex = Experiment('cifar')
+
+
+@cifar_ex.config
+def default_config():
+    seed = 0
+    algo = SimCLR
+    data_dir = 'cifar10/'
+    pretrain_epochs = 1000
+    finetune_epochs = 100
+    _ = locals()
+    del _
+
+
+@cifar_ex.main
+def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, _config):
+
+    # TODO fix this hacky nonsense
+    log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
+    os.mkdir(log_dir)
+    #with TemporaryDirectory() as tmp_dir:
+    if isinstance(algo, str):
+        algo = globals()[algo]
+    assert issubclass(algo, RepresentationLearner)
+    ## TODO allow passing in of kwargs here
+    #trainloader = torch.utils.data.DataLoader(
+    #    trainset, batch_size=opt.batch_size_train, shuffle=True, num_workers=2)
+
+    # Load in data
+    os.makedirs(data_dir, exist_ok=True)
+    transformations = [
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+    ]
+    transform = transforms.Compose(transformations)
+    trainset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=True, download=True, transform=transform)
+    testset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=False, download=True, transform=transform)
+
+    print('Creating model for representation learning')
+    env = MockGymEnv(Box(low=-1.0, high=1.0, shape=(32, 32, 3), dtype=np.float32))
+    # algo_params = {k: v for k, v in _config.items() if k in rep_learner_params.keys()}
+    rep_learning_augmentations = [
+        transforms.Lambda(torch.tensor),
+        transforms.ToPILImage(),
+        transforms.Pad(4),
+        transforms.RandomCrop(16),
+        transforms.Pad(8),
+        # transforms.Lambda(gaussian_blur), # SimCLR doesn't use blur for CIFAR-10
+        transforms.ToTensor(),
+    ]
+    # Note that the resnet18 model used here has an architecture meant for
+    # ImageNet, not CIFAR-10. The SimCLR implementation uses a version
+    # specialized for CIFAR, see https://github.com/google-research/simclr/blob/37ad4e01fb22e3e6c7c4753bd51a1e481c2d992e/resnet.py#L531
+    model = algo(
+        env, log_dir=log_dir, pretrain_epochs=pretrain_epochs, batch_size=512, representation_dim=1000,
+        encoder_kwargs={'architecture': resnet18()},
+        augmenter_kwargs={'augmentations': rep_learning_augmentations},
+        optimizer_kwargs={'lr': 1e-3, 'weight_decay': 1e-4},
+    )
+
+    print('Train representation learner')
+    rep_learning_data = transform_to_rl(trainset)
+    model.learn(rep_learning_data)
+    del rep_learning_data
+
+    print('Train linear head')
+    classifier = LinearHead(model.encoder, 10)
+    train_classifier(classifier, trainset, num_epochs=finetune_epochs)
+
+    print('Evaluate accuracy on test set')
+    evaluate_classifier(classifier, testset)
+
+    env.close()
+
+
+if __name__ == '__main__':
+    cifar_ex.observers.append(FileStorageObserver('cifar_runs'))
+    cifar_ex.run_commandline()

From 53d2f2e068718ccb2c73bf4ec56bfa9d33cb2887 Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Wed, 22 Jul 2020 10:54:51 -0700
Subject: [PATCH 002/123] Initial implementation of CIFAR evaluation, currently
 runs but haven't yet gotten it to reproduce SimCLR results

---
 run_cifar.py | 176 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 run_cifar.py

diff --git a/run_cifar.py b/run_cifar.py
new file mode 100644
index 00000000..e98d5f23
--- /dev/null
+++ b/run_cifar.py
@@ -0,0 +1,176 @@
+from algos import *
+from gym.spaces import Discrete, Box
+from sacred import Experiment
+from sacred.observers import FileStorageObserver
+from algos.utils import gaussian_blur
+
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision
+import torchvision.transforms as transforms
+from torchvision.models.resnet import resnet18
+
+
+class MockGymEnv(object):
+    """A mock Gym env for a supervised learning dataset pretending to be an RL
+    task. Action space is set to Discrete(1), observation space corresponds to
+    the original supervised learning task.
+    """
+    def __init__(self, obs_space):
+        self.observation_space = obs_space
+        self.action_space = Discrete(1)
+
+    def seed(self, seed):
+        pass
+
+    def close(self):
+        pass
+
+
+def transform_to_rl(dataset):
+    """Transforms the input supervised learning dataset into an "RL dataset", by
+    adding dummy 'actions' (always 0) and 'dones' (always False), and pretending
+    that everything is from the same 'trajectory'.
+    """
+    states = [img for img, label in dataset][:10000]
+    data_dict = {
+        'states': states,
+        'actions': [0.0] * len(states),
+        'dones': [False] * len(states),
+    }
+    return data_dict
+
+
+class LinearHead(nn.Module):
+    def __init__(self, encoder, output_dim):
+        super().__init__()
+        self.encoder = encoder
+        self.output_dim = output_dim
+        self.layer = nn.Linear(encoder.representation_dim, output_dim)
+
+    def forward(self, x):
+        encoding = self.encoder.encode_context(x, None).loc.detach()
+        return self.layer(encoding)
+
+
+def train_classifier(classifier, dataset, num_epochs):
+    trainloader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=True)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(classifier.layer.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0)
+
+    for epoch in range(num_epochs):
+        running_loss = 0.0
+        for i, data in enumerate(trainloader, 0):
+            inputs, labels = data
+            optimizer.zero_grad()
+            outputs = classifier(inputs)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+
+            # print statistics
+            running_loss += loss.item()
+            if i % 20 == 19:    # print every 20 mini-batches
+                print('[Epoch %d, Batch %3d] Average loss: %.3f' %
+                      (epoch + 1, i + 1, running_loss / 20))
+                running_loss = 0.0
+
+
+def evaluate_classifier(classifier, dataset):
+    testloader = torch.utils.data.DataLoader(dataset, batch_size=100, shuffle=False)
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for data in testloader:
+            images, labels = data
+            outputs = classifier(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+
+    print('Accuracy: %d %%' % (100 * correct / total))
+
+
+cifar_ex = Experiment('cifar')
+
+
+@cifar_ex.config
+def default_config():
+    seed = 0
+    algo = SimCLR
+    data_dir = 'cifar10/'
+    pretrain_epochs = 1000
+    finetune_epochs = 100
+    _ = locals()
+    del _
+
+
+@cifar_ex.main
+def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, _config):
+
+    # TODO fix this hacky nonsense
+    log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
+    os.mkdir(log_dir)
+    #with TemporaryDirectory() as tmp_dir:
+    if isinstance(algo, str):
+        algo = globals()[algo]
+    assert issubclass(algo, RepresentationLearner)
+    ## TODO allow passing in of kwargs here
+    #trainloader = torch.utils.data.DataLoader(
+    #    trainset, batch_size=opt.batch_size_train, shuffle=True, num_workers=2)
+
+    # Load in data
+    os.makedirs(data_dir, exist_ok=True)
+    transformations = [
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+    ]
+    transform = transforms.Compose(transformations)
+    trainset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=True, download=True, transform=transform)
+    testset = torchvision.datasets.CIFAR10(
+        root=data_dir, train=False, download=True, transform=transform)
+
+    print('Creating model for representation learning')
+    env = MockGymEnv(Box(low=-1.0, high=1.0, shape=(32, 32, 3), dtype=np.float32))
+    # algo_params = {k: v for k, v in _config.items() if k in rep_learner_params.keys()}
+    rep_learning_augmentations = [
+        transforms.Lambda(torch.tensor),
+        transforms.ToPILImage(),
+        transforms.Pad(4),
+        transforms.RandomCrop(16),
+        transforms.Pad(8),
+        # transforms.Lambda(gaussian_blur), # SimCLR doesn't use blur for CIFAR-10
+        transforms.ToTensor(),
+    ]
+    # Note that the resnet18 model used here has an architecture meant for
+    # ImageNet, not CIFAR-10. The SimCLR implementation uses a version
+    # specialized for CIFAR, see https://github.com/google-research/simclr/blob/37ad4e01fb22e3e6c7c4753bd51a1e481c2d992e/resnet.py#L531
+    model = algo(
+        env, log_dir=log_dir, pretrain_epochs=pretrain_epochs, batch_size=512, representation_dim=1000,
+        encoder_kwargs={'architecture': resnet18()},
+        augmenter_kwargs={'augmentations': rep_learning_augmentations},
+        optimizer_kwargs={'lr': 1e-3, 'weight_decay': 1e-4},
+    )
+
+    print('Train representation learner')
+    rep_learning_data = transform_to_rl(trainset)
+    model.learn(rep_learning_data)
+    del rep_learning_data
+
+    print('Train linear head')
+    classifier = LinearHead(model.encoder, 10)
+    train_classifier(classifier, trainset, num_epochs=finetune_epochs)
+
+    print('Evaluate accuracy on test set')
+    evaluate_classifier(classifier, testset)
+
+    env.close()
+
+
+if __name__ == '__main__':
+    cifar_ex.observers.append(FileStorageObserver('cifar_runs'))
+    cifar_ex.run_commandline()

From 1fc4d9a9e8579071c13981d4e703146ee8dbff02 Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Tue, 4 Aug 2020 18:39:02 -0700
Subject: [PATCH 003/123] Add support for GPU training, miscellaneous
 improvements

---
 run_cifar.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/run_cifar.py b/run_cifar.py
index e98d5f23..246ea1fb 100644
--- a/run_cifar.py
+++ b/run_cifar.py
@@ -35,7 +35,7 @@ def transform_to_rl(dataset):
     adding dummy 'actions' (always 0) and 'dones' (always False), and pretending
     that everything is from the same 'trajectory'.
     """
-    states = [img for img, label in dataset][:10000]
+    states = [img for img, label in dataset]
     data_dict = {
         'states': states,
         'actions': [0.0] * len(states),
@@ -56,15 +56,15 @@ def forward(self, x):
         return self.layer(encoding)
 
 
-def train_classifier(classifier, dataset, num_epochs):
+def train_classifier(classifier, dataset, num_epochs, device):
     trainloader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=True)
-    criterion = nn.CrossEntropyLoss()
+    criterion = nn.CrossEntropyLoss().to(device)
     optimizer = optim.SGD(classifier.layer.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0)
 
     for epoch in range(num_epochs):
         running_loss = 0.0
-        for i, data in enumerate(trainloader, 0):
-            inputs, labels = data
+        for i, (inputs, labels) in enumerate(trainloader, 0):
+            inputs, labels = inputs.to(device), labels.to(device)
             optimizer.zero_grad()
             outputs = classifier(inputs)
             loss = criterion(outputs, labels)
@@ -79,13 +79,13 @@ def train_classifier(classifier, dataset, num_epochs):
                 running_loss = 0.0
 
 
-def evaluate_classifier(classifier, dataset):
+def evaluate_classifier(classifier, dataset, device):
     testloader = torch.utils.data.DataLoader(dataset, batch_size=100, shuffle=False)
     correct = 0
     total = 0
     with torch.no_grad():
-        for data in testloader:
-            images, labels = data
+        for images, labels in testloader:
+            images, labels = images.to(device), labels.to(device)
             outputs = classifier(images)
             _, predicted = torch.max(outputs.data, 1)
             total += labels.size(0)
@@ -99,17 +99,18 @@ def evaluate_classifier(classifier, dataset):
 
 @cifar_ex.config
 def default_config():
-    seed = 0
+    seed = 1
     algo = SimCLR
     data_dir = 'cifar10/'
     pretrain_epochs = 1000
     finetune_epochs = 100
+    rep_batch_size = 512
     _ = locals()
     del _
 
 
 @cifar_ex.main
-def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, _config):
+def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, rep_batch_size, _config):
 
     # TODO fix this hacky nonsense
     log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
@@ -126,7 +127,7 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, _config):
     os.makedirs(data_dir, exist_ok=True)
     transformations = [
         transforms.ToTensor(),
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
     ]
     transform = transforms.Compose(transformations)
     trainset = torchvision.datasets.CIFAR10(
@@ -135,6 +136,7 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, _config):
         root=data_dir, train=False, download=True, transform=transform)
 
     print('Creating model for representation learning')
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     env = MockGymEnv(Box(low=-1.0, high=1.0, shape=(32, 32, 3), dtype=np.float32))
     # algo_params = {k: v for k, v in _config.items() if k in rep_learner_params.keys()}
     rep_learning_augmentations = [
@@ -150,7 +152,7 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, _config):
     # ImageNet, not CIFAR-10. The SimCLR implementation uses a version
     # specialized for CIFAR, see https://github.com/google-research/simclr/blob/37ad4e01fb22e3e6c7c4753bd51a1e481c2d992e/resnet.py#L531
     model = algo(
-        env, log_dir=log_dir, pretrain_epochs=pretrain_epochs, batch_size=512, representation_dim=1000,
+        env, log_dir=log_dir, pretrain_epochs=pretrain_epochs, batch_size=rep_batch_size, representation_dim=1000, device=device,
         encoder_kwargs={'architecture': resnet18()},
         augmenter_kwargs={'augmentations': rep_learning_augmentations},
         optimizer_kwargs={'lr': 1e-3, 'weight_decay': 1e-4},
@@ -162,11 +164,11 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, _config):
     del rep_learning_data
 
     print('Train linear head')
-    classifier = LinearHead(model.encoder, 10)
-    train_classifier(classifier, trainset, num_epochs=finetune_epochs)
+    classifier = LinearHead(model.encoder, 10).to(device)
+    train_classifier(classifier, trainset, num_epochs=finetune_epochs, device=device)
 
     print('Evaluate accuracy on test set')
-    evaluate_classifier(classifier, testset)
+    evaluate_classifier(classifier, testset, device=device)
 
     env.close()
 

From 5f24dd9424c6ba42d2b4e4b2a9bdda1bff0017fc Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Wed, 5 Aug 2020 11:33:13 -0700
Subject: [PATCH 004/123] Pull out model training code into its own function

---
 algos/encoders.py |  1 +
 run_cifar.py      | 65 +++++++++++++++++++++++++----------------------
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/algos/encoders.py b/algos/encoders.py
index 6b399b13..c365628c 100644
--- a/algos/encoders.py
+++ b/algos/encoders.py
@@ -80,6 +80,7 @@ def __init__(self, obs_space, representation_dim, architecture_module_cls=None,
         representing the mean representation z of a fixed-variance representation distribution
         """
         super(DeterministicEncoder, self).__init__()
+        self.representation_dim = representation_dim
         if architecture_module_cls is None:
             architecture_module_cls = NatureCNN
         self.network = architecture_module_cls(obs_space, representation_dim)
diff --git a/run_cifar.py b/run_cifar.py
index 246ea1fb..6a07fdaf 100644
--- a/run_cifar.py
+++ b/run_cifar.py
@@ -94,6 +94,39 @@ def evaluate_classifier(classifier, dataset, device):
     print('Accuracy: %d %%' % (100 * correct / total))
 
 
+def representation_learning(algo, trainset, device, log_dir, config):
+    print('Creating model for representation learning')
+
+    if isinstance(algo, str):
+        algo = globals()[algo]
+
+    env = MockGymEnv(Box(low=-1.0, high=1.0, shape=(32, 32, 3), dtype=np.float32))
+    rep_learning_augmentations = [
+        transforms.Lambda(torch.tensor),
+        transforms.ToPILImage(),
+        transforms.Pad(4),
+        transforms.RandomCrop(16),
+        transforms.Pad(8),
+        # SimCLR doesn't use blur for CIFAR-10
+    ]
+    # Note that the resnet18 model used here has an architecture meant for
+    # ImageNet, not CIFAR-10. The SimCLR implementation uses a version
+    # specialized for CIFAR, see https://github.com/google-research/simclr/blob/37ad4e01fb22e3e6c7c4753bd51a1e481c2d992e/resnet.py#L531
+    model = algo(
+        env, log_dir=log_dir, pretrain_epochs=config['pretrain_epochs'], batch_size=config['rep_batch_size'], representation_dim=1000, device=device, shuffle_batches=True,
+        encoder_kwargs={'architecture_module_cls': lambda *args: resnet18()},
+        augmenter_kwargs={'augmentations': rep_learning_augmentations},
+        optimizer_kwargs={'lr': 1e-3, 'weight_decay': 1e-4},
+        loss_calculator_kwargs={'temp': 0.5},
+    )
+
+    print('Train representation learner')
+    rep_learning_data = transform_to_rl(trainset)
+    model.learn(rep_learning_data)
+    env.close()
+    return model
+
+
 cifar_ex = Experiment('cifar')
 
 
@@ -111,13 +144,10 @@ def default_config():
 
 @cifar_ex.main
 def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, rep_batch_size, _config):
-
     # TODO fix this hacky nonsense
     log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
     os.mkdir(log_dir)
     #with TemporaryDirectory() as tmp_dir:
-    if isinstance(algo, str):
-        algo = globals()[algo]
     assert issubclass(algo, RepresentationLearner)
     ## TODO allow passing in of kwargs here
     #trainloader = torch.utils.data.DataLoader(
@@ -135,33 +165,8 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, rep_batch_size,
     testset = torchvision.datasets.CIFAR10(
         root=data_dir, train=False, download=True, transform=transform)
 
-    print('Creating model for representation learning')
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    env = MockGymEnv(Box(low=-1.0, high=1.0, shape=(32, 32, 3), dtype=np.float32))
-    # algo_params = {k: v for k, v in _config.items() if k in rep_learner_params.keys()}
-    rep_learning_augmentations = [
-        transforms.Lambda(torch.tensor),
-        transforms.ToPILImage(),
-        transforms.Pad(4),
-        transforms.RandomCrop(16),
-        transforms.Pad(8),
-        # transforms.Lambda(gaussian_blur), # SimCLR doesn't use blur for CIFAR-10
-        transforms.ToTensor(),
-    ]
-    # Note that the resnet18 model used here has an architecture meant for
-    # ImageNet, not CIFAR-10. The SimCLR implementation uses a version
-    # specialized for CIFAR, see https://github.com/google-research/simclr/blob/37ad4e01fb22e3e6c7c4753bd51a1e481c2d992e/resnet.py#L531
-    model = algo(
-        env, log_dir=log_dir, pretrain_epochs=pretrain_epochs, batch_size=rep_batch_size, representation_dim=1000, device=device,
-        encoder_kwargs={'architecture': resnet18()},
-        augmenter_kwargs={'augmentations': rep_learning_augmentations},
-        optimizer_kwargs={'lr': 1e-3, 'weight_decay': 1e-4},
-    )
-
-    print('Train representation learner')
-    rep_learning_data = transform_to_rl(trainset)
-    model.learn(rep_learning_data)
-    del rep_learning_data
+    model = representation_learning(algo, trainset, device, log_dir, _config)
 
     print('Train linear head')
     classifier = LinearHead(model.encoder, 10).to(device)
@@ -170,8 +175,6 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, rep_batch_size,
     print('Evaluate accuracy on test set')
     evaluate_classifier(classifier, testset, device=device)
 
-    env.close()
-
 
 if __name__ == '__main__':
     cifar_ex.observers.append(FileStorageObserver('cifar_runs'))

From 96dc990e6cda3f08127cb718597fa4c62ad9b237 Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Wed, 5 Aug 2020 11:47:21 -0700
Subject: [PATCH 005/123] Compatibility with new learn interface

---
 run_cifar.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_cifar.py b/run_cifar.py
index 6a07fdaf..d9be37a7 100644
--- a/run_cifar.py
+++ b/run_cifar.py
@@ -113,7 +113,7 @@ def representation_learning(algo, trainset, device, log_dir, config):
     # ImageNet, not CIFAR-10. The SimCLR implementation uses a version
     # specialized for CIFAR, see https://github.com/google-research/simclr/blob/37ad4e01fb22e3e6c7c4753bd51a1e481c2d992e/resnet.py#L531
     model = algo(
-        env, log_dir=log_dir, pretrain_epochs=config['pretrain_epochs'], batch_size=config['rep_batch_size'], representation_dim=1000, device=device, shuffle_batches=True,
+        env, log_dir=log_dir, batch_size=config['rep_batch_size'], representation_dim=1000, device=device, shuffle_batches=True,
         encoder_kwargs={'architecture_module_cls': lambda *args: resnet18()},
         augmenter_kwargs={'augmentations': rep_learning_augmentations},
         optimizer_kwargs={'lr': 1e-3, 'weight_decay': 1e-4},
@@ -122,7 +122,7 @@ def representation_learning(algo, trainset, device, log_dir, config):
 
     print('Train representation learner')
     rep_learning_data = transform_to_rl(trainset)
-    model.learn(rep_learning_data)
+    model.learn(rep_learning_data, config['pretrain_epochs'])
     env.close()
     return model
 

From 89a30dc2a7e8fbd08cebfdfd75503c0a8cb8a9d6 Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Thu, 6 Aug 2020 15:32:31 -0700
Subject: [PATCH 006/123] Implement the correct augmentations for SimCLR on
 CIFAR-10

---
 algos/representation_learner.py |  7 +++-
 run_cifar.py                    | 66 +++++++++++++++++++--------------
 2 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/algos/representation_learner.py b/algos/representation_learner.py
index 3d31cada..d7ad9557 100644
--- a/algos/representation_learner.py
+++ b/algos/representation_learner.py
@@ -28,6 +28,7 @@ def __init__(self, env, log_dir, encoder, decoder, loss_calculator, target_pair_
                  representation_dim=512,
                  projection_dim=None,
                  device=None,
+                 normalize=True,
                  shuffle_batches=True,
                  batch_size=256,
                  preprocess_extra_context=True,
@@ -51,6 +52,7 @@ def __init__(self, env, log_dir, encoder, decoder, loss_calculator, target_pair_
         else:
             self.device = device
 
+        self.normalize = normalize
         self.shuffle_batches = shuffle_batches
         self.batch_size = batch_size
         self.preprocess_extra_context = preprocess_extra_context
@@ -135,7 +137,8 @@ def _preprocess(self, input_data):
             input_data = input_data.permute(self.permutation_tuple)
 
         # Normalization to range [-1, 1]
-        if isinstance(self.observation_space, Box):
+        if self.normalize:
+            assert isinstance(self.observation_space, Box)
             low, high = self.observation_space.low, self.observation_space.high
             low_min, low_max, high_min, high_max = low.min(), low.max(), high.min(), high.max()
             assert low_min == low_max and high_min == high_max
@@ -143,6 +146,8 @@ def _preprocess(self, input_data):
             mid = (low + high) / 2
             delta = high - mid
             input_data = (input_data - mid) / delta
+
+        assert input_data.shape[1:] == self.observation_shape
         return input_data
 
     def _preprocess_extra_context(self, extra_context):
diff --git a/run_cifar.py b/run_cifar.py
index 3f11cc81..3d534534 100644
--- a/run_cifar.py
+++ b/run_cifar.py
@@ -4,8 +4,9 @@
 from sacred.observers import FileStorageObserver
 from algos.utils import gaussian_blur
 
-import os
 import numpy as np
+import os
+import PIL
 import torch
 import torch.nn as nn
 import torch.optim as optim
@@ -56,8 +57,18 @@ def forward(self, x):
         return self.layer(encoding)
 
 
-def train_classifier(classifier, dataset, num_epochs, device):
-    trainloader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=True)
+def train_classifier(classifier, data_dir, num_epochs, device):
+    transform = transforms.Compose([
+        transforms.ToPILImage(),
+        transforms.RandomResizedCrop(32, interpolation=PIL.Image.BICUBIC),
+        transforms.RandomHorizontalFlip(),
+        # No color jitter or grayscale for finetuning
+        # SimCLR doesn't use blur for CIFAR-10
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+    ])
+    trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
+    trainloader = torch.utils.data.DataLoader(trainset, batch_size=512, shuffle=True)
     criterion = nn.CrossEntropyLoss().to(device)
     optimizer = optim.SGD(classifier.layer.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0)
 
@@ -79,8 +90,13 @@ def train_classifier(classifier, dataset, num_epochs, device):
                 running_loss = 0.0
 
 
-def evaluate_classifier(classifier, dataset, device):
-    testloader = torch.utils.data.DataLoader(dataset, batch_size=100, shuffle=False)
+def evaluate_classifier(classifier, data_dir, device):
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+    ])
+    testset = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True, transform=transform)
+    testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False)
     correct = 0
     total = 0
     with torch.no_grad():
@@ -94,27 +110,33 @@ def evaluate_classifier(classifier, dataset, device):
     print('Accuracy: %d %%' % (100 * correct / total))
 
 
-def representation_learning(algo, trainset, device, log_dir, config):
+def representation_learning(algo, data_dir, device, log_dir, config):
     print('Creating model for representation learning')
 
     if isinstance(algo, str):
         algo = globals()[algo]
     assert issubclass(algo, RepresentationLearner)
 
-    env = MockGymEnv(Box(low=-1.0, high=1.0, shape=(32, 32, 3), dtype=np.float32))
     rep_learning_augmentations = [
         transforms.Lambda(torch.tensor),
         transforms.ToPILImage(),
-        transforms.Pad(4),
-        transforms.RandomCrop(16),
-        transforms.Pad(8),
+        transforms.RandomResizedCrop(32, interpolation=PIL.Image.BICUBIC),
+        transforms.RandomHorizontalFlip(),
+        transforms.RandomApply([
+            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
+        ], p=0.8),
+        transforms.RandomGrayscale(p=0.2),
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
         # SimCLR doesn't use blur for CIFAR-10
     ]
+    env = MockGymEnv(Box(low=0.0, high=1.0, shape=(3, 32, 32), dtype=np.float32))
     # Note that the resnet18 model used here has an architecture meant for
     # ImageNet, not CIFAR-10. The SimCLR implementation uses a version
     # specialized for CIFAR, see https://github.com/google-research/simclr/blob/37ad4e01fb22e3e6c7c4753bd51a1e481c2d992e/resnet.py#L531
     model = algo(
-        env, log_dir=log_dir, batch_size=config['rep_batch_size'], representation_dim=1000, device=device, shuffle_batches=True,
+        env, log_dir=log_dir, batch_size=config['rep_batch_size'], representation_dim=1000, device=device,
+        normalize=False, shuffle_batches=True,
         encoder_kwargs={'architecture_module_cls': lambda *args: resnet18()},
         augmenter_kwargs={'augmentations': rep_learning_augmentations},
         optimizer_kwargs={'lr': 1e-3, 'weight_decay': 1e-4},
@@ -122,6 +144,8 @@ def representation_learning(algo, trainset, device, log_dir, config):
     )
 
     print('Train representation learner')
+    transform = transforms.ToTensor()
+    trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
     rep_learning_data = transform_to_rl(trainset)
     model.learn(rep_learning_data, config['pretrain_epochs'])
     env.close()
@@ -148,31 +172,17 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, rep_batch_size,
     # TODO fix this hacky nonsense
     log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
     os.mkdir(log_dir)
-    ## TODO allow passing in of kwargs here
-    #trainloader = torch.utils.data.DataLoader(
-    #    trainset, batch_size=opt.batch_size_train, shuffle=True, num_workers=2)
-
-    # Load in data
     os.makedirs(data_dir, exist_ok=True)
-    transformations = [
-        transforms.ToTensor(),
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-    ]
-    transform = transforms.Compose(transformations)
-    trainset = torchvision.datasets.CIFAR10(
-        root=data_dir, train=True, download=True, transform=transform)
-    testset = torchvision.datasets.CIFAR10(
-        root=data_dir, train=False, download=True, transform=transform)
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = representation_learning(algo, trainset, device, log_dir, _config)
+    model = representation_learning(algo, data_dir, device, log_dir, _config)
 
     print('Train linear head')
     classifier = LinearHead(model.encoder, 10).to(device)
-    train_classifier(classifier, trainset, num_epochs=finetune_epochs, device=device)
+    train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 
     print('Evaluate accuracy on test set')
-    evaluate_classifier(classifier, testset, device=device)
+    evaluate_classifier(classifier, data_dir, device=device)
 
 
 if __name__ == '__main__':

From db49e9259623fdb3dd4c6aca1567c3d0ea29430f Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Sat, 8 Aug 2020 19:46:24 -0700
Subject: [PATCH 007/123] Changes to optimizers and learning rates to be more
 in line with SimCLR

---
 algos/decoders.py | 11 ++++++-----
 algos/utils.py    | 12 ++++++------
 run_cifar.py      | 29 +++++++++++++++++++----------
 3 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/algos/decoders.py b/algos/decoders.py
index 7001262d..96abb2f1 100644
--- a/algos/decoders.py
+++ b/algos/decoders.py
@@ -60,11 +60,12 @@ class ProjectionHead(LossDecoder):
     def __init__(self, representation_dim, projection_shape, sample=False, learn_scale=False):
         super(ProjectionHead, self).__init__(representation_dim, projection_shape, sample)
 
-        self.shared_mlp = nn.Sequential(nn.Linear(self.representation_dim, 256),
-                                      nn.ReLU(),
-                                      nn.Linear(256, 256),
-                                      nn.ReLU())
-        self.mean_layer = nn.Linear(256, self.projection_dim)
+        dim = self.representation_dim
+        self.shared_mlp = nn.Sequential(nn.Linear(dim, dim),
+                                        nn.ReLU(),
+                                        nn.Linear(dim, dim),
+                                        nn.ReLU())
+        self.mean_layer = nn.Linear(dim, self.projection_dim, bias=False)
 
         if learn_scale:
             self.scale_layer = nn.Linear(256, self.projection_dim)
diff --git a/algos/utils.py b/algos/utils.py
index 475189a7..e055777b 100644
--- a/algos/utils.py
+++ b/algos/utils.py
@@ -101,23 +101,23 @@ def log(self, msg):
 
 class LinearWarmupCosine(_LRScheduler):
     def __init__(self, optimizer, warmup_epoch, T_max, eta_min=0, last_epoch=-1):
-        self.T_max = T_max
         self.eta_min = eta_min
         self.warmup_epoch = warmup_epoch
+        self.cosine_epochs = T_max - warmup_epoch
         super(LinearWarmupCosine, self).__init__(optimizer, last_epoch)
 
     def get_lr(self):
         if self.warmup_epoch > 0:
             if self.last_epoch <= self.warmup_epoch:
                 return [base_lr / self.warmup_epoch * self.last_epoch for base_lr in self.base_lrs]
-        if ((self.last_epoch - self.warmup_epoch) - 1 - (self.T_max - self.warmup_epoch)) % (2 * (self.T_max - self.warmup_epoch)) == 0:
+        if ((self.last_epoch - self.warmup_epoch) - 1 - self.cosine_epochs) % (2 * self.cosine_epochs) == 0:
             return [group['lr'] + (base_lr - self.eta_min) *
-                    (1 - math.cos(math.pi / (self.T_max - self.warmup_epoch))) / 2
+                    (1 - math.cos(math.pi / self.cosine_epochs)) / 2
                     for base_lr, group in
                     zip(self.base_lrs, self.optimizer.param_groups)]
         else:
-            return [(1 + math.cos(math.pi * (self.last_epoch - self.warmup_epoch) / (self.T_max - self.warmup_epoch))) /
-                    (1 + math.cos(math.pi * ((self.last_epoch - self.warmup_epoch) - 1) / (self.T_max - self.warmup_epoch))) *
+            return [(1 + math.cos(math.pi * (self.last_epoch - self.warmup_epoch) / self.cosine_epochs)) /
+                    (1 + math.cos(math.pi * ((self.last_epoch - self.warmup_epoch) - 1) / self.cosine_epochs)) *
                     (group['lr'] - self.eta_min) + self.eta_min
                     for group in self.optimizer.param_groups]
 
@@ -166,4 +166,4 @@ def update(self, val, n=1):
         self.val = val
         self.sum += val * n
         self.count += n
-        self.avg = self.sum / self.count
\ No newline at end of file
+        self.avg = self.sum / self.count
diff --git a/run_cifar.py b/run_cifar.py
index 3d534534..8f522180 100644
--- a/run_cifar.py
+++ b/run_cifar.py
@@ -13,6 +13,7 @@
 import torchvision
 import torchvision.transforms as transforms
 from torchvision.models.resnet import resnet18
+from algos.utils import LinearWarmupCosine
 
 
 class MockGymEnv(object):
@@ -70,9 +71,11 @@ def train_classifier(classifier, data_dir, num_epochs, device):
     trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
     trainloader = torch.utils.data.DataLoader(trainset, batch_size=512, shuffle=True)
     criterion = nn.CrossEntropyLoss().to(device)
-    optimizer = optim.SGD(classifier.layer.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0)
+    optimizer = optim.SGD(classifier.layer.parameters(), lr=0.2, momentum=0.9, weight_decay=0.0, nesterov=True)
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
 
     for epoch in range(num_epochs):
+        print(f"Epoch {epoch}/{num_epochs} with lr {optimizer.param_groups[0]['lr']}")
         running_loss = 0.0
         for i, (inputs, labels) in enumerate(trainloader, 0):
             inputs, labels = inputs.to(device), labels.to(device)
@@ -89,6 +92,8 @@ def train_classifier(classifier, data_dir, num_epochs, device):
                       (epoch + 1, i + 1, running_loss / 20))
                 running_loss = 0.0
 
+        scheduler.step()
+
 
 def evaluate_classifier(classifier, data_dir, device):
     transform = transforms.Compose([
@@ -110,7 +115,7 @@ def evaluate_classifier(classifier, data_dir, device):
     print('Accuracy: %d %%' % (100 * correct / total))
 
 
-def representation_learning(algo, data_dir, device, log_dir, config):
+def representation_learning(algo, data_dir, num_epochs, device, log_dir):
     print('Creating model for representation learning')
 
     if isinstance(algo, str):
@@ -134,12 +139,17 @@ def representation_learning(algo, data_dir, device, log_dir, config):
     # Note that the resnet18 model used here has an architecture meant for
     # ImageNet, not CIFAR-10. The SimCLR implementation uses a version
     # specialized for CIFAR, see https://github.com/google-research/simclr/blob/37ad4e01fb22e3e6c7c4753bd51a1e481c2d992e/resnet.py#L531
+    # It seems that SimCLR does not include the final fully connected layer for ResNets, so we set it to the identity.
+    resnet_without_fc = resnet18()
+    resnet_without_fc.fc = torch.nn.Identity()
     model = algo(
-        env, log_dir=log_dir, batch_size=config['rep_batch_size'], representation_dim=1000, device=device,
-        normalize=False, shuffle_batches=True,
-        encoder_kwargs={'architecture_module_cls': lambda *args: resnet18()},
+        env, log_dir=log_dir, batch_size=512, representation_dim=512, projection_dim=128,
+        device=device, normalize=False, shuffle_batches=True,
+        encoder_kwargs={'architecture_module_cls': lambda *args: resnet_without_fc},
         augmenter_kwargs={'augmentations': rep_learning_augmentations},
-        optimizer_kwargs={'lr': 1e-3, 'weight_decay': 1e-4},
+        optimizer_kwargs={'lr': 2.0, 'weight_decay': 1e-4},
+        scheduler=LinearWarmupCosine,
+        scheduler_kwargs={'warmup_epoch': 10, 'T_max': num_epochs},
         loss_calculator_kwargs={'temp': 0.5},
     )
 
@@ -147,7 +157,7 @@ def representation_learning(algo, data_dir, device, log_dir, config):
     transform = transforms.ToTensor()
     trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
     rep_learning_data = transform_to_rl(trainset)
-    model.learn(rep_learning_data, config['pretrain_epochs'])
+    model.learn(rep_learning_data, num_epochs)
     env.close()
     return model
 
@@ -162,20 +172,19 @@ def default_config():
     data_dir = 'cifar10/'
     pretrain_epochs = 1000
     finetune_epochs = 100
-    rep_batch_size = 512
     _ = locals()
     del _
 
 
 @cifar_ex.main
-def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, rep_batch_size, _config):
+def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, _config):
     # TODO fix this hacky nonsense
     log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
     os.mkdir(log_dir)
     os.makedirs(data_dir, exist_ok=True)
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = representation_learning(algo, data_dir, device, log_dir, _config)
+    model = representation_learning(algo, data_dir, pretrain_epochs, device, log_dir)
 
     print('Train linear head')
     classifier = LinearHead(model.encoder, 10).to(device)

From c29cf52ddac1861ae0fbc43727117d4c27467bc6 Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Mon, 10 Aug 2020 09:56:13 -0700
Subject: [PATCH 008/123] Add momentum to optimizer

---
 run_cifar.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/run_cifar.py b/run_cifar.py
index 8f522180..0edbb4fd 100644
--- a/run_cifar.py
+++ b/run_cifar.py
@@ -142,12 +142,13 @@ def representation_learning(algo, data_dir, num_epochs, device, log_dir):
     # It seems that SimCLR does not include the final fully connected layer for ResNets, so we set it to the identity.
     resnet_without_fc = resnet18()
     resnet_without_fc.fc = torch.nn.Identity()
+    # Note SimCLR uses LARSOptimizer, which we currently do not do
     model = algo(
         env, log_dir=log_dir, batch_size=512, representation_dim=512, projection_dim=128,
         device=device, normalize=False, shuffle_batches=True,
         encoder_kwargs={'architecture_module_cls': lambda *args: resnet_without_fc},
         augmenter_kwargs={'augmentations': rep_learning_augmentations},
-        optimizer_kwargs={'lr': 2.0, 'weight_decay': 1e-4},
+        optimizer_kwargs={'lr': 2.0, 'weight_decay': 1e-4, 'momentum': 0.9},
         scheduler=LinearWarmupCosine,
         scheduler_kwargs={'warmup_epoch': 10, 'T_max': num_epochs},
         loss_calculator_kwargs={'temp': 0.5},

From 66ae3d54b20483c7891dd095fe8e765819346913 Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Mon, 10 Aug 2020 11:03:32 -0700
Subject: [PATCH 009/123] Fix indentation bug

---
 algos/encoders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algos/encoders.py b/algos/encoders.py
index 077800b3..9c8caede 100644
--- a/algos/encoders.py
+++ b/algos/encoders.py
@@ -46,7 +46,7 @@ def __init__(self, obs_space, representation_dim):
             shared_network_layers.append(nn.Linear(in_dim, out_dim))
             shared_network_layers.append(nn.ReLU())
 
-            self.shared_network = nn.Sequential(*shared_network_layers)
+        self.shared_network = nn.Sequential(*shared_network_layers)
 
         self.mean_layer = nn.Linear(DEFAULT_CNN_ARCHITECTURE['DENSE'][-1]['in_dim'], self.representation_dim)
         self.scale_layer = nn.Linear(DEFAULT_CNN_ARCHITECTURE['DENSE'][-1]['in_dim'], self.representation_dim)

From 8303ca5729ceedbef1471fcb39069547ec96d699 Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Sun, 16 Aug 2020 19:31:05 -0700
Subject: [PATCH 010/123] Address comments on PR, except for LinearWarmupCosine
 documentation, which I am confused about

---
 algos/decoders.py |  1 +
 algos/utils.py    | 28 ++++++++++++++++------------
 run_cifar.py      | 27 ++++++++++++++++-----------
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/algos/decoders.py b/algos/decoders.py
index 96abb2f1..da923fcb 100644
--- a/algos/decoders.py
+++ b/algos/decoders.py
@@ -61,6 +61,7 @@ def __init__(self, representation_dim, projection_shape, sample=False, learn_sca
         super(ProjectionHead, self).__init__(representation_dim, projection_shape, sample)
 
         dim = self.representation_dim
+        # TODO(rohinmshah): Make the architecture configurable rather than reusing dim everywhere
         self.shared_mlp = nn.Sequential(nn.Linear(dim, dim),
                                         nn.ReLU(),
                                         nn.Linear(dim, dim),
diff --git a/algos/utils.py b/algos/utils.py
index e055777b..70d8a2bd 100644
--- a/algos/utils.py
+++ b/algos/utils.py
@@ -100,26 +100,30 @@ def log(self, msg):
 
 
 class LinearWarmupCosine(_LRScheduler):
-    def __init__(self, optimizer, warmup_epoch, T_max, eta_min=0, last_epoch=-1):
-        self.eta_min = eta_min
+    def __init__(self, optimizer, warmup_epoch, total_epochs, initial_learning_rate=0.0001, last_epoch=-1):
+        assert warmup_epoch >= 0
+        self.eta_min = initial_learning_rate
         self.warmup_epoch = warmup_epoch
-        self.cosine_epochs = T_max - warmup_epoch
+        self.cosine_epochs = total_epochs - warmup_epoch
         super(LinearWarmupCosine, self).__init__(optimizer, last_epoch)
 
     def get_lr(self):
-        if self.warmup_epoch > 0:
-            if self.last_epoch <= self.warmup_epoch:
-                return [base_lr / self.warmup_epoch * self.last_epoch for base_lr in self.base_lrs]
-        if ((self.last_epoch - self.warmup_epoch) - 1 - self.cosine_epochs) % (2 * self.cosine_epochs) == 0:
+        # Linear scaling if we are in the warmup stage
+        if self.warmup_epoch > 0 and self.last_epoch <= self.warmup_epoch:
+            return [base_lr / self.warmup_epoch * self.last_epoch for base_lr in self.base_lrs]
+
+        # Outside the linear scaling regime, we rescale so that warmup_epoch is epoch 0
+        rescaled_epoch = self.last_epoch - self.warmup_epoch
+        if (rescaled_epoch - 1 - self.cosine_epochs) % (2 * self.cosine_epochs) == 0:
             return [group['lr'] + (base_lr - self.eta_min) *
                     (1 - math.cos(math.pi / self.cosine_epochs)) / 2
                     for base_lr, group in
                     zip(self.base_lrs, self.optimizer.param_groups)]
-        else:
-            return [(1 + math.cos(math.pi * (self.last_epoch - self.warmup_epoch) / self.cosine_epochs)) /
-                    (1 + math.cos(math.pi * ((self.last_epoch - self.warmup_epoch) - 1) / self.cosine_epochs)) *
-                    (group['lr'] - self.eta_min) + self.eta_min
-                    for group in self.optimizer.param_groups]
+
+        return [(1 + math.cos(math.pi * rescaled_epoch / self.cosine_epochs)) /
+                (1 + math.cos(math.pi * (rescaled_epoch - 1) / self.cosine_epochs)) *
+                (group['lr'] - self.eta_min) + self.eta_min
+                for group in self.optimizer.param_groups]
 
 
 def set_global_seeds(seed):
diff --git a/run_cifar.py b/run_cifar.py
index 0edbb4fd..63d53214 100644
--- a/run_cifar.py
+++ b/run_cifar.py
@@ -2,7 +2,7 @@
 from gym.spaces import Discrete, Box
 from sacred import Experiment
 from sacred.observers import FileStorageObserver
-from algos.utils import gaussian_blur
+from algos.optimizers import LARS
 
 import numpy as np
 import os
@@ -116,7 +116,7 @@ def evaluate_classifier(classifier, data_dir, device):
 
 
 def representation_learning(algo, data_dir, num_epochs, device, log_dir):
-    print('Creating model for representation learning')
+    print('Train representation learner')
 
     if isinstance(algo, str):
         algo = globals()[algo]
@@ -136,28 +136,33 @@ def representation_learning(algo, data_dir, num_epochs, device, log_dir):
         # SimCLR doesn't use blur for CIFAR-10
     ]
     env = MockGymEnv(Box(low=0.0, high=1.0, shape=(3, 32, 32), dtype=np.float32))
+
+    transform = transforms.ToTensor()
+    trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
+    rep_learning_data = transform_to_rl(trainset)
+    num_examples = len(rep_learning_data)
+    batch_size = 512
+    num_steps = num_epochs * int(ceil(num_examples / batch_size))
+
     # Note that the resnet18 model used here has an architecture meant for
     # ImageNet, not CIFAR-10. The SimCLR implementation uses a version
     # specialized for CIFAR, see https://github.com/google-research/simclr/blob/37ad4e01fb22e3e6c7c4753bd51a1e481c2d992e/resnet.py#L531
     # It seems that SimCLR does not include the final fully connected layer for ResNets, so we set it to the identity.
     resnet_without_fc = resnet18()
     resnet_without_fc.fc = torch.nn.Identity()
-    # Note SimCLR uses LARSOptimizer, which we currently do not do
+
     model = algo(
-        env, log_dir=log_dir, batch_size=512, representation_dim=512, projection_dim=128,
+        env, log_dir=log_dir, batch_size=batch_size, representation_dim=512, projection_dim=128,
         device=device, normalize=False, shuffle_batches=True,
         encoder_kwargs={'architecture_module_cls': lambda *args: resnet_without_fc},
         augmenter_kwargs={'augmentations': rep_learning_augmentations},
-        optimizer_kwargs={'lr': 2.0, 'weight_decay': 1e-4, 'momentum': 0.9},
+        optimizer=LARS,
+        optimizer_kwargs={'lr': 2.0, 'weight_decay': 1e-4, 'momentum': 0.9, 'max_epoch': num_steps},
         scheduler=LinearWarmupCosine,
-        scheduler_kwargs={'warmup_epoch': 10, 'T_max': num_epochs},
+        scheduler_kwargs={'warmup_epoch': 10, 'total_epochs': num_epochs, 'initial_learning_rate': 0.2},
         loss_calculator_kwargs={'temp': 0.5},
     )
 
-    print('Train representation learner')
-    transform = transforms.ToTensor()
-    trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
-    rep_learning_data = transform_to_rl(trainset)
     model.learn(rep_learning_data, num_epochs)
     env.close()
     return model
@@ -188,7 +193,7 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, _config):
     model = representation_learning(algo, data_dir, pretrain_epochs, device, log_dir)
 
     print('Train linear head')
-    classifier = LinearHead(model.encoder, 10).to(device)
+    classifier = LinearHead(model.encoder, output_dim=10).to(device)
     train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 
     print('Evaluate accuracy on test set')

From 474b6f9b78153b62e6ea9b458aca25601c105539 Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Sun, 16 Aug 2020 20:02:20 -0700
Subject: [PATCH 011/123] Rewrote LinearWarmupCosine to be more understandable

---
 algos/encoders.py |  1 +
 algos/utils.py    | 30 +++++++++++++-----------------
 run_cifar.py      |  4 ++--
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/algos/encoders.py b/algos/encoders.py
index 9c8caede..a88993e9 100644
--- a/algos/encoders.py
+++ b/algos/encoders.py
@@ -34,6 +34,7 @@ def __init__(self, obs_space, representation_dim):
         self.input_channel = obs_space.shape[0]
         self.representation_dim = representation_dim
 
+        shared_network_layers = []
         for layer_spec in DEFAULT_CNN_ARCHITECTURE['CONV']:
             shared_network_layers.append(nn.Conv2d(self.input_channel, layer_spec['out_dim'],
                                                    kernel_size=layer_spec['kernel_size'], stride=layer_spec['stride']))
diff --git a/algos/utils.py b/algos/utils.py
index 9c42e2aa..5bc23008 100644
--- a/algos/utils.py
+++ b/algos/utils.py
@@ -100,30 +100,26 @@ def log(self, msg):
 
 
 class LinearWarmupCosine(_LRScheduler):
-    def __init__(self, optimizer, warmup_epoch, total_epochs, initial_learning_rate=0.0001, last_epoch=-1):
+    def __init__(self, optimizer, warmup_epoch, total_epochs, eta_min=0.0, last_epoch=-1):
         assert warmup_epoch >= 0
-        self.eta_min = initial_learning_rate
+        self.eta_min = eta_min
         self.warmup_epoch = warmup_epoch
         self.cosine_epochs = total_epochs - warmup_epoch
         super(LinearWarmupCosine, self).__init__(optimizer, last_epoch)
 
     def get_lr(self):
         # Linear scaling if we are in the warmup stage
-        if self.warmup_epoch > 0 and self.last_epoch <= self.warmup_epoch:
-            return [base_lr / self.warmup_epoch * self.last_epoch for base_lr in self.base_lrs]
-
-        # Outside the linear scaling regime, we rescale so that warmup_epoch is epoch 0
-        rescaled_epoch = self.last_epoch - self.warmup_epoch
-        if (rescaled_epoch - 1 - self.cosine_epochs) % (2 * self.cosine_epochs) == 0:
-            return [group['lr'] + (base_lr - self.eta_min) *
-                    (1 - math.cos(math.pi / self.cosine_epochs)) / 2
-                    for base_lr, group in
-                    zip(self.base_lrs, self.optimizer.param_groups)]
-
-        return [(1 + math.cos(math.pi * rescaled_epoch / self.cosine_epochs)) /
-                (1 + math.cos(math.pi * (rescaled_epoch - 1) / self.cosine_epochs)) *
-                (group['lr'] - self.eta_min) + self.eta_min
-                for group in self.optimizer.param_groups]
+        use_linear_scaling = self.warmup_epoch > 0 and self.last_epoch < self.warmup_epoch
+        result = []
+        for base_lr in self.base_lrs:
+            delta = base_lr - self.eta_min
+            if use_linear_scaling:
+                fraction = (self.last_epoch + 1) / self.warmup_epoch
+            else:
+                rescaled_epoch = self.last_epoch - self.warmup_epoch
+                fraction = 0.5 * (1 + math.cos(math.pi * rescaled_epoch / self.cosine_epochs))
+            result.append(self.eta_min + fraction * delta)
+        return result
 
 
 def set_global_seeds(seed):
diff --git a/run_cifar.py b/run_cifar.py
index 63d53214..d0fb6098 100644
--- a/run_cifar.py
+++ b/run_cifar.py
@@ -3,6 +3,7 @@
 from sacred import Experiment
 from sacred.observers import FileStorageObserver
 from algos.optimizers import LARS
+from math import ceil
 
 import numpy as np
 import os
@@ -60,7 +61,6 @@ def forward(self, x):
 
 def train_classifier(classifier, data_dir, num_epochs, device):
     transform = transforms.Compose([
-        transforms.ToPILImage(),
         transforms.RandomResizedCrop(32, interpolation=PIL.Image.BICUBIC),
         transforms.RandomHorizontalFlip(),
         # No color jitter or grayscale for finetuning
@@ -159,7 +159,7 @@ def representation_learning(algo, data_dir, num_epochs, device, log_dir):
         optimizer=LARS,
         optimizer_kwargs={'lr': 2.0, 'weight_decay': 1e-4, 'momentum': 0.9, 'max_epoch': num_steps},
         scheduler=LinearWarmupCosine,
-        scheduler_kwargs={'warmup_epoch': 10, 'total_epochs': num_epochs, 'initial_learning_rate': 0.2},
+        scheduler_kwargs={'warmup_epoch': 10, 'total_epochs': num_epochs},
         loss_calculator_kwargs={'temp': 0.5},
     )
 

From 9cae4a786f0b4459cfdb35b0dc7354f85318a497 Mon Sep 17 00:00:00 2001
From: Rohin Shah <rohinmshah@gmail.com>
Date: Tue, 25 Aug 2020 17:22:27 -0700
Subject: [PATCH 012/123] Make things more parameterizable

---
 src/il_representations/scripts/run_cifar.py | 54 ++++++++++++++-------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index bbf1d554..3dbf5a36 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -1,7 +1,7 @@
 from gym.spaces import Discrete, Box
 from sacred import Experiment
 from sacred.observers import FileStorageObserver
-from il_representations.algos import *
+from il_representations import algos
 from il_representations.algos.augmenters import ColorSpace
 from il_representations.algos.optimizers import LARS
 from il_representations.algos.utils import LinearWarmupCosine
@@ -49,11 +49,11 @@ def transform_to_rl(dataset):
 
 
 class LinearHead(nn.Module):
-    def __init__(self, encoder, output_dim):
+    def __init__(self, encoder, encoder_dim, output_dim):
         super().__init__()
         self.encoder = encoder
         self.output_dim = output_dim
-        self.layer = nn.Linear(encoder.representation_dim, output_dim)
+        self.layer = nn.Linear(encoder_dim, output_dim)
 
     def forward(self, x):
         encoding = self.encoder.encode_context(x, None).loc.detach()
@@ -116,12 +116,11 @@ def evaluate_classifier(classifier, data_dir, device):
     print('Accuracy: %d %%' % (100 * correct / total))
 
 
-def representation_learning(algo, data_dir, num_epochs, device, log_dir):
+def representation_learning(algo, data_dir, device, log_dir, config):
     print('Train representation learner')
-
     if isinstance(algo, str):
-        algo = globals()[algo]
-    assert issubclass(algo, RepresentationLearner)
+        algo = getattr(algos, algo)
+    assert issubclass(algo, algos.RepresentationLearner)
 
     rep_learning_augmentations = [
         transforms.Lambda(torch.tensor),
@@ -142,7 +141,8 @@ def representation_learning(algo, data_dir, num_epochs, device, log_dir):
     trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
     rep_learning_data = transform_to_rl(trainset)
     num_examples = len(rep_learning_data)
-    batch_size = 512
+    num_epochs = config['pretrain_epochs']
+    batch_size = config['pretrain_batch_size']
     num_steps = num_epochs * int(ceil(num_examples / batch_size))
 
     # Note that the resnet18 model used here has an architecture meant for
@@ -153,16 +153,28 @@ def representation_learning(algo, data_dir, num_epochs, device, log_dir):
     resnet_without_fc.fc = torch.nn.Identity()
 
     model = algo(
-        env, log_dir=log_dir, batch_size=batch_size, representation_dim=512, projection_dim=128,
-        device=device, normalize=False, shuffle_batches=True, color_space=ColorSpace.RGB,
-        save_interval=100,
+        env,
+        log_dir=log_dir,
+        batch_size=batch_size,
+        representation_dim=config['representation_dim'],
+        projection_dim=config['projection_dim'],
+        device=device,
+        normalize=False,
+        shuffle_batches=True,
+        color_space=ColorSpace.RGB,
+        save_interval=config['pretrain_save_interval'],
         encoder_kwargs={'architecture_module_cls': lambda *args: resnet_without_fc},
         augmenter_kwargs={'augmentations': rep_learning_augmentations},
         optimizer=LARS,
-        optimizer_kwargs={'lr': 1.0, 'weight_decay': 1e-4, 'momentum': 0.9, 'max_epoch': num_steps},
+        optimizer_kwargs={
+            'lr': config['pretrain_lr'],
+            'weight_decay': config['pretrain_weight_decay'],
+            'momentum': config['pretrain_momentum'],
+            'max_epoch': num_steps,
+        },
         scheduler=LinearWarmupCosine,
         scheduler_kwargs={'warmup_epoch': 10, 'total_epochs': num_epochs},
-        loss_calculator_kwargs={'temp': 0.5},
+        loss_calculator_kwargs={'temp': config['pretrain_temperature']},
     )
 
     model.learn(rep_learning_data, num_epochs)
@@ -176,26 +188,34 @@ def representation_learning(algo, data_dir, num_epochs, device, log_dir):
 @cifar_ex.config
 def default_config():
     seed = 1
-    algo = SimCLR
+    algo = 'SimCLR'
     data_dir = 'cifar10/'
     pretrain_epochs = 1000
     finetune_epochs = 100
+    representation_dim = 512
+    projection_dim = 128
+    pretrain_lr = 1.0
+    pretrain_weight_decay = 1e-4
+    pretrain_momentum = 0.9
+    pretrain_batch_size = 512
+    pretrain_save_interval = 100
+    pretrain_temperature = 0.5
     _ = locals()
     del _
 
 
 @cifar_ex.main
-def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, _config):
+def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_dim, _config):
     # TODO fix this hacky nonsense
     log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
     os.mkdir(log_dir)
     os.makedirs(data_dir, exist_ok=True)
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = representation_learning(algo, data_dir, pretrain_epochs, device, log_dir)
+    model = representation_learning(algo, data_dir, device, log_dir, _config)
 
     print('Train linear head')
-    classifier = LinearHead(model.encoder, output_dim=10).to(device)
+    classifier = LinearHead(model.encoder, representation_dim, output_dim=10).to(device)
     train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 
     print('Evaluate accuracy on test set')

From 89d58643e4cd96fdd9178d75006b8003da32fe00 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Tue, 13 Apr 2021 16:21:49 +0800
Subject: [PATCH 013/123] Update .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 7e0d2299..e274ad10 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,4 @@ test_observer
 .idea
 *.xml
 *.iml
+venv
\ No newline at end of file

From 3fef45ea03304ad9775e4afbb81c8fc06fbec2bc Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Mon, 19 Apr 2021 13:39:27 +0800
Subject: [PATCH 014/123] update model setting

---
 src/il_representations/algos/augmenters.py  |  9 ++--
 src/il_representations/scripts/run_cifar.py | 49 ++++++++++++---------
 2 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/src/il_representations/algos/augmenters.py b/src/il_representations/algos/augmenters.py
index 24b0ebda..3af5808c 100644
--- a/src/il_representations/algos/augmenters.py
+++ b/src/il_representations/algos/augmenters.py
@@ -13,9 +13,12 @@
 
 
 class Augmenter(ABC):
-    def __init__(self, augmenter_spec, color_space):
-        augment_op = StandardAugmentations.from_string_spec(
-            augmenter_spec, color_space)
+    def __init__(self, augmenter_spec, color_space, augment_func=None):
+        if augment_func:
+            self.augment_op = augment_func
+        else:
+            augment_op = StandardAugmentations.from_string_spec(
+                augmenter_spec, color_space)
         self.augment_op = augment_op
 
     @abstractmethod
diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 3dbf5a36..b4e3a8e7 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -2,9 +2,9 @@
 from sacred import Experiment
 from sacred.observers import FileStorageObserver
 from il_representations import algos
-from il_representations.algos.augmenters import ColorSpace
 from il_representations.algos.optimizers import LARS
 from il_representations.algos.utils import LinearWarmupCosine
+from imitation.augment.color import ColorSpace
 from math import ceil
 
 import numpy as np
@@ -15,7 +15,7 @@
 import torch.optim as optim
 import torchvision
 import torchvision.transforms as transforms
-from torchvision.models.resnet import resnet18
+from torchvision.models.resnet import resnet50
 
 
 class MockGymEnv(object):
@@ -26,6 +26,7 @@ class MockGymEnv(object):
     def __init__(self, obs_space):
         self.observation_space = obs_space
         self.action_space = Discrete(1)
+        self.color_space = ColorSpace.RGB
 
     def seed(self, seed):
         pass
@@ -135,7 +136,19 @@ def representation_learning(algo, data_dir, device, log_dir, config):
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
         # SimCLR doesn't use blur for CIFAR-10
     ]
+
     env = MockGymEnv(Box(low=0.0, high=1.0, shape=(3, 32, 32), dtype=np.float32))
+    augmenter_kwargs = {
+        "augmenter_spec": "translate,flip_lr,color_jitter_ex,gray",
+        "color_space": env.color_space,
+
+        # (Cynthia) Here I'm using augmenter_func because I want our settings
+        # to be as close to SimCLR as possible
+        "augmenter_func": rep_learning_augmentations
+    }
+    optimizer_kwargs = {
+        "lr": 3e-4
+    }
 
     transform = transforms.ToTensor()
     trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
@@ -145,15 +158,15 @@ def representation_learning(algo, data_dir, device, log_dir, config):
     batch_size = config['pretrain_batch_size']
     num_steps = num_epochs * int(ceil(num_examples / batch_size))
 
-    # Note that the resnet18 model used here has an architecture meant for
-    # ImageNet, not CIFAR-10. The SimCLR implementation uses a version
-    # specialized for CIFAR, see https://github.com/google-research/simclr/blob/37ad4e01fb22e3e6c7c4753bd51a1e481c2d992e/resnet.py#L531
-    # It seems that SimCLR does not include the final fully connected layer for ResNets, so we set it to the identity.
-    resnet_without_fc = resnet18()
-    resnet_without_fc.fc = torch.nn.Identity()
+    # Modify resnet according to SimCLR paper Appendix B.9
+    simclr_resnet = resnet50()
+    simclr_resnet.fc = torch.nn.Identity()
+    simclr_resnet.conv1 = torch.nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
+    simclr_resnet.maxpool = torch.nn.Identity()
 
     model = algo(
-        env,
+        observation_space=env.observation_space,
+        action_space=env.action_space,
         log_dir=log_dir,
         batch_size=batch_size,
         representation_dim=config['representation_dim'],
@@ -163,21 +176,17 @@ def representation_learning(algo, data_dir, device, log_dir, config):
         shuffle_batches=True,
         color_space=ColorSpace.RGB,
         save_interval=config['pretrain_save_interval'],
-        encoder_kwargs={'architecture_module_cls': lambda *args: resnet_without_fc},
-        augmenter_kwargs={'augmentations': rep_learning_augmentations},
-        optimizer=LARS,
-        optimizer_kwargs={
-            'lr': config['pretrain_lr'],
-            'weight_decay': config['pretrain_weight_decay'],
-            'momentum': config['pretrain_momentum'],
-            'max_epoch': num_steps,
-        },
+        encoder_kwargs={'obs_encoder_cls': lambda *args: simclr_resnet},
+        augmenter_kwargs=augmenter_kwargs,
+        optimizer=torch.optim.Adam,
+        optimizer_kwargs=optimizer_kwargs,
         scheduler=LinearWarmupCosine,
         scheduler_kwargs={'warmup_epoch': 10, 'total_epochs': num_epochs},
         loss_calculator_kwargs={'temp': config['pretrain_temperature']},
     )
 
-    model.learn(rep_learning_data, num_epochs)
+    # TODO: Check batches per epoch
+    model.learn(rep_learning_data, 1000, num_epochs)
     env.close()
     return model
 
@@ -194,7 +203,7 @@ def default_config():
     finetune_epochs = 100
     representation_dim = 512
     projection_dim = 128
-    pretrain_lr = 1.0
+    pretrain_lr = 3e-4
     pretrain_weight_decay = 1e-4
     pretrain_momentum = 0.9
     pretrain_batch_size = 512

From a93d017662100405eaee9436ef031c50cf863df7 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Tue, 20 Apr 2021 15:15:30 +0800
Subject: [PATCH 015/123] Make CIFAR runnable for RepL!

---
 src/il_representations/algos/augmenters.py  | 12 +++-
 src/il_representations/algos/decoders.py    |  2 +-
 src/il_representations/algos/utils.py       |  2 +-
 src/il_representations/envs/auto.py         | 10 ++-
 src/il_representations/envs/cifar_envs.py   | 47 ++++++++++++
 src/il_representations/envs/config.py       |  7 +-
 src/il_representations/scripts/run_cifar.py | 79 +++++++++------------
 7 files changed, 110 insertions(+), 49 deletions(-)
 create mode 100644 src/il_representations/envs/cifar_envs.py

diff --git a/src/il_representations/algos/augmenters.py b/src/il_representations/algos/augmenters.py
index 3af5808c..37d201f4 100644
--- a/src/il_representations/algos/augmenters.py
+++ b/src/il_representations/algos/augmenters.py
@@ -11,15 +11,18 @@
 either augment just the context, or both the context and the target, depending on the algorithm.
 """
 
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
 
 class Augmenter(ABC):
     def __init__(self, augmenter_spec, color_space, augment_func=None):
+        self.augment_func = augment_func
         if augment_func:
             self.augment_op = augment_func
         else:
             augment_op = StandardAugmentations.from_string_spec(
                 augmenter_spec, color_space)
-        self.augment_op = augment_op
+            self.augment_op = augment_op
 
     @abstractmethod
     def __call__(self, contexts, targets):
@@ -36,6 +39,13 @@ def __call__(self, contexts, targets):
 
 class AugmentContextAndTarget(Augmenter):
     def __call__(self, contexts, targets):
+        if self.augment_func:
+            context_ret, target_ret = [], []
+            for context, target in zip(contexts, targets):
+                context_ret.append(self.augment_op(context))
+                target_ret.append(self.augment_op(target))
+            return torch.stack(context_ret, dim=0).to(device), \
+                   torch.stack(target_ret, dim=0).to(device)
         return self.augment_op(contexts), self.augment_op(targets)
 
 
diff --git a/src/il_representations/algos/decoders.py b/src/il_representations/algos/decoders.py
index 3319adfd..7f84c3da 100644
--- a/src/il_representations/algos/decoders.py
+++ b/src/il_representations/algos/decoders.py
@@ -131,7 +131,7 @@ def _apply_projection_layer(self, z_dist, mean_layer, stdev_layer):
             # We better not have had a learned standard deviation in
             # the encoder, since there's no clear way on how to pass
             # it forward
-            assert np.all((z_dist.stddev == 1).numpy())
+            assert np.all((z_dist.stddev == 1).cpu().numpy())
             stddev = self.ones_like_projection_dim(mean)
         else:
             stddev = stdev_layer(z_vector)
diff --git a/src/il_representations/algos/utils.py b/src/il_representations/algos/utils.py
index 831394b0..599daae2 100644
--- a/src/il_representations/algos/utils.py
+++ b/src/il_representations/algos/utils.py
@@ -99,7 +99,7 @@ def log(self, msg):
 
 
 class LinearWarmupCosine(_LRScheduler):
-    def __init__(self, optimizer, T_max, warmup_epoch=30, eta_min=0, last_epoch=-1):
+    def __init__(self, optimizer, T_max, total_epochs, warmup_epoch=30, eta_min=0, last_epoch=-1):
         self.T_max = T_max
         self.eta_min = eta_min
         self.warmup_epoch = warmup_epoch
diff --git a/src/il_representations/envs/auto.py b/src/il_representations/envs/auto.py
index bd819426..622791f0 100644
--- a/src/il_representations/envs/auto.py
+++ b/src/il_representations/envs/auto.py
@@ -21,6 +21,7 @@
 from il_representations.envs.minecraft_envs import (MinecraftVectorWrapper,
                                                     get_env_name_minecraft,
                                                     load_dataset_minecraft)
+from il_representations.envs.cifar_envs import load_dataset_cifar, MockGymEnv
 from il_representations.scripts.utils import update as dict_update
 
 ERROR_MESSAGE = "no support for benchmark_name={benchmark_name!r}"
@@ -74,6 +75,8 @@ def load_dict_dataset(benchmark_name, n_traj=None):
         dataset_dict = load_dataset_atari(n_traj=n_traj)
     elif benchmark_name == 'minecraft':
         dataset_dict = load_dataset_minecraft(n_traj=n_traj)
+    elif benchmark_name == 'cifar-10':
+        dataset_dict = load_dataset_cifar()
     else:
         raise NotImplementedError(ERROR_MESSAGE.format(**locals()))
 
@@ -100,6 +103,8 @@ def get_gym_env_name(benchmark_name, dm_control_full_env_names, task_name):
         return task_name
     elif benchmark_name == 'minecraft':
         return get_env_name_minecraft()  # uses task_name implicitly through config param
+    elif benchmark_name == 'cifar-10':
+        return 'cifar-10-cls'
     raise NotImplementedError(ERROR_MESSAGE.format(**locals()))
 
 
@@ -163,6 +168,8 @@ def load_vec_env(benchmark_name, dm_control_full_env_names,
                             parallel=venv_parallel,
                             wrapper_class=MinecraftVectorWrapper,
                             max_episode_steps=minecraft_max_env_steps)
+    elif benchmark_name == 'cifar-10':
+        return MockGymEnv()
     raise NotImplementedError(ERROR_MESSAGE.format(**locals()))
 
 
@@ -266,7 +273,8 @@ def load_color_space(benchmark_name):
         'magical': ColorSpace.RGB,
         'dm_control': ColorSpace.RGB,
         'atari': ColorSpace.GRAY,
-        'minecraft': ColorSpace.RGB
+        'minecraft': ColorSpace.RGB,
+        'cifar-10': ColorSpace.RGB
     }
     try:
         return color_spaces[benchmark_name]
diff --git a/src/il_representations/envs/cifar_envs.py b/src/il_representations/envs/cifar_envs.py
new file mode 100644
index 00000000..82e0db1b
--- /dev/null
+++ b/src/il_representations/envs/cifar_envs.py
@@ -0,0 +1,47 @@
+import torch
+import numpy as np
+import torchvision
+import torchvision.transforms as transforms
+
+from imitation.augment.color import ColorSpace
+from gym.spaces import Discrete, Box
+
+
+def load_dataset_cifar():
+    """Return a dataset dict"""
+    dataset = torchvision.datasets.CIFAR10(root='./cifar', train=True, download=True,
+                                           transform=transforms.ToTensor())
+
+    obs, acts = [], []
+    for i in range(len(dataset)):
+        img, label = dataset[i]
+        obs.append(img.cpu().numpy())
+        acts.append(label)
+
+    obs = np.stack([o for o in obs], axis=0)
+    acts = np.array(acts)
+
+    data_dict = {
+        'obs': obs,
+        'acts': acts,
+        'dones': np.array([False] * len(dataset)),
+    }
+
+    return data_dict
+
+
+class MockGymEnv(object):
+    """A mock Gym env for a supervised learning dataset pretending to be an RL
+    task. Action space is set to Discrete(1), observation space corresponds to
+    the original supervised learning task.
+    """
+    def __init__(self):
+        self.observation_space = Box(low=0.0, high=1.0, shape=(3, 32, 32), dtype=np.float32)
+        self.action_space = Discrete(1)
+        self.color_space = ColorSpace.RGB
+
+    def seed(self, seed):
+        pass
+
+    def close(self):
+        pass
\ No newline at end of file
diff --git a/src/il_representations/envs/config.py b/src/il_representations/envs/config.py
index 6a6c3bff..9676e1a4 100644
--- a/src/il_representations/envs/config.py
+++ b/src/il_representations/envs/config.py
@@ -6,7 +6,7 @@
 
 from sacred import Ingredient
 
-ALL_BENCHMARK_NAMES = {"atari", "magical", "dm_control", "minecraft"}
+ALL_BENCHMARK_NAMES = {"atari", "magical", "dm_control", "minecraft", "cifar-10"}
 
 # see env_cfg_defaults docstring for description of this ingredient
 env_cfg_ingredient = Ingredient('env_cfg')
@@ -70,6 +70,11 @@ def env_cfg_defaults():
     # ###############################
     minecraft_max_env_steps = None
 
+    # ###############################
+    # CIFAR-10-specific config variables
+    # (none currently present)
+    # ###############################
+
     _ = locals()
     del _
 
diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index b4e3a8e7..93464b01 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -1,12 +1,3 @@
-from gym.spaces import Discrete, Box
-from sacred import Experiment
-from sacred.observers import FileStorageObserver
-from il_representations import algos
-from il_representations.algos.optimizers import LARS
-from il_representations.algos.utils import LinearWarmupCosine
-from imitation.augment.color import ColorSpace
-from math import ceil
-
 import numpy as np
 import os
 import PIL
@@ -16,6 +7,26 @@
 import torchvision
 import torchvision.transforms as transforms
 from torchvision.models.resnet import resnet50
+from math import ceil
+import webdataset as wds
+
+from gym.spaces import Discrete, Box
+from sacred import Experiment
+from sacred.observers import FileStorageObserver
+from il_representations import algos
+from il_representations.algos.optimizers import LARS
+from il_representations.algos.utils import LinearWarmupCosine
+from il_representations.envs.auto import load_wds_datasets
+from il_representations.envs.config import (env_cfg_ingredient,
+                                            env_data_ingredient,
+                                            venv_opts_ingredient)
+from imitation.augment.color import ColorSpace
+
+
+cifar_ex = Experiment('cifar', ingredients=[
+                                    env_cfg_ingredient, env_data_ingredient,
+                                    venv_opts_ingredient
+                                ])
 
 
 class MockGymEnv(object):
@@ -35,20 +46,6 @@ def close(self):
         pass
 
 
-def transform_to_rl(dataset):
-    """Transforms the input supervised learning dataset into an "RL dataset", by
-    adding dummy 'actions' (always 0) and 'dones' (always False), and pretending
-    that everything is from the same 'trajectory'.
-    """
-    obs = [img for img, label in dataset]
-    data_dict = {
-        'obs': obs,
-        'acts': [0.0] * len(obs),
-        'dones': [False] * len(obs),
-    }
-    return data_dict
-
-
 class LinearHead(nn.Module):
     def __init__(self, encoder, encoder_dim, output_dim):
         super().__init__()
@@ -117,14 +114,14 @@ def evaluate_classifier(classifier, data_dir, device):
     print('Accuracy: %d %%' % (100 * correct / total))
 
 
-def representation_learning(algo, data_dir, device, log_dir, config):
+def representation_learning(algo, device, log_dir, config):
     print('Train representation learner')
     if isinstance(algo, str):
         algo = getattr(algos, algo)
     assert issubclass(algo, algos.RepresentationLearner)
 
-    rep_learning_augmentations = [
-        transforms.Lambda(torch.tensor),
+    rep_learning_augmentations = transforms.Compose([
+        transforms.Lambda(lambda x: (x.cpu().numpy() * 255).astype(np.uint8)),
         transforms.ToPILImage(),
         transforms.RandomResizedCrop(32, interpolation=PIL.Image.BICUBIC),
         transforms.RandomHorizontalFlip(),
@@ -135,38 +132,36 @@ def representation_learning(algo, data_dir, device, log_dir, config):
         transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
         # SimCLR doesn't use blur for CIFAR-10
-    ]
+    ])
 
+    rep_learning_data, combined_meta = load_wds_datasets([{}])
     env = MockGymEnv(Box(low=0.0, high=1.0, shape=(3, 32, 32), dtype=np.float32))
     augmenter_kwargs = {
         "augmenter_spec": "translate,flip_lr,color_jitter_ex,gray",
-        "color_space": env.color_space,
+        "color_space": combined_meta['color_space'],
 
         # (Cynthia) Here I'm using augmenter_func because I want our settings
         # to be as close to SimCLR as possible
-        "augmenter_func": rep_learning_augmentations
+        "augment_func": rep_learning_augmentations
     }
     optimizer_kwargs = {
         "lr": 3e-4
     }
 
-    transform = transforms.ToTensor()
-    trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
-    rep_learning_data = transform_to_rl(trainset)
     num_examples = len(rep_learning_data)
     num_epochs = config['pretrain_epochs']
     batch_size = config['pretrain_batch_size']
-    num_steps = num_epochs * int(ceil(num_examples / batch_size))
+    batches_per_epoch = ceil(num_examples / batch_size)
 
     # Modify resnet according to SimCLR paper Appendix B.9
     simclr_resnet = resnet50()
-    simclr_resnet.fc = torch.nn.Identity()
+    simclr_resnet.fc = torch.nn.Linear(2048, config['representation_dim'])
     simclr_resnet.conv1 = torch.nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
     simclr_resnet.maxpool = torch.nn.Identity()
 
     model = algo(
-        observation_space=env.observation_space,
-        action_space=env.action_space,
+        observation_space=combined_meta['observation_space'],
+        action_space=combined_meta['action_space'],
         log_dir=log_dir,
         batch_size=batch_size,
         representation_dim=config['representation_dim'],
@@ -174,7 +169,7 @@ def representation_learning(algo, data_dir, device, log_dir, config):
         device=device,
         normalize=False,
         shuffle_batches=True,
-        color_space=ColorSpace.RGB,
+        color_space=combined_meta['color_space'],
         save_interval=config['pretrain_save_interval'],
         encoder_kwargs={'obs_encoder_cls': lambda *args: simclr_resnet},
         augmenter_kwargs=augmenter_kwargs,
@@ -185,15 +180,11 @@ def representation_learning(algo, data_dir, device, log_dir, config):
         loss_calculator_kwargs={'temp': config['pretrain_temperature']},
     )
 
-    # TODO: Check batches per epoch
-    model.learn(rep_learning_data, 1000, num_epochs)
+    model.learn(rep_learning_data, batches_per_epoch, num_epochs)
     env.close()
     return model
 
 
-cifar_ex = Experiment('cifar')
-
-
 @cifar_ex.config
 def default_config():
     seed = 1
@@ -221,7 +212,7 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     os.makedirs(data_dir, exist_ok=True)
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = representation_learning(algo, data_dir, device, log_dir, _config)
+    model = representation_learning(algo, device, log_dir, _config)
 
     print('Train linear head')
     classifier = LinearHead(model.encoder, representation_dim, output_dim=10).to(device)
@@ -232,5 +223,5 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
 
 
 if __name__ == '__main__':
-    cifar_ex.observers.append(FileStorageObserver('cifar_runs'))
+    cifar_ex.observers.append(FileStorageObserver('runs/cifar_runs'))
     cifar_ex.run_commandline()

From f2fc56b6bb50eea4bc16d4e1782e9b7ddd683f3d Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Tue, 20 Apr 2021 15:57:33 +0800
Subject: [PATCH 016/123] classification + cleanup

---
 src/il_representations/scripts/run_cifar.py | 113 +++++++++++++-------
 1 file changed, 76 insertions(+), 37 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 93464b01..4243a2b4 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -1,6 +1,7 @@
 import numpy as np
 import os
 import PIL
+import json
 import torch
 import torch.nn as nn
 import torch.optim as optim
@@ -29,32 +30,16 @@
                                 ])
 
 
-class MockGymEnv(object):
-    """A mock Gym env for a supervised learning dataset pretending to be an RL
-    task. Action space is set to Discrete(1), observation space corresponds to
-    the original supervised learning task.
-    """
-    def __init__(self, obs_space):
-        self.observation_space = obs_space
-        self.action_space = Discrete(1)
-        self.color_space = ColorSpace.RGB
-
-    def seed(self, seed):
-        pass
-
-    def close(self):
-        pass
-
-
 class LinearHead(nn.Module):
     def __init__(self, encoder, encoder_dim, output_dim):
         super().__init__()
         self.encoder = encoder
+        self.encoder.fc = torch.nn.Identity()
         self.output_dim = output_dim
-        self.layer = nn.Linear(encoder_dim, output_dim)
+        self.layer = nn.Linear(2048, output_dim)
 
     def forward(self, x):
-        encoding = self.encoder.encode_context(x, None).loc.detach()
+        encoding = self.encoder(x)
         return self.layer(encoding)
 
 
@@ -68,12 +53,24 @@ def train_classifier(classifier, data_dir, num_epochs, device):
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
     ])
     trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
-    trainloader = torch.utils.data.DataLoader(trainset, batch_size=512, shuffle=True)
+    trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
     criterion = nn.CrossEntropyLoss().to(device)
     optimizer = optim.SGD(classifier.layer.parameters(), lr=0.2, momentum=0.9, weight_decay=0.0, nesterov=True)
     scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
 
+    test_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+    ])
+    testset = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True, transform=test_transform)
+    testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False)
+
+    progress_dict = {'loss': [], 'train_acc': [], 'test_acc': []}
+
     for epoch in range(num_epochs):
+        loss_meter = AverageMeter()
+        train_acc_meter = AverageMeter()
+
         print(f"Epoch {epoch}/{num_epochs} with lr {optimizer.param_groups[0]['lr']}")
         running_loss = 0.0
         for i, (inputs, labels) in enumerate(trainloader, 0):
@@ -85,33 +82,42 @@ def train_classifier(classifier, data_dir, num_epochs, device):
             optimizer.step()
 
             # print statistics
+            train_acc_meter.update(accuracy(outputs, labels))
+            loss_meter.update(loss.item())
             running_loss += loss.item()
+
             if i % 20 == 19:    # print every 20 mini-batches
-                print('[Epoch %d, Batch %3d] Average loss: %.3f' %
-                      (epoch + 1, i + 1, running_loss / 20))
+                # print('[Epoch %d, Batch %3d] Average loss: %.3f, Average acc' %
+                #       (epoch + 1, i + 1, running_loss / 20))
+                print(f"[Epoch {epoch}, Batch {i}] "
+                      f"Average loss: {loss_meter.avg} "
+                      f"Average acc: {train_acc_meter.avg} "
+                      f"Running loss: {running_loss / 20}")
                 running_loss = 0.0
 
         scheduler.step()
+        test_acc = evaluate_classifier(testloader, classifier, device)
 
+        progress_dict['loss'].append(loss_meter.avg)
+        progress_dict['train_acc'].append(train_acc_meter.avg)
+        progress_dict['test_acc'].append(test_acc)
 
-def evaluate_classifier(classifier, data_dir, device):
-    transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-    ])
-    testset = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True, transform=transform)
-    testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False)
-    correct = 0
+        with open('./progress.json') as f:
+            json.dump(f)
+
+
+def evaluate_classifier(testloader, classifier, device):
     total = 0
+    test_acc_meter = AverageMeter()
     with torch.no_grad():
         for images, labels in testloader:
             images, labels = images.to(device), labels.to(device)
             outputs = classifier(images)
             _, predicted = torch.max(outputs.data, 1)
             total += labels.size(0)
-            correct += (predicted == labels).sum().item()
+            test_acc_meter.update(accuracy(outputs, labels))
 
-    print('Accuracy: %d %%' % (100 * correct / total))
+    return test_acc_meter.avg
 
 
 def representation_learning(algo, device, log_dir, config):
@@ -135,7 +141,6 @@ def representation_learning(algo, device, log_dir, config):
     ])
 
     rep_learning_data, combined_meta = load_wds_datasets([{}])
-    env = MockGymEnv(Box(low=0.0, high=1.0, shape=(3, 32, 32), dtype=np.float32))
     augmenter_kwargs = {
         "augmenter_spec": "translate,flip_lr,color_jitter_ex,gray",
         "color_space": combined_meta['color_space'],
@@ -180,9 +185,43 @@ def representation_learning(algo, device, log_dir, config):
         loss_calculator_kwargs={'temp': config['pretrain_temperature']},
     )
 
-    model.learn(rep_learning_data, batches_per_epoch, num_epochs)
-    env.close()
-    return model
+    _, encoder_checkpoint_path = model.learn(rep_learning_data, batches_per_epoch, num_epochs)
+    pretrained_model = torch.load(encoder_checkpoint_path)
+    return pretrained_model
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
 
 
 @cifar_ex.config
@@ -215,7 +254,7 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     model = representation_learning(algo, device, log_dir, _config)
 
     print('Train linear head')
-    classifier = LinearHead(model.encoder, representation_dim, output_dim=10).to(device)
+    classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
     train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 
     print('Evaluate accuracy on test set')

From 988ec4ac0a69f784e4aaef64a0106040d1c7da1a Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Tue, 20 Apr 2021 16:21:56 +0800
Subject: [PATCH 017/123] some cleanup

---
 src/il_representations/scripts/run_cifar.py | 31 ++++++++++-----------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 4243a2b4..f1bde61e 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -9,19 +9,15 @@
 import torchvision.transforms as transforms
 from torchvision.models.resnet import resnet50
 from math import ceil
-import webdataset as wds
 
-from gym.spaces import Discrete, Box
 from sacred import Experiment
 from sacred.observers import FileStorageObserver
 from il_representations import algos
-from il_representations.algos.optimizers import LARS
 from il_representations.algos.utils import LinearWarmupCosine
 from il_representations.envs.auto import load_wds_datasets
 from il_representations.envs.config import (env_cfg_ingredient,
                                             env_data_ingredient,
                                             venv_opts_ingredient)
-from imitation.augment.color import ColorSpace
 
 
 cifar_ex = Experiment('cifar', ingredients=[
@@ -34,13 +30,10 @@ class LinearHead(nn.Module):
     def __init__(self, encoder, encoder_dim, output_dim):
         super().__init__()
         self.encoder = encoder
-        self.encoder.fc = torch.nn.Identity()
-        self.output_dim = output_dim
-        self.layer = nn.Linear(2048, output_dim)
+        self.encoder.fc = nn.Linear(2048, output_dim)
 
     def forward(self, x):
-        encoding = self.encoder(x)
-        return self.layer(encoding)
+        return self.encoder(x)
 
 
 def train_classifier(classifier, data_dir, num_epochs, device):
@@ -55,7 +48,8 @@ def train_classifier(classifier, data_dir, num_epochs, device):
     trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
     trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
     criterion = nn.CrossEntropyLoss().to(device)
-    optimizer = optim.SGD(classifier.layer.parameters(), lr=0.2, momentum=0.9, weight_decay=0.0, nesterov=True)
+    optimizer = optim.Adam(classifier.encoder.parameters(), lr=3e-4)
+    # optimizer = optim.Adam(classifier.encoder.fc.parameters(), lr=3e-4, momentum=0.9, weight_decay=0.0, nesterov=True)
     scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
 
     test_transform = transforms.Compose([
@@ -67,6 +61,8 @@ def train_classifier(classifier, data_dir, num_epochs, device):
 
     progress_dict = {'loss': [], 'train_acc': [], 'test_acc': []}
 
+    start_time = time.time()
+
     for epoch in range(num_epochs):
         loss_meter = AverageMeter()
         train_acc_meter = AverageMeter()
@@ -82,14 +78,15 @@ def train_classifier(classifier, data_dir, num_epochs, device):
             optimizer.step()
 
             # print statistics
-            train_acc_meter.update(accuracy(outputs, labels))
+            train_acc_meter.update(accuracy(outputs, labels)[0].item())
             loss_meter.update(loss.item())
             running_loss += loss.item()
 
             if i % 20 == 19:    # print every 20 mini-batches
-                # print('[Epoch %d, Batch %3d] Average loss: %.3f, Average acc' %
-                #       (epoch + 1, i + 1, running_loss / 20))
-                print(f"[Epoch {epoch}, Batch {i}] "
+                hours, rem = divmod(time.time() - start_time, 3600)
+                minutes, seconds = divmod(rem, 60)
+                print(f"[{int(hours)}:{int(minutes)}:{int(seconds)}] "
+                      f"Epoch {epoch}, Batch {i} "
                       f"Average loss: {loss_meter.avg} "
                       f"Average acc: {train_acc_meter.avg} "
                       f"Running loss: {running_loss / 20}")
@@ -102,8 +99,8 @@ def train_classifier(classifier, data_dir, num_epochs, device):
         progress_dict['train_acc'].append(train_acc_meter.avg)
         progress_dict['test_acc'].append(test_acc)
 
-        with open('./progress.json') as f:
-            json.dump(f)
+        with open('./progress.json', 'w') as f:
+            json.dump(progress_dict, f)
 
 
 def evaluate_classifier(testloader, classifier, device):
@@ -115,7 +112,7 @@ def evaluate_classifier(testloader, classifier, device):
             outputs = classifier(images)
             _, predicted = torch.max(outputs.data, 1)
             total += labels.size(0)
-            test_acc_meter.update(accuracy(outputs, labels))
+            test_acc_meter.update(accuracy(outputs, labels)[0].item())
 
     return test_acc_meter.avg
 

From fa647ad71926969d0e0b46adb433d034bcf83cfa Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 11:57:06 -0700
Subject: [PATCH 018/123] Hardcode warmup_epochs to 2

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index f1bde61e..677fa828 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -178,7 +178,7 @@ def representation_learning(algo, device, log_dir, config):
         optimizer=torch.optim.Adam,
         optimizer_kwargs=optimizer_kwargs,
         scheduler=LinearWarmupCosine,
-        scheduler_kwargs={'warmup_epoch': 10, 'total_epochs': num_epochs},
+        scheduler_kwargs={'warmup_epoch': 2, 'total_epochs': num_epochs},
         loss_calculator_kwargs={'temp': config['pretrain_temperature']},
     )
 

From 8fe17b5e24ac95b51d641dcc6c7b2b58e6efae97 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 11:58:22 -0700
Subject: [PATCH 019/123] Import time

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 677fa828..bc266f6e 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -9,7 +9,7 @@
 import torchvision.transforms as transforms
 from torchvision.models.resnet import resnet50
 from math import ceil
-
+import time
 from sacred import Experiment
 from sacred.observers import FileStorageObserver
 from il_representations import algos

From 95a01d020c9a63db3a24b10139278293cc4a4176 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 12:03:17 -0700
Subject: [PATCH 020/123] Make testloader exist

---
 src/il_representations/scripts/run_cifar.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index bc266f6e..1719db43 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -103,7 +103,9 @@ def train_classifier(classifier, data_dir, num_epochs, device):
             json.dump(progress_dict, f)
 
 
-def evaluate_classifier(testloader, classifier, device):
+def evaluate_classifier(classifier, data_dir, device):
+    trainset = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True)
+    testloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
     total = 0
     test_acc_meter = AverageMeter()
     with torch.no_grad():

From 6869714323db09b79fe61e1dea1666fb210f3382 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 12:07:32 -0700
Subject: [PATCH 021/123] Is RepL training?

---
 src/il_representations/scripts/run_cifar.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 1719db43..79b7fe50 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -182,9 +182,11 @@ def representation_learning(algo, device, log_dir, config):
         scheduler=LinearWarmupCosine,
         scheduler_kwargs={'warmup_epoch': 2, 'total_epochs': num_epochs},
         loss_calculator_kwargs={'temp': config['pretrain_temperature']},
+        log_interval=1
     )
 
     _, encoder_checkpoint_path = model.learn(rep_learning_data, batches_per_epoch, num_epochs)
+    print("Representation Learning trained!")
     pretrained_model = torch.load(encoder_checkpoint_path)
     return pretrained_model
 
@@ -256,8 +258,8 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
     train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 
-    print('Evaluate accuracy on test set')
-    evaluate_classifier(classifier, data_dir, device=device)
+    # print('Evaluate accuracy on test set')
+    # evaluate_classifier(classifier, data_dir, device=device)
 
 
 if __name__ == '__main__':

From d977571cb6a8161acb92b7d8d1c020df8db6b1f2 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 12:13:33 -0700
Subject: [PATCH 022/123] Hardcode dataset length

---
 src/il_representations/scripts/run_cifar.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 79b7fe50..10db9b98 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -152,7 +152,9 @@ def representation_learning(algo, device, log_dir, config):
         "lr": 3e-4
     }
 
-    num_examples = len(rep_learning_data)
+    # This is currently erroneously 1
+    #num_examples = len(rep_learning_data)
+    num_examples = 49920
     num_epochs = config['pretrain_epochs']
     batch_size = config['pretrain_batch_size']
     batches_per_epoch = ceil(num_examples / batch_size)
@@ -184,7 +186,6 @@ def representation_learning(algo, device, log_dir, config):
         loss_calculator_kwargs={'temp': config['pretrain_temperature']},
         log_interval=1
     )
-
     _, encoder_checkpoint_path = model.learn(rep_learning_data, batches_per_epoch, num_epochs)
     print("Representation Learning trained!")
     pretrained_model = torch.load(encoder_checkpoint_path)

From 574d8fcac254b60cf1c48f32ab0167e73b994240 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 12:19:59 -0700
Subject: [PATCH 023/123] Remove excess logging

---
 src/il_representations/scripts/run_cifar.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 10db9b98..f8293a15 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -184,7 +184,8 @@ def representation_learning(algo, device, log_dir, config):
         scheduler=LinearWarmupCosine,
         scheduler_kwargs={'warmup_epoch': 2, 'total_epochs': num_epochs},
         loss_calculator_kwargs={'temp': config['pretrain_temperature']},
-        log_interval=1
+        log_interval=10,
+        calc_log_interval=10
     )
     _, encoder_checkpoint_path = model.learn(rep_learning_data, batches_per_epoch, num_epochs)
     print("Representation Learning trained!")

From 45b285a5121793fbf1b3f764e552468a18eb20d4 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 12:59:51 -0700
Subject: [PATCH 024/123] Remove Cosine Annealing to be consistent with repo

---
 src/il_representations/scripts/run_cifar.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index f8293a15..13df61d4 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -50,7 +50,7 @@ def train_classifier(classifier, data_dir, num_epochs, device):
     criterion = nn.CrossEntropyLoss().to(device)
     optimizer = optim.Adam(classifier.encoder.parameters(), lr=3e-4)
     # optimizer = optim.Adam(classifier.encoder.fc.parameters(), lr=3e-4, momentum=0.9, weight_decay=0.0, nesterov=True)
-    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
+    #scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
 
     test_transform = transforms.Compose([
         transforms.ToTensor(),
@@ -92,7 +92,7 @@ def train_classifier(classifier, data_dir, num_epochs, device):
                       f"Running loss: {running_loss / 20}")
                 running_loss = 0.0
 
-        scheduler.step()
+        #scheduler.step()
         test_acc = evaluate_classifier(testloader, classifier, device)
 
         progress_dict['loss'].append(loss_meter.avg)
@@ -149,7 +149,8 @@ def representation_learning(algo, device, log_dir, config):
         "augment_func": rep_learning_augmentations
     }
     optimizer_kwargs = {
-        "lr": 3e-4
+        "lr": 1e-3,
+        "weight_decay": 1e-6
     }
 
     # This is currently erroneously 1
@@ -181,7 +182,7 @@ def representation_learning(algo, device, log_dir, config):
         augmenter_kwargs=augmenter_kwargs,
         optimizer=torch.optim.Adam,
         optimizer_kwargs=optimizer_kwargs,
-        scheduler=LinearWarmupCosine,
+        #scheduler=LinearWarmupCosine,
         scheduler_kwargs={'warmup_epoch': 2, 'total_epochs': num_epochs},
         loss_calculator_kwargs={'temp': config['pretrain_temperature']},
         log_interval=10,

From f0b8f7130d8b30ea23692403803729af20d81d4b Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 13:13:41 -0700
Subject: [PATCH 025/123] Put their loss in for ours

---
 src/il_representations/algos/losses.py | 92 +++++++++++++++-----------
 1 file changed, 53 insertions(+), 39 deletions(-)

diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index b999860d..2a52509c 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -185,45 +185,59 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
         z_j = targets
         batch_size = z_i.shape[0]
 
-        if self.normalize:  # Use cosine similarity
-            z_i = F.normalize(z_i, dim=1)
-            z_j = F.normalize(z_j, dim=1)
-
-        mask = (torch.eye(batch_size) * self.large_num).to(self.device)
-
-        # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.
-        logits_aa = torch.matmul(z_i, z_i.T)  # NxN
-
-        # Values on the diagonal line are each image's similarity with itself
-        logits_aa = logits_aa - mask
-        # Similarity of the augmented images with all other augmented images.
-        logits_bb = torch.matmul(z_j, z_j.T)  # NxN
-        logits_bb = logits_bb - mask
-        # Similarity of original images and augmented images
-        logits_ab = torch.matmul(z_i, z_j.T)  # NxN
-        logits_ba = torch.matmul(z_j, z_i.T)  # NxN
-
-        avg_self_similarity = logits_ab.diag().mean().item()
-        logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
-        avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
-
-        sb_logger.record('avg_self_similarity', avg_self_similarity)
-        sb_logger.record('avg_other_similarity', avg_other_similarity)
-        sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
-
-        # Each row now contains an image's similarity with the batch's augmented images & original images. This applies
-        # to both original and augmented images (hence "symmetric").
-        logits_i = torch.cat((logits_ab, logits_aa), 1)  # Nx2N
-        logits_j = torch.cat((logits_ba, logits_bb), 1)  # Nx2N
-        logits = torch.cat((logits_i, logits_j), axis=0)  # 2Nx2N
-        logits /= self.temp
-
-        # The values we want to maximize lie on the i-th index of each row i. i.e. the dot product of
-        # represent(image_i) and represent(augmented_image_i).
-        label = torch.arange(batch_size, dtype=torch.long).to(self.device)
-        labels = torch.cat((label, label), axis=0)
-
-        return self.criterion(logits, labels)
+        out = torch.cat([z_i, z_j], dim=0)
+        # [2*B, 2*B]
+        sim_matrix = torch.exp(torch.mm(out, out.t().contiguous()) / self.temp)
+        mask = (torch.ones_like(sim_matrix) - torch.eye(2 * batch_size, device=sim_matrix.device)).bool()
+        # [2*B, 2*B-1]
+        sim_matrix = sim_matrix.masked_select(mask).view(2 * batch_size, -1)
+
+        # compute loss
+        pos_sim = torch.exp(torch.sum(z_i * z_j, dim=-1) / self.temp)
+        # [2*B]
+        pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
+        loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
+        #
+        # if self.normalize:  # Use cosine similarity
+        #     z_i = F.normalize(z_i, dim=1)
+        #     z_j = F.normalize(z_j, dim=1)
+        #
+        # mask = (torch.eye(batch_size) * self.large_num).to(self.device)
+        #
+        # # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.
+        # logits_aa = torch.matmul(z_i, z_i.T)  # NxN
+        #
+        # # Values on the diagonal line are each image's similarity with itself
+        # logits_aa = logits_aa - mask
+        # # Similarity of the augmented images with all other augmented images.
+        # logits_bb = torch.matmul(z_j, z_j.T)  # NxN
+        # logits_bb = logits_bb - mask
+        # # Similarity of original images and augmented images
+        # logits_ab = torch.matmul(z_i, z_j.T)  # NxN
+        # logits_ba = torch.matmul(z_j, z_i.T)  # NxN
+        #
+        # avg_self_similarity = logits_ab.diag().mean().item()
+        # logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
+        # avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
+        #
+        # sb_logger.record('avg_self_similarity', avg_self_similarity)
+        # sb_logger.record('avg_other_similarity', avg_other_similarity)
+        # sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
+        #
+        # # Each row now contains an image's similarity with the batch's augmented images & original images. This applies
+        # # to both original and augmented images (hence "symmetric").
+        # logits_i = torch.cat((logits_ab, logits_aa), 1)  # Nx2N
+        # logits_j = torch.cat((logits_ba, logits_bb), 1)  # Nx2N
+        # logits = torch.cat((logits_i, logits_j), axis=0)  # 2Nx2N
+        # logits /= self.temp
+        #
+        # # The values we want to maximize lie on the i-th index of each row i. i.e. the dot product of
+        # # represent(image_i) and represent(augmented_image_i).
+        # label = torch.arange(batch_size, dtype=torch.long).to(self.device)
+        # labels = torch.cat((label, label), axis=0)
+
+        #return self.criterion(logits, labels)
+        return loss
 
 
 class NegativeLogLikelihood(RepresentationLoss):

From fc10cd4b6dbf04fe85214c8d168dfd271259fb6d Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 13:17:05 -0700
Subject: [PATCH 026/123] Comment otu their losss which is nan for some reason

---
 src/il_representations/algos/losses.py | 102 ++++++++++++-------------
 1 file changed, 51 insertions(+), 51 deletions(-)

diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index 2a52509c..eea620b7 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -185,59 +185,59 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
         z_j = targets
         batch_size = z_i.shape[0]
 
-        out = torch.cat([z_i, z_j], dim=0)
-        # [2*B, 2*B]
-        sim_matrix = torch.exp(torch.mm(out, out.t().contiguous()) / self.temp)
-        mask = (torch.ones_like(sim_matrix) - torch.eye(2 * batch_size, device=sim_matrix.device)).bool()
-        # [2*B, 2*B-1]
-        sim_matrix = sim_matrix.masked_select(mask).view(2 * batch_size, -1)
-
-        # compute loss
-        pos_sim = torch.exp(torch.sum(z_i * z_j, dim=-1) / self.temp)
-        # [2*B]
-        pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
-        loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
+        # out = torch.cat([z_i, z_j], dim=0)
+        # # [2*B, 2*B]
+        # sim_matrix = torch.exp(torch.mm(out, out.t().contiguous()) / self.temp)
+        # mask = (torch.ones_like(sim_matrix) - torch.eye(2 * batch_size, device=sim_matrix.device)).bool()
+        # # [2*B, 2*B-1]
+        # sim_matrix = sim_matrix.masked_select(mask).view(2 * batch_size, -1)
         #
-        # if self.normalize:  # Use cosine similarity
-        #     z_i = F.normalize(z_i, dim=1)
-        #     z_j = F.normalize(z_j, dim=1)
-        #
-        # mask = (torch.eye(batch_size) * self.large_num).to(self.device)
-        #
-        # # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.
-        # logits_aa = torch.matmul(z_i, z_i.T)  # NxN
-        #
-        # # Values on the diagonal line are each image's similarity with itself
-        # logits_aa = logits_aa - mask
-        # # Similarity of the augmented images with all other augmented images.
-        # logits_bb = torch.matmul(z_j, z_j.T)  # NxN
-        # logits_bb = logits_bb - mask
-        # # Similarity of original images and augmented images
-        # logits_ab = torch.matmul(z_i, z_j.T)  # NxN
-        # logits_ba = torch.matmul(z_j, z_i.T)  # NxN
-        #
-        # avg_self_similarity = logits_ab.diag().mean().item()
-        # logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
-        # avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
-        #
-        # sb_logger.record('avg_self_similarity', avg_self_similarity)
-        # sb_logger.record('avg_other_similarity', avg_other_similarity)
-        # sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
-        #
-        # # Each row now contains an image's similarity with the batch's augmented images & original images. This applies
-        # # to both original and augmented images (hence "symmetric").
-        # logits_i = torch.cat((logits_ab, logits_aa), 1)  # Nx2N
-        # logits_j = torch.cat((logits_ba, logits_bb), 1)  # Nx2N
-        # logits = torch.cat((logits_i, logits_j), axis=0)  # 2Nx2N
-        # logits /= self.temp
-        #
-        # # The values we want to maximize lie on the i-th index of each row i. i.e. the dot product of
-        # # represent(image_i) and represent(augmented_image_i).
-        # label = torch.arange(batch_size, dtype=torch.long).to(self.device)
-        # labels = torch.cat((label, label), axis=0)
+        # # compute loss
+        # pos_sim = torch.exp(torch.sum(z_i * z_j, dim=-1) / self.temp)
+        # # [2*B]
+        # pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
+        # loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
+        # return loss
 
-        #return self.criterion(logits, labels)
-        return loss
+        if self.normalize:  # Use cosine similarity
+            z_i = F.normalize(z_i, dim=1)
+            z_j = F.normalize(z_j, dim=1)
+
+        mask = (torch.eye(batch_size) * self.large_num).to(self.device)
+
+        # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.
+        logits_aa = torch.matmul(z_i, z_i.T)  # NxN
+
+        # Values on the diagonal line are each image's similarity with itself
+        logits_aa = logits_aa - mask
+        # Similarity of the augmented images with all other augmented images.
+        logits_bb = torch.matmul(z_j, z_j.T)  # NxN
+        logits_bb = logits_bb - mask
+        # Similarity of original images and augmented images
+        logits_ab = torch.matmul(z_i, z_j.T)  # NxN
+        logits_ba = torch.matmul(z_j, z_i.T)  # NxN
+
+        avg_self_similarity = logits_ab.diag().mean().item()
+        logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
+        avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
+
+        sb_logger.record('avg_self_similarity', avg_self_similarity)
+        sb_logger.record('avg_other_similarity', avg_other_similarity)
+        sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
+
+        # Each row now contains an image's similarity with the batch's augmented images & original images. This applies
+        # to both original and augmented images (hence "symmetric").
+        logits_i = torch.cat((logits_ab, logits_aa), 1)  # Nx2N
+        logits_j = torch.cat((logits_ba, logits_bb), 1)  # Nx2N
+        logits = torch.cat((logits_i, logits_j), axis=0)  # 2Nx2N
+        logits /= self.temp
+
+        # The values we want to maximize lie on the i-th index of each row i. i.e. the dot product of
+        # represent(image_i) and represent(augmented_image_i).
+        label = torch.arange(batch_size, dtype=torch.long).to(self.device)
+        labels = torch.cat((label, label), axis=0)
+
+        return self.criterion(logits, labels)
 
 
 class NegativeLogLikelihood(RepresentationLoss):

From 97bf11ae04950e01e27c95f389f68a0af0dbc8ff Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 13:19:09 -0700
Subject: [PATCH 027/123] Add breakpoint

---
 src/il_representations/scripts/run_cifar.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 13df61d4..e4b89928 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -155,10 +155,10 @@ def representation_learning(algo, device, log_dir, config):
 
     # This is currently erroneously 1
     #num_examples = len(rep_learning_data)
-    num_examples = 49920
+    # num_examples = 49920
     num_epochs = config['pretrain_epochs']
     batch_size = config['pretrain_batch_size']
-    batches_per_epoch = ceil(num_examples / batch_size)
+    batches_per_epoch = config['batches_per_epoch']
 
     # Modify resnet according to SimCLR paper Appendix B.9
     simclr_resnet = resnet50()
@@ -234,6 +234,7 @@ def default_config():
     algo = 'SimCLR'
     data_dir = 'cifar10/'
     pretrain_epochs = 1000
+    pretrain_batches_per_epoch = 390
     finetune_epochs = 100
     representation_dim = 512
     projection_dim = 128
@@ -258,6 +259,7 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     model = representation_learning(algo, device, log_dir, _config)
 
     print('Train linear head')
+    breakpoint()
     classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
     train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 

From c22329fecd6f8feeb6eae12f932d9ac479c253d9 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 13:21:23 -0700
Subject: [PATCH 028/123] Fix config name

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index e4b89928..e62dd753 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -158,7 +158,7 @@ def representation_learning(algo, device, log_dir, config):
     # num_examples = 49920
     num_epochs = config['pretrain_epochs']
     batch_size = config['pretrain_batch_size']
-    batches_per_epoch = config['batches_per_epoch']
+    batches_per_epoch = config['pretrain_batches_per_epoch']
 
     # Modify resnet according to SimCLR paper Appendix B.9
     simclr_resnet = resnet50()

From 55963e0205159677087b815eadf9f70cc6bbd505 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 13:26:39 -0700
Subject: [PATCH 029/123] Switch to running their loss

---
 src/il_representations/algos/losses.py      | 107 ++++++++++----------
 src/il_representations/scripts/run_cifar.py |   1 -
 2 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index eea620b7..4fd1d039 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
 import torch
+import numpy as np
 import torch.nn.functional as F
 import stable_baselines3.common.logger as sb_logger
 from pyro.distributions import Delta
@@ -185,59 +186,61 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
         z_j = targets
         batch_size = z_i.shape[0]
 
-        # out = torch.cat([z_i, z_j], dim=0)
-        # # [2*B, 2*B]
-        # sim_matrix = torch.exp(torch.mm(out, out.t().contiguous()) / self.temp)
-        # mask = (torch.ones_like(sim_matrix) - torch.eye(2 * batch_size, device=sim_matrix.device)).bool()
-        # # [2*B, 2*B-1]
-        # sim_matrix = sim_matrix.masked_select(mask).view(2 * batch_size, -1)
+        out = torch.cat([z_i, z_j], dim=0)
+        # [2*B, 2*B]
+        sim_matrix = torch.exp(torch.mm(out, out.t().contiguous()) / self.temp)
+        mask = (torch.ones_like(sim_matrix) - torch.eye(2 * batch_size, device=sim_matrix.device)).bool()
+        # [2*B, 2*B-1]
+        sim_matrix = sim_matrix.masked_select(mask).view(2 * batch_size, -1)
+
+        # compute loss
+        pos_sim = torch.exp(torch.sum(z_i * z_j, dim=-1) / self.temp)
+        # [2*B]
+        pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
+        loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
+        if np.isnnan(loss.item):
+            breakpoint()
+        return loss
         #
-        # # compute loss
-        # pos_sim = torch.exp(torch.sum(z_i * z_j, dim=-1) / self.temp)
-        # # [2*B]
-        # pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
-        # loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
-        # return loss
-
-        if self.normalize:  # Use cosine similarity
-            z_i = F.normalize(z_i, dim=1)
-            z_j = F.normalize(z_j, dim=1)
-
-        mask = (torch.eye(batch_size) * self.large_num).to(self.device)
-
-        # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.
-        logits_aa = torch.matmul(z_i, z_i.T)  # NxN
-
-        # Values on the diagonal line are each image's similarity with itself
-        logits_aa = logits_aa - mask
-        # Similarity of the augmented images with all other augmented images.
-        logits_bb = torch.matmul(z_j, z_j.T)  # NxN
-        logits_bb = logits_bb - mask
-        # Similarity of original images and augmented images
-        logits_ab = torch.matmul(z_i, z_j.T)  # NxN
-        logits_ba = torch.matmul(z_j, z_i.T)  # NxN
-
-        avg_self_similarity = logits_ab.diag().mean().item()
-        logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
-        avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
-
-        sb_logger.record('avg_self_similarity', avg_self_similarity)
-        sb_logger.record('avg_other_similarity', avg_other_similarity)
-        sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
-
-        # Each row now contains an image's similarity with the batch's augmented images & original images. This applies
-        # to both original and augmented images (hence "symmetric").
-        logits_i = torch.cat((logits_ab, logits_aa), 1)  # Nx2N
-        logits_j = torch.cat((logits_ba, logits_bb), 1)  # Nx2N
-        logits = torch.cat((logits_i, logits_j), axis=0)  # 2Nx2N
-        logits /= self.temp
-
-        # The values we want to maximize lie on the i-th index of each row i. i.e. the dot product of
-        # represent(image_i) and represent(augmented_image_i).
-        label = torch.arange(batch_size, dtype=torch.long).to(self.device)
-        labels = torch.cat((label, label), axis=0)
-
-        return self.criterion(logits, labels)
+        # if self.normalize:  # Use cosine similarity
+        #     z_i = F.normalize(z_i, dim=1)
+        #     z_j = F.normalize(z_j, dim=1)
+        #
+        # mask = (torch.eye(batch_size) * self.large_num).to(self.device)
+        #
+        # # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.
+        # logits_aa = torch.matmul(z_i, z_i.T)  # NxN
+        #
+        # # Values on the diagonal line are each image's similarity with itself
+        # logits_aa = logits_aa - mask
+        # # Similarity of the augmented images with all other augmented images.
+        # logits_bb = torch.matmul(z_j, z_j.T)  # NxN
+        # logits_bb = logits_bb - mask
+        # # Similarity of original images and augmented images
+        # logits_ab = torch.matmul(z_i, z_j.T)  # NxN
+        # logits_ba = torch.matmul(z_j, z_i.T)  # NxN
+        #
+        # avg_self_similarity = logits_ab.diag().mean().item()
+        # logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
+        # avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
+        #
+        # sb_logger.record('avg_self_similarity', avg_self_similarity)
+        # sb_logger.record('avg_other_similarity', avg_other_similarity)
+        # sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
+        #
+        # # Each row now contains an image's similarity with the batch's augmented images & original images. This applies
+        # # to both original and augmented images (hence "symmetric").
+        # logits_i = torch.cat((logits_ab, logits_aa), 1)  # Nx2N
+        # logits_j = torch.cat((logits_ba, logits_bb), 1)  # Nx2N
+        # logits = torch.cat((logits_i, logits_j), axis=0)  # 2Nx2N
+        # logits /= self.temp
+        #
+        # # The values we want to maximize lie on the i-th index of each row i. i.e. the dot product of
+        # # represent(image_i) and represent(augmented_image_i).
+        # label = torch.arange(batch_size, dtype=torch.long).to(self.device)
+        # labels = torch.cat((label, label), axis=0)
+        #
+        # return self.criterion(logits, labels)
 
 
 class NegativeLogLikelihood(RepresentationLoss):
diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index e62dd753..f157ba6a 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -259,7 +259,6 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     model = representation_learning(algo, device, log_dir, _config)
 
     print('Train linear head')
-    breakpoint()
     classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
     train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 

From b51f49c5093b9035ebba278df83a97d4aec2add2 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 13:28:09 -0700
Subject: [PATCH 030/123] Add another breakpoint

---
 src/il_representations/scripts/run_cifar.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index f157ba6a..e62dd753 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -259,6 +259,7 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     model = representation_learning(algo, device, log_dir, _config)
 
     print('Train linear head')
+    breakpoint()
     classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
     train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 

From 2c3b69ad210161854151cc43789a1b1f3382d20a Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 13:28:53 -0700
Subject: [PATCH 031/123] Fix numpy call

---
 src/il_representations/algos/losses.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index 4fd1d039..6138815a 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -198,7 +198,7 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
         # [2*B]
         pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
         loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
-        if np.isnnan(loss.item):
+        if np.isnan(loss.item):
             breakpoint()
         return loss
         #

From 9e6c792f4e1969f5b173a6f47f1a561afd31a953 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 13:37:54 -0700
Subject: [PATCH 032/123] What if you used their loss but normalized first to
 maybe avoid infinities?

---
 src/il_representations/algos/losses.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index 6138815a..b0bf3c87 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -185,7 +185,8 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
         z_i = decoded_contexts
         z_j = targets
         batch_size = z_i.shape[0]
-
+        z_i = F.normalize(z_i, dim=1)
+        z_j = F.normalize(z_j, dim=1)
         out = torch.cat([z_i, z_j], dim=0)
         # [2*B, 2*B]
         sim_matrix = torch.exp(torch.mm(out, out.t().contiguous()) / self.temp)
@@ -198,7 +199,7 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
         # [2*B]
         pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
         loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
-        if np.isnan(loss.item):
+        if torch.isnan(loss):
             breakpoint()
         return loss
         #

From 09254b149e6aa14ed03edbf955e92ffda0d31c3b Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 14:06:11 -0700
Subject: [PATCH 033/123] Add ability to do K means evaluation

---
 src/il_representations/scripts/run_cifar.py | 83 +++++++++++++++++++--
 1 file changed, 78 insertions(+), 5 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index e62dd753..0aab92af 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -8,6 +8,10 @@
 import torchvision
 import torchvision.transforms as transforms
 from torchvision.models.resnet import resnet50
+from torchvision.datasets import CIFAR10
+from PIL import Image
+
+import tqdm
 from math import ceil
 import time
 from sacred import Experiment
@@ -227,6 +231,66 @@ def update(self, val, n=1):
         self.count += n
         self.avg = self.sum / self.count
 
+# test for one epoch, use weighted knn to find the most similar images' label to assign the test image
+def test(net, memory_data_loader, test_data_loader, k, num_classes, temperature, epoch):
+    net.eval()
+    total_top1, total_top5, total_num, feature_bank = 0.0, 0.0, 0, []
+    with torch.no_grad():
+        # generate feature bank
+        for data, _, target in tqdm(memory_data_loader, desc='Feature extracting'):
+            feature = net(data.cuda(non_blocking=True))
+            feature_bank.append(feature)
+        # [D, N]
+        feature_bank = torch.cat(feature_bank, dim=0).t().contiguous()
+        # [N]
+        feature_labels = torch.tensor(memory_data_loader.dataset.targets, device=feature_bank.device)
+        # loop test data to predict the label by weighted knn search
+        test_bar = tqdm(test_data_loader)
+        for data, _, target in test_bar:
+            data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
+            feature, out = net(data)
+
+            total_num += data.size(0)
+            # compute cos similarity between each feature vector and feature bank ---> [B, N]
+            sim_matrix = torch.mm(feature, feature_bank)
+            # [B, K]
+            sim_weight, sim_indices = sim_matrix.topk(k=k, dim=-1)
+            # [B, K]
+            sim_labels = torch.gather(feature_labels.expand(data.size(0), -1), dim=-1, index=sim_indices)
+            sim_weight = (sim_weight / temperature).exp()
+
+            # counts for each class
+            one_hot_label = torch.zeros(data.size(0) * k, num_classes, device=sim_labels.device)
+            # [B*K, C]
+            one_hot_label = one_hot_label.scatter(dim=-1, index=sim_labels.view(-1, 1), value=1.0)
+            # weighted score ---> [B, C]
+            pred_scores = torch.sum(one_hot_label.view(data.size(0), -1, num_classes) * sim_weight.unsqueeze(dim=-1), dim=1)
+
+            pred_labels = pred_scores.argsort(dim=-1, descending=True)
+            total_top1 += torch.sum((pred_labels[:, :1] == target.unsqueeze(dim=-1)).any(dim=-1).float()).item()
+            total_top5 += torch.sum((pred_labels[:, :5] == target.unsqueeze(dim=-1)).any(dim=-1).float()).item()
+            test_bar.set_description('Test Epoch: [{}] Acc@1:{:.2f}% Acc@5:{:.2f}%'
+                                     .format(epoch, total_top1 / total_num * 100, total_top5 / total_num * 100))
+
+    return total_top1 / total_num * 100, total_top5 / total_num * 100
+
+
+class CIFAR10Pair(CIFAR10):
+    """CIFAR10 Dataset.
+    """
+
+    def __getitem__(self, index):
+        img, target = self.data[index], self.targets[index]
+        img = Image.fromarray(img)
+
+        if self.transform is not None:
+            pos_1 = self.transform(img)
+            pos_2 = self.transform(img)
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return pos_1, pos_2, target
 
 @cifar_ex.config
 def default_config():
@@ -249,7 +313,7 @@ def default_config():
 
 
 @cifar_ex.main
-def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_dim, _config):
+def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_dim, pretrain_batch_size, _config):
     # TODO fix this hacky nonsense
     log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
     os.mkdir(log_dir)
@@ -258,10 +322,19 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model = representation_learning(algo, device, log_dir, _config)
 
-    print('Train linear head')
-    breakpoint()
-    classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
-    train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
+    test_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
+
+    memory_data = CIFAR10Pair(root='data', train=True, transform=test_transform, download=True)
+    memory_loader = torch.utils.data.DataLoader(memory_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
+    test_data = CIFAR10Pair(root='data', train=False, transform=test_transform, download=True)
+    test_loader = torch.utils.data.DataLoader(test_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
+
+    test(model.encoder, memory_loader, test_loader, k=200, num_classes=10, temperature=_config['pretrain_temperature'], epoch=-1)
+    # print('Train linear head')
+    # classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
+    # train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 
     # print('Evaluate accuracy on test set')
     # evaluate_classifier(classifier, data_dir, device=device)

From d464fca3b93f660ca8727f10601c0a51fed427ab Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 14:07:23 -0700
Subject: [PATCH 034/123] Accidentally called encoder.encoder

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 0aab92af..b8e253c2 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -331,7 +331,7 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     test_data = CIFAR10Pair(root='data', train=False, transform=test_transform, download=True)
     test_loader = torch.utils.data.DataLoader(test_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
 
-    test(model.encoder, memory_loader, test_loader, k=200, num_classes=10, temperature=_config['pretrain_temperature'], epoch=-1)
+    test(model, memory_loader, test_loader, k=200, num_classes=10, temperature=_config['pretrain_temperature'], epoch=-1)
     # print('Train linear head')
     # classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
     # train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)

From 44e6213e91e1ea5a112bceb44a7744088415d4ca Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 14:08:20 -0700
Subject: [PATCH 035/123] double-import tqdm

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index b8e253c2..c03d8362 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -11,7 +11,7 @@
 from torchvision.datasets import CIFAR10
 from PIL import Image
 
-import tqdm
+from tqdm import tqdm
 from math import ceil
 import time
 from sacred import Experiment

From a35ba0a7d6f1264c5047da0c52c2ebc979ffbb64 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 14:09:25 -0700
Subject: [PATCH 036/123] Maybe avoid needing traj_info

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index c03d8362..ea867d74 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -331,7 +331,7 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     test_data = CIFAR10Pair(root='data', train=False, transform=test_transform, download=True)
     test_loader = torch.utils.data.DataLoader(test_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
 
-    test(model, memory_loader, test_loader, k=200, num_classes=10, temperature=_config['pretrain_temperature'], epoch=-1)
+    test(model.network, memory_loader, test_loader, k=200, num_classes=10, temperature=_config['pretrain_temperature'], epoch=-1)
     # print('Train linear head')
     # classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
     # train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)

From 80336826f67afc68b3ad2bbee82e68a2fed1e74b Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 14:10:43 -0700
Subject: [PATCH 037/123] Remove unused feature, out

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index ea867d74..620562db 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -248,7 +248,7 @@ def test(net, memory_data_loader, test_data_loader, k, num_classes, temperature,
         test_bar = tqdm(test_data_loader)
         for data, _, target in test_bar:
             data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
-            feature, out = net(data)
+            feature = net(data)
 
             total_num += data.size(0)
             # compute cos similarity between each feature vector and feature bank ---> [B, N]

From 720c15dd9987ccf51c87a92ad581a48c5bfa9830 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 14:52:41 -0700
Subject: [PATCH 038/123] Allow passing in a pretrained model

---
 src/il_representations/scripts/run_cifar.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 620562db..8110be50 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -308,19 +308,25 @@ def default_config():
     pretrain_batch_size = 512
     pretrain_save_interval = 100
     pretrain_temperature = 0.5
+    pretrained_model = None
     _ = locals()
     del _
 
 
 @cifar_ex.main
-def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_dim, pretrain_batch_size, _config):
+def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_dim,
+        pretrained_model, pretrain_batch_size, _config):
     # TODO fix this hacky nonsense
-    log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
-    os.mkdir(log_dir)
-    os.makedirs(data_dir, exist_ok=True)
+    if pretrained_model is None:
+        log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
+        os.mkdir(log_dir)
+        os.makedirs(data_dir, exist_ok=True)
 
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = representation_learning(algo, device, log_dir, _config)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = representation_learning(algo, device, log_dir, _config)
+
+    else:
+        model = torch.load(pretrained_model)
 
     test_transform = transforms.Compose([
     transforms.ToTensor(),

From 382e9fdf58d25f4be578716c0d1ca1f3e41162c4 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 15:01:41 -0700
Subject: [PATCH 039/123] Make it easier to switch between our loss and repo
 loss

---
 src/il_representations/algos/losses.py      | 122 ++++++++++----------
 src/il_representations/scripts/run_cifar.py |   6 +-
 2 files changed, 69 insertions(+), 59 deletions(-)

diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index b0bf3c87..5733d96d 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -162,11 +162,12 @@ class SymmetricContrastiveLoss(RepresentationLoss):
     all similarities with J, and also all similarities with I, and calculates cross-entropy on both
     """
 
-    def __init__(self, device, sample=False, temp=0.1, normalize=True):
+    def __init__(self, device, sample=False, temp=0.1, normalize=True, use_repo_loss=False):
         super(SymmetricContrastiveLoss, self).__init__(device, sample)
 
         self.criterion = torch.nn.CrossEntropyLoss()
         self.temp = temp
+        self.use_repo_loss = use_repo_loss
 
         # Most methods use either cosine similarity or matrix multiplication similarity. Since cosine similarity equals
         # taking MatMul on normalized vectors, setting normalize=True is equivalent to using torch.CosineSimilarity().
@@ -181,67 +182,72 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
         # decoded_context -> representation of context + optional projection head
         # target -> representation of target + optional projection head
         # encoded_context -> not used by this loss
+
         decoded_contexts, targets = self.get_vector_forms(decoded_context_dist, target_dist)
         z_i = decoded_contexts
         z_j = targets
         batch_size = z_i.shape[0]
-        z_i = F.normalize(z_i, dim=1)
-        z_j = F.normalize(z_j, dim=1)
-        out = torch.cat([z_i, z_j], dim=0)
-        # [2*B, 2*B]
-        sim_matrix = torch.exp(torch.mm(out, out.t().contiguous()) / self.temp)
-        mask = (torch.ones_like(sim_matrix) - torch.eye(2 * batch_size, device=sim_matrix.device)).bool()
-        # [2*B, 2*B-1]
-        sim_matrix = sim_matrix.masked_select(mask).view(2 * batch_size, -1)
-
-        # compute loss
-        pos_sim = torch.exp(torch.sum(z_i * z_j, dim=-1) / self.temp)
-        # [2*B]
-        pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
-        loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
-        if torch.isnan(loss):
-            breakpoint()
-        return loss
-        #
-        # if self.normalize:  # Use cosine similarity
-        #     z_i = F.normalize(z_i, dim=1)
-        #     z_j = F.normalize(z_j, dim=1)
-        #
-        # mask = (torch.eye(batch_size) * self.large_num).to(self.device)
-        #
-        # # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.
-        # logits_aa = torch.matmul(z_i, z_i.T)  # NxN
-        #
-        # # Values on the diagonal line are each image's similarity with itself
-        # logits_aa = logits_aa - mask
-        # # Similarity of the augmented images with all other augmented images.
-        # logits_bb = torch.matmul(z_j, z_j.T)  # NxN
-        # logits_bb = logits_bb - mask
-        # # Similarity of original images and augmented images
-        # logits_ab = torch.matmul(z_i, z_j.T)  # NxN
-        # logits_ba = torch.matmul(z_j, z_i.T)  # NxN
-        #
-        # avg_self_similarity = logits_ab.diag().mean().item()
-        # logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
-        # avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
-        #
-        # sb_logger.record('avg_self_similarity', avg_self_similarity)
-        # sb_logger.record('avg_other_similarity', avg_other_similarity)
-        # sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
-        #
-        # # Each row now contains an image's similarity with the batch's augmented images & original images. This applies
-        # # to both original and augmented images (hence "symmetric").
-        # logits_i = torch.cat((logits_ab, logits_aa), 1)  # Nx2N
-        # logits_j = torch.cat((logits_ba, logits_bb), 1)  # Nx2N
-        # logits = torch.cat((logits_i, logits_j), axis=0)  # 2Nx2N
-        # logits /= self.temp
-        #
-        # # The values we want to maximize lie on the i-th index of each row i. i.e. the dot product of
-        # # represent(image_i) and represent(augmented_image_i).
-        # label = torch.arange(batch_size, dtype=torch.long).to(self.device)
-        # labels = torch.cat((label, label), axis=0)
-        #
-        # return self.criterion(logits, labels)
+
+
+        if self.use_repo_loss:
+            # Normalize to avoid infinities
+            z_i = F.normalize(z_i, dim=1)
+            z_j = F.normalize(z_j, dim=1)
+            out = torch.cat([z_i, z_j], dim=0)
+            # [2*B, 2*B]
+            sim_matrix = torch.exp(torch.mm(out, out.t().contiguous()) / self.temp)
+            mask = (torch.ones_like(sim_matrix) - torch.eye(2 * batch_size, device=sim_matrix.device)).bool()
+            # [2*B, 2*B-1]
+            sim_matrix = sim_matrix.masked_select(mask).view(2 * batch_size, -1)
+
+            # compute loss
+            pos_sim = torch.exp(torch.sum(z_i * z_j, dim=-1) / self.temp)
+            # [2*B]
+            pos_sim = torch.cat([pos_sim, pos_sim], dim=0)
+            loss = (- torch.log(pos_sim / sim_matrix.sum(dim=-1))).mean()
+            if torch.isnan(loss):
+                breakpoint()
+            return loss
+        else:
+            if self.normalize:  # Use cosine similarity
+                z_i = F.normalize(z_i, dim=1)
+                z_j = F.normalize(z_j, dim=1)
+
+            mask = (torch.eye(batch_size) * self.large_num).to(self.device)
+
+            # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.
+            logits_aa = torch.matmul(z_i, z_i.T)  # NxN
+
+            # Values on the diagonal line are each image's similarity with itself
+            logits_aa = logits_aa - mask
+            # Similarity of the augmented images with all other augmented images.
+            logits_bb = torch.matmul(z_j, z_j.T)  # NxN
+            logits_bb = logits_bb - mask
+            # Similarity of original images and augmented images
+            logits_ab = torch.matmul(z_i, z_j.T)  # NxN
+            logits_ba = torch.matmul(z_j, z_i.T)  # NxN
+
+            avg_self_similarity = logits_ab.diag().mean().item()
+            logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
+            avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
+
+            sb_logger.record('avg_self_similarity', avg_self_similarity)
+            sb_logger.record('avg_other_similarity', avg_other_similarity)
+            sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
+
+            # Each row now contains an image's similarity with the batch's augmented images & original images. This applies
+            # to both original and augmented images (hence "symmetric").
+            logits_i = torch.cat((logits_ab, logits_aa), 1)  # Nx2N
+            logits_j = torch.cat((logits_ba, logits_bb), 1)  # Nx2N
+            logits = torch.cat((logits_i, logits_j), axis=0)  # 2Nx2N
+            logits /= self.temp
+
+            # The values we want to maximize lie on the i-th index of each row i. i.e. the dot product of
+            # represent(image_i) and represent(augmented_image_i).
+            label = torch.arange(batch_size, dtype=torch.long).to(self.device)
+            labels = torch.cat((label, label), axis=0)
+
+            return self.criterion(logits, labels)
 
 
 class NegativeLogLikelihood(RepresentationLoss):
diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 8110be50..cea5134d 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -188,7 +188,8 @@ def representation_learning(algo, device, log_dir, config):
         optimizer_kwargs=optimizer_kwargs,
         #scheduler=LinearWarmupCosine,
         scheduler_kwargs={'warmup_epoch': 2, 'total_epochs': num_epochs},
-        loss_calculator_kwargs={'temp': config['pretrain_temperature']},
+        loss_calculator_kwargs={'temp': config['pretrain_temperature'],
+                                'use_repo_loss': config['use_repo_loss']},
         log_interval=10,
         calc_log_interval=10
     )
@@ -309,6 +310,7 @@ def default_config():
     pretrain_save_interval = 100
     pretrain_temperature = 0.5
     pretrained_model = None
+    use_repo_loss = False
     _ = locals()
     del _
 
@@ -337,6 +339,8 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     test_data = CIFAR10Pair(root='data', train=False, transform=test_transform, download=True)
     test_loader = torch.utils.data.DataLoader(test_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
 
+
+    # KNN testing from SimCLR repo for comparison
     test(model.network, memory_loader, test_loader, k=200, num_classes=10, temperature=_config['pretrain_temperature'], epoch=-1)
     # print('Train linear head')
     # classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)

From 2f221c0b0683347ae6c92d66cc70c171d831d3eb Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 16:20:17 -0700
Subject: [PATCH 040/123] Normalize our features before using them in KNN

---
 src/il_representations/scripts/run_cifar.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index cea5134d..1e2d865e 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -4,6 +4,7 @@
 import json
 import torch
 import torch.nn as nn
+import torch.functional as F
 import torch.optim as optim
 import torchvision
 import torchvision.transforms as transforms
@@ -239,7 +240,7 @@ def test(net, memory_data_loader, test_data_loader, k, num_classes, temperature,
     with torch.no_grad():
         # generate feature bank
         for data, _, target in tqdm(memory_data_loader, desc='Feature extracting'):
-            feature = net(data.cuda(non_blocking=True))
+            feature = F.normalize(net(data.cuda(non_blocking=True)), dim=-1)
             feature_bank.append(feature)
         # [D, N]
         feature_bank = torch.cat(feature_bank, dim=0).t().contiguous()
@@ -249,7 +250,7 @@ def test(net, memory_data_loader, test_data_loader, k, num_classes, temperature,
         test_bar = tqdm(test_data_loader)
         for data, _, target in test_bar:
             data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
-            feature = net(data)
+            feature = F.normalize(net(data), dim=-1)
 
             total_num += data.size(0)
             # compute cos similarity between each feature vector and feature bank ---> [B, N]
@@ -275,7 +276,7 @@ def test(net, memory_data_loader, test_data_loader, k, num_classes, temperature,
 
     return total_top1 / total_num * 100, total_top5 / total_num * 100
 
-
+## data handling class copied from SimCLR implementation
 class CIFAR10Pair(CIFAR10):
     """CIFAR10 Dataset.
     """

From 81932ee902f36746a4ce02173f727cd089bc9550 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Wed, 21 Apr 2021 16:27:07 -0700
Subject: [PATCH 041/123] Unbreak torch.nn.functional import

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 1e2d865e..d883a97f 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -4,7 +4,7 @@
 import json
 import torch
 import torch.nn as nn
-import torch.functional as F
+import torch.nn.functional as F
 import torch.optim as optim
 import torchvision
 import torchvision.transforms as transforms

From 5724cef12c0683cdd4eab160a2c301bfe1adaa6c Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 09:52:11 -0700
Subject: [PATCH 042/123] Examine image scale before augmentations

---
 src/il_representations/algos/representation_learner.py | 1 +
 src/il_representations/scripts/run_cifar.py            | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 5d4d4c3b..e5d937e5 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -302,6 +302,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 contexts = self._preprocess(contexts)
                 if self.preprocess_target:
                     targets = self._preprocess(targets)
+                breakpoint()
                 contexts, targets = self.augmenter(contexts, targets)
                 extra_context = self._preprocess_extra_context(extra_context)
                 # This is typically a noop, but sometimes we also augment the extra context
diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index d883a97f..a7d24f3a 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -159,8 +159,6 @@ def representation_learning(algo, device, log_dir, config):
     }
 
     # This is currently erroneously 1
-    #num_examples = len(rep_learning_data)
-    # num_examples = 49920
     num_epochs = config['pretrain_epochs']
     batch_size = config['pretrain_batch_size']
     batches_per_epoch = config['pretrain_batches_per_epoch']

From 8592c6693100f952bc22ea1f2b593cca21ebee33 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 10:29:13 -0700
Subject: [PATCH 043/123] Explicitly use their model class

---
 .../algos/representation_learner.py           |  1 -
 src/il_representations/scripts/run_cifar.py   | 37 +++++++++++++++----
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index e5d937e5..5d4d4c3b 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -302,7 +302,6 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 contexts = self._preprocess(contexts)
                 if self.preprocess_target:
                     targets = self._preprocess(targets)
-                breakpoint()
                 contexts, targets = self.augmenter(contexts, targets)
                 extra_context = self._preprocess_extra_context(extra_context)
                 # This is typically a noop, but sometimes we also augment the extra context
diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index a7d24f3a..3a758a8e 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -123,6 +123,29 @@ def evaluate_classifier(classifier, data_dir, device):
 
     return test_acc_meter.avg
 
+class SimCLRModel(nn.Module):
+    def __init__(self, feature_dim=128):
+        super(SimCLRModel, self).__init__()
+
+        self.f = []
+        for name, module in resnet50().named_children():
+            if name == 'conv1':
+                module = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+            if not isinstance(module, nn.Linear) and not isinstance(module, nn.MaxPool2d):
+                self.f.append(module)
+        # encoder
+        # Temporarily add an extra layer to be closer to our model implementation
+        self.f = nn.Sequential(*self.f)
+        # projection head
+        self.g = nn.Sequential(nn.Linear(2048, 512, bias=False), nn.BatchNorm1d(512),
+                               nn.ReLU(inplace=True), nn.Linear(512, feature_dim, bias=True))
+
+    def forward(self, x):
+        x = self.f(x)
+        feature = torch.flatten(x, start_dim=1)
+        out = self.g(feature)
+        return F.normalize(feature, dim=-1), F.normalize(out, dim=-1)
+
 
 def representation_learning(algo, device, log_dir, config):
     print('Train representation learner')
@@ -164,10 +187,10 @@ def representation_learning(algo, device, log_dir, config):
     batches_per_epoch = config['pretrain_batches_per_epoch']
 
     # Modify resnet according to SimCLR paper Appendix B.9
-    simclr_resnet = resnet50()
-    simclr_resnet.fc = torch.nn.Linear(2048, config['representation_dim'])
-    simclr_resnet.conv1 = torch.nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
-    simclr_resnet.maxpool = torch.nn.Identity()
+    # simclr_resnet = resnet50()
+    # simclr_resnet.fc = torch.nn.Linear(2048, config['representation_dim'])
+    # simclr_resnet.conv1 = torch.nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
+    # simclr_resnet.maxpool = torch.nn.Identity()
 
     model = algo(
         observation_space=combined_meta['observation_space'],
@@ -181,7 +204,7 @@ def representation_learning(algo, device, log_dir, config):
         shuffle_batches=True,
         color_space=combined_meta['color_space'],
         save_interval=config['pretrain_save_interval'],
-        encoder_kwargs={'obs_encoder_cls': lambda *args: simclr_resnet},
+        encoder_kwargs={'obs_encoder_cls': lambda *args: SimCLRModel()},
         augmenter_kwargs=augmenter_kwargs,
         optimizer=torch.optim.Adam,
         optimizer_kwargs=optimizer_kwargs,
@@ -238,7 +261,7 @@ def test(net, memory_data_loader, test_data_loader, k, num_classes, temperature,
     with torch.no_grad():
         # generate feature bank
         for data, _, target in tqdm(memory_data_loader, desc='Feature extracting'):
-            feature = F.normalize(net(data.cuda(non_blocking=True)), dim=-1)
+            feature, out = net(data.cuda(non_blocking=True))
             feature_bank.append(feature)
         # [D, N]
         feature_bank = torch.cat(feature_bank, dim=0).t().contiguous()
@@ -248,7 +271,7 @@ def test(net, memory_data_loader, test_data_loader, k, num_classes, temperature,
         test_bar = tqdm(test_data_loader)
         for data, _, target in test_bar:
             data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
-            feature = F.normalize(net(data), dim=-1)
+            feature, out = net(data)
 
             total_num += data.size(0)
             # compute cos similarity between each feature vector and feature bank ---> [B, N]

From 6bda628eac87855f343d2a98d8eaa86565c65fce Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 10:36:14 -0700
Subject: [PATCH 044/123] Use SimCLR model for encoder at least

---
 src/il_representations/scripts/run_cifar.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 3a758a8e..ae8e005e 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -136,15 +136,15 @@ def __init__(self, feature_dim=128):
         # encoder
         # Temporarily add an extra layer to be closer to our model implementation
         self.f = nn.Sequential(*self.f)
-        # projection head
-        self.g = nn.Sequential(nn.Linear(2048, 512, bias=False), nn.BatchNorm1d(512),
-                               nn.ReLU(inplace=True), nn.Linear(512, feature_dim, bias=True))
+        # # projection head
+        # self.g = nn.Sequential(nn.Linear(2048, 512, bias=False), nn.BatchNorm1d(512),
+        #                        nn.ReLU(inplace=True), nn.Linear(512, feature_dim, bias=True))
 
     def forward(self, x):
         x = self.f(x)
         feature = torch.flatten(x, start_dim=1)
-        out = self.g(feature)
-        return F.normalize(feature, dim=-1), F.normalize(out, dim=-1)
+        #out = self.g(feature)
+        return F.normalize(feature, dim=-1) #, F.normalize(out, dim=-1)
 
 
 def representation_learning(algo, device, log_dir, config):
@@ -323,7 +323,7 @@ def default_config():
     pretrain_epochs = 1000
     pretrain_batches_per_epoch = 390
     finetune_epochs = 100
-    representation_dim = 512
+    representation_dim = 2048 ## TODO change back
     projection_dim = 128
     pretrain_lr = 3e-4
     pretrain_weight_decay = 1e-4

From d1463f5a877a02b79e7a7c23128202a68fd83399 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 10:43:25 -0700
Subject: [PATCH 045/123] No longer expect a tuple in KNN code

---
 src/il_representations/scripts/run_cifar.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index ae8e005e..8a5a6946 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -261,7 +261,7 @@ def test(net, memory_data_loader, test_data_loader, k, num_classes, temperature,
     with torch.no_grad():
         # generate feature bank
         for data, _, target in tqdm(memory_data_loader, desc='Feature extracting'):
-            feature, out = net(data.cuda(non_blocking=True))
+            feature = net(data.cuda(non_blocking=True))
             feature_bank.append(feature)
         # [D, N]
         feature_bank = torch.cat(feature_bank, dim=0).t().contiguous()
@@ -271,7 +271,7 @@ def test(net, memory_data_loader, test_data_loader, k, num_classes, temperature,
         test_bar = tqdm(test_data_loader)
         for data, _, target in test_bar:
             data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
-            feature, out = net(data)
+            feature = net(data)
 
             total_num += data.size(0)
             # compute cos similarity between each feature vector and feature bank ---> [B, N]

From 767511d5ab14a19388c4d3d173ce423705f65345 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 10:56:21 -0700
Subject: [PATCH 046/123] Modify decoder kwargs to be closer to SimCLR

---
 src/il_representations/scripts/run_cifar.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 8a5a6946..d497ef97 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -205,6 +205,7 @@ def representation_learning(algo, device, log_dir, config):
         color_space=combined_meta['color_space'],
         save_interval=config['pretrain_save_interval'],
         encoder_kwargs={'obs_encoder_cls': lambda *args: SimCLRModel()},
+        decoder_kwargs={'projection_architecture': [{'output_dim': 512}]}
         augmenter_kwargs=augmenter_kwargs,
         optimizer=torch.optim.Adam,
         optimizer_kwargs=optimizer_kwargs,

From 6e5563073c67b257dbdfab321180a666374429e7 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 10:56:50 -0700
Subject: [PATCH 047/123] Add comma back in

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index d497ef97..214858e2 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -205,7 +205,7 @@ def representation_learning(algo, device, log_dir, config):
         color_space=combined_meta['color_space'],
         save_interval=config['pretrain_save_interval'],
         encoder_kwargs={'obs_encoder_cls': lambda *args: SimCLRModel()},
-        decoder_kwargs={'projection_architecture': [{'output_dim': 512}]}
+        decoder_kwargs={'projection_architecture': [{'output_dim': 512}]},
         augmenter_kwargs=augmenter_kwargs,
         optimizer=torch.optim.Adam,
         optimizer_kwargs=optimizer_kwargs,

From 0fc1975ce78b8c768a19863bfa0b618eda1278ed Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 11:08:27 -0700
Subject: [PATCH 048/123] Add code to save images out

---
 src/il_representations/algos/representation_learner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 5d4d4c3b..74cf9432 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -15,6 +15,7 @@
 from il_representations.algos.batch_extenders import QueueBatchExtender
 from il_representations.algos.utils import AverageMeter, LinearWarmupCosine
 from il_representations.data.read_dataset import datasets_to_loader, SubdatasetExtractor
+from il_representations.utils import save_rgb_tensor
 
 DEFAULT_HARDCODED_PARAMS = [
     'encoder', 'decoder', 'loss_calculator', 'augmenter',
@@ -303,6 +304,9 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 if self.preprocess_target:
                     targets = self._preprocess(targets)
                 contexts, targets = self.augmenter(contexts, targets)
+                if step == 0:
+                    save_rgb_tensor(contexts[0], os.path.join(self.log_dir, 'saved_images', 'contexts_0.png'))
+                    save_rgb_tensor(targets[0], os.path.join(self.log_dir, 'saved_images', 'targets_0.png'))
                 extra_context = self._preprocess_extra_context(extra_context)
                 # This is typically a noop, but sometimes we also augment the extra context
                 extra_context = self.augmenter.augment_extra_context(extra_context)

From 592a278efa71361cac348b879ed68f7f21748f66 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 11:21:06 -0700
Subject: [PATCH 049/123] Remove image prepreprocessing to avoid
 double-normalizing

---
 src/il_representations/algos/representation_learner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 74cf9432..7c93b3e7 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -300,9 +300,9 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 extra_context = self._prep_tensors(extra_context)
                 traj_ts_info = self._prep_tensors(traj_ts_info)
                 # Note: preprocessing might be better to do on CPU if, in future, we can parallelize doing so
-                contexts = self._preprocess(contexts)
-                if self.preprocess_target:
-                    targets = self._preprocess(targets)
+                # contexts = self._preprocess(contexts)
+                # if self.preprocess_target:
+                #     targets = self._preprocess(targets)
                 contexts, targets = self.augmenter(contexts, targets)
                 if step == 0:
                     save_rgb_tensor(contexts[0], os.path.join(self.log_dir, 'saved_images', 'contexts_0.png'))

From fc7388de5aa824e311f564f089531530ea751a35 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 11:51:16 -0700
Subject: [PATCH 050/123] Add more image saving and warnings

---
 .../algos/representation_learner.py                   | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 7c93b3e7..a2cf20d5 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -13,6 +13,7 @@
 
 from il_representations.algos.base_learner import BaseEnvironmentLearner
 from il_representations.algos.batch_extenders import QueueBatchExtender
+from il_representations.algos.encoders import warn_on_non_image_tensor
 from il_representations.algos.utils import AverageMeter, LinearWarmupCosine
 from il_representations.data.read_dataset import datasets_to_loader, SubdatasetExtractor
 from il_representations.utils import save_rgb_tensor
@@ -293,7 +294,9 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             for step, batch in enumerate(dataloader):
                 # Construct batch (currently just using Torch's default batch-creator)
                 contexts, targets, traj_ts_info, extra_context = self.unpack_batch(batch)
-
+                if step == 0:
+                    save_rgb_tensor(contexts[0], os.path.join(self.log_dir, 'saved_images', 'contexts_from_disk_0.png'))
+                    save_rgb_tensor(targets[0], os.path.join(self.log_dir, 'saved_images', 'targets_from_disk_0.png'))
                 # Use an algorithm-specific augmentation strategy to augment either
                 # just context, or both context and targets
                 contexts, targets = self._prep_tensors(contexts), self._prep_tensors(targets)
@@ -303,6 +306,9 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 # contexts = self._preprocess(contexts)
                 # if self.preprocess_target:
                 #     targets = self._preprocess(targets)
+                if step == 0:
+                    save_rgb_tensor(contexts[0], os.path.join(self.log_dir, 'saved_images', 'contexts_pre_aug_0.png'))
+                    save_rgb_tensor(targets[0], os.path.join(self.log_dir, 'saved_images', 'targets_pre_aug_0.png'))
                 contexts, targets = self.augmenter(contexts, targets)
                 if step == 0:
                     save_rgb_tensor(contexts[0], os.path.join(self.log_dir, 'saved_images', 'contexts_0.png'))
@@ -310,7 +316,8 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 extra_context = self._preprocess_extra_context(extra_context)
                 # This is typically a noop, but sometimes we also augment the extra context
                 extra_context = self.augmenter.augment_extra_context(extra_context)
-
+                warn_on_non_image_tensor(contexts)
+                warn_on_non_image_tensor(targets)
                 # These will typically just use the forward() function for the encoder, but can optionally
                 # use a specific encode_context and encode_target if one is implemented
                 encoded_contexts = self.encoder.encode_context(contexts, traj_ts_info)

From 56ebd6fe152a0a90f92dd57d7e37ee78e18abc51 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 11:51:24 -0700
Subject: [PATCH 051/123] Add more image saving and warnings

---
 src/il_representations/algos/representation_learner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index a2cf20d5..012cfe15 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -294,6 +294,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             for step, batch in enumerate(dataloader):
                 # Construct batch (currently just using Torch's default batch-creator)
                 contexts, targets, traj_ts_info, extra_context = self.unpack_batch(batch)
+                breakpoint()
                 if step == 0:
                     save_rgb_tensor(contexts[0], os.path.join(self.log_dir, 'saved_images', 'contexts_from_disk_0.png'))
                     save_rgb_tensor(targets[0], os.path.join(self.log_dir, 'saved_images', 'targets_from_disk_0.png'))

From 618f4eab756e687013a4e8f222c734b50524df37 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 12:23:26 -0700
Subject: [PATCH 052/123] Save out more images

---
 .../algos/representation_learner.py              | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 012cfe15..cd15f4e4 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -294,10 +294,10 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             for step, batch in enumerate(dataloader):
                 # Construct batch (currently just using Torch's default batch-creator)
                 contexts, targets, traj_ts_info, extra_context = self.unpack_batch(batch)
-                breakpoint()
                 if step == 0:
-                    save_rgb_tensor(contexts[0], os.path.join(self.log_dir, 'saved_images', 'contexts_from_disk_0.png'))
-                    save_rgb_tensor(targets[0], os.path.join(self.log_dir, 'saved_images', 'targets_from_disk_0.png'))
+                    for i in range(10):
+                        save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_from_disk_{i}.png'))
+                        save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_from_disk_{i}.png'))
                 # Use an algorithm-specific augmentation strategy to augment either
                 # just context, or both context and targets
                 contexts, targets = self._prep_tensors(contexts), self._prep_tensors(targets)
@@ -308,12 +308,14 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 # if self.preprocess_target:
                 #     targets = self._preprocess(targets)
                 if step == 0:
-                    save_rgb_tensor(contexts[0], os.path.join(self.log_dir, 'saved_images', 'contexts_pre_aug_0.png'))
-                    save_rgb_tensor(targets[0], os.path.join(self.log_dir, 'saved_images', 'targets_pre_aug_0.png'))
+                    for i in range(10):
+                        save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_pre_aug_{i}.png'))
+                        save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_pre_aug_{i}.png'))
                 contexts, targets = self.augmenter(contexts, targets)
                 if step == 0:
-                    save_rgb_tensor(contexts[0], os.path.join(self.log_dir, 'saved_images', 'contexts_0.png'))
-                    save_rgb_tensor(targets[0], os.path.join(self.log_dir, 'saved_images', 'targets_0.png'))
+                    for i in range(10):
+                        save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_{i}.png'))
+                        save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_{i}.png'))
                 extra_context = self._preprocess_extra_context(extra_context)
                 # This is typically a noop, but sometimes we also augment the extra context
                 extra_context = self.augmenter.augment_extra_context(extra_context)

From 8b6f75a3706091dbe1a213fde067df81d2a66df2 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 12:26:10 -0700
Subject: [PATCH 053/123] Try to get augmentations to match SimCLR

---
 src/il_representations/scripts/run_cifar.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 214858e2..17ae80fe 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -154,10 +154,10 @@ def representation_learning(algo, device, log_dir, config):
     assert issubclass(algo, algos.RepresentationLearner)
 
     rep_learning_augmentations = transforms.Compose([
-        transforms.Lambda(lambda x: (x.cpu().numpy() * 255).astype(np.uint8)),
-        transforms.ToPILImage(),
-        transforms.RandomResizedCrop(32, interpolation=PIL.Image.BICUBIC),
-        transforms.RandomHorizontalFlip(),
+        # transforms.Lambda(lambda x: (x.cpu().numpy() * 255).astype(np.uint8)),
+        # transforms.ToPILImage(),
+        transforms.RandomResizedCrop(32), #, interpolation=PIL.Image.BICUBIC),
+        transforms.RandomHorizontalFlip(p=0.5),
         transforms.RandomApply([
             transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
         ], p=0.8),

From 01b6a6a63732e9670a84e2bb035bc759829cbd2e Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 12:26:59 -0700
Subject: [PATCH 054/123] Still convert to PILImage

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 17ae80fe..93ef0f27 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -155,7 +155,7 @@ def representation_learning(algo, device, log_dir, config):
 
     rep_learning_augmentations = transforms.Compose([
         # transforms.Lambda(lambda x: (x.cpu().numpy() * 255).astype(np.uint8)),
-        # transforms.ToPILImage(),
+        transforms.ToPILImage(),
         transforms.RandomResizedCrop(32), #, interpolation=PIL.Image.BICUBIC),
         transforms.RandomHorizontalFlip(p=0.5),
         transforms.RandomApply([

From c5aee28b07d8b23b8cecd24f8ea30f3920619582 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 12:27:57 -0700
Subject: [PATCH 055/123] Add back numpy conversion without x255

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 93ef0f27..938aa7fc 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -154,7 +154,7 @@ def representation_learning(algo, device, log_dir, config):
     assert issubclass(algo, algos.RepresentationLearner)
 
     rep_learning_augmentations = transforms.Compose([
-        # transforms.Lambda(lambda x: (x.cpu().numpy() * 255).astype(np.uint8)),
+        transforms.Lambda(lambda x: x.cpu().numpy().astype(np.uint8)),
         transforms.ToPILImage(),
         transforms.RandomResizedCrop(32), #, interpolation=PIL.Image.BICUBIC),
         transforms.RandomHorizontalFlip(p=0.5),

From 94f0b9affcfa76e7905c9c49306021efeb2e22b6 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 12:29:01 -0700
Subject: [PATCH 056/123] Add 255x back in

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 938aa7fc..865be7ad 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -154,7 +154,7 @@ def representation_learning(algo, device, log_dir, config):
     assert issubclass(algo, algos.RepresentationLearner)
 
     rep_learning_augmentations = transforms.Compose([
-        transforms.Lambda(lambda x: x.cpu().numpy().astype(np.uint8)),
+        transforms.Lambda(lambda x: (x.cpu().numpy() * 255).astype(np.uint8)),
         transforms.ToPILImage(),
         transforms.RandomResizedCrop(32), #, interpolation=PIL.Image.BICUBIC),
         transforms.RandomHorizontalFlip(p=0.5),

From be29ef9f017fc0c5efe7b3d9d177b3b36a83ac01 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 13:44:03 -0700
Subject: [PATCH 057/123] Normalize in the same way as SimCLR

---
 src/il_representations/scripts/run_cifar.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 865be7ad..bcefb946 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -163,7 +163,8 @@ def representation_learning(algo, device, log_dir, config):
         ], p=0.8),
         transforms.RandomGrayscale(p=0.2),
         transforms.ToTensor(),
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
+        #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
         # SimCLR doesn't use blur for CIFAR-10
     ])
 

From 9270bbcba55370584b8bd8ca3a67a2713f3b55d2 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 13:46:14 -0700
Subject: [PATCH 058/123] For some reason getting a dimension error

---
 src/il_representations/scripts/run_cifar.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index bcefb946..69c20a23 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -163,8 +163,8 @@ def representation_learning(algo, device, log_dir, config):
         ], p=0.8),
         transforms.RandomGrayscale(p=0.2),
         transforms.ToTensor(),
-        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
-        #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        #transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
         # SimCLR doesn't use blur for CIFAR-10
     ])
 

From 7039fbf61a1d6e2e0eff4dccd84737fc78ed7f58 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 13:47:41 -0700
Subject: [PATCH 059/123] Go back to other normalization

---
 src/il_representations/scripts/run_cifar.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 69c20a23..bcefb946 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -163,8 +163,8 @@ def representation_learning(algo, device, log_dir, config):
         ], p=0.8),
         transforms.RandomGrayscale(p=0.2),
         transforms.ToTensor(),
-        #transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
+        #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
         # SimCLR doesn't use blur for CIFAR-10
     ])
 

From cc24d47667207fa48c98d6c2a62136fb15c137b7 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Thu, 22 Apr 2021 18:01:43 -0700
Subject: [PATCH 060/123] Cleanup and final push for the evening

---
 src/il_representations/algos/losses.py      | 1 -
 src/il_representations/scripts/run_cifar.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index 5733d96d..cc3a7387 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -230,7 +230,6 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
             avg_self_similarity = logits_ab.diag().mean().item()
             logits_other_sim_mask = ~torch.eye(batch_size, dtype=bool, device=logits_ab.device)
             avg_other_similarity = logits_ab.masked_select(logits_other_sim_mask).mean().item()
-
             sb_logger.record('avg_self_similarity', avg_self_similarity)
             sb_logger.record('avg_other_similarity', avg_other_similarity)
             sb_logger.record('self_other_sim_delta', avg_self_similarity - avg_other_similarity)
diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index bcefb946..d5aff538 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -123,6 +123,7 @@ def evaluate_classifier(classifier, data_dir, device):
 
     return test_acc_meter.avg
 
+
 class SimCLRModel(nn.Module):
     def __init__(self, feature_dim=128):
         super(SimCLRModel, self).__init__()

From d201c9aa9f064835a75c1665d2af089b8e25407f Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 09:50:21 -0700
Subject: [PATCH 061/123] Switch from bilinear to bicubic interpolation

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index d5aff538..f3a55059 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -157,7 +157,7 @@ def representation_learning(algo, device, log_dir, config):
     rep_learning_augmentations = transforms.Compose([
         transforms.Lambda(lambda x: (x.cpu().numpy() * 255).astype(np.uint8)),
         transforms.ToPILImage(),
-        transforms.RandomResizedCrop(32), #, interpolation=PIL.Image.BICUBIC),
+        transforms.RandomResizedCrop(32, interpolation=PIL.Image.BICUBIC),
         transforms.RandomHorizontalFlip(p=0.5),
         transforms.RandomApply([
             transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),

From fda28dd28c2dd82dd57171c63441aba3112b40c9 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 10:49:53 -0700
Subject: [PATCH 062/123] No longer convert to numpy array before PIL image

---
 src/il_representations/scripts/run_cifar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index f3a55059..9e095d93 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -155,7 +155,7 @@ def representation_learning(algo, device, log_dir, config):
     assert issubclass(algo, algos.RepresentationLearner)
 
     rep_learning_augmentations = transforms.Compose([
-        transforms.Lambda(lambda x: (x.cpu().numpy() * 255).astype(np.uint8)),
+        transforms.Lambda(lambda x: (x * 255).int()), # No longer convert to Numpy array so PIL image works
         transforms.ToPILImage(),
         transforms.RandomResizedCrop(32, interpolation=PIL.Image.BICUBIC),
         transforms.RandomHorizontalFlip(p=0.5),

From 292df58687883b7eed395bf756f137cbaaa048f5 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 10:53:04 -0700
Subject: [PATCH 063/123] Transpose numpy array so PILImage has the right
 shape:

---
 src/il_representations/scripts/run_cifar.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 9e095d93..a678c470 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -155,7 +155,8 @@ def representation_learning(algo, device, log_dir, config):
     assert issubclass(algo, algos.RepresentationLearner)
 
     rep_learning_augmentations = transforms.Compose([
-        transforms.Lambda(lambda x: (x * 255).int()), # No longer convert to Numpy array so PIL image works
+        transforms.Lambda(lambda x: np.transpose((x.cpu().numpy() * 255).astype(np.uint8),
+                                                 axes=(1, 2, 0))),
         transforms.ToPILImage(),
         transforms.RandomResizedCrop(32, interpolation=PIL.Image.BICUBIC),
         transforms.RandomHorizontalFlip(p=0.5),

From 5bdada60ee0872c82ba6d488d7b54ba6b320ab46 Mon Sep 17 00:00:00 2001
From: Cody Wild <cody@svm.bair.berkeley.edu>
Date: Fri, 23 Apr 2021 10:53:18 -0700
Subject: [PATCH 064/123] Breakpoint before augmentation

---
 src/il_representations/algos/representation_learner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index cd15f4e4..23f4c9c0 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -311,6 +311,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                     for i in range(10):
                         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_pre_aug_{i}.png'))
                         save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_pre_aug_{i}.png'))
+                breakpoint()
                 contexts, targets = self.augmenter(contexts, targets)
                 if step == 0:
                     for i in range(10):

From 40d8e7a5ad5bc77ef3738496b8671edf823a5ae8 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 15:55:40 -0700
Subject: [PATCH 065/123] Uniform_ contexts and target

---
 src/il_representations/algos/representation_learner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index cd15f4e4..ae900508 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -323,8 +323,8 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 warn_on_non_image_tensor(targets)
                 # These will typically just use the forward() function for the encoder, but can optionally
                 # use a specific encode_context and encode_target if one is implemented
-                encoded_contexts = self.encoder.encode_context(contexts, traj_ts_info)
-                encoded_targets = self.encoder.encode_target(targets, traj_ts_info)
+                encoded_contexts = self.encoder.encode_context(contexts.uniform_(-2.5, 2.5), traj_ts_info)
+                encoded_targets = self.encoder.encode_target(targets.uniform_(-2.5, 2.5), traj_ts_info)
                 # Typically the identity function
                 encoded_extra_context = self.encoder.encode_extra_context(extra_context, traj_ts_info)
 

From 43c2f5714bae35c5069b151a6ed350a1ff998c8b Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 15:59:15 -0700
Subject: [PATCH 066/123] log every interval

---
 src/il_representations/scripts/run_cifar.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index a678c470..c412eb1f 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -216,8 +216,8 @@ def representation_learning(algo, device, log_dir, config):
         scheduler_kwargs={'warmup_epoch': 2, 'total_epochs': num_epochs},
         loss_calculator_kwargs={'temp': config['pretrain_temperature'],
                                 'use_repo_loss': config['use_repo_loss']},
-        log_interval=10,
-        calc_log_interval=10
+        log_interval=1,
+        calc_log_interval=1
     )
     _, encoder_checkpoint_path = model.learn(rep_learning_data, batches_per_epoch, num_epochs)
     print("Representation Learning trained!")

From d5bdbb0a556c54721edf18e209d92a0e2a5d5273 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 16:05:37 -0700
Subject: [PATCH 067/123] Make zs uniform

---
 src/il_representations/algos/losses.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index cc3a7387..1468b515 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -209,10 +209,15 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
                 breakpoint()
             return loss
         else:
+            z_i = z_i.uniform_(-.3, .3)
+            z_j = z_j.uniform_(-.3, .3)
+            if not self.normalize:
+                breakpoint()
             if self.normalize:  # Use cosine similarity
                 z_i = F.normalize(z_i, dim=1)
                 z_j = F.normalize(z_j, dim=1)
 
+
             mask = (torch.eye(batch_size) * self.large_num).to(self.device)
 
             # Similarity of the original images with all other original images in current batch. Return a matrix of NxN.

From 44466217626063b6ffc0f07a9cd7b224de9ce797 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 16:31:57 -0700
Subject: [PATCH 068/123] Set seed to 10

---
 src/il_representations/algos/representation_learner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index ae900508..8344f2af 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -330,6 +330,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
 
                 # Use an algorithm-specific decoder to "decode" the representations into a loss-compatible tensor
                 # As with encode, these will typically just use forward()
+                torch.manual_seed(10)
                 decoded_contexts = self.decoder.decode_context(encoded_contexts, traj_ts_info, encoded_extra_context)
                 decoded_targets = self.decoder.decode_target(encoded_targets, traj_ts_info, encoded_extra_context)
 

From 9884d1be9df391cdfc5358ce2419c900697450e6 Mon Sep 17 00:00:00 2001
From: Cody Wild <cody@svm.bair.berkeley.edu>
Date: Fri, 23 Apr 2021 16:33:09 -0700
Subject: [PATCH 069/123] Add seed back in

---
 src/il_representations/algos/representation_learner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 06ff22c0..adfd364d 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -324,6 +324,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 warn_on_non_image_tensor(targets)
                 # These will typically just use the forward() function for the encoder, but can optionally
                 # use a specific encode_context and encode_target if one is implemented
+                torch.manual_seed(10)
                 encoded_contexts = self.encoder.encode_context(contexts.uniform_(-2.5, 2.5), traj_ts_info)
                 encoded_targets = self.encoder.encode_target(targets.uniform_(-2.5, 2.5), traj_ts_info)
                 # Typically the identity function

From 432c7b0fff0adc315b79e4f520a4f63f6bcc1ea2 Mon Sep 17 00:00:00 2001
From: Cody Wild <cody@svm.bair.berkeley.edu>
Date: Fri, 23 Apr 2021 16:34:22 -0700
Subject: [PATCH 070/123] No longer have random zs

---
 src/il_representations/algos/losses.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index 1468b515..c5014f12 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -209,8 +209,6 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist=None)
                 breakpoint()
             return loss
         else:
-            z_i = z_i.uniform_(-.3, .3)
-            z_j = z_j.uniform_(-.3, .3)
             if not self.normalize:
                 breakpoint()
             if self.normalize:  # Use cosine similarity

From 1224059b99921b36c8170f20df4960782fe29c7e Mon Sep 17 00:00:00 2001
From: Cody Wild <cody@svm.bair.berkeley.edu>
Date: Fri, 23 Apr 2021 16:35:30 -0700
Subject: [PATCH 071/123] Remove breakpoint

---
 src/il_representations/algos/representation_learner.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index adfd364d..8cf532ac 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -311,7 +311,6 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                     for i in range(10):
                         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_pre_aug_{i}.png'))
                         save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_pre_aug_{i}.png'))
-                breakpoint()
                 contexts, targets = self.augmenter(contexts, targets)
                 if step == 0:
                     for i in range(10):
@@ -332,7 +331,6 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
 
                 # Use an algorithm-specific decoder to "decode" the representations into a loss-compatible tensor
                 # As with encode, these will typically just use forward()
-                torch.manual_seed(10)
                 decoded_contexts = self.decoder.decode_context(encoded_contexts, traj_ts_info, encoded_extra_context)
                 decoded_targets = self.decoder.decode_target(encoded_targets, traj_ts_info, encoded_extra_context)
 

From e6bff326bf2a0c9f57e4222a5c7387a2e03d07a7 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 16:41:40 -0700
Subject: [PATCH 072/123] Remove random seed

---
 src/il_representations/algos/representation_learner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 8cf532ac..ae900508 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -323,7 +323,6 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 warn_on_non_image_tensor(targets)
                 # These will typically just use the forward() function for the encoder, but can optionally
                 # use a specific encode_context and encode_target if one is implemented
-                torch.manual_seed(10)
                 encoded_contexts = self.encoder.encode_context(contexts.uniform_(-2.5, 2.5), traj_ts_info)
                 encoded_targets = self.encoder.encode_target(targets.uniform_(-2.5, 2.5), traj_ts_info)
                 # Typically the identity function

From 877ecc505cc972d73bdadfd026c3b86295d4325b Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 16:45:11 -0700
Subject: [PATCH 073/123] Examine distribution after encoder

---
 src/il_representations/algos/representation_learner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index ae900508..a5414931 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -327,7 +327,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 encoded_targets = self.encoder.encode_target(targets.uniform_(-2.5, 2.5), traj_ts_info)
                 # Typically the identity function
                 encoded_extra_context = self.encoder.encode_extra_context(extra_context, traj_ts_info)
-
+                breakpoint()
                 # Use an algorithm-specific decoder to "decode" the representations into a loss-compatible tensor
                 # As with encode, these will typically just use forward()
                 decoded_contexts = self.decoder.decode_context(encoded_contexts, traj_ts_info, encoded_extra_context)

From fc9dad24a027b4dfe57c4201900da939fc54f33c Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 16:49:05 -0700
Subject: [PATCH 074/123] No longer randomize images

---
 src/il_representations/algos/representation_learner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index a5414931..fe716ae4 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -323,8 +323,8 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 warn_on_non_image_tensor(targets)
                 # These will typically just use the forward() function for the encoder, but can optionally
                 # use a specific encode_context and encode_target if one is implemented
-                encoded_contexts = self.encoder.encode_context(contexts.uniform_(-2.5, 2.5), traj_ts_info)
-                encoded_targets = self.encoder.encode_target(targets.uniform_(-2.5, 2.5), traj_ts_info)
+                encoded_contexts = self.encoder.encode_context(contexts, traj_ts_info)
+                encoded_targets = self.encoder.encode_target(targets, traj_ts_info)
                 # Typically the identity function
                 encoded_extra_context = self.encoder.encode_extra_context(extra_context, traj_ts_info)
                 breakpoint()

From 7bbb153f1ab173a4aa8ab4eb1168b2498db76e9a Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 16:50:44 -0700
Subject: [PATCH 075/123] Remove extraneous breakpoint

---
 src/il_representations/algos/representation_learner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index fe716ae4..1da0e958 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -327,7 +327,6 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 encoded_targets = self.encoder.encode_target(targets, traj_ts_info)
                 # Typically the identity function
                 encoded_extra_context = self.encoder.encode_extra_context(extra_context, traj_ts_info)
-                breakpoint()
                 # Use an algorithm-specific decoder to "decode" the representations into a loss-compatible tensor
                 # As with encode, these will typically just use forward()
                 decoded_contexts = self.decoder.decode_context(encoded_contexts, traj_ts_info, encoded_extra_context)

From cb06dbc363ac4260ec80dac60b23d94cff89e170 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Fri, 23 Apr 2021 18:03:46 -0700
Subject: [PATCH 076/123] Add parameter check to repl

---
 src/il_representations/algos/representation_learner.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 1da0e958..8c938e3f 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -279,11 +279,15 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
 
         self.encoder.train(True)
         self.decoder.train(True)
+        for pname, pval in sorted(self.encoder.named_parameters()):
+            print(f'{pname}: {pval.float().mean().item():.4g} pm {pval.float().std().item():.4g}, shape {pval.shape}')
         batches_trained = 0
         logging.debug(
             f"Training for {n_epochs} epochs, each of {batches_per_epoch} "
             f"batches (batch size {self.batch_size})")
-
+        # train_data = utils.CIFAR10Pair(root='data', train=True, transform=utils.train_transform, download=True)
+        # train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True,
+        #                           drop_last=True)
         for epoch_num in range(1, n_epochs + 1):
             loss_meter = AverageMeter()
             # Set encoder and decoder to be in training mode
@@ -294,6 +298,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             for step, batch in enumerate(dataloader):
                 # Construct batch (currently just using Torch's default batch-creator)
                 contexts, targets, traj_ts_info, extra_context = self.unpack_batch(batch)
+
                 if step == 0:
                     for i in range(10):
                         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_from_disk_{i}.png'))

From 71e129facd5ebaae3967383fb4be48be5adbbbca Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Mon, 26 Apr 2021 14:50:31 -0700
Subject: [PATCH 077/123] Swap our data loader for theirs

---
 .../algos/representation_learner.py           | 64 +++++++++++++++----
 1 file changed, 50 insertions(+), 14 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 8c938e3f..21d90ec2 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -17,6 +17,33 @@
 from il_representations.algos.utils import AverageMeter, LinearWarmupCosine
 from il_representations.data.read_dataset import datasets_to_loader, SubdatasetExtractor
 from il_representations.utils import save_rgb_tensor
+from torch.utils.data import DataLoader
+
+from PIL import Image
+from torchvision.datasets import CIFAR10
+import os
+import numpy as np
+from torchvision import transforms
+
+class CIFAR10Pair(CIFAR10):
+    """CIFAR10 Dataset.
+    """
+
+    def __getitem__(self, index):
+        img, target = self.data[index], self.targets[index]
+        img = Image.fromarray(img)
+        id_val = np.random.randint(0, 50000)
+        #save_image(img, f'results/{id_val}_img_pre_trans.png')
+        if self.transform is not None:
+            pos_1 = self.transform(img)
+            pos_2 = self.transform(img)
+            # save_rgb_tensor(pos_1, f'results/{id_val}_pos1.png')
+            # save_rgb_tensor(pos_2, f'results/{id_val}_pos2.png')
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return pos_1, pos_2, target
+
 
 DEFAULT_HARDCODED_PARAMS = [
     'encoder', 'decoder', 'loss_calculator', 'augmenter',
@@ -260,14 +287,14 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             `loss_record` is a list of average loss values encountered at each
             epoch. `most_recent_encoder_checkpoint_path` is self-explanatory.
         """
-        subdataset_extractor = SubdatasetExtractor(n_trajs=n_trajs)
-        dataloader = datasets_to_loader(
-            datasets, batch_size=self.batch_size,
-            nominal_length=batches_per_epoch * self.batch_size,
-            max_workers=self.dataset_max_workers,
-            shuffle_buffer_size=self.shuffle_buffer_size,
-            shuffle=self.shuffle_batches,
-            preprocessors=(subdataset_extractor, self.target_pair_constructor, ))
+        # subdataset_extractor = SubdatasetExtractor(n_trajs=n_trajs)
+        # dataloader = datasets_to_loader(
+        #     datasets, batch_size=self.batch_size,
+        #     nominal_length=batches_per_epoch * self.batch_size,
+        #     max_workers=self.dataset_max_workers,
+        #     shuffle_buffer_size=self.shuffle_buffer_size,
+        #     shuffle=self.shuffle_batches,
+        #     preprocessors=(subdataset_extractor, self.target_pair_constructor, ))
 
         loss_record = []
 
@@ -279,15 +306,23 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
 
         self.encoder.train(True)
         self.decoder.train(True)
-        for pname, pval in sorted(self.encoder.named_parameters()):
-            print(f'{pname}: {pval.float().mean().item():.4g} pm {pval.float().std().item():.4g}, shape {pval.shape}')
+        # for pname, pval in sorted(self.encoder.named_parameters()):
+        #     print(f'{pname}: {pval.float().mean().item():.4g} pm {pval.float().std().item():.4g}, shape {pval.shape}')
         batches_trained = 0
         logging.debug(
             f"Training for {n_epochs} epochs, each of {batches_per_epoch} "
             f"batches (batch size {self.batch_size})")
-        # train_data = utils.CIFAR10Pair(root='data', train=True, transform=utils.train_transform, download=True)
-        # train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True,
-        #                           drop_last=True)
+        # TODO add transform back in, and probably comment out our augmenter line?
+        train_transform = transforms.Compose([
+            transforms.RandomResizedCrop(32),
+            transforms.RandomHorizontalFlip(p=0.5),
+            transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
+            transforms.RandomGrayscale(p=0.2),
+            transforms.ToTensor(),
+            transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
+        train_data = CIFAR10Pair(root='data', train=True, transform=train_transform, download=True)
+        dataloader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True, num_workers=16, pin_memory=True,
+                                  drop_last=True)
         for epoch_num in range(1, n_epochs + 1):
             loss_meter = AverageMeter()
             # Set encoder and decoder to be in training mode
@@ -316,7 +351,8 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                     for i in range(10):
                         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_pre_aug_{i}.png'))
                         save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_pre_aug_{i}.png'))
-                contexts, targets = self.augmenter(contexts, targets)
+                # TODO put back in when done with "swap their data in" test
+                #contexts, targets = self.augmenter(contexts, targets)
                 if step == 0:
                     for i in range(10):
                         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_{i}.png'))

From 5553d2a6d164b1f80f5122e2bc136f391193bd61 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Mon, 26 Apr 2021 14:52:04 -0700
Subject: [PATCH 078/123] Add dataloader back in and add breakpoint

---
 .../algos/representation_learner.py           | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 21d90ec2..e2d78adb 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -287,14 +287,14 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             `loss_record` is a list of average loss values encountered at each
             epoch. `most_recent_encoder_checkpoint_path` is self-explanatory.
         """
-        # subdataset_extractor = SubdatasetExtractor(n_trajs=n_trajs)
-        # dataloader = datasets_to_loader(
-        #     datasets, batch_size=self.batch_size,
-        #     nominal_length=batches_per_epoch * self.batch_size,
-        #     max_workers=self.dataset_max_workers,
-        #     shuffle_buffer_size=self.shuffle_buffer_size,
-        #     shuffle=self.shuffle_batches,
-        #     preprocessors=(subdataset_extractor, self.target_pair_constructor, ))
+        subdataset_extractor = SubdatasetExtractor(n_trajs=n_trajs)
+        dataloader = datasets_to_loader(
+            datasets, batch_size=self.batch_size,
+            nominal_length=batches_per_epoch * self.batch_size,
+            max_workers=self.dataset_max_workers,
+            shuffle_buffer_size=self.shuffle_buffer_size,
+            shuffle=self.shuffle_batches,
+            preprocessors=(subdataset_extractor, self.target_pair_constructor, ))
 
         loss_record = []
 
@@ -321,7 +321,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             transforms.ToTensor(),
             transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
         train_data = CIFAR10Pair(root='data', train=True, transform=train_transform, download=True)
-        dataloader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True, num_workers=16, pin_memory=True,
+        train_loader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True, num_workers=16, pin_memory=True,
                                   drop_last=True)
         for epoch_num in range(1, n_epochs + 1):
             loss_meter = AverageMeter()
@@ -333,6 +333,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             for step, batch in enumerate(dataloader):
                 # Construct batch (currently just using Torch's default batch-creator)
                 contexts, targets, traj_ts_info, extra_context = self.unpack_batch(batch)
+                breakpoint()
 
                 if step == 0:
                     for i in range(10):

From 294bab4fa1a4e3af64d8cefbf462e66c32396440 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Mon, 26 Apr 2021 14:56:10 -0700
Subject: [PATCH 079/123] Try to get .next() to work

---
 src/il_representations/algos/representation_learner.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index e2d78adb..291815ee 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -321,8 +321,8 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             transforms.ToTensor(),
             transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
         train_data = CIFAR10Pair(root='data', train=True, transform=train_transform, download=True)
-        train_loader = DataLoader(train_data, batch_size=self.batch_size, shuffle=True, num_workers=16, pin_memory=True,
-                                  drop_last=True)
+        train_loader = iter(DataLoader(train_data, batch_size=self.batch_size, shuffle=True, num_workers=16, pin_memory=True,
+                                  drop_last=True))
         for epoch_num in range(1, n_epochs + 1):
             loss_meter = AverageMeter()
             # Set encoder and decoder to be in training mode
@@ -333,6 +333,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             for step, batch in enumerate(dataloader):
                 # Construct batch (currently just using Torch's default batch-creator)
                 contexts, targets, traj_ts_info, extra_context = self.unpack_batch(batch)
+                contexts, targets, _ = train_loader.next()
                 breakpoint()
 
                 if step == 0:

From 2884db69d7909cba0852b6fe1c54029a8e7c30e5 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Mon, 26 Apr 2021 14:59:52 -0700
Subject: [PATCH 080/123] Swap in new contexts/targets temporarily

---
 src/il_representations/algos/representation_learner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 291815ee..4a2e194e 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -332,9 +332,8 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             timer_last_batches_trained = batches_trained
             for step, batch in enumerate(dataloader):
                 # Construct batch (currently just using Torch's default batch-creator)
-                contexts, targets, traj_ts_info, extra_context = self.unpack_batch(batch)
+                old_contexts, old_targets, traj_ts_info, extra_context = self.unpack_batch(batch)
                 contexts, targets, _ = train_loader.next()
-                breakpoint()
 
                 if step == 0:
                     for i in range(10):

From 9eb90b94a97ba96dda81974fb2dd10e4f00cd246 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Mon, 26 Apr 2021 15:09:18 -0700
Subject: [PATCH 081/123] If we double augment that should break things...
 right?

---
 src/il_representations/algos/representation_learner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 4a2e194e..1f0760a1 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -353,7 +353,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_pre_aug_{i}.png'))
                         save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_pre_aug_{i}.png'))
                 # TODO put back in when done with "swap their data in" test
-                #contexts, targets = self.augmenter(contexts, targets)
+                contexts, targets = self.augmenter(contexts, targets)
                 if step == 0:
                     for i in range(10):
                         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_{i}.png'))

From 31864d6a1fcc56c89d8e7c76b63db7b92a3ae3d7 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Mon, 26 Apr 2021 15:10:41 -0700
Subject: [PATCH 082/123] Switch back to using our data loader

---
 .../algos/representation_learner.py           | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 1f0760a1..879cf2fe 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -313,16 +313,16 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             f"Training for {n_epochs} epochs, each of {batches_per_epoch} "
             f"batches (batch size {self.batch_size})")
         # TODO add transform back in, and probably comment out our augmenter line?
-        train_transform = transforms.Compose([
-            transforms.RandomResizedCrop(32),
-            transforms.RandomHorizontalFlip(p=0.5),
-            transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
-            transforms.RandomGrayscale(p=0.2),
-            transforms.ToTensor(),
-            transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
-        train_data = CIFAR10Pair(root='data', train=True, transform=train_transform, download=True)
-        train_loader = iter(DataLoader(train_data, batch_size=self.batch_size, shuffle=True, num_workers=16, pin_memory=True,
-                                  drop_last=True))
+        # train_transform = transforms.Compose([
+        #     transforms.RandomResizedCrop(32),
+        #     transforms.RandomHorizontalFlip(p=0.5),
+        #     transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
+        #     transforms.RandomGrayscale(p=0.2),
+        #     transforms.ToTensor(),
+        #     transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
+        # train_data = CIFAR10Pair(root='data', train=True, transform=train_transform, download=True)
+        # train_loader = iter(DataLoader(train_data, batch_size=self.batch_size, shuffle=True, num_workers=16, pin_memory=True,
+        #                           drop_last=True))
         for epoch_num in range(1, n_epochs + 1):
             loss_meter = AverageMeter()
             # Set encoder and decoder to be in training mode
@@ -332,8 +332,8 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             timer_last_batches_trained = batches_trained
             for step, batch in enumerate(dataloader):
                 # Construct batch (currently just using Torch's default batch-creator)
-                old_contexts, old_targets, traj_ts_info, extra_context = self.unpack_batch(batch)
-                contexts, targets, _ = train_loader.next()
+                contexts, targets, traj_ts_info, extra_context = self.unpack_batch(batch)
+                # contexts, targets, _ = train_loader.next()
 
                 if step == 0:
                     for i in range(10):

From 502c223e8b916dcaae626a9caed6efeb8f7b0a94 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Mon, 26 Apr 2021 15:14:20 -0700
Subject: [PATCH 083/123] Skip the decoding step entirely

---
 src/il_representations/algos/representation_learner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 879cf2fe..3a53b868 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -381,7 +381,8 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 # Use an algorithm-specific loss function. Typically this only requires decoded_contexts and
                 # decoded_targets, but VAE requires encoded_contexts, so we pass it in here
 
-                loss = self.loss_calculator(decoded_contexts, decoded_targets, encoded_contexts)
+                loss = self.loss_calculator(encoded_contexts, encoded_targets, encoded_contexts)
+                #loss = self.loss_calculator(decoded_contexts, decoded_targets, encoded_contexts)
                 if batches_trained % self.calc_log_interval == 0:
                     loss_item = loss.item()
                     assert not np.isnan(loss_item), "Loss is NaN"

From c4096a18aca1db23d786cdf8f82e1b80379089aa Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Mon, 26 Apr 2021 15:15:54 -0700
Subject: [PATCH 084/123] Don't calculate norm on decoder while we're testing
 out not using it

---
 src/il_representations/algos/representation_learner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 3a53b868..e92923a9 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -185,7 +185,7 @@ def _calculate_norms(self, norm_type=2):
         norm_type = float(norm_type)
 
         encoder_params, decoder_params = self._get_trainable_parameters()
-        trainable_params = encoder_params + decoder_params
+        trainable_params = encoder_params # + decoder_params
         stacked_gradient_norms = torch.stack([torch.norm(p.grad.detach(), norm_type).to(self.device) for p in trainable_params])
         stacked_weight_norms = torch.stack([torch.norm(p.detach(), norm_type).to(self.device) for p in trainable_params])
 

From 3dd1e80aa261cfb08b74ad8ff46ecaee816de7ae Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Mon, 26 Apr 2021 15:16:35 -0700
Subject: [PATCH 085/123] Remove decoder from _calculate_norms

---
 src/il_representations/algos/representation_learner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index e92923a9..45b2bee1 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -185,6 +185,7 @@ def _calculate_norms(self, norm_type=2):
         norm_type = float(norm_type)
 
         encoder_params, decoder_params = self._get_trainable_parameters()
+        # TODO undo
         trainable_params = encoder_params # + decoder_params
         stacked_gradient_norms = torch.stack([torch.norm(p.grad.detach(), norm_type).to(self.device) for p in trainable_params])
         stacked_weight_norms = torch.stack([torch.norm(p.detach(), norm_type).to(self.device) for p in trainable_params])

From f2645d54eb985fd2ac37bccdd5a9e3b6f6ed39c2 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <cynthiachen@svm.bair.berkeley.edu>
Date: Mon, 26 Apr 2021 20:09:12 -0700
Subject: [PATCH 086/123] try to use direct network output instead of a
 distribution

---
 src/il_representations/algos/decoders.py | 2 ++
 src/il_representations/algos/encoders.py | 1 +
 src/il_representations/algos/losses.py   | 1 +
 3 files changed, 4 insertions(+)

diff --git a/src/il_representations/algos/decoders.py b/src/il_representations/algos/decoders.py
index 7f84c3da..da8af2fa 100644
--- a/src/il_representations/algos/decoders.py
+++ b/src/il_representations/algos/decoders.py
@@ -116,6 +116,7 @@ def get_vector(self, z_dist):
         if self.sample:
             return z_dist.rsample()
         else:
+            return z_dist
             return z_dist.mean
 
     def ones_like_projection_dim(self, x):
@@ -127,6 +128,7 @@ def passthrough(self, x):
     def _apply_projection_layer(self, z_dist, mean_layer, stdev_layer):
         z_vector = self.get_vector(z_dist)
         mean = mean_layer(z_vector)
+        return mean
         if stdev_layer is None:
             # We better not have had a learned standard deviation in
             # the encoder, since there's no clear way on how to pass
diff --git a/src/il_representations/algos/encoders.py b/src/il_representations/algos/encoders.py
index 3a630ebc..bdb3477f 100644
--- a/src/il_representations/algos/encoders.py
+++ b/src/il_representations/algos/encoders.py
@@ -399,6 +399,7 @@ def forward_with_stddev(self, x, traj_info):
 
     def forward_deterministic(self, x, traj_info):
         features = self.network(x)
+        return features
         return independent_multivariate_normal(mean=features,
                                                stddev=self.scale_constant)
 
diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index c5014f12..b412cd30 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -17,6 +17,7 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist):
         pass
 
     def get_vector_forms(self, *args):
+        return [el.rsample() if self.sample else el for el in args]
         return [el.rsample() if self.sample else el.mean for el in args]
 
 

From 29a9083566f195ab833991511510b1d8e5ec9ea2 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <cyn0531@hku.hk>
Date: Mon, 26 Apr 2021 23:58:06 -0700
Subject: [PATCH 087/123] return to using multivariate normal and adjust loss
 and temperature

---
 src/il_representations/algos/decoders.py               | 4 +---
 src/il_representations/algos/encoders.py               | 1 -
 src/il_representations/algos/losses.py                 | 1 -
 src/il_representations/algos/representation_learner.py | 5 +++--
 src/il_representations/scripts/run_cifar.py            | 5 +++--
 5 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/il_representations/algos/decoders.py b/src/il_representations/algos/decoders.py
index da8af2fa..087e0336 100644
--- a/src/il_representations/algos/decoders.py
+++ b/src/il_representations/algos/decoders.py
@@ -64,8 +64,8 @@ def get_sequential_from_architecture(architecture, representation_dim, projectio
     input_dim = representation_dim
     for layer_def in architecture:
         layers.append(nn.Linear(input_dim, layer_def['output_dim']))
-        layers.append(nn.ReLU())
         layers.append(nn.BatchNorm1d(num_features=layer_def['output_dim']))
+        layers.append(nn.ReLU(inplace=True))
         input_dim = layer_def['output_dim']
     layers.append(nn.Linear(input_dim, projection_dim))
     return nn.Sequential(*layers)
@@ -116,7 +116,6 @@ def get_vector(self, z_dist):
         if self.sample:
             return z_dist.rsample()
         else:
-            return z_dist
             return z_dist.mean
 
     def ones_like_projection_dim(self, x):
@@ -128,7 +127,6 @@ def passthrough(self, x):
     def _apply_projection_layer(self, z_dist, mean_layer, stdev_layer):
         z_vector = self.get_vector(z_dist)
         mean = mean_layer(z_vector)
-        return mean
         if stdev_layer is None:
             # We better not have had a learned standard deviation in
             # the encoder, since there's no clear way on how to pass
diff --git a/src/il_representations/algos/encoders.py b/src/il_representations/algos/encoders.py
index bdb3477f..3a630ebc 100644
--- a/src/il_representations/algos/encoders.py
+++ b/src/il_representations/algos/encoders.py
@@ -399,7 +399,6 @@ def forward_with_stddev(self, x, traj_info):
 
     def forward_deterministic(self, x, traj_info):
         features = self.network(x)
-        return features
         return independent_multivariate_normal(mean=features,
                                                stddev=self.scale_constant)
 
diff --git a/src/il_representations/algos/losses.py b/src/il_representations/algos/losses.py
index b412cd30..c5014f12 100644
--- a/src/il_representations/algos/losses.py
+++ b/src/il_representations/algos/losses.py
@@ -17,7 +17,6 @@ def __call__(self, decoded_context_dist, target_dist, encoded_context_dist):
         pass
 
     def get_vector_forms(self, *args):
-        return [el.rsample() if self.sample else el for el in args]
         return [el.rsample() if self.sample else el.mean for el in args]
 
 
diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 45b2bee1..c709d828 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -149,6 +149,7 @@ def __init__(self, *,
 
         self.encoder = encoder(self.observation_space, representation_dim, **encoder_kwargs).to(self.device)
         self.decoder = decoder(representation_dim, projection_dim, **decoder_kwargs).to(self.device)
+        breakpoint()
 
         if batch_extender is QueueBatchExtender:
             # TODO maybe clean this up?
@@ -382,8 +383,8 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 # Use an algorithm-specific loss function. Typically this only requires decoded_contexts and
                 # decoded_targets, but VAE requires encoded_contexts, so we pass it in here
 
-                loss = self.loss_calculator(encoded_contexts, encoded_targets, encoded_contexts)
-                #loss = self.loss_calculator(decoded_contexts, decoded_targets, encoded_contexts)
+                # loss = self.loss_calculator(encoded_contexts, encoded_targets, encoded_contexts)
+                loss = self.loss_calculator(decoded_contexts, decoded_targets, encoded_contexts)
                 if batches_trained % self.calc_log_interval == 0:
                     loss_item = loss.item()
                     assert not np.isnan(loss_item), "Loss is NaN"
diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index c412eb1f..5099443d 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -319,6 +319,7 @@ def __getitem__(self, index):
 
         return pos_1, pos_2, target
 
+
 @cifar_ex.config
 def default_config():
     seed = 1
@@ -327,14 +328,14 @@ def default_config():
     pretrain_epochs = 1000
     pretrain_batches_per_epoch = 390
     finetune_epochs = 100
-    representation_dim = 2048 ## TODO change back
+    representation_dim = 2048 # TODO change back
     projection_dim = 128
     pretrain_lr = 3e-4
     pretrain_weight_decay = 1e-4
     pretrain_momentum = 0.9
     pretrain_batch_size = 512
     pretrain_save_interval = 100
-    pretrain_temperature = 0.5
+    pretrain_temperature = 0.1
     pretrained_model = None
     use_repo_loss = False
     _ = locals()

From 31d04e563a97a18e50b49d38534b9027f3762036 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <cyn0531@hku.hk>
Date: Tue, 27 Apr 2021 01:38:09 -0700
Subject: [PATCH 088/123] test linear head

---
 src/il_representations/scripts/run_cifar.py | 94 +++++++++++++--------
 1 file changed, 59 insertions(+), 35 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index 5099443d..aea703d4 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -35,32 +35,53 @@ class LinearHead(nn.Module):
     def __init__(self, encoder, encoder_dim, output_dim):
         super().__init__()
         self.encoder = encoder
-        self.encoder.fc = nn.Linear(2048, output_dim)
+        self.fc = nn.Linear(2048, output_dim, bias=True)
+        # self.encoder.fc = nn.Linear(2048, output_dim)
+        breakpoint()
 
     def forward(self, x):
-        return self.encoder(x)
+        x = self.encoder(x)
+        feature = torch.flatten(x, start_dim=1)
+        out = self.fc(feature)
+        return out
 
 
 def train_classifier(classifier, data_dir, num_epochs, device):
-    transform = transforms.Compose([
-        transforms.RandomResizedCrop(32, interpolation=PIL.Image.BICUBIC),
-        transforms.RandomHorizontalFlip(),
-        # No color jitter or grayscale for finetuning
-        # SimCLR doesn't use blur for CIFAR-10
-        transforms.ToTensor(),
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-    ])
-    trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform)
+    # transform = transforms.Compose([
+    #     transforms.RandomResizedCrop(32, interpolation=PIL.Image.BICUBIC),
+    #     transforms.RandomHorizontalFlip(),
+    #     # No color jitter or grayscale for finetuning
+    #     # SimCLR doesn't use blur for CIFAR-10
+    #     transforms.ToTensor(),
+    #     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+    # ])
+    train_transform = transforms.Compose([
+            transforms.RandomResizedCrop(32),
+            transforms.RandomHorizontalFlip(p=0.5),
+            transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4,
+                                                           0.1)], p=0.8),
+            transforms.RandomGrayscale(p=0.2),
+            transforms.ToTensor(),
+            transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994,
+                                                            0.2010])])
+
+    test_transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994,
+                                                            0.2010])])
+    trainset = torchvision.datasets.CIFAR10(root=data_dir, train=True,
+                                            download=True, transform=train_transform)
     trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
     criterion = nn.CrossEntropyLoss().to(device)
-    optimizer = optim.Adam(classifier.encoder.parameters(), lr=3e-4)
-    # optimizer = optim.Adam(classifier.encoder.fc.parameters(), lr=3e-4, momentum=0.9, weight_decay=0.0, nesterov=True)
+    optimizer = optim.Adam(classifier.fc.parameters(), lr=1e-3,
+                           weight_decay=1e-6)
+    # # optimizer = optim.Adam(classifier.encoder.fc.parameters(), lr=3e-4, momentum=0.9, weight_decay=0.0, nesterov=True)
     #scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
 
-    test_transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-    ])
+    # test_transform = transforms.Compose([
+    #     transforms.ToTensor(),
+    #     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+    # ])
     testset = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True, transform=test_transform)
     testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False)
 
@@ -71,6 +92,7 @@ def train_classifier(classifier, data_dir, num_epochs, device):
     for epoch in range(num_epochs):
         loss_meter = AverageMeter()
         train_acc_meter = AverageMeter()
+        classifier.train()
 
         print(f"Epoch {epoch}/{num_epochs} with lr {optimizer.param_groups[0]['lr']}")
         running_loss = 0.0
@@ -108,9 +130,10 @@ def train_classifier(classifier, data_dir, num_epochs, device):
             json.dump(progress_dict, f)
 
 
-def evaluate_classifier(classifier, data_dir, device):
-    trainset = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True)
-    testloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
+def evaluate_classifier(testloader, classifier, device):
+    # trainset = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True)
+    # testloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)
+    classifier.eval()
     total = 0
     test_acc_meter = AverageMeter()
     with torch.no_grad():
@@ -120,6 +143,7 @@ def evaluate_classifier(classifier, data_dir, device):
             _, predicted = torch.max(outputs.data, 1)
             total += labels.size(0)
             test_acc_meter.update(accuracy(outputs, labels)[0].item())
+    print(f"Test acc: {test_acc_meter.avg}")
 
     return test_acc_meter.avg
 
@@ -346,35 +370,35 @@ def default_config():
 def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_dim,
         pretrained_model, pretrain_batch_size, _config):
     # TODO fix this hacky nonsense
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if pretrained_model is None:
         log_dir = os.path.join(cifar_ex.observers[0].dir, 'training_logs')
         os.mkdir(log_dir)
         os.makedirs(data_dir, exist_ok=True)
 
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         model = representation_learning(algo, device, log_dir, _config)
 
     else:
         model = torch.load(pretrained_model)
 
-    test_transform = transforms.Compose([
-    transforms.ToTensor(),
-    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
-
-    memory_data = CIFAR10Pair(root='data', train=True, transform=test_transform, download=True)
-    memory_loader = torch.utils.data.DataLoader(memory_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
-    test_data = CIFAR10Pair(root='data', train=False, transform=test_transform, download=True)
-    test_loader = torch.utils.data.DataLoader(test_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
+    # test_transform = transforms.Compose([
+    # transforms.ToTensor(),
+    # transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
 
+    # memory_data = CIFAR10Pair(root='data', train=True, transform=test_transform, download=True)
+    # memory_loader = torch.utils.data.DataLoader(memory_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
+    # test_data = CIFAR10Pair(root='data', train=False, transform=test_transform, download=True)
+    # test_loader = torch.utils.data.DataLoader(test_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
 
     # KNN testing from SimCLR repo for comparison
-    test(model.network, memory_loader, test_loader, k=200, num_classes=10, temperature=_config['pretrain_temperature'], epoch=-1)
-    # print('Train linear head')
-    # classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
-    # train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
+    # test(model.network, memory_loader, test_loader, k=200, num_classes=10, temperature=_config['pretrain_temperature'], epoch=-1)
+    print('Train linear head')
+    classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
+    train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 
-    # print('Evaluate accuracy on test set')
-    # evaluate_classifier(classifier, data_dir, device=device)
+    print('Evaluate accuracy on test set')
+    evaluate_classifier(classifier, data_dir, device=device)
 
 
 if __name__ == '__main__':

From 77afc23d203cd54d28d28b3e77c81056c23e3570 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Tue, 27 Apr 2021 17:13:09 +0800
Subject: [PATCH 089/123] Try to fully use SimCLR repo's linear evaluation code

---
 src/il_representations/scripts/run_cifar.py | 89 ++++++++++++++++++++-
 1 file changed, 85 insertions(+), 4 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index aea703d4..de14fff9 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -6,6 +6,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
+from torch.utils.data import DataLoader
 import torchvision
 import torchvision.transforms as transforms
 from torchvision.models.resnet import resnet50
@@ -14,6 +15,8 @@
 
 from tqdm import tqdm
 from math import ceil
+import pandas as pd
+
 import time
 from sacred import Experiment
 from sacred.observers import FileStorageObserver
@@ -75,8 +78,8 @@ def train_classifier(classifier, data_dir, num_epochs, device):
     criterion = nn.CrossEntropyLoss().to(device)
     optimizer = optim.Adam(classifier.fc.parameters(), lr=1e-3,
                            weight_decay=1e-6)
-    # # optimizer = optim.Adam(classifier.encoder.fc.parameters(), lr=3e-4, momentum=0.9, weight_decay=0.0, nesterov=True)
-    #scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
+    # optimizer = optim.Adam(classifier.encoder.fc.parameters(), lr=3e-4, momentum=0.9, weight_decay=0.0, nesterov=True)
+    # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
 
     # test_transform = transforms.Compose([
     #     transforms.ToTensor(),
@@ -148,6 +151,82 @@ def evaluate_classifier(testloader, classifier, device):
     return test_acc_meter.avg
 
 
+def train_from_simclr_repo(model, batch_size, epochs):
+    train_transform = transforms.Compose([
+        transforms.RandomResizedCrop(32),
+        transforms.RandomHorizontalFlip(p=0.5),
+        transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
+        transforms.RandomGrayscale(p=0.2),
+        transforms.ToTensor(),
+        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
+
+    test_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
+
+    train_data = CIFAR10(root='data', train=True, transform=train_transform, download=True)
+    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True)
+    test_data = CIFAR10(root='data', train=False, transform=test_transform, download=True)
+    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=16, pin_memory=True)
+
+    # flops, params = profile(model, inputs=(torch.randn(1, 3, 32, 32).cuda(),))
+    # flops, params = clever_format([flops, params])
+    # print('# Model Params: {} FLOPs: {}'.format(params, flops))
+    optimizer = optim.Adam(model.fc.parameters(), lr=1e-3, weight_decay=1e-6)
+    loss_criterion = nn.CrossEntropyLoss()
+    results = {'train_loss': [], 'train_acc@1': [], 'train_acc@5': [],
+               'test_loss': [], 'test_acc@1': [], 'test_acc@5': []}
+
+    best_acc = 0.0
+    for epoch in range(1, epochs + 1):
+        train_loss, train_acc_1, train_acc_5 = train_val(model, train_loader, optimizer, loss_criterion,
+                                                         epoch, epochs)
+        results['train_loss'].append(train_loss)
+        results['train_acc@1'].append(train_acc_1)
+        results['train_acc@5'].append(train_acc_5)
+        test_loss, test_acc_1, test_acc_5 = train_val(model, test_loader, None, loss_criterion,
+                                                      epoch, epochs)
+        results['test_loss'].append(test_loss)
+        results['test_acc@1'].append(test_acc_1)
+        results['test_acc@5'].append(test_acc_5)
+        # save statistics
+        data_frame = pd.DataFrame(data=results, index=range(1, epoch + 1))
+        data_frame.to_csv('results/linear_statistics.csv', index_label='epoch')
+        if test_acc_1 > best_acc:
+            best_acc = test_acc_1
+            torch.save(model.state_dict(), 'results/linear_model.pth')
+
+
+# train or test for one epoch
+def train_val(net, data_loader, train_optimizer, loss_criterion, epoch, epochs):
+    is_train = train_optimizer is not None
+    net.train() if is_train else net.eval()
+
+    total_loss, total_correct_1, total_correct_5, total_num, data_bar = 0.0, 0.0, 0.0, 0, tqdm(data_loader)
+    with (torch.enable_grad() if is_train else torch.no_grad()):
+        for data, target in data_bar:
+            data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
+            out = net(data)
+            loss = loss_criterion(out, target)
+
+            if is_train:
+                train_optimizer.zero_grad()
+                loss.backward()
+                train_optimizer.step()
+
+            total_num += data.size(0)
+            total_loss += loss.item() * data.size(0)
+            prediction = torch.argsort(out, dim=-1, descending=True)
+            total_correct_1 += torch.sum((prediction[:, 0:1] == target.unsqueeze(dim=-1)).any(dim=-1).float()).item()
+            total_correct_5 += torch.sum((prediction[:, 0:5] == target.unsqueeze(dim=-1)).any(dim=-1).float()).item()
+
+            data_bar.set_description('{} Epoch: [{}/{}] Loss: {:.4f} ACC@1: {:.2f}% ACC@5: {:.2f}%'
+                                     .format('Train' if is_train else 'Test', epoch, epochs, total_loss / total_num,
+                                             total_correct_1 / total_num * 100, total_correct_5 / total_num * 100))
+
+    return total_loss / total_num, total_correct_1 / total_num * 100, total_correct_5 / total_num * 100
+
+
 class SimCLRModel(nn.Module):
     def __init__(self, feature_dim=128):
         super(SimCLRModel, self).__init__()
@@ -352,6 +431,7 @@ def default_config():
     pretrain_epochs = 1000
     pretrain_batches_per_epoch = 390
     finetune_epochs = 100
+    finetune_batch_size = 512
     representation_dim = 2048 # TODO change back
     projection_dim = 128
     pretrain_lr = 3e-4
@@ -368,7 +448,7 @@ def default_config():
 
 @cifar_ex.main
 def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_dim,
-        pretrained_model, pretrain_batch_size, _config):
+        pretrained_model, pretrain_batch_size, finetune_batch_size, _config):
     # TODO fix this hacky nonsense
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -395,7 +475,8 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     # test(model.network, memory_loader, test_loader, k=200, num_classes=10, temperature=_config['pretrain_temperature'], epoch=-1)
     print('Train linear head')
     classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
-    train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
+    train_from_simclr_repo(classifier, finetune_batch_size, finetune_epochs)
+    # train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
 
     print('Evaluate accuracy on test set')
     evaluate_classifier(classifier, data_dir, device=device)

From 130c127a1d960d3cb6b64e8d7f306983db225bfb Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Tue, 27 Apr 2021 17:22:37 +0800
Subject: [PATCH 090/123] select test method

---
 src/il_representations/scripts/run_cifar.py | 48 ++++++++++++---------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index de14fff9..e56c6c81 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -191,10 +191,10 @@ def train_from_simclr_repo(model, batch_size, epochs):
         results['test_acc@5'].append(test_acc_5)
         # save statistics
         data_frame = pd.DataFrame(data=results, index=range(1, epoch + 1))
-        data_frame.to_csv('results/linear_statistics.csv', index_label='epoch')
+        data_frame.to_csv('./linear_statistics.csv', index_label='epoch')
         if test_acc_1 > best_acc:
             best_acc = test_acc_1
-            torch.save(model.state_dict(), 'results/linear_model.pth')
+            torch.save(model.state_dict(), './linear_model.pth')
 
 
 # train or test for one epoch
@@ -442,13 +442,14 @@ def default_config():
     pretrain_temperature = 0.1
     pretrained_model = None
     use_repo_loss = False
+    eval_knn = True
     _ = locals()
     del _
 
 
 @cifar_ex.main
 def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_dim,
-        pretrained_model, pretrain_batch_size, finetune_batch_size, _config):
+        pretrained_model, pretrain_batch_size, finetune_batch_size, eval_knn, _config):
     # TODO fix this hacky nonsense
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -462,24 +463,29 @@ def run(seed, algo, data_dir, pretrain_epochs, finetune_epochs, representation_d
     else:
         model = torch.load(pretrained_model)
 
-    # test_transform = transforms.Compose([
-    # transforms.ToTensor(),
-    # transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
-
-    # memory_data = CIFAR10Pair(root='data', train=True, transform=test_transform, download=True)
-    # memory_loader = torch.utils.data.DataLoader(memory_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
-    # test_data = CIFAR10Pair(root='data', train=False, transform=test_transform, download=True)
-    # test_loader = torch.utils.data.DataLoader(test_data, batch_size=pretrain_batch_size, shuffle=False, num_workers=16, pin_memory=True)
-
-    # KNN testing from SimCLR repo for comparison
-    # test(model.network, memory_loader, test_loader, k=200, num_classes=10, temperature=_config['pretrain_temperature'], epoch=-1)
-    print('Train linear head')
-    classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
-    train_from_simclr_repo(classifier, finetune_batch_size, finetune_epochs)
-    # train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
-
-    print('Evaluate accuracy on test set')
-    evaluate_classifier(classifier, data_dir, device=device)
+    if eval_knn:
+        test_transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
+
+        memory_data = CIFAR10Pair(root='data', train=True, transform=test_transform, download=True)
+        memory_loader = torch.utils.data.DataLoader(memory_data, batch_size=pretrain_batch_size, shuffle=False,
+                                                    num_workers=16, pin_memory=True)
+        test_data = CIFAR10Pair(root='data', train=False, transform=test_transform, download=True)
+        test_loader = torch.utils.data.DataLoader(test_data, batch_size=pretrain_batch_size, shuffle=False,
+                                                  num_workers=16, pin_memory=True)
+
+        # KNN testing from SimCLR repo for comparison
+        test(model.network, memory_loader, test_loader, k=200, num_classes=10,
+             temperature=_config['pretrain_temperature'], epoch=-1)
+    else:
+        print('Train linear head')
+        classifier = LinearHead(model.network, representation_dim, output_dim=10).to(device)
+        train_from_simclr_repo(classifier, finetune_batch_size, finetune_epochs)
+        # train_classifier(classifier, data_dir, num_epochs=finetune_epochs, device=device)
+
+        print('Evaluate accuracy on test set')
+        evaluate_classifier(classifier, data_dir, device=device)
 
 
 if __name__ == '__main__':

From 869f3bd386fcd1d88b79aa2fe6c5e9b81ea39f1a Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Tue, 27 Apr 2021 11:25:40 -0700
Subject: [PATCH 091/123] Add comment

---
 src/il_representations/algos/representation_learner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 45b2bee1..9f882300 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -346,6 +346,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 extra_context = self._prep_tensors(extra_context)
                 traj_ts_info = self._prep_tensors(traj_ts_info)
                 # Note: preprocessing might be better to do on CPU if, in future, we can parallelize doing so
+                # TODO this may not make sense for CIFAR10, maybe double normalizing
                 # contexts = self._preprocess(contexts)
                 # if self.preprocess_target:
                 #     targets = self._preprocess(targets)

From 1a537bf6d635c3fa9cf7bfc61f01e1529b513fab Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Tue, 27 Apr 2021 12:03:38 -0700
Subject: [PATCH 092/123] Specifically ablate change to decoder

---
 src/il_representations/algos/decoders.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/il_representations/algos/decoders.py b/src/il_representations/algos/decoders.py
index 087e0336..d9cca44a 100644
--- a/src/il_representations/algos/decoders.py
+++ b/src/il_representations/algos/decoders.py
@@ -64,8 +64,9 @@ def get_sequential_from_architecture(architecture, representation_dim, projectio
     input_dim = representation_dim
     for layer_def in architecture:
         layers.append(nn.Linear(input_dim, layer_def['output_dim']))
+        layers.append(nn.ReLU(inplace=True)) # TODO change this back after testing
         layers.append(nn.BatchNorm1d(num_features=layer_def['output_dim']))
-        layers.append(nn.ReLU(inplace=True))
+        # layers.append(nn.ReLU(inplace=True))
         input_dim = layer_def['output_dim']
     layers.append(nn.Linear(input_dim, projection_dim))
     return nn.Sequential(*layers)

From f769190d667416e58b920a25847d2edd316386e5 Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Tue, 27 Apr 2021 12:06:59 -0700
Subject: [PATCH 093/123] Switch ReLu back to be after BatchNorm

---
 src/il_representations/algos/decoders.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/il_representations/algos/decoders.py b/src/il_representations/algos/decoders.py
index d9cca44a..087e0336 100644
--- a/src/il_representations/algos/decoders.py
+++ b/src/il_representations/algos/decoders.py
@@ -64,9 +64,8 @@ def get_sequential_from_architecture(architecture, representation_dim, projectio
     input_dim = representation_dim
     for layer_def in architecture:
         layers.append(nn.Linear(input_dim, layer_def['output_dim']))
-        layers.append(nn.ReLU(inplace=True)) # TODO change this back after testing
         layers.append(nn.BatchNorm1d(num_features=layer_def['output_dim']))
-        # layers.append(nn.ReLU(inplace=True))
+        layers.append(nn.ReLU(inplace=True))
         input_dim = layer_def['output_dim']
     layers.append(nn.Linear(input_dim, projection_dim))
     return nn.Sequential(*layers)

From 5d2ff3f07693854d34c74b31e7442af022e350ae Mon Sep 17 00:00:00 2001
From: Cody Wild <codywild@berkeley.edu>
Date: Tue, 27 Apr 2021 12:11:48 -0700
Subject: [PATCH 094/123] Remove breakpoint on Github

---
 src/il_representations/algos/representation_learner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 375af1fb..b35797b7 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -149,7 +149,6 @@ def __init__(self, *,
 
         self.encoder = encoder(self.observation_space, representation_dim, **encoder_kwargs).to(self.device)
         self.decoder = decoder(representation_dim, projection_dim, **decoder_kwargs).to(self.device)
-        breakpoint()
 
         if batch_extender is QueueBatchExtender:
             # TODO maybe clean this up?

From ea8deaeb5f4523f2f063884c995f177514d60a67 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Fri, 30 Apr 2021 14:51:21 +0800
Subject: [PATCH 095/123] config for running few trajs

---
 src/il_representations/scripts/chain_configs.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/il_representations/scripts/chain_configs.py b/src/il_representations/scripts/chain_configs.py
index ad3d51bb..47b2b040 100644
--- a/src/il_representations/scripts/chain_configs.py
+++ b/src/il_representations/scripts/chain_configs.py
@@ -206,6 +206,20 @@ def cfg_bench_micro_sweep_dm_control():
         _ = locals()
         del _
 
+    @experiment_obj.named_config
+    def cfg_run_few_trajs_long_dm_control():
+        """For experiments running very few BC trajs"""
+        spec = dict(il_train={
+            'bc': {
+                'n_batches': 10000000,
+                'n_trajs': tune.grid_search([1, 10, 30]),
+                'save_every_n_batches': 1e6
+            }
+        })
+
+        _ = locals()
+        del _
+
     @experiment_obj.named_config
     def cfg_bench_one_task_magical():
         """Just one simple MAGICAL config."""

From 6a2842efd20ef351700ba2fe83cbcea72daa89f1 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Fri, 30 Apr 2021 16:09:35 +0800
Subject: [PATCH 096/123] Update chain_configs.py

---
 src/il_representations/scripts/chain_configs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/scripts/chain_configs.py b/src/il_representations/scripts/chain_configs.py
index 47b2b040..44e2e38d 100644
--- a/src/il_representations/scripts/chain_configs.py
+++ b/src/il_representations/scripts/chain_configs.py
@@ -212,8 +212,8 @@ def cfg_run_few_trajs_long_dm_control():
         spec = dict(il_train={
             'bc': {
                 'n_batches': 10000000,
-                'n_trajs': tune.grid_search([1, 10, 30]),
-                'save_every_n_batches': 1e6
+                # 'n_trajs': tune.grid_search([1, 10, 30]),
+                'save_every_n_batches': 5e4
             }
         })
 

From 93783a12a3d86b233e7bf3d71cd1360898154c43 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Fri, 30 Apr 2021 16:32:38 +0800
Subject: [PATCH 097/123] Finding a good gpu number balance

---
 src/il_representations/scripts/chain_configs.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/il_representations/scripts/chain_configs.py b/src/il_representations/scripts/chain_configs.py
index 44e2e38d..58f2bfbe 100644
--- a/src/il_representations/scripts/chain_configs.py
+++ b/src/il_representations/scripts/chain_configs.py
@@ -52,6 +52,23 @@ def cfg_base_3seed_4cpu_pt3gpu():
         _ = locals()
         del _
 
+    @experiment_obj.named_config
+    def cfg_base_3seed_1cpu_pt2gpu():
+        """Basic config that does three samples per config, using 1 CPU cores and
+        0.2 of a GPU."""
+        use_skopt = False
+        tune_run_kwargs = dict(num_samples=3,
+                               # retry on (node) failure
+                               max_failures=2,
+                               fail_fast=False,
+                               resources_per_trial=dict(
+                                   cpu=1,
+                                   gpu=0.2,
+                               ))
+
+        _ = locals()
+        del _
+
     @experiment_obj.named_config
     def cfg_base_3seed_1cpu_pt2gpu_2envs():
         """Another config that uses only one CPU per run, and .2 of a GPU. Good for

From 4c1c2b80819042207e456f92765700612f2b5717 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Thu, 6 May 2021 12:54:08 +0800
Subject: [PATCH 098/123] Add SimCLR model to default SimCLR settings

---
 src/il_representations/algos/__init__.py    |  3 ++-
 src/il_representations/algos/encoders.py    | 27 +++++++++++++++++++++
 src/il_representations/scripts/run_cifar.py | 25 +------------------
 3 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/src/il_representations/algos/__init__.py b/src/il_representations/algos/__init__.py
index 5f29bff0..08f58f55 100644
--- a/src/il_representations/algos/__init__.py
+++ b/src/il_representations/algos/__init__.py
@@ -1,7 +1,7 @@
 from il_representations.algos.representation_learner import RepresentationLearner, DEFAULT_HARDCODED_PARAMS
 from il_representations.algos.encoders import MomentumEncoder, InverseDynamicsEncoder, TargetStoringActionEncoder, \
     RecurrentEncoder, BaseEncoder, VAEEncoder, ActionEncodingEncoder, ActionEncodingInverseDynamicsEncoder, \
-    infer_action_shape_info
+    infer_action_shape_info, SimCLRModel
 from il_representations.algos.decoders import NoOp, MomentumProjectionHead, \
     BYOLProjectionHead, ActionConditionedVectorDecoder, ContrastiveInverseDynamicsConcatenationHead, \
     ActionPredictionHead, PixelDecoder, SymmetricProjectionHead, AsymmetricProjectionHead
@@ -54,6 +54,7 @@ class SimCLR(RepresentationLearner):
     # TODO note: not made to use momentum because not being used in experiments
     def __init__(self, **kwargs):
         algo_hardcoded_kwargs = dict(encoder=BaseEncoder,
+                                     encoder_kwargs={'obs_encoder_cls': lambda *args: SimCLRModel()},
                                      decoder=SymmetricProjectionHead,
                                      loss_calculator=SymmetricContrastiveLoss,
                                      augmenter=AugmentContextAndTarget,
diff --git a/src/il_representations/algos/encoders.py b/src/il_representations/algos/encoders.py
index 3a630ebc..8cf5e9c9 100644
--- a/src/il_representations/algos/encoders.py
+++ b/src/il_representations/algos/encoders.py
@@ -10,6 +10,8 @@
 from torchvision.models.resnet import BasicBlock as BasicResidualBlock
 import torch
 from torch import nn
+from torchvision.models.resnet import resnet50
+import torch.nn.functional as F
 from pyro.distributions import Delta
 
 from gym import spaces
@@ -267,11 +269,36 @@ def forward(self, x):
         warn_on_non_image_tensor(x)
         return self.shared_network(x)
 
+
+class SimCLRModel(nn.Module):
+    def __init__(self, feature_dim=128):
+        super(SimCLRModel, self).__init__()
+
+        self.f = []
+        for name, module in resnet50().named_children():
+            if name == 'conv1':
+                module = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+            if not isinstance(module, nn.Linear) and not isinstance(module, nn.MaxPool2d):
+                self.f.append(module)
+        # encoder
+        # Temporarily add an extra layer to be closer to our model implementation
+        self.f = nn.Sequential(*self.f)
+        # # projection head
+        # self.g = nn.Sequential(nn.Linear(2048, 512, bias=False), nn.BatchNorm1d(512),
+        #                        nn.ReLU(inplace=True), nn.Linear(512, feature_dim, bias=True))
+
+    def forward(self, x):
+        x = self.f(x)
+        feature = torch.flatten(x, start_dim=1)
+        return F.normalize(feature, dim=-1)
+
+
 # string names for convolutional networks; this makes it easier to choose
 # between them from the command line
 NETWORK_SHORT_NAMES = {
     'BasicCNN': BasicCNN,
     'MAGICALCNN': MAGICALCNN,
+    'SimCLRModel': SimCLRModel
 }
 
 
diff --git a/src/il_representations/scripts/run_cifar.py b/src/il_representations/scripts/run_cifar.py
index e56c6c81..c1464c52 100644
--- a/src/il_representations/scripts/run_cifar.py
+++ b/src/il_representations/scripts/run_cifar.py
@@ -23,6 +23,7 @@
 from il_representations import algos
 from il_representations.algos.utils import LinearWarmupCosine
 from il_representations.envs.auto import load_wds_datasets
+from il_representations.algos.encoders import SimCLRModel
 from il_representations.envs.config import (env_cfg_ingredient,
                                             env_data_ingredient,
                                             venv_opts_ingredient)
@@ -227,30 +228,6 @@ def train_val(net, data_loader, train_optimizer, loss_criterion, epoch, epochs):
     return total_loss / total_num, total_correct_1 / total_num * 100, total_correct_5 / total_num * 100
 
 
-class SimCLRModel(nn.Module):
-    def __init__(self, feature_dim=128):
-        super(SimCLRModel, self).__init__()
-
-        self.f = []
-        for name, module in resnet50().named_children():
-            if name == 'conv1':
-                module = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
-            if not isinstance(module, nn.Linear) and not isinstance(module, nn.MaxPool2d):
-                self.f.append(module)
-        # encoder
-        # Temporarily add an extra layer to be closer to our model implementation
-        self.f = nn.Sequential(*self.f)
-        # # projection head
-        # self.g = nn.Sequential(nn.Linear(2048, 512, bias=False), nn.BatchNorm1d(512),
-        #                        nn.ReLU(inplace=True), nn.Linear(512, feature_dim, bias=True))
-
-    def forward(self, x):
-        x = self.f(x)
-        feature = torch.flatten(x, start_dim=1)
-        #out = self.g(feature)
-        return F.normalize(feature, dim=-1) #, F.normalize(out, dim=-1)
-
-
 def representation_learning(algo, device, log_dir, config):
     print('Train representation learner')
     if isinstance(algo, str):

From 01473d80ebb7529e317a46a939a8694b24f1fbd6 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Thu, 6 May 2021 13:01:20 +0800
Subject: [PATCH 099/123] Try to use 3e-4 lr for SimCLR repl

---
 src/il_representations/scripts/chain_configs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/il_representations/scripts/chain_configs.py b/src/il_representations/scripts/chain_configs.py
index ad3d51bb..55dee68f 100644
--- a/src/il_representations/scripts/chain_configs.py
+++ b/src/il_representations/scripts/chain_configs.py
@@ -302,6 +302,7 @@ def cfg_repl_simclr():
         stages_to_run = StagesToRun.REPL_AND_IL
         repl = {
             'algo': 'SimCLR',
+            'optimizer_kwargs': {'lr': 3e-4},
         }
 
         _ = locals()

From 701c69128988cbd654bb148bd2227eb746ce461d Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Thu, 6 May 2021 13:27:21 +0800
Subject: [PATCH 100/123] update config

---
 src/il_representations/scripts/chain_configs.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/scripts/chain_configs.py b/src/il_representations/scripts/chain_configs.py
index 61d25a37..0ece1f29 100644
--- a/src/il_representations/scripts/chain_configs.py
+++ b/src/il_representations/scripts/chain_configs.py
@@ -333,9 +333,10 @@ def cfg_repl_simclr():
         stages_to_run = StagesToRun.REPL_AND_IL
         repl = {
             'algo': 'SimCLR',
-            'optimizer_kwargs': {'lr': 3e-4},
+            'algo_params': {
+                'optimizer_kwargs': {'lr': 3e-4},
+            }
         }
-
         _ = locals()
         del _
 

From 17af15fbd8667178db113db3bac5f4a917ea91fc Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Thu, 6 May 2021 15:19:20 +0800
Subject: [PATCH 101/123] comment out context saving code

---
 .../algos/representation_learner.py           | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index b35797b7..85ba2f5d 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -336,10 +336,10 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 contexts, targets, traj_ts_info, extra_context = self.unpack_batch(batch)
                 # contexts, targets, _ = train_loader.next()
 
-                if step == 0:
-                    for i in range(10):
-                        save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_from_disk_{i}.png'))
-                        save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_from_disk_{i}.png'))
+                # if step == 0:
+                #     for i in range(10):
+                #         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_from_disk_{i}.png'))
+                #         save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_from_disk_{i}.png'))
                 # Use an algorithm-specific augmentation strategy to augment either
                 # just context, or both context and targets
                 contexts, targets = self._prep_tensors(contexts), self._prep_tensors(targets)
@@ -350,16 +350,16 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 # contexts = self._preprocess(contexts)
                 # if self.preprocess_target:
                 #     targets = self._preprocess(targets)
-                if step == 0:
-                    for i in range(10):
-                        save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_pre_aug_{i}.png'))
-                        save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_pre_aug_{i}.png'))
+                # if step == 0:
+                #     for i in range(10):
+                #         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_pre_aug_{i}.png'))
+                #         save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_pre_aug_{i}.png'))
                 # TODO put back in when done with "swap their data in" test
                 contexts, targets = self.augmenter(contexts, targets)
-                if step == 0:
-                    for i in range(10):
-                        save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_{i}.png'))
-                        save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_{i}.png'))
+                # if step == 0:
+                #     for i in range(10):
+                #         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_{i}.png'))
+                #         save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_{i}.png'))
                 extra_context = self._preprocess_extra_context(extra_context)
                 # This is typically a noop, but sometimes we also augment the extra context
                 extra_context = self.augmenter.augment_extra_context(extra_context)

From 766b1605858b15b901b49bd71179896b774e2749 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Thu, 6 May 2021 15:21:58 +0800
Subject: [PATCH 102/123] Try augmenting with SimCLR default

---
 .../algos/representation_learner.py             | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 85ba2f5d..86c238a2 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -137,6 +137,15 @@ def __init__(self, *,
             # This doesn't have any meaningful effect unless you specify a projection head.
             projection_dim = representation_dim
 
+        train_transform = transforms.Compose([
+            transforms.RandomResizedCrop(32),
+            transforms.RandomHorizontalFlip(p=0.5),
+            transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
+            transforms.RandomGrayscale(p=0.2),
+            transforms.ToTensor(),
+            transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
+
+        augmenter_kwargs["augment_func"] = train_transform
         self.augmenter = augmenter(**augmenter_kwargs)
         self.target_pair_constructor = target_pair_constructor(**to_dict(target_pair_constructor_kwargs))
 
@@ -314,13 +323,7 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
             f"Training for {n_epochs} epochs, each of {batches_per_epoch} "
             f"batches (batch size {self.batch_size})")
         # TODO add transform back in, and probably comment out our augmenter line?
-        # train_transform = transforms.Compose([
-        #     transforms.RandomResizedCrop(32),
-        #     transforms.RandomHorizontalFlip(p=0.5),
-        #     transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
-        #     transforms.RandomGrayscale(p=0.2),
-        #     transforms.ToTensor(),
-        #     transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
+
         # train_data = CIFAR10Pair(root='data', train=True, transform=train_transform, download=True)
         # train_loader = iter(DataLoader(train_data, batch_size=self.batch_size, shuffle=True, num_workers=16, pin_memory=True,
         #                           drop_last=True))

From a398ed89d7adf3d74239a026f20a7234eefd58a0 Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Thu, 6 May 2021 01:21:33 -0700
Subject: [PATCH 103/123] adjust augmenter

---
 src/il_representations/algos/augmenters.py    |  8 +++++
 .../algos/representation_learner.py           | 29 ++++++++++---------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/il_representations/algos/augmenters.py b/src/il_representations/algos/augmenters.py
index 37d201f4..18da951b 100644
--- a/src/il_representations/algos/augmenters.py
+++ b/src/il_representations/algos/augmenters.py
@@ -39,9 +39,17 @@ def __call__(self, contexts, targets):
 
 class AugmentContextAndTarget(Augmenter):
     def __call__(self, contexts, targets):
+        pil_process_func = transforms.Compose([
+            transforms.ToPILImage()
+        ])
         if self.augment_func:
             context_ret, target_ret = [], []
             for context, target in zip(contexts, targets):
+                if isinstance(context, torch.Tensor) and \
+                   isinstance(self.augment_op.transforms[0],
+                              transforms.RandomResizedCrop):
+                    context, target = pil_process_func(context.cpu()), \
+                                      pil_process_func(target.cpu())
                 context_ret.append(self.augment_op(context))
                 target_ret.append(self.augment_op(target))
             return torch.stack(context_ret, dim=0).to(device), \
diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 86c238a2..de2a4c67 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -138,7 +138,7 @@ def __init__(self, *,
             projection_dim = representation_dim
 
         train_transform = transforms.Compose([
-            transforms.RandomResizedCrop(32),
+            transforms.RandomResizedCrop(90),
             transforms.RandomHorizontalFlip(p=0.5),
             transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
             transforms.RandomGrayscale(p=0.2),
@@ -341,8 +341,9 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
 
                 # if step == 0:
                 #     for i in range(10):
-                #         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_from_disk_{i}.png'))
-                #         save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_from_disk_{i}.png'))
+                #         breakpoint()
+                #         save_rgb_tensor(contexts[i][:3], os.path.join(self.log_dir, 'saved_images', f'contexts_from_disk_{i}.png'))
+                #         save_rgb_tensor(targets[i][:3], os.path.join(self.log_dir, 'saved_images', f'targets_from_disk_{i}.png'))
                 # Use an algorithm-specific augmentation strategy to augment either
                 # just context, or both context and targets
                 contexts, targets = self._prep_tensors(contexts), self._prep_tensors(targets)
@@ -350,19 +351,19 @@ def learn(self, datasets, batches_per_epoch, n_epochs, n_trajs=None, callbacks=(
                 traj_ts_info = self._prep_tensors(traj_ts_info)
                 # Note: preprocessing might be better to do on CPU if, in future, we can parallelize doing so
                 # TODO this may not make sense for CIFAR10, maybe double normalizing
-                # contexts = self._preprocess(contexts)
-                # if self.preprocess_target:
-                #     targets = self._preprocess(targets)
-                # if step == 0:
-                #     for i in range(10):
-                #         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_pre_aug_{i}.png'))
-                #         save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_pre_aug_{i}.png'))
+                contexts = self._preprocess(contexts)
+                if self.preprocess_target:
+                    targets = self._preprocess(targets)
+                if step == 0:
+                    for i in range(10):
+                        save_rgb_tensor(contexts[i][:3], os.path.join(self.log_dir, 'saved_images', f'contexts_pre_aug_{i}.png'))
+                        save_rgb_tensor(targets[i][:3], os.path.join(self.log_dir, 'saved_images', f'targets_pre_aug_{i}.png'))
                 # TODO put back in when done with "swap their data in" test
                 contexts, targets = self.augmenter(contexts, targets)
-                # if step == 0:
-                #     for i in range(10):
-                #         save_rgb_tensor(contexts[i], os.path.join(self.log_dir, 'saved_images', f'contexts_{i}.png'))
-                #         save_rgb_tensor(targets[i], os.path.join(self.log_dir, 'saved_images', f'targets_{i}.png'))
+                if step == 0:
+                    for i in range(10):
+                        save_rgb_tensor(contexts[i][:3], os.path.join(self.log_dir, 'saved_images', f'contexts_{i}.png'))
+                        save_rgb_tensor(targets[i][:3], os.path.join(self.log_dir, 'saved_images', f'targets_{i}.png'))
                 extra_context = self._preprocess_extra_context(extra_context)
                 # This is typically a noop, but sometimes we also augment the extra context
                 extra_context = self.augmenter.augment_extra_context(extra_context)

From ae814c3d33415915a3ea6010b84d3ccdca27210e Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Thu, 6 May 2021 16:26:42 +0800
Subject: [PATCH 104/123] Try to use multiple GPUs

---
 src/il_representations/algos/encoders.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/il_representations/algos/encoders.py b/src/il_representations/algos/encoders.py
index 8cf5e9c9..f5e7d6a9 100644
--- a/src/il_representations/algos/encoders.py
+++ b/src/il_representations/algos/encoders.py
@@ -409,6 +409,13 @@ def __init__(self, obs_space, representation_dim, obs_encoder_cls=None,
             self.network = obs_encoder_cls(obs_space, representation_dim, **obs_encoder_cls_kwargs)
             self.scale_constant = scale_constant
 
+        if torch.cuda.device_count() > 1:
+            print("Using", torch.cuda.device_count(), "GPUs!")
+            self.network = nn.DataParallel(self.network)
+
+        self.network.to(self.device)
+
+
     def forward(self, x, traj_info):
         if self.learn_scale:
             return self.forward_with_stddev(x, traj_info)

From 2491d70f2ae3b49d41924c2f75a3432d3d0f9ba2 Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Thu, 6 May 2021 01:46:04 -0700
Subject: [PATCH 105/123] Add a script for running simclr

---
 src/il_representations/scripts/run_simclr.sh | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100755 src/il_representations/scripts/run_simclr.sh

diff --git a/src/il_representations/scripts/run_simclr.sh b/src/il_representations/scripts/run_simclr.sh
new file mode 100755
index 00000000..facc6685
--- /dev/null
+++ b/src/il_representations/scripts/run_simclr.sh
@@ -0,0 +1,17 @@
+repl_epochs=100
+bc_trajs=30
+bc_batches=4000000
+
+CUDA_VISIBLE_DEVICES=0,2 python src/il_representations/scripts/pretrain_n_adapt.py with \
+  cfg_repl_simclr \
+  cfg_il_bc_nofreeze \
+  tune_run_kwargs.num_samples=1 \
+  tune_run_kwargs.resources_per_trial.gpu=1 \
+  env_cfg.benchmark_name=dm_control \
+  env_cfg.task_name=finger-spin \
+  repl.n_epochs=$repl_epochs \
+  repl.algo_params.batch_size=32 \
+  il_train.bc.n_trajs=$bc_trajs \
+  il_train.bc.n_batches=$bc_batches \
+  exp_ident=repl_epoch_${repl_epochs}_bc_${bc_trajs}_trajs_${bc_batches}_batches
+

From d992c52166121c4ec80c7143a7681cb22a22a105 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Thu, 6 May 2021 19:34:33 +0800
Subject: [PATCH 106/123] adjust decoder input dim

---
 src/il_representations/algos/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/il_representations/algos/__init__.py b/src/il_representations/algos/__init__.py
index 08f58f55..f76e41d6 100644
--- a/src/il_representations/algos/__init__.py
+++ b/src/il_representations/algos/__init__.py
@@ -56,6 +56,7 @@ def __init__(self, **kwargs):
         algo_hardcoded_kwargs = dict(encoder=BaseEncoder,
                                      encoder_kwargs={'obs_encoder_cls': lambda *args: SimCLRModel()},
                                      decoder=SymmetricProjectionHead,
+                                     decoder_kwargs={'representation_dim': 2048},
                                      loss_calculator=SymmetricContrastiveLoss,
                                      augmenter=AugmentContextAndTarget,
                                      target_pair_constructor=IdentityPairConstructor,

From 8d4ebd5de2eaa088cfc6b257eeb71070962a932f Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Thu, 6 May 2021 19:44:41 +0800
Subject: [PATCH 107/123] Adjust decoder shape and normalization

---
 src/il_representations/algos/__init__.py               | 1 -
 src/il_representations/algos/representation_learner.py | 4 ++--
 src/il_representations/scripts/run_simclr.sh           | 1 +
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/il_representations/algos/__init__.py b/src/il_representations/algos/__init__.py
index f76e41d6..08f58f55 100644
--- a/src/il_representations/algos/__init__.py
+++ b/src/il_representations/algos/__init__.py
@@ -56,7 +56,6 @@ def __init__(self, **kwargs):
         algo_hardcoded_kwargs = dict(encoder=BaseEncoder,
                                      encoder_kwargs={'obs_encoder_cls': lambda *args: SimCLRModel()},
                                      decoder=SymmetricProjectionHead,
-                                     decoder_kwargs={'representation_dim': 2048},
                                      loss_calculator=SymmetricContrastiveLoss,
                                      augmenter=AugmentContextAndTarget,
                                      target_pair_constructor=IdentityPairConstructor,
diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index de2a4c67..7dc66e6d 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -142,8 +142,8 @@ def __init__(self, *,
             transforms.RandomHorizontalFlip(p=0.5),
             transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
             transforms.RandomGrayscale(p=0.2),
-            transforms.ToTensor(),
-            transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
+            transforms.ToTensor()])
+            # transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
 
         augmenter_kwargs["augment_func"] = train_transform
         self.augmenter = augmenter(**augmenter_kwargs)
diff --git a/src/il_representations/scripts/run_simclr.sh b/src/il_representations/scripts/run_simclr.sh
index facc6685..c0c8ad80 100755
--- a/src/il_representations/scripts/run_simclr.sh
+++ b/src/il_representations/scripts/run_simclr.sh
@@ -11,6 +11,7 @@ CUDA_VISIBLE_DEVICES=0,2 python src/il_representations/scripts/pretrain_n_adapt.
   env_cfg.task_name=finger-spin \
   repl.n_epochs=$repl_epochs \
   repl.algo_params.batch_size=32 \
+  repl.algo_params.representation_dim=2048 \
   il_train.bc.n_trajs=$bc_trajs \
   il_train.bc.n_batches=$bc_batches \
   exp_ident=repl_epoch_${repl_epochs}_bc_${bc_trajs}_trajs_${bc_batches}_batches

From 872101b4589145e4349634b44786441a71029768 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <cyn0531@hku.hk>
Date: Thu, 6 May 2021 06:15:35 -0700
Subject: [PATCH 108/123] Update run_il.sh for long dmc runs with few trajs

---
 src/il_representations/scripts/run_il.sh | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/il_representations/scripts/run_il.sh b/src/il_representations/scripts/run_il.sh
index ac9fef06..bfad3935 100755
--- a/src/il_representations/scripts/run_il.sh
+++ b/src/il_representations/scripts/run_il.sh
@@ -1,16 +1,16 @@
 #!/usr/bin/env bash
 
-CUDA_VISIBLE_DEVICES=3 xvfb-run -a python src/il_representations/scripts/pretrain_n_adapt.py with \
- cfg_base_3seed_1cpu_pt2gpu_2envs \
+CUDA_VISIBLE_DEVICES=1 xvfb-run -a python src/il_representations/scripts/pretrain_n_adapt.py with \
  cfg_repl_none \
  cfg_il_bc_nofreeze \
- tune_run_kwargs.num_samples=2 \
- tune_run_kwargs.resources_per_trial.gpu=0.5 \
- exp_ident=magical-small \
- il_train.bc.n_batches=400000 \
- il_train.bc.batch_size=512 \
- il_train.encoder_kwargs.obs_encoder_cls=MAGICALCNN \
- il_train.encoder_kwargs.obs_encoder_cls_kwargs.arch_str=MAGICALCNN-small \
- env_cfg.benchmark_name=dm_control \
- env_cfg.task_name=finger-spin
+ cfg_bench_micro_sweep_dm_control \
+ cfg_run_few_trajs_long_dm_control \
+ il_train.bc.n_trajs=10 \
+ exp_ident=dmc_long_ntrajs_10 \
+ tune_run_kwargs.num_samples=1 \
+ tune_run_kwargs.resources_per_trial.gpu=0.3 \
+ # il_train.bc.n_batches=400000 \
+ # il_train.bc.nominal_length=10000 \
+ # env_cfg.benchmark_name=dm_control \
+ # env_cfg.task_name=finger-spin
 

From 3834b8812dc33fa621d0297f1e1fdae6c5537f4d Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Fri, 7 May 2021 11:03:48 +0800
Subject: [PATCH 109/123] Setting up loading procgen dataset

---
 src/il_representations/envs/config.py       |  7 +++
 src/il_representations/envs/procgen_envs.py | 52 +++++++++++++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 src/il_representations/envs/procgen_envs.py

diff --git a/src/il_representations/envs/config.py b/src/il_representations/envs/config.py
index 6a6c3bff..d207412e 100644
--- a/src/il_representations/envs/config.py
+++ b/src/il_representations/envs/config.py
@@ -171,5 +171,12 @@ def env_data_defaults():
         'data/atari/PongNoFrameskip-v4_rollouts_500_ts_100_traj.npz',
     }
 
+    # ###########################
+    # ProcGen config variables
+    # ###########################
+    procgen_demo_paths = {
+        'procgen/demo_coinrun.pickle'
+    }
+
     _ = locals()
     del _
diff --git a/src/il_representations/envs/procgen_envs.py b/src/il_representations/envs/procgen_envs.py
new file mode 100644
index 00000000..5243cd0d
--- /dev/null
+++ b/src/il_representations/envs/procgen_envs.py
@@ -0,0 +1,52 @@
+"""Utilities for working with Atari environments and demonstrations."""
+import os
+import random
+
+import numpy as np
+
+from il_representations.envs.config import (env_cfg_ingredient,
+                                            env_data_ingredient)
+
+
+@env_data_ingredient.capture
+def _get_procgen_data_opts(data_root, procgen_demo_paths):
+    # workaround for Sacred issue #206
+    return data_root, procgen_demo_paths
+
+
+@env_cfg_ingredient.capture
+def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
+    data_root, procgen_demo_paths = _get_procgen_data_opts()
+
+    # load trajectories from disk
+    full_rollouts_path = os.path.join(data_root, procgen_demo_paths[task_name])
+    trajs_or_file = np.load(full_rollouts_path, allow_pickle=True)
+    if isinstance(trajs_or_file, np.lib.npyio.NpzFile):
+        # handle .npz files (several arrays, maybe compressed, but we assume
+        # there's only one)
+        trajectories, = trajs_or_file.values()
+
+    trajectories = list(trajectories)
+    random.shuffle(trajectories)
+    if n_traj is not None:
+        trajectories = trajectories[:n_traj]
+
+    # merge stats/actions/dones from all trajectories into one big dataset
+    # (we use same naming convention as `imitation` here)
+    merged_trajectories = {'obs': [], 'next_obs': [], 'acts': [], 'dones': []}
+    for traj in trajectories:
+        # we slice to :-1 so that we can have a meaningful next_obs
+        merged_trajectories['obs'] += traj['states'][:-1]
+        merged_trajectories['next_obs'] += traj['states'][1:]
+        merged_trajectories['acts'] += traj['actions'][:-1]
+        merged_trajectories['dones'] += traj['dones'][:-1]
+    dataset_dict = {
+        key: np.stack(values, axis=0)
+        for key, values in merged_trajectories.items()
+    }
+
+    if chans_first:
+        for key in ('obs', 'next_obs'):
+            dataset_dict[key] = np.transpose(dataset_dict[key], (0, 3, 1, 2))
+
+    return dataset_dict

From 2c706457581bca118f9f5f9938bb93af0e912af9 Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Thu, 6 May 2021 23:39:10 -0700
Subject: [PATCH 110/123] Adding support for procgen (loading env)

---
 src/il_representations/envs/auto.py         | 17 ++++-
 src/il_representations/envs/config.py       |  4 +-
 src/il_representations/envs/procgen_envs.py | 69 ++++++++++++++-------
 src/il_representations/scripts/il_test.py   |  3 +-
 4 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/src/il_representations/envs/auto.py b/src/il_representations/envs/auto.py
index bd819426..00c990f8 100644
--- a/src/il_representations/envs/auto.py
+++ b/src/il_representations/envs/auto.py
@@ -21,6 +21,7 @@
 from il_representations.envs.minecraft_envs import (MinecraftVectorWrapper,
                                                     get_env_name_minecraft,
                                                     load_dataset_minecraft)
+from il_representations.envs.procgen_envs import load_dataset_procgen
 from il_representations.scripts.utils import update as dict_update
 
 ERROR_MESSAGE = "no support for benchmark_name={benchmark_name!r}"
@@ -74,6 +75,8 @@ def load_dict_dataset(benchmark_name, n_traj=None):
         dataset_dict = load_dataset_atari(n_traj=n_traj)
     elif benchmark_name == 'minecraft':
         dataset_dict = load_dataset_minecraft(n_traj=n_traj)
+    elif benchmark_name == 'procgen':
+        dataset_dict = load_dataset_procgen(n_traj=n_traj)
     else:
         raise NotImplementedError(ERROR_MESSAGE.format(**locals()))
 
@@ -100,6 +103,8 @@ def get_gym_env_name(benchmark_name, dm_control_full_env_names, task_name):
         return task_name
     elif benchmark_name == 'minecraft':
         return get_env_name_minecraft()  # uses task_name implicitly through config param
+    elif benchmark_name == 'procgen':
+        return task_name
     raise NotImplementedError(ERROR_MESSAGE.format(**locals()))
 
 
@@ -163,6 +168,15 @@ def load_vec_env(benchmark_name, dm_control_full_env_names,
                             parallel=venv_parallel,
                             wrapper_class=MinecraftVectorWrapper,
                             max_episode_steps=minecraft_max_env_steps)
+    elif benchmark_name == 'procgen':
+        raw_procgen_env = make_vec_env(gym_env_name,
+                                       n_envs=n_envs,
+                                       parallel=venv_parallel,
+                                       parallel_workers=parallel_workers,
+                                       wrapper_class=ProcgenWrapper)
+        final_env = VecFrameStack(VecTransposeImage(raw_procgen_env), 4)
+        assert final_env.observation_space.shape == (12, 64, 64), \
+            final_env.observation_space.shape
     raise NotImplementedError(ERROR_MESSAGE.format(**locals()))
 
 
@@ -266,7 +280,8 @@ def load_color_space(benchmark_name):
         'magical': ColorSpace.RGB,
         'dm_control': ColorSpace.RGB,
         'atari': ColorSpace.GRAY,
-        'minecraft': ColorSpace.RGB
+        'minecraft': ColorSpace.RGB,
+        'procgen': ColorSpace.RGB
     }
     try:
         return color_spaces[benchmark_name]
diff --git a/src/il_representations/envs/config.py b/src/il_representations/envs/config.py
index d207412e..cfbfed8c 100644
--- a/src/il_representations/envs/config.py
+++ b/src/il_representations/envs/config.py
@@ -6,7 +6,7 @@
 
 from sacred import Ingredient
 
-ALL_BENCHMARK_NAMES = {"atari", "magical", "dm_control", "minecraft"}
+ALL_BENCHMARK_NAMES = {"atari", "magical", "dm_control", "minecraft", "procgen"}
 
 # see env_cfg_defaults docstring for description of this ingredient
 env_cfg_ingredient = Ingredient('env_cfg')
@@ -175,7 +175,7 @@ def env_data_defaults():
     # ProcGen config variables
     # ###########################
     procgen_demo_paths = {
-        'procgen/demo_coinrun.pickle'
+        'coinrun': 'procgen/demo_coinrun.pickle'
     }
 
     _ = locals()
diff --git a/src/il_representations/envs/procgen_envs.py b/src/il_representations/envs/procgen_envs.py
index 5243cd0d..5f9fe2ed 100644
--- a/src/il_representations/envs/procgen_envs.py
+++ b/src/il_representations/envs/procgen_envs.py
@@ -1,9 +1,16 @@
 """Utilities for working with Atari environments and demonstrations."""
 import os
 import random
-
 import numpy as np
 
+from baselines.common.vec_env import (
+    VecExtractDictObs,
+    VecMonitor,
+    VecFrameStack,
+    VecNormalize
+)
+from procgen import ProcgenEnv
+
 from il_representations.envs.config import (env_cfg_ingredient,
                                             env_data_ingredient)
 
@@ -20,33 +27,49 @@ def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
 
     # load trajectories from disk
     full_rollouts_path = os.path.join(data_root, procgen_demo_paths[task_name])
-    trajs_or_file = np.load(full_rollouts_path, allow_pickle=True)
-    if isinstance(trajs_or_file, np.lib.npyio.NpzFile):
-        # handle .npz files (several arrays, maybe compressed, but we assume
-        # there's only one)
-        trajectories, = trajs_or_file.values()
-
-    trajectories = list(trajectories)
-    random.shuffle(trajectories)
-    if n_traj is not None:
-        trajectories = trajectories[:n_traj]
-
-    # merge stats/actions/dones from all trajectories into one big dataset
-    # (we use same naming convention as `imitation` here)
-    merged_trajectories = {'obs': [], 'next_obs': [], 'acts': [], 'dones': []}
-    for traj in trajectories:
-        # we slice to :-1 so that we can have a meaningful next_obs
-        merged_trajectories['obs'] += traj['states'][:-1]
-        merged_trajectories['next_obs'] += traj['states'][1:]
-        merged_trajectories['acts'] += traj['actions'][:-1]
-        merged_trajectories['dones'] += traj['dones'][:-1]
+    trajectories = np.load(full_rollouts_path, allow_pickle=True)
+
+    # do frame stacking on observations in each loaded trajectory sequence,
+    # then concatenate the frame-stacked trajectories together to make one big
+    # dataset
+    cat_obs = np.concatenate(trajectories['obs'][:-1], axis=0)
+    cat_nobs = np.concatenate(trajectories['obs'][1:], axis=0)
+    # the remaining entries don't need any special stacking, so we just
+    # concatenate them
+    cat_acts = np.concatenate(trajectories['acts'], axis=0)
+    cat_infos = np.concatenate(trajectories['infos'], axis=0)
+    cat_rews = np.concatenate(trajectories['rews'], axis=0)
+    cat_dones = np.concatenate(trajectories['dones'], axis=0)
+
     dataset_dict = {
-        key: np.stack(values, axis=0)
-        for key, values in merged_trajectories.items()
+        'obs': cat_obs,
+        'next_obs': cat_nobs,
+        'acts': cat_acts,
+        'infos': cat_infos,
+        'rews': cat_rews,
+        'dones': cat_dones,
     }
 
+    # TODO: Figure out whether we need chans first for procgen
     if chans_first:
         for key in ('obs', 'next_obs'):
             dataset_dict[key] = np.transpose(dataset_dict[key], (0, 3, 1, 2))
 
     return dataset_dict
+
+
+@env_cfg_ingredient.capture
+def ProcgenWrapper(task_name, num_envs=1, num_levels=0, start_level=0,
+                   distribution_mode='easy'):
+    # TODO: Check start level
+    venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode)
+    venv = VecExtractDictObs(venv, "rgb")
+
+    venv = VecMonitor(
+        venv=venv, filename=None, keep_buf=100,
+    )
+
+    venv = VecNormalize(venv=venv, ob=False)
+
+
+
diff --git a/src/il_representations/scripts/il_test.py b/src/il_representations/scripts/il_test.py
index a3bbe374..756b28f8 100644
--- a/src/il_representations/scripts/il_test.py
+++ b/src/il_representations/scripts/il_test.py
@@ -106,7 +106,8 @@ def run(policy_path, env_cfg, venv_opts, seed, n_rollouts, device_name, run_id,
             'return_mean': eval_data_frame['mean_score'].mean(),
         }
 
-    elif (env_cfg['benchmark_name'] in ('dm_control', 'atari', 'minecraft')):
+    elif (env_cfg['benchmark_name'] in ('dm_control', 'atari', 'minecraft',
+                                        'procgen')):
         # must import this to register envs
         from il_representations.envs import dm_control_envs  # noqa: F401
 

From 2f0659a1934ecd48f441a3d3a948401c99492809 Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Fri, 7 May 2021 00:18:32 -0700
Subject: [PATCH 111/123] Set Procgen env names

---
 requirements.txt                            |  1 +
 src/il_representations/envs/auto.py         |  6 ++++--
 src/il_representations/envs/procgen_envs.py | 17 ++++++++++-------
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 369306da..f02aaf9e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,6 +25,7 @@ torchsummary~=1.5.1
 #webdataset introduces breaking changes in 0.1.49, so setting this to an exact equality
 webdataset==0.1.40
 tqdm~=4.48.0
+procgen==0.10.4
 
 # Jupyter Lab is used for our experiment analysis notebook
 jupyterlab~=2.2.6
diff --git a/src/il_representations/envs/auto.py b/src/il_representations/envs/auto.py
index 00c990f8..93a1dd6b 100644
--- a/src/il_representations/envs/auto.py
+++ b/src/il_representations/envs/auto.py
@@ -21,7 +21,9 @@
 from il_representations.envs.minecraft_envs import (MinecraftVectorWrapper,
                                                     get_env_name_minecraft,
                                                     load_dataset_minecraft)
-from il_representations.envs.procgen_envs import load_dataset_procgen
+from il_representations.envs.procgen_envs import (load_dataset_procgen,
+                                                  ProcgenWrapper,
+                                                  get_procgen_env_name)
 from il_representations.scripts.utils import update as dict_update
 
 ERROR_MESSAGE = "no support for benchmark_name={benchmark_name!r}"
@@ -104,7 +106,7 @@ def get_gym_env_name(benchmark_name, dm_control_full_env_names, task_name):
     elif benchmark_name == 'minecraft':
         return get_env_name_minecraft()  # uses task_name implicitly through config param
     elif benchmark_name == 'procgen':
-        return task_name
+        return get_procgen_env_name()
     raise NotImplementedError(ERROR_MESSAGE.format(**locals()))
 
 
diff --git a/src/il_representations/envs/procgen_envs.py b/src/il_representations/envs/procgen_envs.py
index 5f9fe2ed..cf318254 100644
--- a/src/il_representations/envs/procgen_envs.py
+++ b/src/il_representations/envs/procgen_envs.py
@@ -3,13 +3,14 @@
 import random
 import numpy as np
 
-from baselines.common.vec_env import (
-    VecExtractDictObs,
-    VecMonitor,
-    VecFrameStack,
-    VecNormalize
-)
+# from baselines.common.vec_env import (
+#     VecExtractDictObs,
+#     VecMonitor,
+#     VecFrameStack,
+#     VecNormalize
+# )
 from procgen import ProcgenEnv
+from procgen.gym_registration import make_env, register_environments
 
 from il_representations.envs.config import (env_cfg_ingredient,
                                             env_data_ingredient)
@@ -72,4 +73,6 @@ def ProcgenWrapper(task_name, num_envs=1, num_levels=0, start_level=0,
     venv = VecNormalize(venv=venv, ob=False)
 
 
-
+@env_cfg_ingredient.capture
+def get_procgen_env_name(task_name):
+    return f'procgen-{task_name}-v0'

From fe9558ece674a3021a704ce0d37bf92daef102dc Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Mon, 10 May 2021 20:38:07 -0700
Subject: [PATCH 112/123] Update loading procgen envs

---
 src/il_representations/envs/procgen_envs.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/il_representations/envs/procgen_envs.py b/src/il_representations/envs/procgen_envs.py
index cf318254..72dc41d9 100644
--- a/src/il_representations/envs/procgen_envs.py
+++ b/src/il_representations/envs/procgen_envs.py
@@ -30,15 +30,9 @@ def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
     full_rollouts_path = os.path.join(data_root, procgen_demo_paths[task_name])
     trajectories = np.load(full_rollouts_path, allow_pickle=True)
 
-    # do frame stacking on observations in each loaded trajectory sequence,
-    # then concatenate the frame-stacked trajectories together to make one big
-    # dataset
-    cat_obs = np.concatenate(trajectories['obs'][:-1], axis=0)
-    cat_nobs = np.concatenate(trajectories['obs'][1:], axis=0)
-    # the remaining entries don't need any special stacking, so we just
-    # concatenate them
+    cat_obs = np.concatenate(trajectories['obs'], axis=0)[:-1]
+    cat_nobs = np.concatenate(trajectories['obs'], axis=0)[1:]
     cat_acts = np.concatenate(trajectories['acts'], axis=0)
-    cat_infos = np.concatenate(trajectories['infos'], axis=0)
     cat_rews = np.concatenate(trajectories['rews'], axis=0)
     cat_dones = np.concatenate(trajectories['dones'], axis=0)
 
@@ -46,7 +40,6 @@ def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
         'obs': cat_obs,
         'next_obs': cat_nobs,
         'acts': cat_acts,
-        'infos': cat_infos,
         'rews': cat_rews,
         'dones': cat_dones,
     }

From dcd44ca9160e8ce89e87dcdfc59ba9ebebffdbf3 Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Mon, 10 May 2021 23:48:44 -0700
Subject: [PATCH 113/123] Maybe we don't need next_obs?

---
 src/il_representations/envs/procgen_envs.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/il_representations/envs/procgen_envs.py b/src/il_representations/envs/procgen_envs.py
index 72dc41d9..1a430a73 100644
--- a/src/il_representations/envs/procgen_envs.py
+++ b/src/il_representations/envs/procgen_envs.py
@@ -30,15 +30,13 @@ def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
     full_rollouts_path = os.path.join(data_root, procgen_demo_paths[task_name])
     trajectories = np.load(full_rollouts_path, allow_pickle=True)
 
-    cat_obs = np.concatenate(trajectories['obs'], axis=0)[:-1]
-    cat_nobs = np.concatenate(trajectories['obs'], axis=0)[1:]
+    cat_obs = np.concatenate(trajectories['obs'], axis=0)
     cat_acts = np.concatenate(trajectories['acts'], axis=0)
     cat_rews = np.concatenate(trajectories['rews'], axis=0)
     cat_dones = np.concatenate(trajectories['dones'], axis=0)
 
     dataset_dict = {
         'obs': cat_obs,
-        'next_obs': cat_nobs,
         'acts': cat_acts,
         'rews': cat_rews,
         'dones': cat_dones,
@@ -46,7 +44,7 @@ def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
 
     # TODO: Figure out whether we need chans first for procgen
     if chans_first:
-        for key in ('obs', 'next_obs'):
+        for key in ('obs', ):
             dataset_dict[key] = np.transpose(dataset_dict[key], (0, 3, 1, 2))
 
     return dataset_dict

From e236df281cc54405146b6a824b13b8d05c4b61a8 Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Mon, 10 May 2021 23:55:36 -0700
Subject: [PATCH 114/123] Env wrapper is already handled by Procgen

---
 src/il_representations/envs/procgen_envs.py | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/src/il_representations/envs/procgen_envs.py b/src/il_representations/envs/procgen_envs.py
index 1a430a73..59b2b470 100644
--- a/src/il_representations/envs/procgen_envs.py
+++ b/src/il_representations/envs/procgen_envs.py
@@ -3,12 +3,6 @@
 import random
 import numpy as np
 
-# from baselines.common.vec_env import (
-#     VecExtractDictObs,
-#     VecMonitor,
-#     VecFrameStack,
-#     VecNormalize
-# )
 from procgen import ProcgenEnv
 from procgen.gym_registration import make_env, register_environments
 
@@ -42,7 +36,6 @@ def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
         'dones': cat_dones,
     }
 
-    # TODO: Figure out whether we need chans first for procgen
     if chans_first:
         for key in ('obs', ):
             dataset_dict[key] = np.transpose(dataset_dict[key], (0, 3, 1, 2))
@@ -50,20 +43,6 @@ def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
     return dataset_dict
 
 
-@env_cfg_ingredient.capture
-def ProcgenWrapper(task_name, num_envs=1, num_levels=0, start_level=0,
-                   distribution_mode='easy'):
-    # TODO: Check start level
-    venv = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode)
-    venv = VecExtractDictObs(venv, "rgb")
-
-    venv = VecMonitor(
-        venv=venv, filename=None, keep_buf=100,
-    )
-
-    venv = VecNormalize(venv=venv, ob=False)
-
-
 @env_cfg_ingredient.capture
 def get_procgen_env_name(task_name):
     return f'procgen-{task_name}-v0'

From 9faea2df70fb7ecfc21b01ec4a38971f42f75e9a Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Mon, 10 May 2021 23:57:59 -0700
Subject: [PATCH 115/123] More clean up

---
 src/il_representations/envs/auto.py         | 5 ++---
 src/il_representations/envs/procgen_envs.py | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/il_representations/envs/auto.py b/src/il_representations/envs/auto.py
index 93a1dd6b..cd7bce73 100644
--- a/src/il_representations/envs/auto.py
+++ b/src/il_representations/envs/auto.py
@@ -22,7 +22,6 @@
                                                     get_env_name_minecraft,
                                                     load_dataset_minecraft)
 from il_representations.envs.procgen_envs import (load_dataset_procgen,
-                                                  ProcgenWrapper,
                                                   get_procgen_env_name)
 from il_representations.scripts.utils import update as dict_update
 
@@ -174,11 +173,11 @@ def load_vec_env(benchmark_name, dm_control_full_env_names,
         raw_procgen_env = make_vec_env(gym_env_name,
                                        n_envs=n_envs,
                                        parallel=venv_parallel,
-                                       parallel_workers=parallel_workers,
-                                       wrapper_class=ProcgenWrapper)
+                                       parallel_workers=parallel_workers)
         final_env = VecFrameStack(VecTransposeImage(raw_procgen_env), 4)
         assert final_env.observation_space.shape == (12, 64, 64), \
             final_env.observation_space.shape
+        return final_env
     raise NotImplementedError(ERROR_MESSAGE.format(**locals()))
 
 
diff --git a/src/il_representations/envs/procgen_envs.py b/src/il_representations/envs/procgen_envs.py
index 59b2b470..e5fc6167 100644
--- a/src/il_representations/envs/procgen_envs.py
+++ b/src/il_representations/envs/procgen_envs.py
@@ -3,7 +3,6 @@
 import random
 import numpy as np
 
-from procgen import ProcgenEnv
 from procgen.gym_registration import make_env, register_environments
 
 from il_representations.envs.config import (env_cfg_ingredient,

From 19d1dc3e475093d79f4d99838711055838d6c781 Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Tue, 11 May 2021 01:04:19 -0700
Subject: [PATCH 116/123] Add framestack

---
 src/il_representations/envs/config.py       |  5 +++++
 src/il_representations/envs/procgen_envs.py | 20 ++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/envs/config.py b/src/il_representations/envs/config.py
index cfbfed8c..66b5572a 100644
--- a/src/il_representations/envs/config.py
+++ b/src/il_representations/envs/config.py
@@ -70,6 +70,11 @@ def env_cfg_defaults():
     # ###############################
     minecraft_max_env_steps = None
 
+    # ###############################
+    # Procgen-specific config variables
+    # ###############################
+    procgen_frame_stack = 4
+
     _ = locals()
     del _
 
diff --git a/src/il_representations/envs/procgen_envs.py b/src/il_representations/envs/procgen_envs.py
index e5fc6167..9062b1ac 100644
--- a/src/il_representations/envs/procgen_envs.py
+++ b/src/il_representations/envs/procgen_envs.py
@@ -1,4 +1,3 @@
-"""Utilities for working with Atari environments and demonstrations."""
 import os
 import random
 import numpy as np
@@ -16,7 +15,8 @@ def _get_procgen_data_opts(data_root, procgen_demo_paths):
 
 
 @env_cfg_ingredient.capture
-def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
+def load_dataset_procgen(task_name, procgen_frame_stack, n_traj=None,
+                         chans_first=True):
     data_root, procgen_demo_paths = _get_procgen_data_opts()
 
     # load trajectories from disk
@@ -38,6 +38,8 @@ def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
     if chans_first:
         for key in ('obs', ):
             dataset_dict[key] = np.transpose(dataset_dict[key], (0, 3, 1, 2))
+    dataset_dict['obs'] = _stack_obs_oldest_first(dataset_dict['obs'],
+                                                  procgen_frame_stack)
 
     return dataset_dict
 
@@ -45,3 +47,17 @@ def load_dataset_procgen(task_name, n_traj=None, chans_first=True):
 @env_cfg_ingredient.capture
 def get_procgen_env_name(task_name):
     return f'procgen-{task_name}-v0'
+
+
+@env_cfg_ingredient.capture
+def _stack_obs_oldest_first(obs_arr, procgen_frame_stack):
+    frame_accumulator = np.repeat([obs_arr[0]], procgen_frame_stack, axis=0)
+    c, h, w = obs_arr.shape[1:]
+    out_sequence = []
+    for in_frame in obs_arr:
+        frame_accumulator = np.concatenate(
+            [frame_accumulator[1:], [in_frame]], axis=0)
+        out_sequence.append(frame_accumulator.reshape(
+            procgen_frame_stack * c, h, w))
+    out_sequence = np.stack(out_sequence, axis=0)
+    return out_sequence

From 7e67b54821c256b434d4b53a4ae250acc8a2217e Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Wed, 12 May 2021 09:36:01 +0800
Subject: [PATCH 117/123] Adjust encoder network channel

---
 src/il_representations/algos/encoders.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/il_representations/algos/encoders.py b/src/il_representations/algos/encoders.py
index f5e7d6a9..a6c04023 100644
--- a/src/il_representations/algos/encoders.py
+++ b/src/il_representations/algos/encoders.py
@@ -271,21 +271,20 @@ def forward(self, x):
 
 
 class SimCLRModel(nn.Module):
-    def __init__(self, feature_dim=128):
+    def __init__(self, observation_space, representation_dim=128):
         super(SimCLRModel, self).__init__()
 
         self.f = []
+        in_channel = observation_space.shape[0]
         for name, module in resnet50().named_children():
             if name == 'conv1':
-                module = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+                module = nn.Conv2d(in_channel, 64, kernel_size=3, stride=1, padding=1, bias=False)
             if not isinstance(module, nn.Linear) and not isinstance(module, nn.MaxPool2d):
                 self.f.append(module)
         # encoder
         # Temporarily add an extra layer to be closer to our model implementation
         self.f = nn.Sequential(*self.f)
-        # # projection head
-        # self.g = nn.Sequential(nn.Linear(2048, 512, bias=False), nn.BatchNorm1d(512),
-        #                        nn.ReLU(inplace=True), nn.Linear(512, feature_dim, bias=True))
+
 
     def forward(self, x):
         x = self.f(x)

From f2c9be2af6470edfc3064f5aefa7a9798605835f Mon Sep 17 00:00:00 2001
From: Cynthia Chen <cyn0531@hku.hk>
Date: Tue, 11 May 2021 18:47:22 -0700
Subject: [PATCH 118/123] Update simclr running script

---
 src/il_representations/scripts/run_simclr.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/il_representations/scripts/run_simclr.sh b/src/il_representations/scripts/run_simclr.sh
index c0c8ad80..366b7d19 100755
--- a/src/il_representations/scripts/run_simclr.sh
+++ b/src/il_representations/scripts/run_simclr.sh
@@ -1,8 +1,8 @@
-repl_epochs=100
-bc_trajs=30
+repl_epochs=1
+bc_trajs=10
 bc_batches=4000000
 
-CUDA_VISIBLE_DEVICES=0,2 python src/il_representations/scripts/pretrain_n_adapt.py with \
+CUDA_VISIBLE_DEVICES=2 python src/il_representations/scripts/pretrain_n_adapt.py with \
   cfg_repl_simclr \
   cfg_il_bc_nofreeze \
   tune_run_kwargs.num_samples=1 \

From ad9a9d1fb7978b326797185abb473d68d7f4b36c Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Wed, 12 May 2021 09:57:24 +0800
Subject: [PATCH 119/123] Try a smaller network

---
 src/il_representations/algos/encoders.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/algos/encoders.py b/src/il_representations/algos/encoders.py
index a6c04023..b6152d9f 100644
--- a/src/il_representations/algos/encoders.py
+++ b/src/il_representations/algos/encoders.py
@@ -10,7 +10,7 @@
 from torchvision.models.resnet import BasicBlock as BasicResidualBlock
 import torch
 from torch import nn
-from torchvision.models.resnet import resnet50
+from torchvision.models.resnet import resnet50, resnet34
 import torch.nn.functional as F
 from pyro.distributions import Delta
 
@@ -276,7 +276,7 @@ def __init__(self, observation_space, representation_dim=128):
 
         self.f = []
         in_channel = observation_space.shape[0]
-        for name, module in resnet50().named_children():
+        for name, module in resnet34().named_children():
             if name == 'conv1':
                 module = nn.Conv2d(in_channel, 64, kernel_size=3, stride=1, padding=1, bias=False)
             if not isinstance(module, nn.Linear) and not isinstance(module, nn.MaxPool2d):
@@ -289,6 +289,7 @@ def __init__(self, observation_space, representation_dim=128):
     def forward(self, x):
         x = self.f(x)
         feature = torch.flatten(x, start_dim=1)
+        breakpoint()
         return F.normalize(feature, dim=-1)
 
 

From 4365e02fa368d1c6d34666765130d0d91dddfb4e Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Wed, 12 May 2021 10:13:02 +0800
Subject: [PATCH 120/123] See if it can run end to end

---
 src/il_representations/scripts/run_simclr.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/il_representations/scripts/run_simclr.sh b/src/il_representations/scripts/run_simclr.sh
index 366b7d19..aad8d717 100755
--- a/src/il_representations/scripts/run_simclr.sh
+++ b/src/il_representations/scripts/run_simclr.sh
@@ -1,8 +1,8 @@
 repl_epochs=1
 bc_trajs=10
-bc_batches=4000000
+bc_batches=1
 
-CUDA_VISIBLE_DEVICES=2 python src/il_representations/scripts/pretrain_n_adapt.py with \
+CUDA_VISIBLE_DEVICES=0 python src/il_representations/scripts/pretrain_n_adapt.py with \
   cfg_repl_simclr \
   cfg_il_bc_nofreeze \
   tune_run_kwargs.num_samples=1 \

From 5eb92830d2862f8b6e3e672404f3b08d510a8067 Mon Sep 17 00:00:00 2001
From: Cynthia Chen <RPC2@users.noreply.github.com>
Date: Wed, 12 May 2021 10:16:24 +0800
Subject: [PATCH 121/123] Update encoder kwargs

---
 src/il_representations/algos/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/il_representations/algos/__init__.py b/src/il_representations/algos/__init__.py
index 08f58f55..30c74c93 100644
--- a/src/il_representations/algos/__init__.py
+++ b/src/il_representations/algos/__init__.py
@@ -54,7 +54,7 @@ class SimCLR(RepresentationLearner):
     # TODO note: not made to use momentum because not being used in experiments
     def __init__(self, **kwargs):
         algo_hardcoded_kwargs = dict(encoder=BaseEncoder,
-                                     encoder_kwargs={'obs_encoder_cls': lambda *args: SimCLRModel()},
+                                     encoder_kwargs={'obs_encoder_cls': 'SimCLRModel'},
                                      decoder=SymmetricProjectionHead,
                                      loss_calculator=SymmetricContrastiveLoss,
                                      augmenter=AugmentContextAndTarget,

From 5fdf57a18ed012ebae431217a1bc6e81a5344900 Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Tue, 11 May 2021 19:50:11 -0700
Subject: [PATCH 122/123] Current script to train simclr as repl

---
 src/il_representations/algos/__init__.py     |  1 -
 src/il_representations/algos/encoders.py     |  1 -
 src/il_representations/scripts/run_simclr.sh | 10 +++++-----
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/il_representations/algos/__init__.py b/src/il_representations/algos/__init__.py
index 30c74c93..4283a685 100644
--- a/src/il_representations/algos/__init__.py
+++ b/src/il_representations/algos/__init__.py
@@ -54,7 +54,6 @@ class SimCLR(RepresentationLearner):
     # TODO note: not made to use momentum because not being used in experiments
     def __init__(self, **kwargs):
         algo_hardcoded_kwargs = dict(encoder=BaseEncoder,
-                                     encoder_kwargs={'obs_encoder_cls': 'SimCLRModel'},
                                      decoder=SymmetricProjectionHead,
                                      loss_calculator=SymmetricContrastiveLoss,
                                      augmenter=AugmentContextAndTarget,
diff --git a/src/il_representations/algos/encoders.py b/src/il_representations/algos/encoders.py
index b6152d9f..f927eaf1 100644
--- a/src/il_representations/algos/encoders.py
+++ b/src/il_representations/algos/encoders.py
@@ -289,7 +289,6 @@ def __init__(self, observation_space, representation_dim=128):
     def forward(self, x):
         x = self.f(x)
         feature = torch.flatten(x, start_dim=1)
-        breakpoint()
         return F.normalize(feature, dim=-1)
 
 
diff --git a/src/il_representations/scripts/run_simclr.sh b/src/il_representations/scripts/run_simclr.sh
index aad8d717..66cb3a37 100755
--- a/src/il_representations/scripts/run_simclr.sh
+++ b/src/il_representations/scripts/run_simclr.sh
@@ -1,8 +1,8 @@
-repl_epochs=1
+repl_epochs=100
 bc_trajs=10
-bc_batches=1
+bc_batches=4000000
 
-CUDA_VISIBLE_DEVICES=0 python src/il_representations/scripts/pretrain_n_adapt.py with \
+CUDA_VISIBLE_DEVICES=2 python src/il_representations/scripts/pretrain_n_adapt.py with \
   cfg_repl_simclr \
   cfg_il_bc_nofreeze \
   tune_run_kwargs.num_samples=1 \
@@ -10,9 +10,9 @@ CUDA_VISIBLE_DEVICES=0 python src/il_representations/scripts/pretrain_n_adapt.py
   env_cfg.benchmark_name=dm_control \
   env_cfg.task_name=finger-spin \
   repl.n_epochs=$repl_epochs \
-  repl.algo_params.batch_size=32 \
-  repl.algo_params.representation_dim=2048 \
+  repl.algo_params.batch_size=256 \
   il_train.bc.n_trajs=$bc_trajs \
   il_train.bc.n_batches=$bc_batches \
   exp_ident=repl_epoch_${repl_epochs}_bc_${bc_trajs}_trajs_${bc_batches}_batches
 
+  # repl.algo_params.representation_dim=512 \

From 99831232218abdf357aa0a360e9124a36a9c4edc Mon Sep 17 00:00:00 2001
From: Cynthia <cyn0531@hku.hk>
Date: Wed, 12 May 2021 11:53:55 +0800
Subject: [PATCH 123/123] Use default augmentation

---
 src/il_representations/algos/representation_learner.py | 2 +-
 src/il_representations/scripts/run_simclr.sh           | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/il_representations/algos/representation_learner.py b/src/il_representations/algos/representation_learner.py
index 7dc66e6d..4b75dea1 100644
--- a/src/il_representations/algos/representation_learner.py
+++ b/src/il_representations/algos/representation_learner.py
@@ -145,7 +145,7 @@ def __init__(self, *,
             transforms.ToTensor()])
             # transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])])
 
-        augmenter_kwargs["augment_func"] = train_transform
+        # augmenter_kwargs["augment_func"] = train_transform
         self.augmenter = augmenter(**augmenter_kwargs)
         self.target_pair_constructor = target_pair_constructor(**to_dict(target_pair_constructor_kwargs))
 
diff --git a/src/il_representations/scripts/run_simclr.sh b/src/il_representations/scripts/run_simclr.sh
index 66cb3a37..61e45bba 100755
--- a/src/il_representations/scripts/run_simclr.sh
+++ b/src/il_representations/scripts/run_simclr.sh
@@ -2,14 +2,14 @@ repl_epochs=100
 bc_trajs=10
 bc_batches=4000000
 
-CUDA_VISIBLE_DEVICES=2 python src/il_representations/scripts/pretrain_n_adapt.py with \
+CUDA_VISIBLE_DEVICES=0 python src/il_representations/scripts/pretrain_n_adapt.py with \
   cfg_repl_simclr \
   cfg_il_bc_nofreeze \
+  cfg_bench_micro_sweep_dm_control \
   tune_run_kwargs.num_samples=1 \
-  tune_run_kwargs.resources_per_trial.gpu=1 \
-  env_cfg.benchmark_name=dm_control \
-  env_cfg.task_name=finger-spin \
+  tune_run_kwargs.resources_per_trial.gpu=0.3 \
   repl.n_epochs=$repl_epochs \
+  repl.n_trajs=$bc_trajs \
   repl.algo_params.batch_size=256 \
   il_train.bc.n_trajs=$bc_trajs \
   il_train.bc.n_batches=$bc_batches \