From 6fe118090099e638ca781569469ef2fdc05e1b47 Mon Sep 17 00:00:00 2001 From: students Date: Mon, 6 Apr 2020 14:10:00 +0000 Subject: [PATCH 1/9] add batches generator for office-31 --- data_loader.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 data_loader.py diff --git a/data_loader.py b/data_loader.py new file mode 100644 index 0000000..1e9ab1b --- /dev/null +++ b/data_loader.py @@ -0,0 +1,75 @@ +from torchvision.datasets import ImageFolder +from torch.utils.data import DataLoader, random_split +from torchvision import transforms + +class DataGenerator(DataLoader): + + def get_classes_to_idx(self): + return self.dataset.dataset.class_to_idx + + def get_classes(self): + return self.dataset.dataset.classes + +def create_data_generators(dataset_name, domain, data_path = "data", batch_size = 16, + transformations = transforms.ToTensor(), num_workers = 1, split_ratios = [0.8, 0.1, 0.1]): + """ + Args: + dataset_name (string) + domain (string) - valid domain of the dataset dataset_name + data_path (string) - valid path, which contains dataset_name folder + batch_size (int) + transformations (callable) - optional transform to be applied on an image sample + num_workers (int) - multi-process data loading + split_ratios (list of ints, len(split_ratios) = 3) - ratios of train, validation and test parts + + Return: + 3 data generators - for train, validation and test data + + """ + + dataset = create_dataset(dataset_name, domain, data_path, transformations) + + len_dataset = len(dataset) + train_size = int(len_dataset * split_ratios[0]) + val_size = int(len_dataset * split_ratios[1]) + test_size = len_dataset - train_size - val_size + + train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size]) + + train_dataloader = DataGenerator(train_dataset, batch_size = batch_size, + shuffle=True, num_workers=num_workers) + + val_dataloader = DataGenerator(val_dataset, batch_size = batch_size, + shuffle=False, num_workers=num_workers) + + test_dataloader = DataGenerator(test_dataset, batch_size = batch_size, + shuffle=False, num_workers=num_workers) + + return train_dataloader, val_dataloader, test_dataloader + + + +def create_dataset(dataset_name, domain, data_path, transformations): + """ + Args: + dataset_name (string) + domain (string) - valid domain of the dataset dataset_name + data_path (string) - valid path, which contains dataset_name folder + transformations (callable) - optional transform to be applied on an image sample + + Return: + torchvision.dataset object + + """ + + assert dataset_name in ["office-31"], f"Dataset {dataset_name} is not implemented" + + if dataset_name == "office-31": + + dataset_domains = ["amazon", "dslr", "webcam"] + + assert domain in dataset_domains, f"Incorrect domain {domain}: dataset {dataset_name} domains: {dataset_domains}" + + dataset = ImageFolder(f"{data_path}/{dataset_name}/{domain}/images", transform=transformations) + + return dataset \ No newline at end of file From 85466a4eafe7336919a1be2ea5ca606478ae5862 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=95=D0=BA=D0=B0=D1=82=D0=B5=D1=80=D0=B8=D0=BD=D0=B0=20?= =?UTF-8?q?=D0=93=D0=BB=D0=B0=D0=B7=D0=BA=D0=BE=D0=B2=D0=B0?= Date: Wed, 8 Apr 2020 18:42:24 +0300 Subject: [PATCH 2/9] pep8 code style --- data_loader.py | 98 +++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 44 deletions(-) diff --git a/data_loader.py b/data_loader.py index 1e9ab1b..5408954 100644 --- a/data_loader.py +++ b/data_loader.py @@ -2,74 +2,84 @@ from torch.utils.data import DataLoader, random_split from torchvision import transforms + class DataGenerator(DataLoader): - def get_classes_to_idx(self): return self.dataset.dataset.class_to_idx - + def get_classes(self): return self.dataset.dataset.classes - -def create_data_generators(dataset_name, domain, data_path = "data", batch_size = 16, - transformations = transforms.ToTensor(), num_workers = 1, split_ratios = [0.8, 0.1, 0.1]): + + +def create_data_generators(dataset_name, domain, data_path="data", + batch_size=16, + transformations=transforms.ToTensor(), + num_workers=1, split_ratios=[0.8, 0.1, 0.1]): """ Args: dataset_name (string) - domain (string) - valid domain of the dataset dataset_name - data_path (string) - valid path, which contains dataset_name folder + domain (string) + - valid domain of the dataset dataset_name + data_path (string) + - valid path, which contains dataset_name folder batch_size (int) - transformations (callable) - optional transform to be applied on an image sample - num_workers (int) - multi-process data loading - split_ratios (list of ints, len(split_ratios) = 3) - ratios of train, validation and test parts - + transformations (callable) + - optional transform applied on image sample + num_workers (int) + - multi-process data loading + split_ratios (list of ints, len(split_ratios) = 3) + - ratios of train, validation and test parts + Return: 3 data generators - for train, validation and test data - """ - + dataset = create_dataset(dataset_name, domain, data_path, transformations) - + len_dataset = len(dataset) train_size = int(len_dataset * split_ratios[0]) val_size = int(len_dataset * split_ratios[1]) test_size = len_dataset - train_size - val_size - - train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size]) - - train_dataloader = DataGenerator(train_dataset, batch_size = batch_size, - shuffle=True, num_workers=num_workers) - - val_dataloader = DataGenerator(val_dataset, batch_size = batch_size, - shuffle=False, num_workers=num_workers) - - test_dataloader = DataGenerator(test_dataset, batch_size = batch_size, - shuffle=False, num_workers=num_workers) - + + train_dataset, val_dataset, test_dataset = \ + random_split(dataset, [train_size, val_size, test_size]) + + train_dataloader = DataGenerator(train_dataset, batch_size=batch_size, + shuffle=True, num_workers=num_workers) + val_dataloader = DataGenerator(val_dataset, batch_size=batch_size, + shuffle=False, num_workers=num_workers) + test_dataloader = DataGenerator(test_dataset, batch_size=batch_size, + shuffle=False, num_workers=num_workers) + return train_dataloader, val_dataloader, test_dataloader - - + def create_dataset(dataset_name, domain, data_path, transformations): """ Args: dataset_name (string) - domain (string) - valid domain of the dataset dataset_name - data_path (string) - valid path, which contains dataset_name folder - transformations (callable) - optional transform to be applied on an image sample - + domain (string) + - valid domain of the dataset dataset_name + data_path (string) + - valid path, which contains dataset_name folder + transformations (callable) + - optional transform to be applied on an image sample + Return: - torchvision.dataset object - + torchvision.dataset object """ - - assert dataset_name in ["office-31"], f"Dataset {dataset_name} is not implemented" - + + assert dataset_name in ["office-31"], \ + f"Dataset {dataset_name} is not implemented" + if dataset_name == "office-31": - + dataset_domains = ["amazon", "dslr", "webcam"] - - assert domain in dataset_domains, f"Incorrect domain {domain}: dataset {dataset_name} domains: {dataset_domains}" - - dataset = ImageFolder(f"{data_path}/{dataset_name}/{domain}/images", transform=transformations) - - return dataset \ No newline at end of file + + assert domain in dataset_domains, f"Incorrect domain {domain}: " + \ + f"dataset {dataset_name} domains: {dataset_domains}" + + dataset = ImageFolder(f"{data_path}/{dataset_name}/{domain}/images", + transform=transformations) + + return dataset From 2596c74c6e110ecf6d60cfb9fbc886198e89284d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=95=D0=BA=D0=B0=D1=82=D0=B5=D1=80=D0=B8=D0=BD=D0=B0=20?= =?UTF-8?q?=D0=93=D0=BB=D0=B0=D0=B7=D0=BA=D0=BE=D0=B2=D0=B0?= Date: Thu, 9 Apr 2020 01:08:48 +0300 Subject: [PATCH 3/9] added infinite generation feature --- data_loader.py | 49 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/data_loader.py b/data_loader.py index 5408954..da62117 100644 --- a/data_loader.py +++ b/data_loader.py @@ -4,6 +4,26 @@ class DataGenerator(DataLoader): + def __init__(self, is_infinite=False, *args, **kwargs): + super().__init__(*args, **kwargs) + self.is_infinite = is_infinite + self.reload_iterator() + + def reload_iterator(self): + self.dataset_iterator = super().__iter__() + + def __iter__(self): + return self + + def __next__(self): + try: + batch = next(self.dataset_iterator) + except StopIteration: + if self.is_infinite: + self.reload_iterator() + batch = next(self.dataset_iterator) + return batch + def get_classes_to_idx(self): return self.dataset.dataset.class_to_idx @@ -11,10 +31,9 @@ def get_classes(self): return self.dataset.dataset.classes -def create_data_generators(dataset_name, domain, data_path="data", - batch_size=16, - transformations=transforms.ToTensor(), - num_workers=1, split_ratios=[0.8, 0.1, 0.1]): +def create_data_generators(dataset_name, domain, data_path="data", batch_size=16, + transformations=None, num_workers=1, split_ratios=[0.8, 0.1, 0.1], + image_size=500, infinite_train=False): """ Args: dataset_name (string) @@ -33,6 +52,11 @@ def create_data_generators(dataset_name, domain, data_path="data", Return: 3 data generators - for train, validation and test data """ + if transformations is None: + transformations = transforms.Compose([ + transforms.Resize(image_size), + transforms.ToTensor(), + ]) dataset = create_dataset(dataset_name, domain, data_path, transformations) @@ -41,14 +65,13 @@ def create_data_generators(dataset_name, domain, data_path="data", val_size = int(len_dataset * split_ratios[1]) test_size = len_dataset - train_size - val_size - train_dataset, val_dataset, test_dataset = \ - random_split(dataset, [train_size, val_size, test_size]) + train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size]) - train_dataloader = DataGenerator(train_dataset, batch_size=batch_size, - shuffle=True, num_workers=num_workers) - val_dataloader = DataGenerator(val_dataset, batch_size=batch_size, + train_dataloader = DataGenerator(is_infinite=infinite_train, dataset=train_dataset, batch_size=batch_size, + shuffle=True, num_workers=num_workers, drop_last=True) + val_dataloader = DataGenerator(is_infinite=False, dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) - test_dataloader = DataGenerator(test_dataset, batch_size=batch_size, + test_dataloader = DataGenerator(is_infinite=False, dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) return train_dataloader, val_dataloader, test_dataloader @@ -69,8 +92,7 @@ def create_dataset(dataset_name, domain, data_path, transformations): torchvision.dataset object """ - assert dataset_name in ["office-31"], \ - f"Dataset {dataset_name} is not implemented" + assert dataset_name in ["office-31"], f"Dataset {dataset_name} is not implemented" if dataset_name == "office-31": @@ -79,7 +101,6 @@ def create_dataset(dataset_name, domain, data_path, transformations): assert domain in dataset_domains, f"Incorrect domain {domain}: " + \ f"dataset {dataset_name} domains: {dataset_domains}" - dataset = ImageFolder(f"{data_path}/{dataset_name}/{domain}/images", - transform=transformations) + dataset = ImageFolder(f"{data_path}/{dataset_name}/{domain}/images", transform=transformations) return dataset From 8670ae1300ced7fa7588ecb9540c1c45307172c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=95=D0=BA=D0=B0=D1=82=D0=B5=D1=80=D0=B8=D0=BD=D0=B0=20?= =?UTF-8?q?=D0=93=D0=BB=D0=B0=D0=B7=D0=BA=D0=BE=D0=B2=D0=B0?= Date: Sat, 11 Apr 2020 20:27:12 +0300 Subject: [PATCH 4/9] added device for dataloader --- data_loader.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/data_loader.py b/data_loader.py index da62117..58f925b 100644 --- a/data_loader.py +++ b/data_loader.py @@ -1,6 +1,8 @@ from torchvision.datasets import ImageFolder from torch.utils.data import DataLoader, random_split from torchvision import transforms +from torch.utils.data.dataloader import default_collate +import torch class DataGenerator(DataLoader): @@ -33,7 +35,7 @@ def get_classes(self): def create_data_generators(dataset_name, domain, data_path="data", batch_size=16, transformations=None, num_workers=1, split_ratios=[0.8, 0.1, 0.1], - image_size=500, infinite_train=False): + image_size=500, infinite_train=False, device=torch.device('cpu')): """ Args: dataset_name (string) @@ -58,7 +60,7 @@ def create_data_generators(dataset_name, domain, data_path="data", batch_size=16 transforms.ToTensor(), ]) - dataset = create_dataset(dataset_name, domain, data_path, transformations) + dataset = create_dataset(dataset_name, domain, data_path, transformations, device) len_dataset = len(dataset) train_size = int(len_dataset * split_ratios[0]) @@ -68,16 +70,19 @@ def create_data_generators(dataset_name, domain, data_path="data", batch_size=16 train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size]) train_dataloader = DataGenerator(is_infinite=infinite_train, dataset=train_dataset, batch_size=batch_size, - shuffle=True, num_workers=num_workers, drop_last=True) + shuffle=True, num_workers=num_workers, drop_last=True, + collate_fn=lambda x: default_collate(x).to(device)) val_dataloader = DataGenerator(is_infinite=False, dataset=val_dataset, batch_size=batch_size, - shuffle=False, num_workers=num_workers) + shuffle=False, num_workers=num_workers, + collate_fn=lambda x: default_collate(x).to(device)) test_dataloader = DataGenerator(is_infinite=False, dataset=test_dataset, batch_size=batch_size, - shuffle=False, num_workers=num_workers) + shuffle=False, num_workers=num_workers, + collate_fn=lambda x: default_collate(x).to(device)) return train_dataloader, val_dataloader, test_dataloader -def create_dataset(dataset_name, domain, data_path, transformations): +def create_dataset(dataset_name, domain, data_path, transformations, device): """ Args: dataset_name (string) From 63d6c7471f19bf52ed39f523837ce3879904b43b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=95=D0=BA=D0=B0=D1=82=D0=B5=D1=80=D0=B8=D0=BD=D0=B0=20?= =?UTF-8?q?=D0=93=D0=BB=D0=B0=D0=B7=D0=BA=D0=BE=D0=B2=D0=B0?= Date: Tue, 21 Apr 2020 18:31:33 +0300 Subject: [PATCH 5/9] added semi-supevised dataloader --- data_loader.py | 110 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 4 deletions(-) diff --git a/data_loader.py b/data_loader.py index 4dbb8da..f2a8a41 100644 --- a/data_loader.py +++ b/data_loader.py @@ -1,9 +1,64 @@ +import numpy as np + from torchvision.datasets import ImageFolder from torch.utils.data import DataLoader, random_split from torchvision import transforms from torch.utils.data.dataloader import default_collate import torch +import configs.dann_config as dann_config + + +class CustomDataset(ImageFolder): + """ + Interface for datasets. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_semi_supervised_indexes_for_subset(self, labeled_ratio, subset_indices): + """ + Used for data separation in semi-supervised approaches + Input: + labeled_ratio - ratio of labeled images + subset_indices - indexes of considered subset in dataset + Output: list[int], list[int] - indexes of labeled and unlabeled items in considered subset + + """ + return NotImplemented + + +class Office31Dataset(CustomDataset): + """ + Office31 Dataset class. + More info about the dataset: https://people.eecs.berkeley.edu/~jhoffman/domainadapt/ + Data link: https://drive.google.com/file/d/0B4IapRTv9pJ1WGZVd1VDMmhwdlE/view + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_semi_supervised_indexes_for_subset(self, labeled_ratio, subset_indices): + subset_images_with_classes = np.array(self.imgs)[subset_indices] + class_name_to_id = self.class_to_idx + data_classes = [int(x[1]) for x in subset_images_with_classes] + unique_classes, classes_counts = np.unique(data_classes, return_counts=True) + labeled_indexes = [] + unlabeled_indexes = [] + last_included = False + + for class_id in unique_classes: + all_class_indexes = np.where(np.array(data_classes) == class_id)[0] + labeled_num = labeled_ratio*classes_counts[class_id] + if (labeled_num % 1 > 0): + labeled_num = int(labeled_num + last_included) + last_included = not last_included + else: + labeled_num = int(labeled_num) + labeled_indexes.extend(all_class_indexes[:labeled_num]) + unlabeled_indexes.extend(all_class_indexes[labeled_num:]) + + return labeled_indexes, unlabeled_indexes + class DataGenerator(DataLoader): def __init__(self, is_infinite=False, device=torch.device('cpu'), *args, **kwargs): @@ -35,9 +90,49 @@ def get_classes(self): return self.dataset.dataset.classes +class SemiSupervisedDataGenerator: + def __init__(self, dataset, is_infinite, labeled_ratio, batch_size, + unk_value=dann_config.UNK_VALUE, *args, **kwargs): + assert labeled_ratio is not None, "labeled ratio argument should be provided for semi-supervised dataset" + # need dataset.dataset as dataaset - dataset subset created because of splitting to train, val, test + labeled_indexes, unlabeled_indexes = dataset.dataset.get_semi_supervised_indexes_for_subset(labeled_ratio, + dataset.indices) + self.batch_size = batch_size + self.labeled_batch_size = int(labeled_ratio*batch_size) + self.unlabeled_batch_size = self.batch_size - self.labeled_batch_size + self.labeled_generator = DataGenerator(is_infinite, batch_size=self.labeled_batch_size, + dataset=torch.utils.data.Subset(dataset, labeled_indexes), + *args, **kwargs) + self.unlabeled_generator = DataGenerator(is_infinite, batch_size=self.unlabeled_batch_size, + dataset=torch.utils.data.Subset(dataset, unlabeled_indexes), + *args, **kwargs) + self.unk_class = unk_value + + def __next__(self): + labeled_batch = next(self.labeled_generator) + unlabeled_batch = next(self.unlabeled_generator) + return (torch.cat([labeled_batch[0], unlabeled_batch[0]]), torch.cat([labeled_batch[1], + -1*torch.ones_like(unlabeled_batch[1])])) + + def __iter__(self): + return self + + def __len__(self): + return min(len(self.unlabeled_generator), len(self.labeled_generator)) + + def get_classes_to_idx(self): + labeled_classes_to_idx = self.labeled_generator.get_classes_to_idx() + labeled_classes_to_idx[self.unk_class] = -1 + return labeled_classes_to_idx + + def get_classes(self): + return self.labeled_generator.dataset.dataset.classes + + def create_data_generators(dataset_name, domain, data_path="data", batch_size=16, transformations=None, num_workers=1, split_ratios=[0.8, 0.1, 0.1], - image_size=500, infinite_train=False, device=torch.device('cpu')): + image_size=500, infinite_train=False, device=torch.device('cpu'), + semi_supervised=False, semi_supervised_labeled_ratio=None): """ Args: dataset_name (string) @@ -72,8 +167,15 @@ def create_data_generators(dataset_name, domain, data_path="data", batch_size=16 torch.manual_seed(42) train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size]) - train_dataloader = DataGenerator(is_infinite=infinite_train, device=device, dataset=train_dataset, - batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True) + if semi_supervised: + train_dataloader = SemiSupervisedDataGenerator(is_infinite=infinite_train, + labeled_ratio=semi_supervised_labeled_ratio, + device=device, dataset=train_dataset, + batch_size=batch_size, shuffle=True, + num_workers=num_workers, drop_last=True) + else: + train_dataloader = DataGenerator(is_infinite=infinite_train, device=device, dataset=train_dataset, + batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True) val_dataloader = DataGenerator(is_infinite=False, device=device, dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) test_dataloader = DataGenerator(is_infinite=False, device=device, dataset=test_dataset, @@ -106,6 +208,6 @@ def create_dataset(dataset_name, domain, data_path, transformations, device): assert domain in dataset_domains, f"Incorrect domain {domain}: " + \ f"dataset {dataset_name} domains: {dataset_domains}" - dataset = ImageFolder(f"{data_path}/{dataset_name}/{domain}/images", transform=transformations) + dataset = Office31Dataset(f"{data_path}/{dataset_name}/{domain}/images", transform=transformations) return dataset From 2ed2f876507ae23e44f2c880e20d567028d3c775 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=95=D0=BA=D0=B0=D1=82=D0=B5=D1=80=D0=B8=D0=BD=D0=B0=20?= =?UTF-8?q?=D0=93=D0=BB=D0=B0=D0=B7=D0=BA=D0=BE=D0=B2=D0=B0?= Date: Thu, 7 May 2020 13:42:32 +0300 Subject: [PATCH 6/9] rm data_loader.py for merge --- data_loader.py | 213 ------------------------------------------------- 1 file changed, 213 deletions(-) delete mode 100644 data_loader.py diff --git a/data_loader.py b/data_loader.py deleted file mode 100644 index f2a8a41..0000000 --- a/data_loader.py +++ /dev/null @@ -1,213 +0,0 @@ -import numpy as np - -from torchvision.datasets import ImageFolder -from torch.utils.data import DataLoader, random_split -from torchvision import transforms -from torch.utils.data.dataloader import default_collate -import torch - -import configs.dann_config as dann_config - - -class CustomDataset(ImageFolder): - """ - Interface for datasets. - """ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_semi_supervised_indexes_for_subset(self, labeled_ratio, subset_indices): - """ - Used for data separation in semi-supervised approaches - Input: - labeled_ratio - ratio of labeled images - subset_indices - indexes of considered subset in dataset - Output: list[int], list[int] - indexes of labeled and unlabeled items in considered subset - - """ - return NotImplemented - - -class Office31Dataset(CustomDataset): - """ - Office31 Dataset class. - More info about the dataset: https://people.eecs.berkeley.edu/~jhoffman/domainadapt/ - Data link: https://drive.google.com/file/d/0B4IapRTv9pJ1WGZVd1VDMmhwdlE/view - """ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def get_semi_supervised_indexes_for_subset(self, labeled_ratio, subset_indices): - subset_images_with_classes = np.array(self.imgs)[subset_indices] - class_name_to_id = self.class_to_idx - data_classes = [int(x[1]) for x in subset_images_with_classes] - unique_classes, classes_counts = np.unique(data_classes, return_counts=True) - labeled_indexes = [] - unlabeled_indexes = [] - last_included = False - - for class_id in unique_classes: - all_class_indexes = np.where(np.array(data_classes) == class_id)[0] - labeled_num = labeled_ratio*classes_counts[class_id] - if (labeled_num % 1 > 0): - labeled_num = int(labeled_num + last_included) - last_included = not last_included - else: - labeled_num = int(labeled_num) - labeled_indexes.extend(all_class_indexes[:labeled_num]) - unlabeled_indexes.extend(all_class_indexes[labeled_num:]) - - return labeled_indexes, unlabeled_indexes - - -class DataGenerator(DataLoader): - def __init__(self, is_infinite=False, device=torch.device('cpu'), *args, **kwargs): - super().__init__(*args, **kwargs) - self.device = device - self.is_infinite = is_infinite - self.reload_iterator() - - def reload_iterator(self): - self.dataset_iterator = super().__iter__() - - def __iter__(self): - return self - - def __next__(self): - try: - batch = next(self.dataset_iterator) - except StopIteration: - if self.is_infinite: - self.reload_iterator() - batch = next(self.dataset_iterator) - batch = [elem.to(self.device) for elem in batch] - return batch - - def get_classes_to_idx(self): - return self.dataset.dataset.class_to_idx - - def get_classes(self): - return self.dataset.dataset.classes - - -class SemiSupervisedDataGenerator: - def __init__(self, dataset, is_infinite, labeled_ratio, batch_size, - unk_value=dann_config.UNK_VALUE, *args, **kwargs): - assert labeled_ratio is not None, "labeled ratio argument should be provided for semi-supervised dataset" - # need dataset.dataset as dataaset - dataset subset created because of splitting to train, val, test - labeled_indexes, unlabeled_indexes = dataset.dataset.get_semi_supervised_indexes_for_subset(labeled_ratio, - dataset.indices) - self.batch_size = batch_size - self.labeled_batch_size = int(labeled_ratio*batch_size) - self.unlabeled_batch_size = self.batch_size - self.labeled_batch_size - self.labeled_generator = DataGenerator(is_infinite, batch_size=self.labeled_batch_size, - dataset=torch.utils.data.Subset(dataset, labeled_indexes), - *args, **kwargs) - self.unlabeled_generator = DataGenerator(is_infinite, batch_size=self.unlabeled_batch_size, - dataset=torch.utils.data.Subset(dataset, unlabeled_indexes), - *args, **kwargs) - self.unk_class = unk_value - - def __next__(self): - labeled_batch = next(self.labeled_generator) - unlabeled_batch = next(self.unlabeled_generator) - return (torch.cat([labeled_batch[0], unlabeled_batch[0]]), torch.cat([labeled_batch[1], - -1*torch.ones_like(unlabeled_batch[1])])) - - def __iter__(self): - return self - - def __len__(self): - return min(len(self.unlabeled_generator), len(self.labeled_generator)) - - def get_classes_to_idx(self): - labeled_classes_to_idx = self.labeled_generator.get_classes_to_idx() - labeled_classes_to_idx[self.unk_class] = -1 - return labeled_classes_to_idx - - def get_classes(self): - return self.labeled_generator.dataset.dataset.classes - - -def create_data_generators(dataset_name, domain, data_path="data", batch_size=16, - transformations=None, num_workers=1, split_ratios=[0.8, 0.1, 0.1], - image_size=500, infinite_train=False, device=torch.device('cpu'), - semi_supervised=False, semi_supervised_labeled_ratio=None): - """ - Args: - dataset_name (string) - domain (string) - - valid domain of the dataset dataset_name - data_path (string) - - valid path, which contains dataset_name folder - batch_size (int) - transformations (callable) - - optional transform applied on image sample - num_workers (int) - - multi-process data loading - split_ratios (list of ints, len(split_ratios) = 3) - - ratios of train, validation and test parts - - Return: - 3 data generators - for train, validation and test data - """ - if transformations is None: - transformations = transforms.Compose([ - transforms.Resize(image_size), - transforms.ToTensor(), - ]) - - dataset = create_dataset(dataset_name, domain, data_path, transformations, device) - - len_dataset = len(dataset) - train_size = int(len_dataset * split_ratios[0]) - val_size = int(len_dataset * split_ratios[1]) - test_size = len_dataset - train_size - val_size - - torch.manual_seed(42) - train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size]) - - if semi_supervised: - train_dataloader = SemiSupervisedDataGenerator(is_infinite=infinite_train, - labeled_ratio=semi_supervised_labeled_ratio, - device=device, dataset=train_dataset, - batch_size=batch_size, shuffle=True, - num_workers=num_workers, drop_last=True) - else: - train_dataloader = DataGenerator(is_infinite=infinite_train, device=device, dataset=train_dataset, - batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True) - val_dataloader = DataGenerator(is_infinite=False, device=device, dataset=val_dataset, - batch_size=batch_size, shuffle=False, num_workers=num_workers) - test_dataloader = DataGenerator(is_infinite=False, device=device, dataset=test_dataset, - batch_size=batch_size, shuffle=False, num_workers=num_workers) - - return train_dataloader, val_dataloader, test_dataloader - - -def create_dataset(dataset_name, domain, data_path, transformations, device): - """ - Args: - dataset_name (string) - domain (string) - - valid domain of the dataset dataset_name - data_path (string) - - valid path, which contains dataset_name folder - transformations (callable) - - optional transform to be applied on an image sample - - Return: - torchvision.dataset object - """ - - assert dataset_name in ["office-31"], f"Dataset {dataset_name} is not implemented" - - if dataset_name == "office-31": - - dataset_domains = ["amazon", "dslr", "webcam"] - - assert domain in dataset_domains, f"Incorrect domain {domain}: " + \ - f"dataset {dataset_name} domains: {dataset_domains}" - - dataset = Office31Dataset(f"{data_path}/{dataset_name}/{domain}/images", transform=transformations) - - return dataset From da0b971400ff2cf3d6347fa962dceb3a2d8323fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=95=D0=BA=D0=B0=D1=82=D0=B5=D1=80=D0=B8=D0=BD=D0=B0=20?= =?UTF-8?q?=D0=93=D0=BB=D0=B0=D0=B7=D0=BA=D0=BE=D0=B2=D0=B0?= Date: Thu, 7 May 2020 16:55:46 +0300 Subject: [PATCH 7/9] added more logs and training script --- train.py | 41 +++++++++++++++++++++++++++++++++++++++++ trainer/trainer.py | 19 +++++++++++-------- 2 files changed, 52 insertions(+), 8 deletions(-) create mode 100644 train.py diff --git a/train.py b/train.py new file mode 100644 index 0000000..ad0eef7 --- /dev/null +++ b/train.py @@ -0,0 +1,41 @@ +import torch + +from trainer import Trainer +from loss import loss_DANN +from models import DANNModel +from dataloader.data_loader import create_data_generators +from metrics import AccuracyScoreFromLogits +from utils.callbacks import simple_callback, ModelSaver +import configs.dann_config as dann_config + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +if __name__ == '__main__': + print("Creating datasets") + train_gen_s, val_gen_s, test_gen_s = create_data_generators(dann_config.DATASET, + dann_config.SOURCE_DOMAIN, + batch_size=dann_config.BATCH_SIZE, + infinite_train=True, + image_size=dann_config.IMAGE_SIZE, + num_workers=dann_config.NUM_WORKERS, + device=device) + + train_gen_t, val_gen_t, test_gen_t = create_data_generators(dann_config.DATASET, + dann_config.TARGET_DOMAIN, + batch_size=dann_config.BATCH_SIZE, + infinite_train=True, + image_size=dann_config.IMAGE_SIZE, + num_workers=dann_config.NUM_WORKERS, + device=device) + print("Creating model") + model = DANNModel().to(device) + acc = AccuracyScoreFromLogits() + + tr = Trainer(model, loss_DANN) + print("Starting training") + tr.fit(train_gen_s, train_gen_t, + n_epochs=1, + validation_data=[val_gen_s, val_gen_t], + metrics=[acc], + steps_per_epoch=1, + callbacks=[simple_callback, ModelSaver("DANN")]) diff --git a/trainer/trainer.py b/trainer/trainer.py index be332e0..17d44a7 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -49,12 +49,12 @@ def fit(self, src_data, trg_data, n_epochs=1000, steps_per_epoch=100, val_freq=1 elif opt == 'sgd': parameters = self.model.parameters() if hasattr(self.model, "adaptation_block"): - parameters = [{ "params": self.model.features.parameters(), "lr": 0.1 * opt_kwargs["lr"] }, - { "params": self.model.class_classifier[:-1].parameters(), "lr": 0.1 * opt_kwargs["lr"] }, - { "params": self.model.class_classifier[-1].parameters() }, - { "params": self.model.domain_classifier.parameters() }, - { "params": self.model.adaptation_block.parameters() }, - ] + parameters = [{"params": self.model.features.parameters(), "lr": 0.1 * opt_kwargs["lr"]}, + {"params": self.model.class_classifier[:-1].parameters(), "lr": 0.1 * opt_kwargs["lr"]}, + {"params": self.model.class_classifier[-1].parameters()}, + {"params": self.model.domain_classifier.parameters()}, + {"params": self.model.adaptation_block.parameters()}, + ] opt = torch.optim.SGD(parameters, **opt_kwargs) else: raise NotImplementedError @@ -63,12 +63,13 @@ def fit(self, src_data, trg_data, n_epochs=1000, steps_per_epoch=100, val_freq=1 src_val_data, trg_val_data = validation_data for self.epoch in range(self.epoch, n_epochs): + print(f"Starting epoch {self.epoch}/{n_epochs}") self.loss_logger.reset_history() + print(f"Starting training") for step, (src_batch, trg_batch) in enumerate(zip(src_data, trg_data)): if step == steps_per_epoch: break self.train_on_batch(src_batch, trg_batch, opt) - # validation src_metrics = None trg_metrics = None @@ -76,13 +77,15 @@ def fit(self, src_data, trg_data, n_epochs=1000, steps_per_epoch=100, val_freq=1 self.model.eval() # calculating metrics on validation + print(f"Starting metrics calculation") if metrics is not None: if src_val_data is not None: src_metrics = self.score(src_val_data, metrics) if trg_val_data is not None: trg_metrics = self.score(trg_val_data, metrics) - + # calculating loss on validation + print(f"Starting loss on validation calculation") if src_val_data is not None and trg_val_data is not None: for val_step, (src_batch, trg_batch) in enumerate(zip(src_val_data, trg_val_data)): loss, loss_info = self.calc_loss(src_batch, trg_batch) From a906f0ed41018c4f75978e5692ce3a2dc4bc4422 Mon Sep 17 00:00:00 2001 From: EkaterinaGlazkova Date: Wed, 13 May 2020 14:12:58 +0000 Subject: [PATCH 8/9] added learning with ALL data and dropout head parematers --- configs/dann_config.py | 7 +++++-- example.py | 43 +++++++++++++++++++++------------------ models/backbone_models.py | 8 +++++++- models/models.py | 16 ++++++++++++--- trainer/trainer.py | 27 +++++++++++++++--------- utils/callbacks.py | 11 +++++++--- 6 files changed, 73 insertions(+), 39 deletions(-) diff --git a/configs/dann_config.py b/configs/dann_config.py index f1791a8..033215f 100644 --- a/configs/dann_config.py +++ b/configs/dann_config.py @@ -2,6 +2,7 @@ LOSS_NEED_INTERMEDIATE_LAYERS = False UNK_VALUE = -100 # torch default IS_UNSUPERVISED = True +LOG_PATH = "_log/exp4" GRADIENT_REVERSAL_LAYER_ALPHA = 1.0 FREZE_BACKBONE_FEATURES = True @@ -18,14 +19,16 @@ CLASSES_CNT = 31 MODEL_BACKBONE = "alexnet" # alexnet resnet50 vanilla_dann -DOMAIN_HEAD = "vanilla_dann" +DOMAIN_HEAD = "vanilla_dann" # "vanilla_dann", "dropout_dann", "mnist_dann" BACKBONE_PRETRAINED = True -NEED_ADAPTATION_BLOCK = True # ="True" only for alexnet, ="False" for other types +ALEXNET_NEED_ADAPTATION_BLOCK = True # ="True" only for alexnet, ="False" for other types +ALEXNET_USE_DROPOUT_IN_CLASS_HEAD_AFTER_ADAPTATION_BLOCK = True # used only if NEED_ADAPTATION_BLOCK == True BLOCKS_WITH_SMALLER_LR = 2 # ="2" only for alexnet, ="0" for other types IMAGE_SIZE = 224 DATASET = "office-31" SOURCE_DOMAIN = "amazon" TARGET_DOMAIN = "webcam" +RESNET50_USE_DROPOUT_IN_CLASS_HEAD = True # CLASSES_CNT = 10 # MODEL_BACKBONE = "mnist_dann" diff --git a/example.py b/example.py index 62dab53..fbdadf1 100644 --- a/example.py +++ b/example.py @@ -16,41 +16,44 @@ if __name__ == '__main__': - train_gen_s, val_gen_s, test_gen_s = create_data_generators(dann_config.DATASET, - dann_config.SOURCE_DOMAIN, - batch_size=dann_config.BATCH_SIZE, - infinite_train=True, - image_size=dann_config.IMAGE_SIZE, - num_workers=dann_config.NUM_WORKERS, - device=device) + train_gen_s, _, _ = create_data_generators(dann_config.DATASET, + dann_config.SOURCE_DOMAIN, + batch_size=dann_config.BATCH_SIZE, + infinite_train=True, + image_size=dann_config.IMAGE_SIZE, + num_workers=dann_config.NUM_WORKERS, + device=device, + split_ratios=[1.0, 0., 0.]) - train_gen_t, val_gen_t, test_gen_t = create_data_generators(dann_config.DATASET, - dann_config.TARGET_DOMAIN, - batch_size=dann_config.BATCH_SIZE, - infinite_train=True, - image_size=dann_config.IMAGE_SIZE, - num_workers=dann_config.NUM_WORKERS, - device=device) + train_gen_t, _, _ = create_data_generators(dann_config.DATASET, + dann_config.TARGET_DOMAIN, + batch_size=dann_config.BATCH_SIZE, + infinite_train=True, + image_size=dann_config.IMAGE_SIZE, + num_workers=dann_config.NUM_WORKERS, + device=device, + split_ratios=[1.0, 0., 0.]) model = DANNModel().to(device) + print(model) acc = AccuracyScoreFromLogits() scheduler = LRSchedulerSGD(blocks_with_smaller_lr=dann_config.BLOCKS_WITH_SMALLER_LR) tr = Trainer(model, loss_DANN) tr.fit(train_gen_s, train_gen_t, n_epochs=dann_config.N_EPOCHS, - validation_data=[val_gen_s, val_gen_t], + validation_data=[train_gen_s, train_gen_t], metrics=[acc], steps_per_epoch=dann_config.STEPS_PER_EPOCH, val_freq=dann_config.VAL_FREQ, opt='sgd', opt_kwargs={'lr': 0.01, 'momentum': 0.9}, lr_scheduler=scheduler, - callbacks=[print_callback(watch=["loss", "domain_loss", "val_loss", - "val_domain_loss", 'trg_metrics', 'src_metrics']), + callbacks=[print_callback(watch=["loss", "domain_loss",# "val_loss", "val_domain_loss", + 'trg_metrics', 'src_metrics']), ModelSaver('DANN', dann_config.SAVE_MODEL_FREQ), - WandbCallback(), - HistorySaver('log_with_sgd', dann_config.VAL_FREQ, path='_log/DANN_Resnet_sgd', - extra_losses={'domain_loss': ['domain_loss', 'val_domain_loss'], + #WandbCallback(), + HistorySaver('log_with_sgd', dann_config.VAL_FREQ, path=dann_config.LOG_PATH, + extra_losses={'domain_loss': ['domain_loss'],#, 'val_domain_loss'], 'train_domain_loss': ['domain_loss_on_src', 'domain_loss_on_trg']})]) wandb.join() diff --git a/models/backbone_models.py b/models/backbone_models.py index a718b6e..ba585bf 100644 --- a/models/backbone_models.py +++ b/models/backbone_models.py @@ -97,7 +97,13 @@ def get_resnet50(): param.requires_grad = False pooling = model.avgpool - classifier = nn.Sequential(nn.Linear(2048, dann_config.CLASSES_CNT)) + if dann_config.RESNET50_USE_DROPOUT_IN_CLASS_HEAD: + classifier = nn.Sequential(nn.Linear(2048, 1024), + nn.ReLU(), + nn.Dropout(0.5), + nn.Linear(1024, dann_config.CLASSES_CNT)) + else: + classifier = nn.Sequential(nn.Linear(2048, dann_config.CLASSES_CNT)) classifier_layer_ids = [0] pooling_ftrs = 2048 pooling_output_side = 1 diff --git a/models/models.py b/models/models.py index 05437c5..d3a55c5 100644 --- a/models/models.py +++ b/models/models.py @@ -19,7 +19,7 @@ def __init__(self): self.features, self.pooling, self.class_classifier, \ domain_input_len, self.classifier_before_domain_cnt = backbone_models.get_backbone_model() - if dann_config.NEED_ADAPTATION_BLOCK: + if dann_config.ALEXNET_NEED_ADAPTATION_BLOCK: self.adaptation_block = nn.Sequential( nn.ReLU(), nn.Linear(domain_input_len, 2048), @@ -27,7 +27,17 @@ def __init__(self): ) domain_input_len = 2048 classifier_start_output_len = self.class_classifier[self.classifier_before_domain_cnt][-1].out_features - self.class_classifier[self.classifier_before_domain_cnt][-1] = nn.Linear(2048, classifier_start_output_len) + if dann_config.ALEXNET_USE_DROPOUT_IN_CLASS_HEAD_AFTER_ADAPTATION_BLOCK: + self.class_classifier[self.classifier_before_domain_cnt][-1] = nn.Sequential( + nn.Linear(2048, 2048), + nn.ReLU(), + nn.Dropout(0.5), + nn.Linear(2048, 1024), + nn.ReLU(), + nn.Dropout(0.5), + nn.Linear(1024, classifier_start_output_len)) + else: + self.class_classifier[self.classifier_before_domain_cnt][-1] = nn.Linear(2048, classifier_start_output_len) self.domain_classifier = domain_heads.get_domain_head(domain_input_len) @@ -48,7 +58,7 @@ def forward(self, input_data, rev_grad_alpha=dann_config.GRADIENT_REVERSAL_LAYER output_classifier = self.class_classifier[i](output_classifier) classifier_layers_outputs.append(output_classifier) - if dann_config.NEED_ADAPTATION_BLOCK: + if dann_config.ALEXNET_NEED_ADAPTATION_BLOCK: output_classifier = self.adaptation_block(output_classifier) reversed_features = blocks.GradientReversalLayer.apply(output_classifier, rev_grad_alpha) diff --git a/trainer/trainer.py b/trainer/trainer.py index 17d44a7..149bef8 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -80,17 +80,18 @@ def fit(self, src_data, trg_data, n_epochs=1000, steps_per_epoch=100, val_freq=1 print(f"Starting metrics calculation") if metrics is not None: if src_val_data is not None: - src_metrics = self.score(src_val_data, metrics) + src_metrics = self.score(src_val_data, metrics, len(src_val_data)) if trg_val_data is not None: - trg_metrics = self.score(trg_val_data, metrics) + trg_metrics = self.score(trg_val_data, metrics, len(trg_val_data)) # calculating loss on validation - print(f"Starting loss on validation calculation") - if src_val_data is not None and trg_val_data is not None: - for val_step, (src_batch, trg_batch) in enumerate(zip(src_val_data, trg_val_data)): - loss, loss_info = self.calc_loss(src_batch, trg_batch) - self.loss_logger.store(prefix="val", loss=loss.data.cpu().item(), **loss_info) - + #commented - not working with training on ALL source and target data + #print(f"Starting loss on validation calculation") + #if src_val_data is not None and trg_val_data is not None: + # for val_step, (src_batch, trg_batch) in enumerate(zip(src_val_data, trg_val_data)): + # loss, loss_info = self.calc_loss(src_batch, trg_batch) + # self.loss_logger.store(prefix="val", loss=loss.data.cpu().item(), **loss_info) + if callbacks is not None: epoch_log = dict(**self.loss_logger.get_info()) if src_metrics is not None: @@ -103,15 +104,21 @@ def fit(self, src_data, trg_data, n_epochs=1000, steps_per_epoch=100, val_freq=1 if lr_scheduler: lr_scheduler.step(opt, self.epoch, n_epochs) - def score(self, data, metrics): + def score(self, data, metrics, steps_num): for metric in metrics: metric.reset() - + + # not to iterate infinitely + cur_step = 0 + data.reload_iterator() for images, true_classes in data: pred_classes = self.model.predict(images) for metric in metrics: metric(true_classes, pred_classes) + cur_step += 1 + if cur_step == steps_num: + break data.reload_iterator() return {metric.name: metric.score for metric in metrics} diff --git a/utils/callbacks.py b/utils/callbacks.py index 4291ccb..56afe6c 100644 --- a/utils/callbacks.py +++ b/utils/callbacks.py @@ -5,11 +5,15 @@ def simple_callback(model, epoch_log, current_epoch, total_epoch): train_loss = epoch_log['loss'] - val_loss = epoch_log['val_loss'] + if 'val_loss' in epoch_log: + val_loss = epoch_log['val_loss'] trg_metrics = epoch_log['trg_metrics'] src_metrics = epoch_log['src_metrics'] message_head = f'Epoch {current_epoch+1}/{total_epoch}\n' - message_loss = 'loss: {:<10}\t val_loss: {:<10}\t'.format(train_loss, val_loss) + if 'val_loss' in epoch_log: + message_loss = 'loss: {:<10}\t val_loss: {:<10}\t'.format(train_loss, val_loss) + else: + message_loss = 'loss: {:<10}\t'.format(train_loss) message_src_metrics = ' '.join(['val_src_{}: {:<10}\t'.format(k, v) for k, v in src_metrics.items()]) message_trg_metrics = ' '.join(['val_trg_{}: {:<10}\t'.format(k, v) for k, v in trg_metrics.items()]) print(message_head + message_loss + message_src_metrics + message_trg_metrics) @@ -115,7 +119,8 @@ def _save_to_json(self, data, name=None): def __call__(self, model, epoch_log, current_epoch, total_epoch): if current_epoch % self.val_freq == 0: - self.loss_history['val_loss'].append(epoch_log['val_loss']) + if "val_loss" in epoch_log: + self.loss_history['val_loss'].append(epoch_log['val_loss']) for metric in epoch_log['trg_metrics']: self.trg_metrics_history[metric].append(epoch_log['trg_metrics'][metric]) From cc066292f204b6fa86ac4365be1c02c2a1c7d146 Mon Sep 17 00:00:00 2001 From: EkaterinaGlazkova Date: Wed, 13 May 2020 14:20:36 +0000 Subject: [PATCH 9/9] train.py deleted --- train.py | 41 ----------------------------------------- 1 file changed, 41 deletions(-) delete mode 100644 train.py diff --git a/train.py b/train.py deleted file mode 100644 index ad0eef7..0000000 --- a/train.py +++ /dev/null @@ -1,41 +0,0 @@ -import torch - -from trainer import Trainer -from loss import loss_DANN -from models import DANNModel -from dataloader.data_loader import create_data_generators -from metrics import AccuracyScoreFromLogits -from utils.callbacks import simple_callback, ModelSaver -import configs.dann_config as dann_config - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -if __name__ == '__main__': - print("Creating datasets") - train_gen_s, val_gen_s, test_gen_s = create_data_generators(dann_config.DATASET, - dann_config.SOURCE_DOMAIN, - batch_size=dann_config.BATCH_SIZE, - infinite_train=True, - image_size=dann_config.IMAGE_SIZE, - num_workers=dann_config.NUM_WORKERS, - device=device) - - train_gen_t, val_gen_t, test_gen_t = create_data_generators(dann_config.DATASET, - dann_config.TARGET_DOMAIN, - batch_size=dann_config.BATCH_SIZE, - infinite_train=True, - image_size=dann_config.IMAGE_SIZE, - num_workers=dann_config.NUM_WORKERS, - device=device) - print("Creating model") - model = DANNModel().to(device) - acc = AccuracyScoreFromLogits() - - tr = Trainer(model, loss_DANN) - print("Starting training") - tr.fit(train_gen_s, train_gen_t, - n_epochs=1, - validation_data=[val_gen_s, val_gen_t], - metrics=[acc], - steps_per_epoch=1, - callbacks=[simple_callback, ModelSaver("DANN")])