From eab0b0801de8201368f3a56f1947dfe00466e26f Mon Sep 17 00:00:00 2001 From: salomaestro Date: Thu, 20 Feb 2025 12:04:54 +0100 Subject: [PATCH 1/8] Update to handle nr_channels arg --- CollaborativeCoding/dataloaders/usps_0_6.py | 14 ++++++++++++-- CollaborativeCoding/load_data.py | 6 +++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/CollaborativeCoding/dataloaders/usps_0_6.py b/CollaborativeCoding/dataloaders/usps_0_6.py index 70286dc..c729b4b 100644 --- a/CollaborativeCoding/dataloaders/usps_0_6.py +++ b/CollaborativeCoding/dataloaders/usps_0_6.py @@ -8,6 +8,7 @@ import h5py as h5 import numpy as np +import torch from PIL import Image from torch.utils.data import Dataset @@ -83,6 +84,7 @@ def __init__( sample_ids: list, train: bool = False, transform=None, + nr_channels=1, ): super().__init__() @@ -91,6 +93,7 @@ def __init__( self.transform = transform self.mode = "train" if train else "test" self.sample_ids = sample_ids + self.nr_channels = nr_channels def __len__(self): return len(self.sample_ids) @@ -100,11 +103,18 @@ def __getitem__(self, id): with h5.File(self.filepath, "r") as f: data = f[self.mode]["data"][index].astype(np.uint8) - label = f[self.mode]["target"][index] + label = int(f[self.mode]["target"][index]) - data = Image.fromarray(data, mode="L") + if self.nr_channels == 1: + data = Image.fromarray(data, mode="L") + elif self.nr_channels == 3: + data = Image.fromarray(data, mode="RGB") + else: + raise ValueError("Invalid number of channels") if self.transform: data = self.transform(data) + # label = torch.tensor(label).long() + return data, label diff --git a/CollaborativeCoding/load_data.py b/CollaborativeCoding/load_data.py index b4a247b..6102c87 100644 --- a/CollaborativeCoding/load_data.py +++ b/CollaborativeCoding/load_data.py @@ -89,7 +89,7 @@ def load_data(dataset: str, *args, **kwargs) -> tuple: sample_ids=train_samples, train=True, transform=transform, - nr_channels=kwargs.get("nr_channels"), + nr_channels=kwargs.get("nr_channels", 1), ) val = dataset( @@ -97,7 +97,7 @@ def load_data(dataset: str, *args, **kwargs) -> tuple: sample_ids=val_samples, train=True, transform=transform, - nr_channels=kwargs.get("nr_channels"), + nr_channels=kwargs.get("nr_channels", 1), ) test = dataset( @@ -105,7 +105,7 @@ def load_data(dataset: str, *args, **kwargs) -> tuple: sample_ids=test_samples, train=False, transform=transform, - nr_channels=kwargs.get("nr_channels"), + nr_channels=kwargs.get("nr_channels", 1), ) return train, val, test From 2be6ccf5395d3dd6c82257deb7cb8aa7a3db2051 Mon Sep 17 00:00:00 2001 From: salomaestro Date: Thu, 20 Feb 2025 12:05:29 +0100 Subject: [PATCH 2/8] Update recall to store metrics on the go --- CollaborativeCoding/metrics/recall.py | 42 +++++++++++++++++++++------ 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/CollaborativeCoding/metrics/recall.py b/CollaborativeCoding/metrics/recall.py index 80a1b72..385f974 100644 --- a/CollaborativeCoding/metrics/recall.py +++ b/CollaborativeCoding/metrics/recall.py @@ -1,3 +1,4 @@ +import numpy as np import torch import torch.nn as nn @@ -57,26 +58,49 @@ def __init__(self, num_classes, macro_averaging=False): self.num_classes = num_classes self.macro_averaging = macro_averaging + self.__y_true = [] + self.__y_pred = [] + def forward(self, true, logits): pred = logits.argmax(dim=-1) y_true = one_hot_encode(true, self.num_classes) y_pred = one_hot_encode(pred, self.num_classes) + self.__y_true.append(y_true) + self.__y_pred.append(y_pred) + + def compute(self, y_true, y_pred): if self.macro_averaging: - recall = 0 - for i in range(self.num_classes): - tp = (y_true[:, i] * y_pred[:, i]).sum() - fn = torch.sum(~y_pred[y_true[:, i].bool()].bool()) - recall += tp / (tp + fn) - recall /= self.num_classes - else: - recall = self.__compute(y_true, y_pred) + return self.__compute_macro_averaging(y_true, y_pred) + + return self.__compute_micro_averaging(y_true, y_pred) + + def __compute_macro_averaging(self, y_true, y_pred): + recall = 0 + for i in range(self.num_classes): + tp = (y_true[:, i] * y_pred[:, i]).sum() + fn = torch.sum(~y_pred[y_true[:, i].bool()].bool()) + recall += tp / (tp + fn) + recall /= self.num_classes return recall - def __compute(self, y_true, y_pred): + def __compute_micro_averaging(self, y_true, y_pred): true_positives = (y_true * y_pred).sum() false_negatives = torch.sum(~y_pred[y_true.bool()].bool()) recall = true_positives / (true_positives + false_negatives) return recall + + def __returnmetric__(self): + if len(self.__y_true) == 0 and len(self.__y_pred) == 0: + return np.nan + + y_true = torch.cat(self.__y_true, dim=0) + y_pred = torch.cat(self.__y_pred, dim=0) + + return self.compute(y_true, y_pred) + + def __reset__(self): + self.__y_true = [] + self.__y_pred = [] From 2accbe4532b6dc78de45f1600b7a5569825cb59a Mon Sep 17 00:00:00 2001 From: salomaestro Date: Thu, 20 Feb 2025 12:06:09 +0100 Subject: [PATCH 3/8] Update recall test for new structure --- tests/test_metrics.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 67db356..61ae00c 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -59,8 +59,11 @@ def test_recall(): recall_micro = Recall(7) recall_macro = Recall(7, macro_averaging=True) - recall_micro_score = recall_micro(y_true, logits) - recall_macro_score = recall_macro(y_true, logits) + recall_micro(y_true, logits) + recall_macro(y_true, logits) + + recall_micro_score = recall_micro.__returnmetric__() + recall_macro_score = recall_macro.__returnmetric__() assert isinstance(recall_micro_score, torch.Tensor), "Expected a tensor output." assert isinstance(recall_macro_score, torch.Tensor), "Expected a tensor output." From f31e4e725ee4ab9f4d5f32ce9ddf13027a7e6119 Mon Sep 17 00:00:00 2001 From: salomaestro Date: Thu, 20 Feb 2025 12:06:51 +0100 Subject: [PATCH 4/8] Move load_data_test to test_wrappers.py --- tests/test_dataloaders.py | 30 ------------------ tests/test_wrappers.py | 67 +++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 60 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index dbdb14a..eddb116 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -1,9 +1,6 @@ from pathlib import Path import numpy as np -import pytest -import torch -from PIL import Image from torchvision import transforms from CollaborativeCoding.dataloaders import ( @@ -11,39 +8,12 @@ USPSDataset0_6, USPSH5_Digit_7_9_Dataset, ) -from CollaborativeCoding.load_data import load_data - - -@pytest.mark.parametrize( - "data_name, expected", - [ - ("usps_0-6", USPSDataset0_6), - ("usps_7-9", USPSH5_Digit_7_9_Dataset), - ("mnist_0-3", MNISTDataset0_3), - # TODO: Add more datasets here - ], -) -def test_load_data(data_name, expected): - dataset = load_data( - data_name, - data_dir=Path("data"), - transform=transforms.ToTensor(), - ) - assert isinstance(dataset, expected) - assert len(dataset) > 0 - assert isinstance(dataset[0], tuple) - assert isinstance(dataset[0][0], torch.Tensor) - assert isinstance( - dataset[0][1], (int, torch.Tensor, np.ndarray) - ) # Should probably restrict this to only int or one-hot encoded tensor or array for consistency. def test_uspsdataset0_6(): from tempfile import TemporaryDirectory import h5py - import numpy as np - from torchvision import transforms # Create a temporary directory (deleted after the test) with TemporaryDirectory() as tempdir: diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index f30176b..4d4d05d 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -1,6 +1,17 @@ from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest +import torch +from torchvision import transforms from CollaborativeCoding import load_data, load_metric, load_model +from CollaborativeCoding.dataloaders import ( + MNISTDataset0_3, + SVHNDataset, + USPSDataset0_6, + USPSH5_Digit_7_9_Dataset, +) def test_load_model(): @@ -30,38 +41,34 @@ def test_load_model(): ) -def test_load_data(): - from tempfile import TemporaryDirectory - - import torch as th - from torchvision import transforms - - dataset_names = [ - "usps_0-6", - "mnist_0-3", - "usps_7-9", - "svhn", - # 'mnist_4-9' #Uncomment when implemented - ] - - trans = transforms.Compose( - [ - transforms.Resize((16, 16)), - transforms.ToTensor(), - ] - ) - - with TemporaryDirectory() as tmppath: - for name in dataset_names: - dataset = load_data( - name, train=False, data_dir=Path(tmppath), transform=trans +@pytest.mark.parametrize( + "data_name, expected", + [ + ("usps_0-6", USPSDataset0_6), + ("usps_7-9", USPSH5_Digit_7_9_Dataset), + ("mnist_0-3", MNISTDataset0_3), + ("svhn", SVHNDataset), + ], +) +def test_load_data(data_name, expected): + with TemporaryDirectory() as tempdir: + tempdir = Path(tempdir) + + train, val, test = load_data( + data_name, + data_dir=tempdir, + transform=transforms.ToTensor(), + ) + + for dataset in [train, val, test]: + assert isinstance(dataset, expected) + assert len(dataset) > 0 + assert isinstance(dataset[0], tuple) + assert isinstance(dataset[0][0], torch.Tensor) + assert isinstance( + dataset[0][1], int ) - im, _ = dataset.__getitem__(0) - - assert dataset.__len__() != 0 - assert type(im) == th.Tensor and len(im.size()) == 3 - def test_load_metric(): pass From 0de568ee6ccbf262c1ec884655d2be5dd35b4bdb Mon Sep 17 00:00:00 2001 From: salomaestro Date: Thu, 20 Feb 2025 12:07:01 +0100 Subject: [PATCH 5/8] Had to add this to not get a error --- CollaborativeCoding/dataloaders/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CollaborativeCoding/dataloaders/download.py b/CollaborativeCoding/dataloaders/download.py index 5e90beb..cd809cc 100644 --- a/CollaborativeCoding/dataloaders/download.py +++ b/CollaborativeCoding/dataloaders/download.py @@ -87,7 +87,7 @@ def _get_labels(path: Path) -> np.ndarray: def svhn(self, data_dir: Path) -> tuple[np.ndarray, np.ndarray]: def download_svhn(path, train: bool = True): - SVHN() + SVHN(path) parent_path = data_dir / "SVHN" From ffe105fb91fe321f7c30750032efa0621a184b2c Mon Sep 17 00:00:00 2001 From: salomaestro Date: Thu, 20 Feb 2025 12:11:03 +0100 Subject: [PATCH 6/8] Ruff + isort --- CollaborativeCoding/dataloaders/uspsh5_7_9.py | 7 ++-- CollaborativeCoding/metrics/F1.py | 8 +++-- CollaborativeCoding/models/solveig_model.py | 36 +++++++++---------- main.py | 3 +- tests/test_metrics.py | 8 +++-- tests/test_wrappers.py | 4 +-- 6 files changed, 36 insertions(+), 30 deletions(-) diff --git a/CollaborativeCoding/dataloaders/uspsh5_7_9.py b/CollaborativeCoding/dataloaders/uspsh5_7_9.py index 6808ad3..3a933db 100644 --- a/CollaborativeCoding/dataloaders/uspsh5_7_9.py +++ b/CollaborativeCoding/dataloaders/uspsh5_7_9.py @@ -32,7 +32,9 @@ class USPSH5_Digit_7_9_Dataset(Dataset): A transform function to apply to the images. """ - def __init__(self, data_path, sample_ids, train=False, transform=None, nr_channels=1): + def __init__( + self, data_path, sample_ids, train=False, transform=None, nr_channels=1 + ): super().__init__() """ Initializes the USPS dataset by loading images and labels from the given `.h5` file. @@ -112,7 +114,8 @@ def main(): indices = np.array([7, 8, 9]) # Load the dataset dataset = USPSH5_Digit_7_9_Dataset( - data_path="C:/Users/Solveig/OneDrive/Dokumente/UiT PhD/Courses/Git", sample_ids=indices, + data_path="C:/Users/Solveig/OneDrive/Dokumente/UiT PhD/Courses/Git", + sample_ids=indices, train=False, transform=transform, ) diff --git a/CollaborativeCoding/metrics/F1.py b/CollaborativeCoding/metrics/F1.py index 33c0a4d..483d8eb 100644 --- a/CollaborativeCoding/metrics/F1.py +++ b/CollaborativeCoding/metrics/F1.py @@ -159,11 +159,13 @@ def __returnmetric__(self): else: self.y_true = torch.cat(self.y_true) self.y_pred = torch.cat(self.y_pred) - return self._micro_F1(self.y_true, self.y_pred) if not self.macro_averaging else self._macro_F1(self.y_true, self.y_pred) + return ( + self._micro_F1(self.y_true, self.y_pred) + if not self.macro_averaging + else self._macro_F1(self.y_true, self.y_pred) + ) def __reset__(self): self.y_true = [] self.y_pred = [] return None - - diff --git a/CollaborativeCoding/models/solveig_model.py b/CollaborativeCoding/models/solveig_model.py index 442ab0e..96407b0 100644 --- a/CollaborativeCoding/models/solveig_model.py +++ b/CollaborativeCoding/models/solveig_model.py @@ -4,24 +4,24 @@ def find_fc_input_shape(image_shape, model): """ - Find the shape of the input to the fully connected layer after passing through the convolutional layers. - - Code inspired by @Seilmast (https://github.com/SFI-Visual-Intelligence/Collaborative-Coding-Exam/issues/67#issuecomment-2651212254) - - Args - ---- - image_shape : tuple(int, int, int) - Shape of the input image (C, H, W), where C is the number of channels, - H is the height, and W is the width of the image. - model : nn.Module - The CNN model containing the convolutional layers, whose output size is used to - determine the number of input features for the fully connected layer. - - Returns - ------- - int - The number of elements in the input to the fully connected layer. - """ + Find the shape of the input to the fully connected layer after passing through the convolutional layers. + + Code inspired by @Seilmast (https://github.com/SFI-Visual-Intelligence/Collaborative-Coding-Exam/issues/67#issuecomment-2651212254) + + Args + ---- + image_shape : tuple(int, int, int) + Shape of the input image (C, H, W), where C is the number of channels, + H is the height, and W is the width of the image. + model : nn.Module + The CNN model containing the convolutional layers, whose output size is used to + determine the number of input features for the fully connected layer. + + Returns + ------- + int + The number of elements in the input to the fully connected layer. + """ dummy_img = torch.randn(1, *image_shape) with torch.no_grad(): diff --git a/main.py b/main.py index 0f2a5d7..06f7277 100644 --- a/main.py +++ b/main.py @@ -1,11 +1,11 @@ import numpy as np import torch as th import torch.nn as nn +import wandb from torch.utils.data import DataLoader from torchvision import transforms from tqdm import tqdm -import wandb from CollaborativeCoding import ( MetricWrapper, createfolders, @@ -17,7 +17,6 @@ # from wandb_api import WANDB_API - def main(): """ diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 04c724f..6b225c3 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -91,8 +91,12 @@ def test_f1score(): macro_f1_score = f1_macro.__returnmetric__() # Check if outputs are tensors - assert isinstance(micro_f1_score, torch.Tensor), "Micro F1 score should be a tensor." - assert isinstance(macro_f1_score, torch.Tensor), "Macro F1 score should be a tensor." + assert isinstance(micro_f1_score, torch.Tensor), ( + "Micro F1 score should be a tensor." + ) + assert isinstance(macro_f1_score, torch.Tensor), ( + "Macro F1 score should be a tensor." + ) # Check that F1 scores are between 0 and 1 assert 0 <= micro_f1_score.item() <= 1, "Micro F1 score should be between 0 and 1." diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index 4d4d05d..fabe47b 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -65,9 +65,7 @@ def test_load_data(data_name, expected): assert len(dataset) > 0 assert isinstance(dataset[0], tuple) assert isinstance(dataset[0][0], torch.Tensor) - assert isinstance( - dataset[0][1], int - ) + assert isinstance(dataset[0][1], int) def test_load_metric(): From e9fa5335e25590bf0ca2e5e70f923a69a342c32e Mon Sep 17 00:00:00 2001 From: salomaestro Date: Thu, 20 Feb 2025 12:20:20 +0100 Subject: [PATCH 7/8] Remove unused import --- CollaborativeCoding/dataloaders/usps_0_6.py | 1 - 1 file changed, 1 deletion(-) diff --git a/CollaborativeCoding/dataloaders/usps_0_6.py b/CollaborativeCoding/dataloaders/usps_0_6.py index c729b4b..ac44dd6 100644 --- a/CollaborativeCoding/dataloaders/usps_0_6.py +++ b/CollaborativeCoding/dataloaders/usps_0_6.py @@ -8,7 +8,6 @@ import h5py as h5 import numpy as np -import torch from PIL import Image from torch.utils.data import Dataset From bfb895c94478bb8cd75e048e9c6a47f20f041e78 Mon Sep 17 00:00:00 2001 From: seilmast Date: Thu, 20 Feb 2025 12:28:22 +0100 Subject: [PATCH 8/8] Commiting before merging christian branch --- .gitignore | 5 ++-- CollaborativeCoding/dataloaders/download.py | 2 +- CollaborativeCoding/load_data.py | 6 +++- CollaborativeCoding/load_metric.py | 25 ++++++++++++++++ tests/test_dataloaders.py | 2 ++ tests/test_wrappers.py | 33 +++++++++++++++------ 6 files changed, 59 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 04f9680..dce22ec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ __pycache__/ .ipynb_checkpoints/ Data/* +data/* Results/* Experiments/* _build/ @@ -14,9 +15,7 @@ doc/autoapi #Magnus specific job* -env2/* -ruffian.sh -localtest.sh +local* # Johanthings formatting.x diff --git a/CollaborativeCoding/dataloaders/download.py b/CollaborativeCoding/dataloaders/download.py index 5e90beb..52d65cd 100644 --- a/CollaborativeCoding/dataloaders/download.py +++ b/CollaborativeCoding/dataloaders/download.py @@ -87,7 +87,7 @@ def _get_labels(path: Path) -> np.ndarray: def svhn(self, data_dir: Path) -> tuple[np.ndarray, np.ndarray]: def download_svhn(path, train: bool = True): - SVHN() + SVHN(path, split="train" if train else "test", download=True) parent_path = data_dir / "SVHN" diff --git a/CollaborativeCoding/load_data.py b/CollaborativeCoding/load_data.py index b4a247b..cc24daa 100644 --- a/CollaborativeCoding/load_data.py +++ b/CollaborativeCoding/load_data.py @@ -64,7 +64,7 @@ def load_data(dataset: str, *args, **kwargs) -> tuple: case "svhn": dataset = SVHNDataset train_labels, test_labels = downloader.svhn(data_dir=data_dir) - labels = np.arange(10) + labels = np.unique(train_labels) case "mnist_4-9": dataset = MNISTDataset4_9 train_labels, test_labels = downloader.mnist(data_dir=data_dir) @@ -78,6 +78,10 @@ def load_data(dataset: str, *args, **kwargs) -> tuple: train_indices = np.arange(len(train_labels)) test_indices = np.arange(len(test_labels)) + print(train_indices.shape) + print(np.asarray(train_labels).shape) + print(labels.shape) + # Filter the labels to only get indices of the wanted labels train_samples = train_indices[np.isin(train_labels, labels)] test_samples = test_indices[np.isin(test_labels, labels)] diff --git a/CollaborativeCoding/load_metric.py b/CollaborativeCoding/load_metric.py index 49a60f6..3420808 100644 --- a/CollaborativeCoding/load_metric.py +++ b/CollaborativeCoding/load_metric.py @@ -94,3 +94,28 @@ def getmetrics(self, str_prefix: str = None): def resetmetric(self): for key in self.metrics: self.metrics[key].__reset__() + + +if __name__ == "__main__": + import torch as th + + metrics = ["entropy", "f1", "recall", "precision", "accuracy"] + + class_sizes = [3, 6, 10] + for class_size in class_sizes: + y_true = th.rand((5, class_size)).argmax(dim=1) + y_pred = th.rand((5, class_size)) + + metricwrapper = MetricWrapper( + metric, + num_classes=class_size, + macro_averaging=True if class_size % 2 == 0 else False, + ) + + metricwrapper(y_true, y_pred) + metric = metricwrapper.getmetrics() + assert metric is not None + + metricwrapper.resetmetric() + metric2 = metricwrapper.getmetrics() + assert metric != metric2 diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index dbdb14a..893a6bf 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -8,6 +8,7 @@ from CollaborativeCoding.dataloaders import ( MNISTDataset0_3, + SVHNDataset, USPSDataset0_6, USPSH5_Digit_7_9_Dataset, ) @@ -20,6 +21,7 @@ ("usps_0-6", USPSDataset0_6), ("usps_7-9", USPSH5_Digit_7_9_Dataset), ("mnist_0-3", MNISTDataset0_3), + ("svhn", SVHNDataset), # TODO: Add more datasets here ], ) diff --git a/tests/test_wrappers.py b/tests/test_wrappers.py index f30176b..7ab5bdd 100644 --- a/tests/test_wrappers.py +++ b/tests/test_wrappers.py @@ -1,6 +1,6 @@ from pathlib import Path -from CollaborativeCoding import load_data, load_metric, load_model +from CollaborativeCoding import MetricWrapper, load_data, load_model def test_load_model(): @@ -36,13 +36,7 @@ def test_load_data(): import torch as th from torchvision import transforms - dataset_names = [ - "usps_0-6", - "mnist_0-3", - "usps_7-9", - "svhn", - # 'mnist_4-9' #Uncomment when implemented - ] + dataset_names = ["usps_0-6", "mnist_0-3", "usps_7-9", "svhn", "mnist_4-9"] trans = transforms.Compose( [ @@ -64,4 +58,25 @@ def test_load_data(): def test_load_metric(): - pass + import torch as th + + metrics = ("entropy", "f1", "recall", "precision", "accuracy") + + class_sizes = [3, 6, 10] + for class_size in class_sizes: + y_true = th.rand((5, class_size)).argmax(dim=1) + y_pred = th.rand((5, class_size)) + + metricwrapper = MetricWrapper( + *metrics, + num_classes=class_size, + macro_averaging=True if class_size % 2 == 0 else False, + ) + + metricwrapper(y_true, y_pred) + metric = metricwrapper.getmetrics() + assert metric is not None + + metricwrapper.resetmetric() + metric2 = metricwrapper.getmetrics() + assert metric != metric2