Add 20 datasets from UCI repository (#103)

* Add new datasets * [github-action] formatting fixes * Add tqdm dependency Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: mikeheddes <[email protected]>
hyperdimensional-computing · Dec 30, 2022 · 0ac7881 · 0ac7881
1 parent 996a71d
commit 0ac7881
Show file tree

Hide file tree

Showing 21 changed files with 667 additions and 1 deletion.
diff --git a/docs/datasets.rst b/docs/datasets.rst
@@ -21,7 +21,24 @@ The Torchhd library provides many popular built-in datasets to work with.
     CyclePowerPlant
     Abalone
     Adult
-
+    AcuteInflammation    
+    AcuteNephritis
+    Annealing
+    Arrhythmia
+    AudiologyStd
+    BalanceScale
+    Balloons
+    Bank
+    Blood
+    BreastCancer
+    BreastCancerWisc
+    BreastCancerWiscDiag
+    BreastCancerWiscProg
+    BreastTissue
+    Car
+    Cardiotocography3Clases
+    Cardiotocography10Clases
+    ChessKrvk
 
 Base classes
 ------------------------

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -2,6 +2,7 @@ torch
 torchvision
 pandas
 requests
+tqdm
 numpy
 sphinx
 sphinx-rtd-theme
diff --git a/torchhd/datasets/__init__.py b/torchhd/datasets/__init__.py
@@ -11,6 +11,24 @@
 from torchhd.datasets.dataset import DatasetTrainTest
 from torchhd.datasets.abalone import Abalone
 from torchhd.datasets.adult import Adult
+from torchhd.datasets.acute_inflammation import AcuteInflammation
+from torchhd.datasets.acute_nephritis import AcuteNephritis
+from torchhd.datasets.annealing import Annealing
+from torchhd.datasets.arrhythmia import Arrhythmia
+from torchhd.datasets.audiology_std import AudiologyStd
+from torchhd.datasets.balance_scale import BalanceScale
+from torchhd.datasets.balloons import Balloons
+from torchhd.datasets.bank import Bank
+from torchhd.datasets.blood import Blood
+from torchhd.datasets.breast_cancer import BreastCancer
+from torchhd.datasets.breast_cancer_wisc import BreastCancerWisc
+from torchhd.datasets.breast_cancer_wisc_diag import BreastCancerWiscDiag
+from torchhd.datasets.breast_cancer_wisc_prog import BreastCancerWiscProg
+from torchhd.datasets.breast_tissue import BreastTissue
+from torchhd.datasets.car import Car
+from torchhd.datasets.cardiotocography_3clases import Cardiotocography3Clases
+from torchhd.datasets.cardiotocography_10clases import Cardiotocography10Clases
+from torchhd.datasets.chess_krvk import ChessKrvk
 
 
 __all__ = [
@@ -27,4 +45,22 @@
     "DatasetTrainTest",
     "Abalone",
     "Adult",
+    "AcuteInflammation",
+    "AcuteNephritis",
+    "Annealing",
+    "Arrhythmia",
+    "AudiologyStd",
+    "BalanceScale",
+    "Balloons",
+    "Bank",
+    "Blood",
+    "BreastCancer",
+    "BreastCancerWisc",
+    "BreastCancerWiscDiag",
+    "BreastCancerWiscProg",
+    "BreastTissue",
+    "Car",
+    "Cardiotocography3Clases",
+    "Cardiotocography10Clases",
+    "ChessKrvk",
 ]
diff --git a/torchhd/datasets/acute_inflammation.py b/torchhd/datasets/acute_inflammation.py
@@ -0,0 +1,31 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class AcuteInflammation(DatasetFourFold):
+    """`Acute Inflammation of urinary bladder <https://archive.ics.uci.edu/ml/datasets/Acute+Inflammations>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "acute-inflammation"
+    classes: List[str] = [
+        "yes",
+        "no",
+    ]
diff --git a/torchhd/datasets/acute_nephritis.py b/torchhd/datasets/acute_nephritis.py
@@ -0,0 +1,31 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class AcuteNephritis(DatasetFourFold):
+    """`Acute Nephritis of renal pelvis origin <https://archive.ics.uci.edu/ml/datasets/Acute+Inflammations>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "acute-nephritis"
+    classes: List[str] = [
+        "yes",
+        "no",
+    ]
diff --git a/torchhd/datasets/annealing.py b/torchhd/datasets/annealing.py
@@ -0,0 +1,30 @@
+from typing import List
+from torchhd.datasets import DatasetTrainTest
+
+
+class Annealing(DatasetTrainTest):
+    """`Annealing <https://archive.ics.uci.edu/ml/datasets/Annealing>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
+            Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
+        hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "annealing"
+    classes: List[str] = [
+        "1",
+        "2",
+        "3",
+        "4",
+        "5",
+    ]
diff --git a/torchhd/datasets/arrhythmia.py b/torchhd/datasets/arrhythmia.py
@@ -0,0 +1,42 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class Arrhythmia(DatasetFourFold):
+    """`Arrhythmia <https://archive.ics.uci.edu/ml/datasets/arrhythmia>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "arrhythmia"
+    classes: List[str] = [
+        "1 - normal",
+        "2",
+        "3",
+        "4",
+        "5",
+        "6",
+        "7",
+        "8",
+        "9",
+        "10",
+        "14",
+        "15",
+        "16 - unclassified",
+    ]
diff --git a/torchhd/datasets/audiology_std.py b/torchhd/datasets/audiology_std.py
@@ -0,0 +1,43 @@
+from typing import List
+from torchhd.datasets import DatasetTrainTest
+
+
+class AudiologyStd(DatasetTrainTest):
+    """`Audiology (Standardized) <https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
+            Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
+        hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "audiology-std"
+    classes: List[str] = [
+        "cochlear_age",
+        "cochlear_age_and_noise",
+        "cochlear_noise_and_heredity",
+        "cochlear_poss_noise",
+        "cochlear_unknown",
+        "conductive_discontinuity",
+        "conductive_fixation",
+        "mixed_cochlear_age_otitis_media",
+        "mixed_cochlear_age_s_om",
+        "mixed_cochlear_unk_discontinuity",
+        "mixed_cochlear_unk_fixation",
+        "mixed_cochlear_unk_ser_om",
+        "mixed_poss_noise_om",
+        "normal_ear",
+        "otitis_media",
+        "possible_brainstem_disorder",
+        "possible_menieres",
+        "retrocochlear_unknown",
+    ]
diff --git a/torchhd/datasets/balance_scale.py b/torchhd/datasets/balance_scale.py
@@ -0,0 +1,32 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class BalanceScale(DatasetFourFold):
+    """`Balance Scale <https://archive.ics.uci.edu/ml/datasets/balance+scale>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "balance-scale"
+    classes: List[str] = [
+        "B",
+        "L",
+        "R",
+    ]
diff --git a/torchhd/datasets/balloons.py b/torchhd/datasets/balloons.py
@@ -0,0 +1,31 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class Balloons(DatasetFourFold):
+    """`Balloons <https://archive.ics.uci.edu/ml/datasets/balloons>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "balloons"
+    classes: List[str] = [
+        "inflated - F",
+        "inflated - T",
+    ]
diff --git a/torchhd/datasets/bank.py b/torchhd/datasets/bank.py
@@ -0,0 +1,31 @@
+from typing import List
+from torchhd.datasets import DatasetFourFold
+
+
+class Bank(DatasetFourFold):
+    """`Bank Marketing <https://archive.ics.uci.edu/ml/datasets/Bank+Marketing>`_ dataset.
+
+    Args:
+        root (string): Root directory containing the files of the dataset.
+        train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
+            Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
+            as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
+        fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
+            Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
+            Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
+        hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
+            while the second row corresponds to test indices (used if ``train = False``).
+        transform (callable, optional): A function/transform that takes in an torch.FloatTensor
+            and returns a transformed version.
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    name = "bank"
+    classes: List[str] = [
+        "no",
+        "yes",
+    ]