Skip to content

Commit

Permalink
Add 20 datasets from UCI repository (#103)
Browse files Browse the repository at this point in the history
* Add new datasets

* [github-action] formatting fixes

* Add tqdm dependency

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: mikeheddes <[email protected]>
  • Loading branch information
3 people authored Dec 30, 2022
1 parent 996a71d commit 0ac7881
Show file tree
Hide file tree
Showing 21 changed files with 667 additions and 1 deletion.
19 changes: 18 additions & 1 deletion docs/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,24 @@ The Torchhd library provides many popular built-in datasets to work with.
CyclePowerPlant
Abalone
Adult

AcuteInflammation
AcuteNephritis
Annealing
Arrhythmia
AudiologyStd
BalanceScale
Balloons
Bank
Blood
BreastCancer
BreastCancerWisc
BreastCancerWiscDiag
BreastCancerWiscProg
BreastTissue
Car
Cardiotocography3Clases
Cardiotocography10Clases
ChessKrvk

Base classes
------------------------
Expand Down
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ torch
torchvision
pandas
requests
tqdm
numpy
sphinx
sphinx-rtd-theme
36 changes: 36 additions & 0 deletions torchhd/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,24 @@
from torchhd.datasets.dataset import DatasetTrainTest
from torchhd.datasets.abalone import Abalone
from torchhd.datasets.adult import Adult
from torchhd.datasets.acute_inflammation import AcuteInflammation
from torchhd.datasets.acute_nephritis import AcuteNephritis
from torchhd.datasets.annealing import Annealing
from torchhd.datasets.arrhythmia import Arrhythmia
from torchhd.datasets.audiology_std import AudiologyStd
from torchhd.datasets.balance_scale import BalanceScale
from torchhd.datasets.balloons import Balloons
from torchhd.datasets.bank import Bank
from torchhd.datasets.blood import Blood
from torchhd.datasets.breast_cancer import BreastCancer
from torchhd.datasets.breast_cancer_wisc import BreastCancerWisc
from torchhd.datasets.breast_cancer_wisc_diag import BreastCancerWiscDiag
from torchhd.datasets.breast_cancer_wisc_prog import BreastCancerWiscProg
from torchhd.datasets.breast_tissue import BreastTissue
from torchhd.datasets.car import Car
from torchhd.datasets.cardiotocography_3clases import Cardiotocography3Clases
from torchhd.datasets.cardiotocography_10clases import Cardiotocography10Clases
from torchhd.datasets.chess_krvk import ChessKrvk


__all__ = [
Expand All @@ -27,4 +45,22 @@
"DatasetTrainTest",
"Abalone",
"Adult",
"AcuteInflammation",
"AcuteNephritis",
"Annealing",
"Arrhythmia",
"AudiologyStd",
"BalanceScale",
"Balloons",
"Bank",
"Blood",
"BreastCancer",
"BreastCancerWisc",
"BreastCancerWiscDiag",
"BreastCancerWiscProg",
"BreastTissue",
"Car",
"Cardiotocography3Clases",
"Cardiotocography10Clases",
"ChessKrvk",
]
31 changes: 31 additions & 0 deletions torchhd/datasets/acute_inflammation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import List
from torchhd.datasets import DatasetFourFold


class AcuteInflammation(DatasetFourFold):
"""`Acute Inflammation of urinary bladder <https://archive.ics.uci.edu/ml/datasets/Acute+Inflammations>`_ dataset.
Args:
root (string): Root directory containing the files of the dataset.
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
while the second row corresponds to test indices (used if ``train = False``).
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""

name = "acute-inflammation"
classes: List[str] = [
"yes",
"no",
]
31 changes: 31 additions & 0 deletions torchhd/datasets/acute_nephritis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import List
from torchhd.datasets import DatasetFourFold


class AcuteNephritis(DatasetFourFold):
"""`Acute Nephritis of renal pelvis origin <https://archive.ics.uci.edu/ml/datasets/Acute+Inflammations>`_ dataset.
Args:
root (string): Root directory containing the files of the dataset.
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
while the second row corresponds to test indices (used if ``train = False``).
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""

name = "acute-nephritis"
classes: List[str] = [
"yes",
"no",
]
30 changes: 30 additions & 0 deletions torchhd/datasets/annealing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import List
from torchhd.datasets import DatasetTrainTest


class Annealing(DatasetTrainTest):
"""`Annealing <https://archive.ics.uci.edu/ml/datasets/Annealing>`_ dataset.
Args:
root (string): Root directory containing the files of the dataset.
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
while the second row corresponds to test indices (used if ``train = False``).
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""

name = "annealing"
classes: List[str] = [
"1",
"2",
"3",
"4",
"5",
]
42 changes: 42 additions & 0 deletions torchhd/datasets/arrhythmia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import List
from torchhd.datasets import DatasetFourFold


class Arrhythmia(DatasetFourFold):
"""`Arrhythmia <https://archive.ics.uci.edu/ml/datasets/arrhythmia>`_ dataset.
Args:
root (string): Root directory containing the files of the dataset.
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
while the second row corresponds to test indices (used if ``train = False``).
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""

name = "arrhythmia"
classes: List[str] = [
"1 - normal",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"14",
"15",
"16 - unclassified",
]
43 changes: 43 additions & 0 deletions torchhd/datasets/audiology_std.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import List
from torchhd.datasets import DatasetTrainTest


class AudiologyStd(DatasetTrainTest):
"""`Audiology (Standardized) <https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29>`_ dataset.
Args:
root (string): Root directory containing the files of the dataset.
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
while the second row corresponds to test indices (used if ``train = False``).
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""

name = "audiology-std"
classes: List[str] = [
"cochlear_age",
"cochlear_age_and_noise",
"cochlear_noise_and_heredity",
"cochlear_poss_noise",
"cochlear_unknown",
"conductive_discontinuity",
"conductive_fixation",
"mixed_cochlear_age_otitis_media",
"mixed_cochlear_age_s_om",
"mixed_cochlear_unk_discontinuity",
"mixed_cochlear_unk_fixation",
"mixed_cochlear_unk_ser_om",
"mixed_poss_noise_om",
"normal_ear",
"otitis_media",
"possible_brainstem_disorder",
"possible_menieres",
"retrocochlear_unknown",
]
32 changes: 32 additions & 0 deletions torchhd/datasets/balance_scale.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import List
from torchhd.datasets import DatasetFourFold


class BalanceScale(DatasetFourFold):
"""`Balance Scale <https://archive.ics.uci.edu/ml/datasets/balance+scale>`_ dataset.
Args:
root (string): Root directory containing the files of the dataset.
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
while the second row corresponds to test indices (used if ``train = False``).
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""

name = "balance-scale"
classes: List[str] = [
"B",
"L",
"R",
]
31 changes: 31 additions & 0 deletions torchhd/datasets/balloons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import List
from torchhd.datasets import DatasetFourFold


class Balloons(DatasetFourFold):
"""`Balloons <https://archive.ics.uci.edu/ml/datasets/balloons>`_ dataset.
Args:
root (string): Root directory containing the files of the dataset.
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
while the second row corresponds to test indices (used if ``train = False``).
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""

name = "balloons"
classes: List[str] = [
"inflated - F",
"inflated - T",
]
31 changes: 31 additions & 0 deletions torchhd/datasets/bank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import List
from torchhd.datasets import DatasetFourFold


class Bank(DatasetFourFold):
"""`Bank Marketing <https://archive.ics.uci.edu/ml/datasets/Bank+Marketing>`_ dataset.
Args:
root (string): Root directory containing the files of the dataset.
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
while the second row corresponds to test indices (used if ``train = False``).
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""

name = "bank"
classes: List[str] = [
"no",
"yes",
]
Loading

0 comments on commit 0ac7881

Please sign in to comment.