Merge pull request #28 from JuliaTrustworthyAI/subsampling-needs-to-b…

…e-seeded Subsampling needs to be seeded
JuliaTrustworthyAI · Jan 10, 2025 · 2da3f52 · 2da3f52 · pat-alt · Jan 10, 2025
2 parents d6dc1d3 + 5cb6a3a
commit 2da3f52
Show file tree

Hide file tree

Showing 12 changed files with 74 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 
 *Note*: We try to adhere to these practices as of version [v1.0.1].
 
-## Version [1.0.1] - 2025-01-09
+## Version [1.1.1] - 2025-01-10
+
+### Changed
+
+- Improved seeding behaviour for tabular and vision datasets. 
+
+## Version [1.1.0] - 2025-01-09
 
 ### Changed
 

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TaijaData"
 uuid = "9d524318-b4e6-4a65-86d2-b2b72d07866c"
 authors = ["Patrick Altmeyer"]
-version = "1.1.0"
+version = "1.1.1"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"

diff --git a/src/synthetic/blobs.jl b/src/synthetic/blobs.jl
@@ -1,12 +1,10 @@
 """
-    load_blobs(n=250; seed=Random.GLOBAL_RNG, kwrgs...)
+    load_blobs(n=250; seed=data_seed, k=2, centers=2, kwrgs...)
 
 Loads overlapping synthetic data.
 """
 function load_blobs(n=250; seed=data_seed, k=2, centers=2, kwrgs...)
-    if isa(seed, Int)
-        seed = Xoshiro(seed)
-    end
+    seed = get_rng(seed)
 
     X, y = MLJBase.make_blobs(n, k; centers=centers, rng=seed, kwrgs...)
     X = permutedims(MLJBase.matrix(X))

diff --git a/src/tabular/adult.jl b/src/tabular/adult.jl
@@ -1,9 +1,9 @@
 """
-    load_uci_adult(n::Union{Nothing, Int}=1000)
+    load_uci_adult(n::Union{Nothing,Int}=1000; seed=data_seed)
 
 Loads data from the UCI 'Adult' dataset.
 """
-function load_uci_adult(n::Union{Nothing,Int}=1000)
+function load_uci_adult(n::Union{Nothing,Int}=1000; seed=data_seed)
     # Throw an exception if n < 1:
     if !isnothing(n) && n < 1
         throw(ArgumentError("n must be >= 1"))
@@ -44,9 +44,10 @@ function load_uci_adult(n::Union{Nothing,Int}=1000)
 
     y = df.target
 
-    # Undersample:
-    if !isnothing(n)
-        X, y = subsample(X, y, n)
+    # Randomly under-/over-sample:
+    rng = get_rng(seed)
+    if !isnothing(n) && n != size(X)[2]
+        X, y = subsample(rng, X, y, n)
     end
 
     return (X, y)

diff --git a/src/tabular/california_housing.jl b/src/tabular/california_housing.jl
@@ -1,9 +1,9 @@
 """
-    load_california_housing(n::Union{Nothing,Int}=5000)
+    load_california_housing(n::Union{Nothing,Int}=5000; seed=data_seed)
 
 Loads California Housing data.
 """
-function load_california_housing(n::Union{Nothing,Int}=5000)
+function load_california_housing(n::Union{Nothing,Int}=5000; seed=data_seed)
 
     # check that n is > 0
     if !isnothing(n) && n <= 0
@@ -22,9 +22,10 @@ function load_california_housing(n::Union{Nothing,Int}=5000)
     # Counterfactual data:
     y = Int.(df.target)
 
-    # Undersample:
-    if !isnothing(n)
-        X, y = subsample(X, y, n)
+    # Randomly under-/over-sample:
+    rng = get_rng(seed)
+    if !isnothing(n) && n != size(X)[2]
+        X, y = subsample(rng, X, y, n)
     end
 
     return (X, y)

diff --git a/src/tabular/credit_default.jl b/src/tabular/credit_default.jl
@@ -1,9 +1,9 @@
 """
-    load_credit_default(n::Union{Nothing,Int}=5000)
+    load_credit_default(n::Union{Nothing,Int}=5000; seed=data_seed)
 
 Loads UCI Credit Default data.
 """
-function load_credit_default(n::Union{Nothing,Int}=5000)
+function load_credit_default(n::Union{Nothing,Int}=5000; seed=data_seed)
 
     # Load:
     df = CSV.read(joinpath(data_dir, "credit_default.csv"), DataFrames.DataFrame)
@@ -28,9 +28,10 @@ function load_credit_default(n::Union{Nothing,Int}=5000)
     #     X, y; features_categorical=features_categorical
     # )
 
-    # Undersample:
-    if !isnothing(n)
-        X, y = subsample(X, y, n)
+    # Randomly under-/over-sample:
+    rng = get_rng(seed)
+    if !isnothing(n) && n != size(X)[2]
+        X, y = subsample(rng, X, y, n)
     end
 
     return (X, y)

diff --git a/src/tabular/german_credit.jl b/src/tabular/german_credit.jl
@@ -1,9 +1,9 @@
 """
-    load_german_credit(n::Union{Nothing, Int}=nothing)
+    load_german_credit(n::Union{Nothing,Int}=nothing; seed=data_seed)
 
 Loads UCI German Credit data.
 """
-function load_german_credit(n::Union{Nothing,Int}=nothing)
+function load_german_credit(n::Union{Nothing,Int}=nothing; seed=data_seed)
     # Throw an exception if n > 1000:
     if !isnothing(n) && n > 1000
         throw(ArgumentError("n must be <= 1000"))
@@ -27,9 +27,10 @@ function load_german_credit(n::Union{Nothing,Int}=nothing)
     # Counterfactual data:
     y = convert(Vector, df.target)
 
-    # Undersample:
-    if !isnothing(n)
-        X, y = subsample(X, y, n)
+    # Randomly under-/over-sample:
+    rng = get_rng(seed)
+    if !isnothing(n) && n != size(X)[2]
+        X, y = subsample(rng, X, y, n)
     end
 
     return (X, y)

diff --git a/src/tabular/gmsc.jl b/src/tabular/gmsc.jl
@@ -1,9 +1,9 @@
 """
-    load_gmsc(n::Union{Nothing,Int}=5000)
+    load_gmsc(n::Union{Nothing,Int}=5000; seed=data_seed)
 
 Loads Give Me Some Credit (GMSC) data.
 """
-function load_gmsc(n::Union{Nothing,Int}=5000)
+function load_gmsc(n::Union{Nothing,Int}=5000; seed=data_seed)
 
     # Load:
     df = CSV.read(joinpath(data_dir, "gmsc.csv"), DataFrames.DataFrame)
@@ -18,9 +18,10 @@ function load_gmsc(n::Union{Nothing,Int}=5000)
     # Counterfactual data:
     y = df.target
 
-    # Undersample:
-    if !isnothing(n)
-        X, y = subsample(X, y, n)
+    # Randomly under-/over-sample:
+    rng = get_rng(seed)
+    if !isnothing(n) && n != size(X)[2]
+        X, y = subsample(rng, X, y, n)
     end
 
     return (X, y)

diff --git a/src/utils.jl b/src/utils.jl
@@ -1,4 +1,18 @@
-function subsample(X::AbstractMatrix, y::AbstractVector, n::Int)
+using Random
+
+"""
+    get_rng(seed::Union{Int,AbstractRNG})
+
+Returns a random number generator based on the provided seed, if seed is an integer, or returns the seed itself if it's already an `AbstractRNG`.
+"""
+function get_rng(seed::Union{Int,AbstractRNG})
+    if isa(seed, Int)
+        seed = Xoshiro(seed)
+    end
+    return seed
+end
+
+function subsample(rng::AbstractRNG, X::AbstractMatrix, y::AbstractVector, n::Int)
     # Get the unique classes in `y`.
     classes_ = unique(y)
 
@@ -13,7 +27,7 @@ function subsample(X::AbstractMatrix, y::AbstractVector, n::Int)
         reduce(
             vcat,
             [
-                StatsBase.sample(findall(y .== cls), n_per_class; replace=true) for
+                StatsBase.sample(rng, findall(y .== cls), n_per_class; replace=true) for
                 cls in classes_
             ],
         ),

diff --git a/src/vision/cifar_10.jl b/src/vision/cifar_10.jl
@@ -1,9 +1,9 @@
 """
-    load_cifar_10(n::Union{Nothing, Int}=nothing)
+    load_cifar_10(n::Union{Nothing, Int}=nothing; seed=data_seed)
 
 Loads data from the CIFAR-10 dataset.
 """
-function load_cifar_10(n::Union{Nothing,Int}=nothing)
+function load_cifar_10(n::Union{Nothing,Int}=nothing; seed=data_seed)
     X, y = MLDatasets.CIFAR10()[:] # [:] gives us X, y
     X = Flux.flatten(X)
     X = X .* 2 .- 1 # normalization between [-1, 1]
@@ -13,9 +13,10 @@ function load_cifar_10(n::Union{Nothing,Int}=nothing)
     #     X, y; domain=(-1.0, 1.0), standardize=false
     # )
 
-    # Undersample:
-    if !isnothing(n)
-        X, y = subsample(X, y, n)
+    # Randomly under-/over-sample:
+    rng = get_rng(seed)
+    if !isnothing(n) && n != size(X)[2]
+        X, y = subsample(rng, X, y, n)
     end
 
     return (X, y)

diff --git a/src/vision/fashion_mnist.jl b/src/vision/fashion_mnist.jl
@@ -1,9 +1,9 @@
 """
-    load_fashion_mnist(n::Union{Nothing,Int}=nothing)
+    load_fashion_mnist(n::Union{Nothing,Int}=nothing; seed=data_seed)
 
 Loads FashionMNIST data.
 """
-function load_fashion_mnist(n::Union{Nothing,Int}=nothing)
+function load_fashion_mnist(n::Union{Nothing,Int}=nothing; seed=data_seed)
     X, y = MLDatasets.FashionMNIST(:train)[:]
     X = Flux.flatten(X)
     X = X .* 2.0f0 .- 1.0f0
@@ -13,9 +13,10 @@ function load_fashion_mnist(n::Union{Nothing,Int}=nothing)
     #     X, y; domain=(-1.0, 1.0), standardize=false
     # )
 
-    # Undersample:
-    if !isnothing(n)
-        X, y = subsample(X, y, n)
+    # Randomly under-/over-sample:
+    rng = get_rng(seed)
+    if !isnothing(n) && n != size(X)[2]
+        X, y = subsample(rng, X, y, n)
     end
 
     return (X, y)

diff --git a/src/vision/mnist.jl b/src/vision/mnist.jl
@@ -1,9 +1,9 @@
 """
-    load_mnist(n::Union{Nothing,Int}=nothing)
+    load_mnist(n::Union{Nothing,Int}=nothing; seed=data_seed)
 
 Loads MNIST data.
 """
-function load_mnist(n::Union{Nothing,Int}=nothing)
+function load_mnist(n::Union{Nothing,Int}=nothing; seed=data_seed)
     X, y = MLDatasets.MNIST(:train)[:]
     X = Flux.flatten(X)
     X = X .* 2.0f0 .- 1.0f0
@@ -13,9 +13,10 @@ function load_mnist(n::Union{Nothing,Int}=nothing)
     #     X, y; domain=(-1.0, 1.0), standardize=false
     # )
 
-    # Undersample:
-    if !isnothing(n)
-        X, y = subsample(X, y, n)
+    # Randomly under-/over-sample:
+    rng = get_rng(seed)
+    if !isnothing(n) && n != size(X)[2]
+        X, y = subsample(rng, X, y, n)
     end
 
     return (X, y)