diff --git a/.gitignore b/.gitignore index 0aa90f2..f6b2bb6 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,5 @@ Manifest.toml # Ignore directories in repo test/data/ + +/.luarc.json diff --git a/Project.toml b/Project.toml index 442f008..31f125f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "BarBay" uuid = "3e4a4fdd-3c42-44bf-ab95-e818c96fb7a6" authors = ["Manuel Razo-Mejia"] -version = "0.0.2" +version = "0.0.3" [deps] AdvancedVI = "b5ca4192-6429-45e5-a2d9-87aec30a685c" @@ -14,6 +14,7 @@ DynamicHMC = "bbc10e6e-7c05-544b-b16e-64fede858acb" DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" MCMCChains = "c7f686f2-ff18-58e9-bc7b-31028e88f75d" Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -23,20 +24,21 @@ Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0" UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" [compat] -AdvancedVI = "^0.2" -Bijectors = "^0.13" -CSV = "^0.10" -ComponentArrays = "^0.15" -DataFrames = "^1.6" -Distributions = "^0.25" -DynamicHMC = "^3.4" -DynamicPPL = "^0.24" +AdvancedVI = "^0" +Bijectors = "^0" +CSV = "^0" +ComponentArrays = "^0" +DataFrames = "^1" +Distributions = "^0" +DynamicHMC = "^3" +DynamicPPL = "^0" JLD2 = "^0.4" +Logging = "1" MCMCChains = "^6" -Memoization = "^0.2" -ReverseDiff = "^1.15" -StatsBase = "^0.34" -Turing = "^0.30" +Memoization = "^0" +ReverseDiff = "^1" +StatsBase = "^0" +Turing = "^0" UnPack = "^1" julia = "1" diff --git a/src/utils.jl b/src/utils.jl index f28f2ec..9e59534 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1036,45 +1036,6 @@ end # function # Convert ADVI output to dataframe # ------------------------------------------------------------------------------ -@doc raw""" - ADVIOutput - -Struct to store processed output from Automatic Differentiation Variational -Inference (ADVI). - -# Fields -- `df::DataFrame`: DataFrame containing parameter estimates and metadata -- `var_groups::Vector{String}`: Names of variable groups in the model -- `var_range::Vector{UnitRange{Int}}`: Index ranges for each variable group -- `n_time::Union{Int,Vector{Int}}`: Number of time points -- `n_rep::Int`: Number of experimental replicates -- `n_env::Int`: Number of unique environments -- `n_neutral::Int`: Number of neutral barcodes -- `n_bc::Int`: Number of non-neutral barcodes -- `n_geno::Int`: Number of genotypes -- `neutral_ids::Vector`: Identifiers for neutral barcodes -- `bc_ids::Vector`: Identifiers for non-neutral barcodes -- `envs::Union{String,Vector{<:Any},Vector{Vector{<:Any}}}`: Environment - information, can be: - - `String`: When no environment information is given ("N/A") - - `Vector{<:Any}`: Vector of environment identifiers - - `Vector{Vector{<:Any}}`: Nested vector for multiple environment conditions -""" -struct ADVIOutput - df::DF.DataFrame # Parameter dataframe - var_groups::Vector{String} # Variable groups - var_range::Vector{UnitRange{Int}} # Ranges for each variable - n_time::Union{Int,Vector{Int}} # Number of time points - n_rep::Int # Number of replicates - n_env::Int # Number of environments - n_neutral::Int # Number of neutral barcodes - n_bc::Int # Number of non-neutral barcodes - n_geno::Int # Number of genotypes - neutral_ids::Vector # Neutral barcode IDs - bc_ids::Vector # Non-neutral barcode IDs - envs::Union{String,Vector{<:Any},Vector{Vector{<:Any}}} # Environment info -end - """ Extract variable groups and ranges from ADVI output. """ @@ -1140,7 +1101,7 @@ function add_replicate_info!( df::DF.DataFrame, var_groups::Vector{String}, var_range::Vector{UnitRange{Int}}, - output::ADVIOutput, + output::DataArrays, rep_col::Symbol, ) # Initialize replicate column @@ -1204,7 +1165,7 @@ function add_environment_info!( df::DF.DataFrame, var_groups::Vector{String}, var_range::Vector{UnitRange{Int}}, - output::ADVIOutput, + output::DataArrays, env_col::Symbol ) df[!, env_col] = Vector{Any}(undef, size(df, 1)) @@ -1232,7 +1193,8 @@ function add_barcode_info!( df::DF.DataFrame, var_groups::Vector{String}, var_range::Vector{UnitRange{Int}}, - output::ADVIOutput, + output::DataArrays, + genotype_col::Union{Nothing,Symbol}=nothing, ) df[!, :id] = Vector{Any}(undef, size(df, 1)) @@ -1240,7 +1202,8 @@ function add_barcode_info!( if occursin("̲ₜ", var) # Population mean fitness variables not associated with any barcode df[var_range[i], :id] .= "N/A" - elseif var == "θ̲⁽ᵐ⁾" + # Hyper fitness for replicates + elseif (var == "θ̲⁽ᵐ⁾") && (typeof(genotype_col) <: Nothing) if output.n_env == 1 # Single environment case df[var_range[i], :id] = output.bc_ids @@ -1252,6 +1215,10 @@ function add_barcode_info!( [repeat([bc], output.n_env) for bc in output.bc_ids] ) end # if + # Hyper fitness for genotypes + elseif (var == "θ̲⁽ᵐ⁾") && (typeof(genotype_col) <: Symbol) + # Hyper fitness for genotypes + df[var_range[i], :id] = unique(output.genotypes) elseif var == "logΛ̲̲" # Get all barcode IDs all_ids = [output.neutral_ids; output.bc_ids] @@ -1316,7 +1283,7 @@ Process hierarchical model samples for replicates. """ function process_hierarchical_samples!( df::DF.DataFrame, - output::ADVIOutput, + output::DataArrays, n_samples::Int ) # Sample θ̲ variables @@ -1453,7 +1420,7 @@ function advi_to_df( n_samples::Int=10_000 )::DF.DataFrame # Process data arrays - data_arrays = data_to_arrays( + output = data_to_arrays( data; id_col=id_col, time_col=time_col, @@ -1464,21 +1431,6 @@ function advi_to_df( genotype_col=genotype_col ) - # Create output container - output = ADVIOutput( - DF.DataFrame(), # Will be filled later - String[], # Will be filled later - UnitRange{Int}[], # Will be filled later - data_arrays.n_time, - data_arrays.n_rep, - data_arrays.n_env, - data_arrays.n_neutral, - data_arrays.n_bc, - data_arrays.neutral_ids, - data_arrays.bc_ids, - data_arrays.envs - ) - # Extract variable information var_groups, var_range = extract_variable_info(dist, vars) @@ -1499,7 +1451,7 @@ function advi_to_df( end # Add barcode information - add_barcode_info!(df, var_groups, var_range, output) + add_barcode_info!(df, var_groups, var_range, output, genotype_col) # Process hierarchical model if needed if length(var_groups) == 7 && (output.n_rep > 1 || typeof(genotype_col) <: Symbol) @@ -1507,538 +1459,4 @@ function advi_to_df( end return df -end - -# ------------------------------------------------------------------------------ -# @doc raw""" -# advi_to_df( -# data::DataFrames.AbstractDataFrame, dist::Distribution.Sampleable, -# vars::Vector{<:Any}; kwargs -# ) - -# Convert the output of automatic differentiation variational inference (ADVI) to -# a tidy dataframe. - -# # Arguments -# - `data::DataFrames.AbstractDataFrame`: Tidy dataframe used to perform the ADVI -# inference. See `BarBay.vi` module for the dataframe requirements. -# - `dist::Distributions.Sampleable`: The ADVI posterior sampleable distribution -# object. -# - `vars::Vector{<:Any}`: Vector of variable/parameter names from the ADVI run. - -# ## Optional Keyword Arguments -# - `id_col::Symbol=:barcode`: Name of the column in `data` containing the barcode -# identifier. The column may contain any type of entry. -# - `time_col::Symbol=:time`: Name of the column in `data` defining the time point -# at which measurements were done. The column may contain any type of entry as -# long as `sort` will resulted in time-ordered names. -# - `count_col::Symbol=:count`: Name of the column in `data` containing the raw -# barcode count. The column must contain entries of type `Int64`. -# - `neutral_col::Symbol=:neutral`: Name of the column in `data` defining whether -# the barcode belongs to a neutral lineage or not. The column must contain -# entries of type `Bool`. -# - `n_samples::Int=10_000`: Number of posterior samples to draw used for -# hierarchical models. Default is 10,000. - -# # Returns -# - `df::DataFrames.DataFrame`: DataFrame containing summary statistics of -# posterior samples for each parameter. Columns include: -# - `mean, std`: posterior mean and standard deviation for each variable. -# - `varname`: parameter name from the ADVI posterior distribution. -# - `vartype`: Description of the type of parameter. The types are: -# - `pop_mean_fitness`: Population mean fitness value `s̲ₜ`. -# - `pop_std`: (Nuisance parameter) Log of standard deviation in the -# likelihood function for the neutral lineages. -# - `bc_fitness`: Mutant relative fitness `s⁽ᵐ⁾`. -# - `bc_hyperfitness`: For hierarchical models, mutant hyperparameter -# that connects the fitness over multiple experimental replicates or -# multiple genotypes `θ⁽ᵐ⁾`. -# - `bc_noncenter`: (Nuisance parameter) For hierarchical models, -# non-centered samples used to connect the experimental replicates to -# the hyperparameter `θ̃⁽ᵐ⁾`. -# - `bc_deviations`: (Nuisance parameter) For hierarchical models, -# samples that define the log of the deviation from the hyperparameter -# fitness value `logτ⁽ᵐ⁾`. -# - `bc_std`: (Nuisance parameter) Log of standard deviation in the -# likelihood function for the mutant lineages. -# - `freq`: (Nuisance parameter) Log of the Poisson parameter used to -# define the frequency of each lineage. -# - rep: Experimental replicate number. -# - env: Environment for each parameter. -# - id: Mutant or neutral strain ID. - -# # Notes -# - Converts multivariate posterior into summarized dataframe format. -# - Adds metadata like parameter type, replicate, strain ID, etc. -# - Can handle models with multiple replicates and environments. -# - Can handle models with hierarchical structure on genotypes. -# - Useful for post-processing ADVI results for further analysis and plotting. -# """ -# function advi_to_df( -# data::DF.AbstractDataFrame, -# dist::Distributions.Sampleable, -# vars::Vector{<:Any}; -# id_col::Symbol=:barcode, -# time_col::Symbol=:time, -# count_col::Symbol=:count, -# neutral_col::Symbol=:neutral, -# rep_col::Union{Nothing,Symbol}=nothing, -# env_col::Union{Nothing,Symbol}=nothing, -# genotype_col::Union{Nothing,Symbol}=nothing, -# n_samples::Int=10_000 -# ) -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # -# # Extract information from data -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # - -# # Convert data to arrays to obtain information -# data_arrays = data_to_arrays( -# data; -# id_col=id_col, -# time_col=time_col, -# count_col=count_col, -# neutral_col=neutral_col, -# rep_col=rep_col, -# env_col=env_col, -# genotype_col=genotype_col -# ) - -# # Extract number of neutral and mutant lineages -# n_neutral = data_arrays[:n_neutral] -# n_bc = data_arrays[:n_bc] - -# # Extract bc lineages IDs -# bc_ids = data_arrays[:bc_ids] -# # Extract neutral lineages IDs -# neutral_ids = data_arrays[:neutral_ids] - -# # Extract number of replicates -# n_rep = data_arrays[:n_rep] -# # Extract number of time points per replicate -# n_time = data_arrays[:n_time] - -# # Extract number of environments -# n_env = data_arrays[:n_env] -# # Extract list of environments -# envs = data_arrays[:envs] -# # For multi-replicate inferences replicate the list of environments when -# # necessary -# if (n_env > 1) & (n_rep > 1) & !(typeof(envs) <: Vector{<:Vector}) -# envs = repeat([envs], n_rep) -# end # if - -# # Extract genotype information -# genotypes = data_arrays[:genotypes] -# n_geno = data_arrays[:n_geno] - -# # Extract unique replicates -# if typeof(rep_col) <: Symbol -# reps = sort(unique(data[:, rep_col])) -# end # if -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # -# # Convert distribution parameters into tidy dataframe -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # - -# # Extract parameters and convert to dataframe -# df_par = DF.DataFrame(hcat(dist.dist.m, dist.dist.σ), ["mean", "std"]) - -# # Add variable name. Notice that we use deepcopy to avoid the modification -# # of the original variable -# df_par[!, :varname] = deepcopy(vars) - -# # Locate variable groups by identifying the first variable of each group -# var_groups = replace.(vars[occursin.("[1]", string.(vars))], "[1]" => "") - -# # Define ranges for each variable -# var_range = dist.transform.ranges_out - -# # Count how many variables exist per group -# var_count = length.(var_range) - -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # -# # Add vartype information -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # - -# # Define dictionary from varname to vartype -# varname_to_vartype = Dict( -# "s̲ₜ" => "pop_mean_fitness", -# "logσ̲ₜ" => "pop_std", -# "s̲⁽ᵐ⁾" => "bc_fitness", -# "logσ̲⁽ᵐ⁾" => "bc_std", -# "θ̲⁽ᵐ⁾" => "bc_hyperfitness", -# "θ̲̃⁽ᵐ⁾" => "bc_noncenter", -# "logτ̲⁽ᵐ⁾" => "bc_deviations", -# "logΛ̲̲" => "log_poisson", -# ) - -# # Add column to be modified with vartype -# df_par[!, :vartype] .= "tmp" -# # Loop through variables -# for (i, var) in enumerate(var_groups) -# df_par[var_range[i], :vartype] .= varname_to_vartype[var] -# end # for - -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # -# # Add replicate number information -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # - -# if (n_rep == 1) & (typeof(genotype_col) <: Nothing) & -# (length(var_groups) > 5) -# # Report error if no env or rep info is given when needed -# error("The distribution seems to match a hierarchical model but no genotype or replicate information was provided") -# elseif (n_rep == 1) & (length(var_groups) == 5) & (n_env == 1) -# println("Single replicate non-hierarchical model") -# # Add replicate for single-replicate model for consistency -# df_par[!, :rep] .= "R1" -# elseif (n_rep == 1) & (length(var_groups) == 7) & -# (typeof(genotype_col) <: Symbol) -# println("Single replicate genotype hierarchical model") -# # Add replicate for single-replicate model for consistency -# df_par[!, :rep] .= "R1" -# elseif (n_rep > 1) -# # Add replicate column to be modified -# df_par[!, rep_col] = Vector{Any}(undef, size(df_par, 1)) -# # Loop through var groups -# for (i, var) in enumerate(var_groups) -# if occursin("̲ₜ", var) -# # Add replicate for population mean fitness variables -# df_par[var_range[i], rep_col] = reduce( -# vcat, -# [ -# repeat([reps[j]], n_time[j] - 1) for j = 1:n_rep -# ] -# ) -# elseif var == "θ̲⁽ᵐ⁾" -# # Add information for hyperparameter that does not have -# # replicate information -# df_par[var_range[i], rep_col] .= "N/A" -# elseif var == "logΛ̲̲" -# df_par[var_range[i], rep_col] = reduce( -# vcat, -# [ -# repeat( -# [reps[j]], -# (n_bc + n_neutral) * (n_time[j]) -# ) for j = 1:n_rep -# ] -# ) -# else -# # Information on the other variables depends on environments -# if n_env == 1 -# # Add information for bc-related parameters when no -# # environment information is provided -# df_par[var_range[i], rep_col] = reduce( -# vcat, -# [ -# repeat([reps[j]], n_bc) for j = 1:n_rep -# ] -# ) -# else -# df_par[var_range[i], rep_col] = reduce( -# vcat, -# [ -# repeat([reps[j]], n_bc * n_env) for j = 1:n_rep -# ] -# ) -# end # if -# end # if -# end # for -# end # if - -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # -# # Add environment information -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # - -# if (n_env == 1) -# # Add environment information for consistency -# df_par[!, :env] .= "env1" -# elseif (n_env > 1) -# # Add replicate column to be modified -# df_par[!, env_col] = Vector{Any}(undef, size(df_par, 1)) -# # Loop through var groups -# for (i, var) in enumerate(var_groups) -# if occursin("̲ₜ", var) -# if n_rep == 1 -# # Add environment information for single replicate -# df_par[var_range[i], env_col] = envs[2:end] -# elseif n_rep > 1 -# # Add environment information for multiple replicates -# df_par[var_range[i], env_col] = reduce( -# vcat, [env[2:end] for env in envs] -# ) -# end # if -# elseif var == "θ̲⁽ᵐ⁾" -# # Add environment information for hyperparameter -# df_par[var_range[i], env_col] = reduce( -# vcat, repeat([unique(reduce(vcat, envs))], n_bc) -# ) -# elseif var == "logΛ̲̲" -# if n_rep == 1 -# # Add environment informatin for each Poisson parameter for -# # single replicate -# df_par[var_range[i], env_col] = repeat( -# envs, (n_bc + n_neutral) -# ) -# elseif n_rep > 1 -# # Add environment information for each Poisson parameter for -# # multiple replicates -# df_par[var_range[i], env_col] = reduce( -# vcat, -# [repeat(env, (n_bc + n_neutral)) for env in envs] -# ) -# end # if -# else -# if n_rep == 1 -# # Add environment information for each bc-related -# # variable for single replicate -# df_par[var_range[i], env_col] = reduce( -# vcat, -# repeat(unique(envs), n_bc) -# ) -# elseif n_rep > 1 -# # Add environment information for each bc-related -# # variable for multiple replicates -# df_par[var_range[i], env_col] = reduce( -# vcat, -# repeat(unique(reduce(vcat, envs)), n_bc * n_rep) -# ) -# end #if -# end # if -# end # for -# end # if - -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # -# # Add mutant id information -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # - -# # Add mutant id column. To be modified -# df_par[!, :id] = Vector{Any}(undef, size(df_par, 1)) - -# # Loop through var groups -# for (i, var) in enumerate(var_groups) -# if occursin("̲ₜ", var) -# # Population mean fitness variables are not associated with any -# # particular barcode. -# df_par[var_range[i], :id] .= "N/A" -# elseif var == "θ̲⁽ᵐ⁾" -# if (n_env == 1) & (typeof(genotype_col) <: Nothing) -# # Add ID information to hyperparameter for single environment -# # and no genotypes -# df_par[var_range[i], :id] = bc_ids -# elseif (n_env == 1) & (typeof(genotype_col) <: Symbol) -# # Add ID information to hyperparameter fitness for genotypes -# df_par[var_range[i], :id] = unique(genotypes) -# elseif n_env > 1 -# # Add ID information to hyperparameter for multiple environments -# df_par[var_range[i], :id] = reduce( -# vcat, -# [repeat([bc], n_env) for bc in bc_ids] -# ) -# end # if -# elseif var == "logΛ̲̲" -# if n_rep == 1 -# # Add ID information to Poisson parameter for single replicate -# df_par[var_range[i], :id] = reduce( -# vcat, -# [repeat([bc], n_time) for bc in [neutral_ids; bc_ids]] -# ) -# elseif n_rep > 1 -# df_par[var_range[i], :id] = reduce( -# vcat, -# [ -# repeat([bc], n_time[rep]) -# for rep = 1:n_rep -# for bc in [neutral_ids; bc_ids] -# ] -# ) -# end # if -# else -# if (n_rep == 1) & (n_env == 1) -# df_par[var_range[i], :id] = bc_ids -# elseif (n_rep == 1) & (n_env > 1) -# df_par[var_range[i], :id] = reduce( -# vcat, -# [repeat([bc], n_env) for bc in bc_ids] -# ) -# elseif (n_rep > 1) & (n_env == 1) -# df_par[var_range[i], :id] = reduce( -# vcat, -# repeat(bc_ids, n_rep) -# ) -# elseif (n_rep > 1) & (n_env > 1) -# df_par[var_range[i], :id] = reduce( -# vcat, -# [ -# repeat([bc], n_env) -# for rep = 1:n_rep -# for bc in bc_ids -# ] -# ) - -# end # if -# end # if -# end # for - -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # -# # Add per-replicate fitness effect variables -# # (for hierarchical models on experimental replicates) -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # - -# if (n_rep > 1) & (length(var_groups) == 7) -# # Sample θ̲ variables -# θ_mat = hcat( -# [ -# Random.rand(Distributions.Normal(x...), n_samples) -# for x in eachrow( -# df_par[ -# df_par.vartype.=="bc_hyperfitness", -# [:mean, :std] -# ] -# ) -# ]... -# ) - -# # Sample τ̲ variables -# τ_mat = exp.( -# hcat( -# [ -# Random.rand(Distributions.Normal(x...), n_samples) -# for x in eachrow( -# df_par[ -# df_par.vartype.=="bc_deviations", -# [:mean, :std] -# ] -# ) -# ]... -# ) -# ) - -# # Sample θ̲̃ variables -# θ_tilde_mat = hcat( -# [ -# Random.rand(Distributions.Normal(x...), n_samples) -# for x in eachrow( -# df_par[ -# df_par.vartype.=="bc_noncenter", -# [:mean, :std] -# ] -# ) -# ]... -# ) - - -# # Compute individual strains fitness values -# s_mat = hcat(repeat([θ_mat], n_rep)...) .+ (τ_mat .* θ_tilde_mat) - -# # Compute mean and standard deviation and turn into dataframe -# df_s = DF.DataFrame( -# hcat( -# [StatsBase.median.(eachcol(s_mat)), -# StatsBase.std.(eachcol(s_mat))]... -# ), -# ["mean", "std"] -# ) - -# # To add the rest of the columns, we will use the τ variables that have -# # the same length. Let's locate such variables -# τ_idx = occursin.("τ", string.(vars)) - -# # Add extra columns -# DF.insertcols!( -# df_s, -# :varname => replace.(df_par[τ_idx, :varname], "logτ" => "s"), -# :vartype .=> "bc_fitness", -# rep_col => df_par[τ_idx, rep_col], -# :env => df_par[τ_idx, :env], -# :id => df_par[τ_idx, :id] -# ) - -# # Append dataframes -# DF.append!(df_par, df_s) -# end # if - -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # -# # Add per strain fitness effect variables -# # (for hierarchical models on genotypes) -# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # - -# if (n_rep == 1) & (length(var_groups) == 7) -# # Find unique genotypes -# geno_unique = unique(genotypes) -# # Define genotype indexes -# geno_idx = indexin(genotypes, geno_unique) - -# # Sample θ̲ variables -# θ_mat = hcat( -# [ -# Random.rand(Distributions.Normal(x...), n_samples) -# for x in eachrow( -# df_par[ -# df_par.vartype.=="bc_hyperfitness", -# [:mean, :std] -# ] -# ) -# ]... -# ) - -# # Sample τ̲ variables -# τ_mat = exp.( -# hcat( -# [ -# Random.rand(Distributions.Normal(x...), n_samples) -# for x in eachrow( -# df_par[ -# df_par.vartype.=="bc_deviations", -# [:mean, :std] -# ] -# ) -# ]... -# ) -# ) - -# # Sample θ̲̃ variables -# θ_tilde_mat = hcat( -# [ -# Random.rand(Distributions.Normal(x...), n_samples) -# for x in eachrow( -# df_par[ -# df_par.vartype.=="bc_noncenter", -# [:mean, :std] -# ] -# ) -# ]... -# ) - - -# # Compute individual strains fitness values -# s_mat = θ_mat[:, geno_idx] .+ (τ_mat .* θ_tilde_mat) - -# # Compute mean and standard deviation and turn into dataframe -# df_s = DF.DataFrame( -# hcat( -# [StatsBase.median.(eachcol(s_mat)), -# StatsBase.std.(eachcol(s_mat))]... -# ), -# ["mean", "std"] -# ) - -# # To add the rest of the columns, we will use the τ variables that have -# # the same length. Let's locate such variables -# τ_idx = occursin.("τ", string.(vars)) - -# # Add extra columns -# DF.insertcols!( -# df_s, -# :varname => replace.(df_par[τ_idx, :varname], "logτ" => "s"), -# :vartype .=> "bc_fitness", -# :rep => df_par[τ_idx, :rep], -# :env => df_par[τ_idx, :env], -# :id => df_par[τ_idx, :id] -# ) - -# # Append dataframes -# DF.append!(df_par, df_s) -# end # if - -# return df_par -# end # function \ No newline at end of file +end \ No newline at end of file diff --git a/src/vi.jl b/src/vi.jl index dda4563..c5bfa21 100644 --- a/src/vi.jl +++ b/src/vi.jl @@ -1,3 +1,6 @@ +# Import logging for progress updates +import Logging + # Import libraries relevant for Bayesian inference import Turing import DynamicPPL @@ -16,6 +19,20 @@ using ..utils: data_to_arrays, advi_to_df # Running MCMC for full joint fitness inference π(s̲⁽ᵐ⁾, s̲ₜ | data) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # +# Import libraries relevant for Bayesian inference +import Turing +import DynamicPPL + +# Import package to handle DataFrames +import DataFrames as DF +import CSV + +# Import needed function from the stats.jl module +using ..stats: build_getq + +# Import needed function from the utils module +using ..utils: data_to_arrays, advi_to_df + @doc raw""" advi(; kwargs) @@ -34,7 +51,7 @@ respective keyword arguments: - `time_col`: Indicates the measurement time point. - `count_col`: Contains the raw barcode count. - `neutral_col`: Indicates if the barcode is from a neutral lineage. Additional -optional columns include `rep_col`, `env_col`, and `genotype_col`. + optional columns include `rep_col`, `env_col`, and `genotype_col`. # Keyword Arguments - `data::DF.AbstractDataFrame`: Tidy dataframe with data for sampling the @@ -51,7 +68,6 @@ optional columns include `rep_col`, `env_col`, and `genotype_col`. - `rep_col::Union{Nothing,Symbol}=nothing`: Column for experimental replicate. - `env_col::Union{Nothing,Symbol}=nothing`: Column for environment. - `genotype_col::Union{Nothing,Symbol}=nothing`: Column for genotype. -- `rm_T0::Bool=false`: Option to remove the first time point from inference. - `advi::Turing.AdvancedVI.VariationalInference=Turing.ADVI{Turing.AutoReverseDiff(true)}(1, 10_000)`, A default instance of `Turing.AdvancedVI.VariationalInference` with the following parameters: @@ -85,16 +101,15 @@ function advi(; rep_col::Union{Nothing,Symbol}=nothing, env_col::Union{Nothing,Symbol}=nothing, genotype_col::Union{Nothing,Symbol}=nothing, - rm_T0::Bool=false, advi::Turing.AdvancedVI.VariationalInference=Turing.ADVI{Turing.AutoReverseDiff(true)}(1, 10_000), opt::Union{Turing.AdvancedVI.TruncatedADAGrad,Turing.AdvancedVI.DecayedADAGrad}=Turing.Variational.TruncatedADAGrad(), verbose::Bool=true ) # Define output filename - fname = "$(outputname).csv" + fname = isnothing(outputname) ? nothing : "$(outputname).csv" # Check if file has been processed before - if isfile(fname) + if !isnothing(fname) && isfile(fname) error("$(fname) was already processed") end # if @@ -110,9 +125,12 @@ function advi(; ## %%%%%%%%%%% Preprocessing data %%%%%%%%%%% ## - println("Pre-processing data...") - # Convert from tidy dataframe to model inputs - data_dict = data_to_arrays( + if verbose + Logging.@info "Pre-processing data..." + end # if + + # Convert from tidy dataframe to model inputs using DataArrays struct + data_arrays = data_to_arrays( data; id_col=id_col, time_col=time_col, @@ -120,22 +138,19 @@ function advi(; neutral_col=neutral_col, rep_col=rep_col, env_col=env_col, - genotype_col=genotype_col, - rm_T0=rm_T0, - verbose=verbose + genotype_col=genotype_col ) ## %%%%%%%%%%% Variational Inference with ADVI %%%%%%%%%%% ## + if verbose - println("Initialize Variational Inference Optimization...\n") + Logging.@info "Initialize Variational Inference Optimization..." end # if - - # Check if model is multi-environment to manually add the list of - # environments + # Check if model is multi-environment to manually add the list of environments if occursin("multienv", "$(model)") # Initialize empty dictionary that accepts any type - mk = Dict{Symbol,Any}(:envs => data_dict[:envs]) + mk = Dict{Symbol,Any}(:envs => data_arrays.envs) # Loop through elements of model_kwargs for (key, item) in model_kwargs # Add element to flexible dictionary @@ -143,12 +158,12 @@ function advi(; end # for # Change mk name to model_kwargs model_kwargs = mk - end + end # if # Check if model is hierarchical on genotypes to manually add genotype list if occursin("genotype", "$(model)") # Initialize empty dictionary that accepts any type - mk = Dict{Symbol,Any}(:genotypes => data_dict[:genotypes]) + mk = Dict{Symbol,Any}(:genotypes => data_arrays.genotypes) # Loop through elements of model_kwargs for (key, item) in model_kwargs # Add element to flexible dictionary @@ -160,10 +175,10 @@ function advi(; # Define model bayes_model = model( - data_dict[:bc_count], - data_dict[:bc_total], - data_dict[:n_neutral], - data_dict[:n_bc]; + data_arrays.bc_count, + data_arrays.bc_total, + data_arrays.n_neutral, + data_arrays.n_bc; model_kwargs... ) @@ -190,7 +205,7 @@ function advi(; # Optimize meanfield variational distribution q = Turing.vi(bayes_model, advi; optimizer=opt) - if typeof(outputname) <: Nothing + if isnothing(outputname) return advi_to_df( data, q, @@ -201,8 +216,7 @@ function advi(; neutral_col=neutral_col, rep_col=rep_col, env_col=env_col, - genotype_col=genotype_col, - rm_T0=rm_T0 + genotype_col=genotype_col ) else # Convert and save output as tidy dataframe @@ -218,10 +232,9 @@ function advi(; neutral_col=neutral_col, rep_col=rep_col, env_col=env_col, - genotype_col=genotype_col, - rm_T0=rm_T0 + genotype_col=genotype_col ) ) + return nothing end # if - end # function \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 475455c..60c5fc1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,5 +1,6 @@ @testset "BarBay.jl" begin include("utils_tests.jl") + include("vi_tests.jl") end # ---------------------------------------------------------------------------- # diff --git a/test/vi_tests.jl b/test/vi_tests.jl new file mode 100644 index 0000000..62f6f9b --- /dev/null +++ b/test/vi_tests.jl @@ -0,0 +1,237 @@ +## ============================================================================= + +using BarBay +using Test +using CSV +using DataFrames +using StatsBase +using Turing +using AdvancedVI +using ReverseDiff + +## ============================================================================= + +@testset "vi tests" begin + # ======================================================================== + # Test ADVI with data001_single.csv (single condition) + # ======================================================================== + @testset "Single Condition Data (data001)" begin + # Load test data + data = CSV.read("data/data001_single.csv", DataFrame) + + # Define test parameters + test_advi = Turing.ADVI(1, 1, Turing.AutoReverseDiff(true)) + + # Test basic functionality + result = BarBay.vi.advi( + data=data, + model=BarBay.model.fitness_normal, + advi=test_advi + ) + + # Test result properties + @test result isa DataFrame + @test :mean in propertynames(result) + @test :std in propertynames(result) + @test :vartype in propertynames(result) + @test :varname in propertynames(result) + + # Test that all required variable types are present + required_vartypes = [ + "pop_mean_fitness", + "pop_std", + "bc_fitness", + "bc_std", + "log_poisson" + ] + @test all(vt in result.vartype for vt in required_vartypes) + + # Test with ForwardDiffAD + test_advi_forward = Turing.ADVI(1, 1, Turing.AutoForwardDiff()) + result_forward = BarBay.vi.advi( + data=data, + model=BarBay.model.fitness_normal, + advi=test_advi_forward + ) + @test result_forward isa DataFrame + end + + # ======================================================================== + # Test ADVI with data002_hier-rep.csv (hierarchical replicates) + # ======================================================================== + @testset "Hierarchical Replicates Data (data002)" begin + # Load test data + data = CSV.read("data/data002_hier-rep.csv", DataFrame) + + # Define test parameters + test_advi = Turing.ADVI(1, 1, Turing.AutoReverseDiff(true)) + + # Test with replicates + result = BarBay.vi.advi( + data=data, + model=BarBay.model.replicate_fitness_normal, + rep_col=:rep, + advi=test_advi + ) + + # Test result properties + @test result isa DataFrame + @test :rep in propertynames(result) + @test :id in propertynames(result) + @test :vartype in propertynames(result) + + # Test that hierarchical model variables are present + hierarchical_vartypes = [ + "bc_hyperfitness", + "bc_noncenter", + "bc_deviations" + ] + @test all(vt in result.vartype for vt in hierarchical_vartypes) + + # Test with ForwardDiffAD + test_advi_forward = Turing.ADVI(1, 1, Turing.AutoForwardDiff()) + result_forward = BarBay.vi.advi( + data=data, + model=BarBay.model.replicate_fitness_normal, + rep_col=:rep, + advi=test_advi_forward + ) + @test result_forward isa DataFrame + + # Test with uneven timepoints + data_uneven = data[ + (data.rep.!=maximum(data.rep)).|(data.time.!=maximum(data.time)), + :] + result_uneven = BarBay.vi.advi( + data=data_uneven, + model=BarBay.model.replicate_fitness_normal, + rep_col=:rep, + advi=test_advi + ) + @test result_uneven isa DataFrame + end + + # ======================================================================== + # Test ADVI with data003_multienv.csv (multiple environments) + # ======================================================================== + @testset "Multiple Environments Data (data003)" begin + # Load test data + data = CSV.read("data/data003_multienv.csv", DataFrame) + + # Define test parameters + test_advi = Turing.ADVI(1, 1, Turing.AutoReverseDiff(true)) + + # Test with environments + result = BarBay.vi.advi( + data=data, + model=BarBay.model.multienv_fitness_normal, + env_col=:env, + advi=test_advi + ) + + # Test result properties + @test result isa DataFrame + @test :env in propertynames(result) + + # Test with ForwardDiffAD + test_advi_forward = Turing.ADVI(1, 1, Turing.AutoForwardDiff()) + result_forward = BarBay.vi.advi( + data=data, + model=BarBay.model.multienv_fitness_normal, + env_col=:env, + advi=test_advi_forward + ) + @test result_forward isa DataFrame + end + + # ======================================================================== + # Test ADVI with data004_multigen.csv (multiple genotypes) + # ======================================================================== + @testset "Multiple Genotypes Data (data004)" begin + # Load test data + data = CSV.read("data/data004_multigen.csv", DataFrame) + + # Define test parameters + test_advi = Turing.ADVI(1, 1, Turing.AutoReverseDiff(true)) + + # Test with genotypes + result = BarBay.vi.advi( + data=data, + model=BarBay.model.genotype_fitness_normal, + genotype_col=:genotype, + advi=test_advi + ) + + # Test result properties + @test result isa DataFrame + + # Test that hierarchical model variables are present + hierarchical_vartypes = [ + "bc_hyperfitness", + "bc_noncenter", + "bc_deviations" + ] + @test all(vt in result.vartype for vt in hierarchical_vartypes) + + # Test with ForwardDiffAD + test_advi_forward = Turing.ADVI(1, 1, Turing.AutoForwardDiff()) + result_forward = BarBay.vi.advi( + data=data, + model=BarBay.model.genotype_fitness_normal, + genotype_col=:genotype, + advi=test_advi_forward + ) + @test result_forward isa DataFrame + end + + # ======================================================================== + # Test error handling + # ======================================================================== + @testset "Error Handling" begin + data = CSV.read("data/data001_single.csv", DataFrame) + test_advi = Turing.ADVI(1, 1, Turing.AutoReverseDiff(true)) + + # Test error when trying to use hierarchical model without replicate + # info + @test_throws ErrorException BarBay.vi.advi( + data=data, + model=BarBay.model.replicate_fitness_normal, + advi=test_advi + ) + + # Test error when trying to use multienv model without environment info + @test_throws ErrorException BarBay.vi.advi( + data=data, + model=BarBay.model.multienv_fitness_normal, + advi=test_advi + ) + end + + # ======================================================================== + # Test output file handling + # ======================================================================== + @testset "Output File Handling" begin + data = CSV.read("data/data001_single.csv", DataFrame) + test_advi = Turing.ADVI(1, 1, Turing.AutoReverseDiff(true)) + + # Test file output + output_file = tempname() + result = BarBay.vi.advi( + data=data, + model=BarBay.model.fitness_normal, + outputname=output_file, + advi=test_advi + ) + + # Test that result is nothing (since saving to file) + @test isnothing(result) + + # Test that file exists and is readable + @test isfile("$(output_file).csv") + df = CSV.read("$(output_file).csv", DataFrame) + @test df isa DataFrame + + # Clean up + rm("$(output_file).csv") + end +end \ No newline at end of file