diff --git a/.github/workflows/Tier1.yml b/.github/workflows/Tier1.yml index d9481f6b..7639b18b 100644 --- a/.github/workflows/Tier1.yml +++ b/.github/workflows/Tier1.yml @@ -30,7 +30,6 @@ jobs: strategy: matrix: version: - - '1.6' - '1.8' - '1' os: diff --git a/Project.toml b/Project.toml index c91d01e5..86d81023 100644 --- a/Project.toml +++ b/Project.toml @@ -3,7 +3,7 @@ uuid = "a1dec852-9fe5-11e9-361f-8d9fde67cfa2" keywords = ["lenearmodel", "mixedmodel"] desc = "Mixed-effects models with flexible covariance structure." authors = ["Vladimir Arnautov "] -version = "0.15.0" +version = "0.15.1" [deps] DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5" diff --git a/docs/src/api.md b/docs/src/api.md index 04a525ed..b2c824bd 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -400,6 +400,13 @@ Metida.typeiii Metida.MILMM ``` +### Metida.RawCoding + +```@docs +Metida.RawCoding +``` + + ## Not API functions ### Metida.contrast @@ -443,3 +450,10 @@ Metida.tname ```@docs Metida.raneflenv ``` + +### Metida.edistance + +```@docs +Metida.edistance +``` + diff --git a/docs/src/custom.md b/docs/src/custom.md index 2dcfac16..7789d090 100644 --- a/docs/src/custom.md +++ b/docs/src/custom.md @@ -90,6 +90,7 @@ Example: ```@example lmmexample using Metida, DataFrames, CSV, CategoricalArrays +spatdf = CSV.File(joinpath(dirname(pathof(Metida)), "..", "test", "csv", "spatialdata.csv"); types = [Int, Int, String, Float64, Float64, Float64, Float64, Float64]) |> DataFrame ftdf = CSV.File(joinpath(dirname(pathof(Metida)), "..", "test", "csv", "1fptime.csv"); types = [String, String, Float64, Float64]) |> DataFrame df0 = CSV.File(joinpath(dirname(pathof(Metida)), "..", "test", "csv", "df0.csv"); types = [String, String, String, String,Float64, Float64, Float64]) |> DataFrame @@ -143,3 +144,30 @@ repeated = Metida.VarEffect(Metida.@covstr(period|subject), CustomCovarianceStru ) Metida.fit!(lmm) ``` + +### Custom distance estimation for spatial structures + +If you want to use coordinates or some other structures for distance estimation you can define method [`Metida.edistance`](@ref) to calculate distance: + +```@example lmmexample +function Metida.edistance(mx::AbstractMatrix{<:CartesianIndex}, i::Int, j::Int) + return sqrt((mx[i, 1][1] - mx[j, 1][1])^2 + (mx[i, 1][2] - mx[j, 1][2])^2) +end +``` + +For example this method returns distance between two vectors represented as `CartesianIndex`. + +Make vector of `CartesianIndex`: + +```@example lmmexample +spatdf.ci = map(x -> CartesianIndex(x[:x], x[:y]), eachrow(spatdf)) +``` + +Then use new column as "raw" variable with [`Metida.RawCoding`](@ref) contrast and fit the model: + +```@example lmmexample +lmm = Metida.LMM(@formula(r2 ~ f), spatdf; + repeated = Metida.VarEffect(Metida.@covstr(ci|1), Metida.SPEXP; coding = Dict(:ci => Metida.RawCoding())), + ) +Metida.fit!(lmm) +``` \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index c38faa25..a364593e 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -38,6 +38,10 @@ Implemented covariance structures: Actually Metida can fit datasets with wore than 160k observation and 40k subjects levels on PC with 64 GB RAM. This is not "hard-coded" limitation, but depends on your model and data structure. Fitting of big datasets can take a lot of time. Optimal dataset size is less than 100k observations with maximum length of block less than 400. +!!! note + + Julia v1.8 or higher required. + ## Contents ```@contents diff --git a/src/Metida.jl b/src/Metida.jl index ca1059a6..a86b2143 100644 --- a/src/Metida.jl +++ b/src/Metida.jl @@ -3,7 +3,6 @@ __precompile__() module Metida - using ProgressMeter, LinearAlgebra, ForwardDiff, DiffResults, Random, Optim, LineSearches, MetidaBase#, SparseArrays#, Polyester#, LoopVectorization import StatsBase, StatsModels, Distributions @@ -32,7 +31,7 @@ TOEPH, HeterogeneousToeplitz, TOEPHP, HeterogeneousToeplitzParameterized, SPEXP, SpatialExponential, SPPOW, SpatialPower, -SPGAU, SpatialGaussian, +SPGAU, SpatialGaussian, RawCoding, UN, Unstructured, CovarianceType, fit, fit!, LMM, VarEffect, theta, logreml, m2logreml, thetalength, dof_satter, dof_contain, rankx, caic, lcontrast, typeiii, estimate, contrast, diff --git a/src/fit.jl b/src/fit.jl index ec3b85b2..a6d07882 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -63,7 +63,7 @@ Fit LMM model. * `refitinit` - true/false - if `true` - use last values for initial condition (`false` by default) * `optmethod` - Optimization method. Look at Optim.jl documentation. (Newton by default) * `singtol` - singular tolerance = 1e-8 -* `maxthreads` - maximum threads = num_cores() +* `maxthreads` - maximum threads = min(num_cores(), Threads.nthreads()) """ function fit!(lmm::LMM{T}; kwargs...) where T @@ -87,7 +87,7 @@ function fit!(lmm::LMM{T}; kwargs...) where T :refitinit ∈ kwkeys ? refitinit = kwargs[:refitinit] : refitinit = false :optmethod ∈ kwkeys ? optmethod = kwargs[:optmethod] : optmethod = :default :singtol ∈ kwkeys ? singtol = kwargs[:singtol] : singtol = 1e-8 - :maxthreads ∈ kwkeys ? maxthreads = kwargs[:maxthreads] : maxthreads = num_cores() + :maxthreads ∈ kwkeys ? maxthreads = kwargs[:maxthreads] : maxthreads = min(num_cores(), Threads.nthreads()) # If model was fitted, previous results can be used if `refitinit` == true # Before fitting clear log diff --git a/src/gmat.jl b/src/gmat.jl index c6346c95..452cf9f1 100644 --- a/src/gmat.jl +++ b/src/gmat.jl @@ -148,7 +148,7 @@ function gmat!(mx, θ, ::TOEP_) if s > 1 for n = 2:s @inbounds @simd for m = 1:n-1 - mx[m, n] = de * θ[n-m+1] + mx[m, n] = de * θ[n - m + 1] end end end @@ -177,8 +177,9 @@ function gmat!(mx, θ, ::TOEPH_) end if s > 1 for n = 2:s + @inbounds mxnn = mx[n, n] @inbounds @simd for m = 1:n-1 - mx[m, n] = mx[m, m] * mx[n, n] * θ[n-m+s] + mx[m, n] = mx[m, m] * mxnn * θ[n - m + s] end end end @@ -195,8 +196,9 @@ function gmat!(mx, θ, ct::TOEPHP_) end if s > 1 && ct.p > 1 for m = 1:s - 1 + @inbounds mxmm = mx[m, m] for n = m + 1:(m + ct.p - 1 > s ? s : m + ct.p - 1) - @inbounds mx[m, n] = mx[m, m] * mx[n, n] * θ[n - m + s] + @inbounds mx[m, n] = mxmm * mx[n, n] * θ[n - m + s] end end end @@ -213,8 +215,9 @@ function gmat!(mx, θ, ::UN_) end if s > 1 for n = 2:s + @inbounds mxnn = mx[n, n] @inbounds @simd for m = 1:n - 1 - mx[m, n] = mx[m, m] * mx[n, n] * θ[s + tpnum(m, n, s)] + mx[m, n] = mx[m, m] * mxnn * θ[s + tpnum(m, n, s)] end end end diff --git a/src/reml.jl b/src/reml.jl index 5b8500c9..10c29e51 100644 --- a/src/reml.jl +++ b/src/reml.jl @@ -33,7 +33,7 @@ end ################################################################################ # REML without provided β ################################################################################ -function reml_sweep_β(lmm, data, θ::Vector{T}; maxthreads::Int = 16) where T # Main optimization way - make gradient / hessian analytical / semi-analytical functions +function reml_sweep_β(lmm, data, θ::Vector{T}; maxthreads::Int = 4) where T # Main optimization way - make gradient / hessian analytical / semi-analytical functions n = length(lmm.covstr.vcovblock) N = length(lmm.data.yv) c = (N - lmm.rankx)*log(2π) diff --git a/src/rmat.jl b/src/rmat.jl index 15b01543..7681c7e0 100644 --- a/src/rmat.jl +++ b/src/rmat.jl @@ -167,6 +167,11 @@ end return sqrt(sum) end =# +""" + edistance(mx::AbstractMatrix{T}, i::Int, j::Int) where T + +Distance between vector mx[i, :] and mx[j, :]. +""" function edistance(mx::AbstractMatrix{T}, i::Int, j::Int) where T sum = zero(T) @inbounds for c = 1:size(mx, 2) @@ -304,8 +309,9 @@ function unrmat(θ::AbstractVector{T}, rz) where T end if rm > 1 for m = 1:rm - 1 + @inbounds mxmm = mx[m, m] @inbounds @simd for n = m + 1:rm - mx[m, n] += mx[m, m] * mx[n, n] * θ[rm+tpnum(m, n, rm)] + mx[m, n] += mxmm * mx[n, n] * θ[rm + tpnum(m, n, rm)] end end end diff --git a/src/utils.jl b/src/utils.jl index 64ca46e1..3a8efad4 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -31,7 +31,7 @@ function nterms(rhs::Union{Tuple{Vararg{AbstractTerm}}, Nothing, AbstractTerm}) p end """ - Rerm name. + Term name. """ tname(t::AbstractTerm) = "$(t.sym)" tname(t::InteractionTerm) = join(tname.(t.terms), " & ") @@ -204,8 +204,8 @@ end function logreml(lmm) -m2logreml(lmm)/2 end -function m2logreml(lmm, theta) - reml_sweep_β(lmm, LMMDataViews(lmm), theta)[1] +function m2logreml(lmm, theta; maxthreads::Int = num_cores()) + reml_sweep_β(lmm, LMMDataViews(lmm), theta; maxthreads = maxthreads)[1] end ################################################################################ diff --git a/src/varstruct.jl b/src/varstruct.jl index d87dd76d..49a426a3 100644 --- a/src/varstruct.jl +++ b/src/varstruct.jl @@ -1,5 +1,26 @@ const CType = Union{FunctionTerm{typeof(+), Vector{Term}}, FunctionTerm{typeof(*), Vector{Term}}, FunctionTerm{typeof(&), Vector{Term}}} +import StatsModels: ContrastsMatrix, AbstractContrasts, modelcols + +""" + mutable struct RawCoding <: AbstractContrasts + +Contrast for CategoricalTerm to get column "as it is" for model matrix. +""" +mutable struct RawCoding <: AbstractContrasts +end +function StatsModels.ContrastsMatrix(contrasts::RawCoding, levels::AbstractVector{T}) where T + ContrastsMatrix(ones(1,1), + ["levels"], + levels, + contrasts) +end +function StatsModels.modelcols(t::CategoricalTerm{RawCoding, T, N}, d::NamedTuple) where T where N + #v = d[t.sym] + #reshape(v, length(v), 1) + d[t.sym] +end + ################################################################################ # @covstr macro ################################################################################ @@ -154,7 +175,7 @@ end """ Covarince structure. """ -struct CovStructure{T} <: AbstractCovarianceStructure +struct CovStructure{T, T2} <: AbstractCovarianceStructure # Random effects random::Vector{VarEffect} # Repearted effects @@ -180,7 +201,7 @@ struct CovStructure{T} <: AbstractCovarianceStructure # unit range z column range for each random effect zrndur::Vector{UnitRange{Int}} # repeated effect parametrization matrix - rz::Vector{Matrix{T}} + rz::Vector{Matrix{T2}} # size 2 of z/rz matrix q::Vector{Int} # total number of parameters in each effect @@ -209,7 +230,7 @@ struct CovStructure{T} <: AbstractCovarianceStructure z = Matrix{Float64}(undef, rown, 0) #subjz = Vector{BitMatrix}(undef, alleffl) dicts = Vector{Dict}(undef, alleffl) - # + # unit range z column range for each random effect zrndur = Vector{UnitRange{Int}}(undef, length(random)) # Number of random effects rn = length(random) @@ -218,7 +239,7 @@ struct CovStructure{T} <: AbstractCovarianceStructure # Number of repeated effects rpn = length(repeated) # Z Matrix for repeated effect - rz = Vector{Matrix{Float64}}(undef, rpn) + # rz = Vector{Matrix{Float64}}(undef, rpn) # #Theta parameter type ct = Vector{Symbol}(undef, 0) @@ -239,17 +260,30 @@ struct CovStructure{T} <: AbstractCovarianceStructure if length(random[i].coding) == 0 fill_coding_dict!(random[i].model, random[i].coding, data) end - schema[i] = apply_schema(random[i].model, StatsModels.schema(data, random[i].coding)) - ztemp = modelcols(MatrixTerm(schema[i]), data) - q[i] = size(ztemp, 2) + if isa(random[i].model, ConstantTerm) # if only ConstantTerm in the model - data_ - first is collumn (responce) + data_ = data[[first(keys(data))]] + else + data_ = data[StatsModels.termvars(random[i].model)] # only collumns for model + end + if isa(random[i].covtype.s, ZERO) + schema[i] = InterceptTerm{false}() + zsize = 0 + else + schema[i] = apply_schema(random[i].model, StatsModels.schema(data_, random[i].coding)) + ztemp = modelcols(MatrixTerm(schema[i]), data_) + z = hcat(z, ztemp) + zsize = size(ztemp, 2) + end + + q[i] = zsize csp = covstrparam(random[i].covtype.s, q[i]) t[i] = sum(csp) - z = hcat(z, ztemp) + fillur!(zrndur, i, q) fillur!(tr, i, t) symbs = StatsModels.termvars(random[i].subj) if length(symbs) > 0 - cdata = tabcols(data, symbs) # Tuple(Tables.getcolumn(Tables.columns(data), x) for x in symbs) + cdata = tabcols(data, symbs) # Tuple(Tables.getcolumn(Tables.columns(data_), x) for x in symbs) dicts[i] = Dict{Tuple{eltype.(cdata)...}, Vector{Int}}() indsdict!(dicts[i], cdata) else @@ -261,35 +295,44 @@ struct CovStructure{T} <: AbstractCovarianceStructure append!(emap, fill(i, t[i])) rtn += t[i] end - + + rz_ = Vector{Matrix}(undef, rpn) # REPEATED EFFECTS for i = 1:length(repeated) if length(repeated[i].coding) == 0 fill_coding_dict!(repeated[i].model, repeated[i].coding, data) end - - schema[rn+i] = apply_schema(repeated[i].model, StatsModels.schema(data, repeated[i].coding)) - rz[i] = modelcols(MatrixTerm(schema[rn+i]), data) + if isa(repeated[i].model, ConstantTerm) # if only ConstantTerm in the model - data_ - first is collumn (responce) + data_ = data[[first(keys(data))]] + else + data_ = data[StatsModels.termvars(repeated[i].model)] # only collumns for model + end + + schema[rn + i] = apply_schema(repeated[i].model, StatsModels.schema(data_, repeated[i].coding)) + #rz_[i] = reduce(hcat, modelcols(schema[rn+i], data)) + rz_[i] = modelcols(MatrixTerm(schema[rn+i]), data_) symbs = StatsModels.termvars(repeated[i].subj) if length(symbs) > 0 - cdata = tabcols(data, symbs) # Tuple(Tables.getcolumn(Tables.columns(data), x) for x in symbs) - dicts[rn+i] = Dict{Tuple{eltype.(cdata)...}, Vector{Int}}() - indsdict!(dicts[rn+i], cdata) + cdata = tabcols(data, symbs) # Tuple(Tables.getcolumn(Tables.columns(data), x) for x in symbs) + dicts[rn + i] = Dict{Tuple{eltype.(cdata)...}, Vector{Int}}() + indsdict!(dicts[rn + i], cdata) else dicts[rn+i] = Dict(1 => collect(1:rown)) #changed to range end - sn[rn+i] = length(dicts[rn+i]) - q[rn+i] = size(rz[i], 2) + sn[rn + i] = length(dicts[rn+i]) + q[rn + i] = size(rz_[i], 2) csp = covstrparam(repeated[i].covtype.s, q[rn+i]) - t[rn+i] = sum(csp) - tr[rn+i] = UnitRange(sum(t[1:rn+i-1]) + 1, sum(t[1:rn+i-1]) + t[rn+i]) + t[rn + i] = sum(csp) + tr[rn + i] = UnitRange(sum(t[1:rn+i-1]) + 1, sum(t[1:rn+i-1]) + t[rn+i]) updatenametype!(ct, rcnames, csp, schema[rn+i], repeated[i].covtype.s) - # emap append!(emap, fill(rn+i, t[rn+i])) end + T2 = typejoin(eltype.(rz_)...) + rz = Vector{Matrix{T2}}(undef, rpn) + rz .= rz_ # Theta length tl = sum(t) ######################################################################## @@ -351,7 +394,7 @@ struct CovStructure{T} <: AbstractCovarianceStructure end esb = EffectSubjectBlock(sblock, nblock) ####################################################################### - new{eltype(z)}(random, repeated, schema, rcnames, blocks, rn, rtn, rpn, z, esb, zrndur, rz, q, t, tr, tl, ct, emap, sn, maxn) + new{eltype(z), T2}(random, repeated, schema, rcnames, blocks, rn, rtn, rpn, z, esb, zrndur, rz, q, t, tr, tl, ct, emap, sn, maxn) end end ############################################################################### diff --git a/test/devtest.jl b/test/devtest.jl index d0650e47..d70c1149 100644 --- a/test/devtest.jl +++ b/test/devtest.jl @@ -7,7 +7,7 @@ using DataFrames, CSV, StatsModels, LinearAlgebra, ForwardDiff, ForwardDiff, Opt using BenchmarkTools path = dirname(@__FILE__) cd(path) -df0 = CSV.File(path*"/csv/df0.csv"; types = [String, String, String, String, Float64, Float64]) |> DataFrame +df0 = CSV.File(path*"/csv/df0.csv"; types = [String, String, String, String, Float64, Float64, Float64]) |> DataFrame df1 = CSV.File(path*"/csv/df1.csv"; types = [String, String, String, String, Float64, Float64]) |> DataFrame ftdf = CSV.File(path*"/csv/1fptime.csv"; types = [String, String, Float64, Float64]) |> DataFrame ftdf2 = CSV.File(path*"/csv/1freparma.csv"; types = [String, String, Float64, Float64]) |> DataFrame @@ -342,3 +342,20 @@ BenchmarkTools.Trial: 2 samples with 1 evaluation. Memory estimate: 8.31 GiB, allocs estimate: 365031. =# + +#= +lmm = Metida.LMM(@formula(r2 ~ f), spatdf; +repeated = Metida.VarEffect(Metida.@covstr(x+y|1), Metida.SPEXP), +) +@benchmark Metida.fit!($lmm, hes = false; maxthreads = 16) seconds = 15 + + +spatdf.ci = map(x -> CartesianIndex(x[:x], x[:y]), eachrow(spatdf)) +function Metida.edistance(mx::AbstractMatrix{<:CartesianIndex}, i::Int, j::Int) + return sqrt((mx[i, 1][1] - mx[j, 1][1])^2 + (mx[i, 1][2] - mx[j, 1][2])^2) +end +lmm = Metida.LMM(@formula(r2 ~ f), spatdf; +repeated = Metida.VarEffect(Metida.@covstr(ci|1), Metida.SPEXP; coding = Dict(:ci => Metida.RawCoding())), +) +@benchmark Metida.fit!($lmm, hes = false; maxthreads = 16) seconds = 15 +=# \ No newline at end of file diff --git a/test/profile.pb.gz b/test/profile.pb.gz new file mode 100644 index 00000000..fd252a96 Binary files /dev/null and b/test/profile.pb.gz differ diff --git a/test/test.jl b/test/test.jl index 54bc17cb..afcf3bea 100644 --- a/test/test.jl +++ b/test/test.jl @@ -818,7 +818,6 @@ end @testset " Experimental " begin - io = IOBuffer(); lmm = Metida.LMM(@formula(r2 ~ f), spatdf; repeated = Metida.VarEffect(Metida.@covstr(x+y|1), Metida.SPEXP), @@ -827,6 +826,19 @@ end @test Metida.m2logreml(lmm) ≈ 1985.3417397854946 atol=1E-6 @test Metida.dof_satter(lmm)[1] ≈ 10.261390893063432 atol=1E-2 + + spatdf.ci = map(x -> CartesianIndex(x[:x], x[:y]), eachrow(spatdf)) + function Metida.edistance(mx::AbstractMatrix{<:CartesianIndex}, i::Int, j::Int) + return sqrt((mx[i, 1][1] - mx[j, 1][1])^2 + (mx[i, 1][2] - mx[j, 1][2])^2) + end + lmm = Metida.LMM(@formula(r2 ~ f), spatdf; + repeated = Metida.VarEffect(Metida.@covstr(ci|1), Metida.SPEXP; coding = Dict(:ci => Metida.RawCoding())), + ) + Metida.fit!(lmm) + @test Metida.m2logreml(lmm) ≈ 1985.3417397854946 atol=1E-6 + @test Metida.dof_satter(lmm)[1] ≈ 10.261390893063432 atol=1E-2 + + lmm = Metida.LMM(@formula(r2 ~ f), spatdf; repeated = Metida.VarEffect(Metida.@covstr(x+y|1), Metida.SPPOW), )