From c265c4ea5f67ee6c1b284512df483f6db48cb76a Mon Sep 17 00:00:00 2001
From: PharmCat <v.s.arnautov@yandex.ru>
Date: Fri, 23 Dec 2022 00:57:59 +0300
Subject: [PATCH 1/2] minor optimiztions

---
 src/fit.jl           |   7 ++-
 src/gmat.jl          |   6 +++
 src/linearalgebra.jl | 124 +++++++++++++++++++++++++++++++++++++------
 src/reml.jl          |  45 +++++++++-------
 src/rmat.jl          |   4 ++
 src/sweep.jl         |  18 ++++---
 6 files changed, 159 insertions(+), 45 deletions(-)

diff --git a/src/fit.jl b/src/fit.jl
index 789523c0..23a9765d 100644
--- a/src/fit.jl
+++ b/src/fit.jl
@@ -59,9 +59,10 @@ Fit LMM model.
 * `io` - output IO
 * `time_limit` - time limit = 120 sec
 * `iterations` - maximum iterations = 300
-* `refitinit` - true/false - if true - use last values for initial condition
+* `refitinit` - true/false - if `true` - use last values for initial condition  (`false` by default)
 * `optmethod` - Optimization method. Look at Optim.jl documentation. (Newton by default)
 * `singtol` - singular tolerance = 1e-8
+* `maxthreads` - maximum threads = num_cores()
 
 """
 function fit!(lmm::LMM{T}; kwargs...) where T
@@ -85,6 +86,7 @@ function fit!(lmm::LMM{T}; kwargs...) where T
     :refitinit ∈ kwkeys ? refitinit = kwargs[:refitinit] : refitinit = false
     :optmethod ∈ kwkeys ? optmethod = kwargs[:optmethod] : optmethod = :default
     :singtol ∈ kwkeys ? singtol = kwargs[:singtol] : singtol = 1e-8
+    :maxthreads ∈ kwkeys ? maxthreads = kwargs[:maxthreads] : maxthreads = num_cores()
 
     # If model was fitted, previous results can be used if `refitinit` == true
     # Before fitting clear log
@@ -159,7 +161,8 @@ function fit!(lmm::LMM{T}; kwargs...) where T
     varlinkrvecapply!(θ, lmm.covstr.ct; varlinkf = varlinkf, rholinkf = rholinkf)
 
     # Twice differentiable object
-    vloptf(x) = optfunc(lmm, lmm.dv, varlinkvecapply(x, lmm.covstr.ct; varlinkf = varlinkf, rholinkf = rholinkf))[1]
+    vloptf(x) = optfunc(lmm, lmm.dv, varlinkvecapply(x, lmm.covstr.ct; varlinkf = varlinkf, rholinkf = rholinkf); syrkblas = false, maxthreads = maxthreads)[1]
+    #vloptfd(x) = optfunc(lmm, lmm.dv, varlinkvecapply(x, lmm.covstr.ct; varlinkf = varlinkf, rholinkf = rholinkf); maxthreads = maxthreads)[1]
 
     gcfg   = ForwardDiff.GradientConfig(vloptf, θ, chunk)
     hcfg   = ForwardDiff.HessianConfig(vloptf, θ, chunk)
diff --git a/src/gmat.jl b/src/gmat.jl
index 6571c4dd..c1a7e973 100644
--- a/src/gmat.jl
+++ b/src/gmat.jl
@@ -248,6 +248,12 @@ function tpnum(m, n, s)
     end
     b -= s - n
 end
+#=
+function tpnum(m, n, s)
+    div(m*(2s - 1 - m), 2) - s + n 
+end
+=#
+
 
 ################################################################################
 
diff --git a/src/linearalgebra.jl b/src/linearalgebra.jl
index 02fb437a..11841b63 100644
--- a/src/linearalgebra.jl
+++ b/src/linearalgebra.jl
@@ -26,6 +26,23 @@ end
 
 Change θ (only upper triangle). B is symmetric.
 """
+function mulαβαtinc!(θ::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix)
+    axb  = axes(B, 1)
+    sa   = size(A, 1)
+    for j ∈ axb
+        for i ∈ axb
+            Bij = B[i, j]
+            for n ∈ 1:sa
+                Anj = A[n, j]
+                for m ∈ 1:n
+                    @inbounds θ[m, n] +=  A[m, i] * Bij * Anj
+                end
+            end
+        end
+    end
+    θ
+end
+#=
 function mulαβαtinc!(θ::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix)
     axb  = axes(B, 1)
     sa   = size(A, 1)
@@ -40,12 +57,30 @@ function mulαβαtinc!(θ::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix
     end
     θ
 end
-
+=#
 """
 θ + A * B * A' * alpha
 
 Change θ (only upper triangle). B is symmetric.
 """
+function mulαβαtinc!(θ::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, alpha)
+    if  !(size(B, 1) == size(B, 2) == size(A, 2)) || !(size(A, 1) == size(θ, 1) == size(θ, 2)) throw(ArgumentError("Wrong dimentions!")) end
+    axb  = axes(B, 1)
+    sa   = size(A, 1)
+    for j ∈ axb
+        for i ∈ axb
+            @inbounds Bij = B[i, j]
+            for n ∈ 1:sa 
+                @inbounds Anj = A[n, j]
+                for m ∈ 1:n
+                    @inbounds θ[m, n] +=  A[m, i] * Bij * Anj * alpha
+                end
+            end
+        end
+    end
+    θ
+end
+#=
 function mulαβαtinc!(θ::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, alpha)
     if  !(size(B, 1) == size(B, 2) == size(A, 2)) || !(size(A, 1) == size(θ, 1) == size(θ, 2)) throw(ArgumentError("Wrong dimentions!")) end
     axb  = axes(B, 1)
@@ -61,26 +96,42 @@ function mulαβαtinc!(θ::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix
     end
     θ
 end
-
+=#
 """
 θ + A * B * (a - b) * alpha
 
 Change θ (only upper triangle). B is symmetric.
 """
+function mulαβαtinc!(θ::AbstractVector{T}, A::AbstractMatrix, B::AbstractMatrix, a::AbstractVector, b::AbstractVector, alpha) where T
+    if !(size(B, 2) == length(a) == length(b)) || size(B, 1) != size(A, 2) || size(A, 1) != length(θ) throw(ArgumentError("Wrong dimentions.")) end
+    axb  = axes(B, 1)
+    sa   = size(A, 1)
+    for i ∈ axb
+        abi = a[i] - b[i]
+        for j ∈ axb
+            for m ∈ 1:sa
+                @inbounds θ[m] +=  A[m, j] * B[j, i] * abi * alpha
+            end
+        end
+    end
+    θ
+end
+#=
 function mulαβαtinc!(θ::AbstractVector, A::AbstractMatrix, B::AbstractMatrix, a::AbstractVector, b::AbstractVector, alpha)
     if !(size(B, 2) == length(a) == length(b)) || size(B, 1) != size(A, 2) || size(A, 1) != length(θ) throw(ArgumentError("Wrong dimentions.")) end
     axb  = axes(B, 1)
     sa   = size(A, 1)
     for m ∈ 1:sa
         for j ∈ axb
+            Amj = A[m, j]
             for i ∈ axb
-                @inbounds θ[m] +=  A[m, j] * B[j, i] * (a[i] - b[i]) * alpha
+                @inbounds θ[m] +=  Amj * B[j, i] * (a[i] - b[i]) * alpha
             end
         end
     end
     θ
 end
-
+=#
 """
     mulθ₃(y, X, β, V::AbstractMatrix{T})::T where T
 
@@ -95,40 +146,78 @@ function mulθ₃(y, X, β, V::AbstractArray{T}) where T # check for optimizatio
 
     if q == 1
         cs = zero(T)
-        #=@turbo=# @inbounds  for m in 1:p
+        for m in 1:p
+            @inbounds cs += X[1, m] * β[m]
+        end
+        return -V[1, 1] * (y[1] - cs)^2
+    end
+    c = zeros(T, q)
+    @simd for m = 1:p
+        βm = β[m]
+        @simd for n = 1:q
+            @inbounds c[n] += X[n, m] * βm
+        end
+    end
+    @simd for m = 2:q
+        ycm = y[m] - c[m]
+        @simd for n = 1:m-1
+            @inbounds θ -= V[n, m] * (y[n] - c[n]) * ycm * 2
+        end
+    end
+    @simd for m = 1:q
+        @inbounds θ -= V[m, m] * (y[m] - c[m]) ^ 2
+    end
+    return θ
+end
+#=
+function mulθ₃(y, X, β, V::AbstractArray{T}) where T # check for optimization
+    q = size(V, 1)
+    p = size(X, 2)
+    θ = zero(T)
+
+    if q == 1
+        cs = zero(T)
+        @inbounds  for m in 1:p
             cs += X[1, m] * β[m]
         end
         return -V[1, 1] * (y[1] - cs)^2
     end
 
     c = zeros(T, q)
-    #=@turbo=# @inbounds  for n = 1:q, m = 1:p
-        c[n] += X[n, m] * β[m]
+    for n = 1:q
+        for m = 1:p
+            c[n] += X[n, m] * β[m]
+        end
     end
 
-    @simd for n = 1:q
+    @simd for n = 1:q-1
+        ycn = y[n] - c[n]
         @simd for m = n+1:q
-            @inbounds θ -= V[n, m] * (y[n] - c[n]) * (y[m] - c[m]) * 2
+            @inbounds θ -= V[n, m] * ycn * (y[m] - c[m]) * 2
         end
     end
-    #=@turbo=# @inbounds  for m = 1:q
+    @inbounds  for m = 1:q
         θ -= V[m, m] * (y[m] - c[m]) ^ 2
     end
 
     return θ
 end
-
+=#
 """
 θ + A' * b
 
 Change θ.
 """
-function mulαtβinc!(θ::AbstractVector, A::AbstractMatrix, b::AbstractVector)
+function mulαtβinc!(θ::AbstractVector{T}, A::AbstractMatrix, b::AbstractVector) where T
     q = size(A, 1)
     if q != length(b) throw(DimensionMismatch("size(A, 1) should be equal length(b)")) end
     p = size(A, 2)
-    #=@turbo=# @inbounds  for m in 1:q, n in 1:p
-        θ[n] += b[m] * A[m, n]
+    for n in 1:p
+        θn = zero(T)
+        for m in 1:q
+            @inbounds θn += b[m] * A[m, n]
+        end
+        @inbounds θ[n] += θn
     end
     θ
 end
@@ -137,9 +226,10 @@ end
 
 @inline function tmul_unsafe(rz, θ::AbstractVector{T}) where T
     vec = zeros(T, size(rz, 1))
-    #=@turbo=# for r ∈ axes(rz, 1)
-        for i ∈ axes(rz, 2)
-            @inbounds vec[r] += rz[r, i] * θ[i]
+    for i ∈ axes(rz, 2)
+        θi = θ[i]
+        for r ∈ axes(rz, 1)
+            @inbounds vec[r] += rz[r, i] * θi
         end
     end
     vec
diff --git a/src/reml.jl b/src/reml.jl
index 9e6f0548..ab7be2c0 100644
--- a/src/reml.jl
+++ b/src/reml.jl
@@ -34,7 +34,7 @@ end
 #                     REML without provided β
 ################################################################################
 
-function reml_sweep_β(lmm, data, θ::Vector{T}; syrkblas::Bool = false) where T # Main optimization way - make gradient / hessian analytical / semi-analytical functions
+function reml_sweep_β(lmm, data, θ::Vector{T}; syrkblas::Bool = false, maxthreads::Int = 16) where T # Main optimization way - make gradient / hessian analytical / semi-analytical functions
     n             = length(lmm.covstr.vcovblock)
     N             = length(lmm.data.yv)
     c             = (N - lmm.rankx)*log(2π)
@@ -48,7 +48,7 @@ function reml_sweep_β(lmm, data, θ::Vector{T}; syrkblas::Bool = false) where T
     gvec          = gmatvec(θ, lmm.covstr)
     rθ            = θ[lmm.covstr.tr[end]] # R part of θ
     noerror       = true
-        ncore     = min(num_cores(), n, 16)
+        ncore     = min(num_cores(), n, maxthreads)
         accθ₁     = zeros(T, ncore)
         accθ₂     = Vector{Matrix{T}}(undef, ncore)
         accβm     = Vector{Vector{T}}(undef, ncore)
@@ -71,14 +71,16 @@ function reml_sweep_β(lmm, data, θ::Vector{T}; syrkblas::Bool = false) where T
                 i    =  offset + j
                 q    = length(lmm.covstr.vcovblock[i])
                 qswm = q + lmm.rankx
-                Vp   = Matrix{T}(undef, qswm, qswm)
+                Vp   = zeros(T, qswm, qswm)
+                #Vp   = Matrix{T}(undef, qswm, qswm)
+                #fillzeroutri!(Vp)
                 #Vp   = view(Vpt[t], qswm, qswm)
                 V    = view(Vp, 1:q, 1:q)
                 Vx   = view(Vp, 1:q, q+1:qswm)
                 Vc   = view(Vp, q+1:qswm, q+1:qswm)
-                fillzeroutri!(V)
+                #fillzeroutri!(V)
                 copyto!(Vx, data.xv[i])
-                fillzeroutri!(Vc)
+                #fillzeroutri!(Vc)
             #-------------------------------------------------------------------
             # Make V matrix
                 vmatrix!(V, gvec, rθ, lmm, i)
@@ -91,7 +93,8 @@ function reml_sweep_β(lmm, data, θ::Vector{T}; syrkblas::Bool = false) where T
             #-----------------------------------------------------------------------
                 if ne == false erroracc[t] = false end
                 accθ₁[t] += swr
-                subutri!(accθ₂[t], Vc)
+                #subutri!(accθ₂[t], Vc)
+                accθ₂[t] .-= Vc
                 mulαtβinc!(accβm[t], Vx, data.yv[i])
             end
             #-----------------------------------------------------------------------
@@ -126,8 +129,8 @@ end
 #                     REML with provided β
 ################################################################################
 
-function core_sweep_β(lmm, data, θ::Vector{T}, β, n) where T
-    ncore     = min(num_cores(), n, 16)
+function core_sweep_β(lmm, data, θ::Vector{T}, β, n; maxthreads::Int = 16) where T
+    ncore     = min(num_cores(), n, maxthreads)
     accθ₁     = zeros(T, ncore)
     accθ₂     = Vector{Matrix{T}}(undef, ncore)
     accθ₃     = zeros(T, ncore)
@@ -142,31 +145,33 @@ function core_sweep_β(lmm, data, θ::Vector{T}, β, n) where T
             i =  offset + j
             q    = length(lmm.covstr.vcovblock[i])
             qswm = q + lmm.rankx
-            Vp   = Matrix{T}(undef, qswm, qswm)
+            Vp   = zeros(T, qswm, qswm)
+            #Vp   = Matrix{T}(undef, qswm, qswm)
             V    = view(Vp, 1:q, 1:q)
             Vx   = view(Vp, 1:q, q+1:qswm)
             Vc   = view(Vp, q+1:qswm, q+1:qswm)
-            fillzeroutri!(V)
+            #fillzeroutri!(V)
             copyto!(Vx, data.xv[i])
-            fillzeroutri!(Vc)
+            #fillzeroutri!(Vc)
             vmatrix!(V, gvec, rθ, lmm, i)
             #-----------------------------------------------------------------------
             swm, swr, ne = sweepb!(Vector{T}(undef, qswm), Vp, 1:q; logdet = true)
             #-----------------------------------------------------------------------
             if ne == false erroracc[t] = false end
             accθ₁[t] += swr
-            subutri!(accθ₂[t], Vc)
+            #subutri!(accθ₂[t], Vc)
+            accθ₂[t] .-= Vc
             accθ₃[t]  += mulθ₃(data.yv[i], data.xv[i], β, V)
         end
     end
     sum(accθ₁), sum(accθ₂), sum(accθ₃), all(erroracc)
 end
 
-function reml_sweep_β(lmm, data, θ::Vector{T}, β) where T
+function reml_sweep_β(lmm, data, θ::Vector{T}, β; kwargs...) where T
     n             = length(lmm.covstr.vcovblock)
     N             = length(lmm.data.yv)
     c             = (N - lmm.rankx)*log(2π)
-    θ₁, θ₂, θ₃, noerror = core_sweep_β(lmm, data, θ::Vector{T}, β, n)
+    θ₁, θ₂, θ₃, noerror = core_sweep_β(lmm, data, θ::Vector{T}, β, n; kwargs...)
     θs₂      = Symmetric(θ₂)
     logdetθ₂ = logdet(θs₂)
     return   θ₁ + logdetθ₂ + θ₃ + c, θs₂, θ₃, noerror #REML, iC, θ₃
@@ -174,21 +179,21 @@ end
 ################################################################################
 #                     REML AI-like / scoring part
 ################################################################################
-function sweep_ai(lmm, data, θ, β)
+function sweep_ai(lmm, data, θ, β; kwargs...)
     n                   = length(lmm.covstr.vcovblock)
-    θ₁, θ₂, θ₃, noerror = core_sweep_β(lmm, data, θ, β, n)
+    θ₁, θ₂, θ₃, noerror = core_sweep_β(lmm, data, θ, β, n; kwargs...)
     return  θ₃
 end
-function sweep_score(lmm, data, θ, β)
+function sweep_score(lmm, data, θ, β; kwargs...)
     n                   = length(lmm.covstr.vcovblock)
-    θ₁, θ₂, θ₃, noerror = core_sweep_β(lmm, data, θ, β, n)
+    θ₁, θ₂, θ₃, noerror = core_sweep_β(lmm, data, θ, β, n; kwargs...)
     return -θ₁ + θ₃
 end
 ################################################################################
 #                     variance-covariance matrix of β
 ################################################################################
-function sweep_β_cov(lmm, data, θ, β)
+function sweep_β_cov(lmm, data, θ, β; kwargs...)
     n                   = length(lmm.covstr.vcovblock)
-    θ₁, θ₂, θ₃, noerror = core_sweep_β(lmm, data, θ, β, n)
+    θ₁, θ₂, θ₃, noerror = core_sweep_β(lmm, data, θ, β, n; kwargs...)
     return Symmetric(θ₂)
 end
diff --git a/src/rmat.jl b/src/rmat.jl
index ad06ed15..d4222888 100644
--- a/src/rmat.jl
+++ b/src/rmat.jl
@@ -3,11 +3,14 @@
 ################################################################################
 
 ################################################################################
+#=
 function rmat_base_inc_b!(mx, θ, zrv, covstr)
     rmat!(mx, θ, zrv, covstr.repeated.covtype.s)
 end
+=#
 ################################################################################
 ################################################################################
+#=
 function rmat_base_inc!(mx, θ, covstr, block, sblock)
     zblock    = view(covstr.rz, block, :)
     @simd for i ∈ axes(sblock[end], 1)
@@ -15,6 +18,7 @@ function rmat_base_inc!(mx, θ, covstr, block, sblock)
     end
     mx
 end
+=#
 @noinline function rmat_base_inc!(mx, θ, covstr, bi)
     en        = covstr.rn + 1
     block     = covstr.vcovblock[bi]
diff --git a/src/sweep.jl b/src/sweep.jl
index 4ac56e38..3e614053 100644
--- a/src/sweep.jl
+++ b/src/sweep.jl
@@ -4,11 +4,17 @@
 
 function nsyrk!(α, x, A)
     p = checksquare(A)
-    for i in 1:p, j in i:p
-        @inbounds A[i,j] += α * x[i] * x[j]
+    for j in 1:p
+        xj = x[j]
+        for i in 1:j 
+            @inbounds A[i, j] += α * x[i] * xj
+        end
     end
     A
 end
+function nsyrk!(α, x, A::AbstractArray{T}) where T <: AbstractFloat
+    BLAS.syrk!('U', 'N', α, x, one(T), A)
+end
 #=
 function nsyrk!(α::T, x::AbstractArray{<:T}, A::StridedMatrix{T}) where {T<:Union{LinearAlgebra.BlasFloat, LinearAlgebra.BlasComplex}}
     nt = LinearAlgebra.BLAS.get_num_threads()
@@ -34,11 +40,11 @@ function sweepb!(akk::AbstractArray{T, 1}, A::AbstractArray{T, 2}, k::Integer, i
     #Rank-k update of the symmetric matrix C as alpha*A*transpose(A) + beta*C
     #or alpha*transpose(A)*A + beta*C according to trans.
     #Only the uplo triangle of C is used. Returns C.
-    if syrkblas
-        BLAS.syrk!('U', 'N', -d, akk, one(T), A)
-    else
+    #if syrkblas
+    #    BLAS.syrk!('U', 'N', -d, akk, one(T), A)
+    #else
         nsyrk!(-d, akk, A)
-    end
+    #end
 
     rmul!(akk, d * (-one(T)) ^ inv)
     @simd for i in 1:(k-1)

From 6ae8d1401b9a0a5d2a162c393dba72d8fd885527 Mon Sep 17 00:00:00 2001
From: PharmCat <v.s.arnautov@yandex.ru>
Date: Fri, 23 Dec 2022 00:59:36 +0300
Subject: [PATCH 2/2] update

---
 Project.toml | 2 +-
 change.log   | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index fb1427c9..c840e747 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,7 +3,7 @@ uuid = "a1dec852-9fe5-11e9-361f-8d9fde67cfa2"
 keywords = ["lenearmodel", "mixedmodel"]
 desc = "Mixed-effects models with flexible covariance structure."
 authors = ["Vladimir Arnautov <mail@pharmcat.net>"]
-version = "0.14.1"
+version = "0.14.2"
 
 [deps]
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
diff --git a/change.log b/change.log
index be96ee67..da98d265 100644
--- a/change.log
+++ b/change.log
@@ -1,3 +1,7 @@
+v0.14.2
+  * minor optimizations and changes
+  * potential bug fix
+
 v0.14.1
   * docs
   * test