From 2026064a89a5bfa567c981593dd661e206e76bc6 Mon Sep 17 00:00:00 2001
From: PetersBas <1.bas.peters@gmail.com>
Date: Fri, 28 Jan 2022 12:54:40 -0800
Subject: [PATCH] improve parallel version, simplify some code, some memory
 reduction and speedup

---
 examples/projection_intersection_2D.jl |  32 +-
 examples/projection_intersection_3D.jl |  24 +-
 src/CDS_MVp.jl                         |  64 ++-
 src/CDS_MVp_MT.jl                      |   2 +-
 src/CDS_MVp_MT_subfunc.jl              |  15 +-
 src/CDS_scaled_add!.jl                 |   4 +-
 src/PARSDMM.jl                         | 200 ++++-----
 src/PARSDMM_initialize.jl              | 564 ++++++++++++-------------
 src/adapt_rho_gamma.jl                 | 210 ++++-----
 src/adapt_rho_gamma_parallel.jl        | 190 ++++-----
 src/cg.jl                              |  16 +-
 src/compute_relative_feasibility.jl    |  22 +-
 src/projectors/project_l1_Duchi!.jl    |  10 +-
 src/rhs_compose.jl                     |  29 +-
 src/update_y_l.jl                      |  56 +--
 src/update_y_l_parallel.jl             |  49 +--
 test/runtests.jl                       |   2 +-
 17 files changed, 781 insertions(+), 708 deletions(-)

diff --git a/examples/projection_intersection_2D.jl b/examples/projection_intersection_2D.jl
index d47c47a..36119c4 100755
--- a/examples/projection_intersection_2D.jl
+++ b/examples/projection_intersection_2D.jl
@@ -36,14 +36,14 @@ elseif options.FL==Float32
 end
 
 #load image to project
-file = matopen(joinpath(dirname(pathof(SetIntersectionProjection)), "../examples/Data/compass_velocity.mat"))
-m    = read(file, "Data");close(file)
-m    = m[1:341,200:600]
-m    = permutedims(m,[2,1])
+file = matopen(joinpath(dirname(pathof(SetIntersectionProjection)), "../examples/Data/compass_velocity.mat"));
+m    = read(file, "Data");close(file);
+m    = m[1:341,200:600];
+m    = permutedims(m,[2,1]);
 
 #set up computational grid (25 and 6 m are the original distances between grid points)
 comp_grid = compgrid((TF(25.0), TF(6.0)),(size(m,1), size(m,2)))
-m         = convert(Vector{TF},vec(m))
+m         = convert(Vector{TF},vec(m));
 
 #define axis limits and colorbar limits
 xmax = comp_grid.d[1]*comp_grid.n[1]
@@ -55,13 +55,13 @@ vma  = 4500
 constraint = Vector{SetIntersectionProjection.set_definitions}()
 
 #bounds:
-m_min     = 1500.0
+m_min     = 1480.0
 m_max     = 4500.0
 set_type  = "bounds"
 TD_OP     = "identity"
 app_mode  = ("matrix","")
 custom_TD_OP = ([],false)
-push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
 #slope constraints (vertical)
 m_min     = 0.0
@@ -70,11 +70,11 @@ set_type  = "bounds"
 TD_OP     = "D_z"
 app_mode  = ("matrix","")
 custom_TD_OP = ([],false)
-push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
 options.parallel       = false
-(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
-(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
+(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
 println("")
 println("PARSDMM serial (bounds and bounds on D_z):")
@@ -103,14 +103,17 @@ tight_layout()
 #tight_layout(pad=0.0, w_pad=0.0, h_pad=1.0)
 savefig("PARSDMM_logs.png",bbox_inches="tight")
 
+#print timings in terminal
+log_PARSDMM.timing
+
 println("")
 println("PARSDMM parallel (bounds and bounds on D_z):")
 options.parallel       = true
-(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
-(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
+(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
 @time (x2,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
-#@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
+@time (x2,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 #@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 
 #plot
@@ -119,6 +122,9 @@ subplot(2,1,1);imshow(permutedims(reshape(m,(comp_grid.n[1],comp_grid.n[2])),[2,
 subplot(2,1,2);imshow(permutedims(reshape(x2,(comp_grid.n[1],comp_grid.n[2])),[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("Projection (bounds and bounds on D_z)")
 savefig("projected_model_ParallelPARSDMM.png",bbox_inches="tight")
 
+#print timings in terminal
+log_PARSDMM.timing
+
 # #use multilevel-serial (2-levels)
 # options.parallel = false
 
diff --git a/examples/projection_intersection_3D.jl b/examples/projection_intersection_3D.jl
index bbfa714..8e10726 100644
--- a/examples/projection_intersection_3D.jl
+++ b/examples/projection_intersection_3D.jl
@@ -3,7 +3,7 @@
 # Bas Peters, 2017
 
 using Distributed
-using LinearAlgebra
+@everywhere using LinearAlgebra
 @everywhere using SetIntersectionProjection
 using HDF5
 using PyPlot
@@ -17,7 +17,7 @@ end
 if ~isfile("overthrust_3D_true_model.h5")
   run(`wget ftp://slim.gatech.edu/data/SoftwareRelease/WaveformInversion.jl/3DFWI/overthrust_3D_true_model.h5`)
 end
-n, d, o, m = read(h5open("overthrust_3D_true_model.h5","r"), "n", "d", "o", "m")
+n, d, o, m = read(h5open("overthrust_3D_true_model.h5","r"), "n", "d", "o", "m");
 
 m .= 1000.0 ./ sqrt.(m);
 m = m[50:200,50:200,:];
@@ -36,7 +36,7 @@ options.Blas_active            = true
 options.maxit                  = 500
 
 set_zero_subnormals(true)
-BLAS.set_num_threads(4)
+@everywhere BLAS.set_num_threads(4)
 
 #select working precision
 if options.FL==Float64
@@ -68,7 +68,7 @@ set_type  = "bounds"
 TD_OP     = "identity"
 app_mode  = ("matrix","")
 custom_TD_OP = ([],false)
-push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
 #slope constraints (vertical)
 m_min     = 0.0
@@ -77,11 +77,11 @@ set_type  = "bounds"
 TD_OP     = "D_z"
 app_mode  = ("matrix","")
 custom_TD_OP = ([],false)
-push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
 options.parallel       = false
-(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
-(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
+(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
 println("")
 println("PARSDMM serial (bounds and bounds on D_z):")
@@ -133,12 +133,12 @@ println("PARSDMM multilevel-serial (bounds and bounds on D_z):")
 println("")
 println("PARSDMM parallel (bounds and bounds on D_z):")
 options.parallel       = true
-(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
-(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
+(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
-@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options)
-@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options)
-@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options)
+@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
+@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
+@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 
 #use multilevel-parallel (2-levels)
 options.parallel  = true
diff --git a/src/CDS_MVp.jl b/src/CDS_MVp.jl
index 9299417..4f5c190 100755
--- a/src/CDS_MVp.jl
+++ b/src/CDS_MVp.jl
@@ -1,4 +1,4 @@
-export CDS_MVp
+export CDS_MVp, CDS_MVp2, CDS_MVp3, CDS_MVp4
 
 """
 compute single-thread matrix vector product with vector x, output is vector y: y=A*x
@@ -7,15 +7,15 @@ R is a tall matrix N by ndiagonals, corresponding to a square matrix A
 offsets indicate offset of diagonal compared to the main diagonal in A (which is 0)
 """
 function CDS_MVp(
-                        N::Integer,
-                        ndiags::Integer,
-                        R::Array{TF,2},
-                        offset::Vector{TI},
-                        x::Vector{TF},
-                        y::Vector{TF}) where {TF<:Real,TI<:Integer}
+                        N      ::Integer,
+                        ndiags ::Integer,
+                        R      ::Array{TF,2},
+                        offset ::Vector{TI},
+                        x      ::Vector{TF},
+                        y      ::Vector{TF}) where {TF<:Real,TI<:Integer}
 
   for i = 1 : ndiags
-      d = offset[i]
+      d  = offset[i]
       r0 = max(1, 1-d)
       r1 = min(N, N-d)
       c0 = max(1, 1+d)
@@ -26,3 +26,51 @@ function CDS_MVp(
   end
   return y
 end
+
+# #the versions below is insignificantly faster than the original version above
+# function CDS_MVp(
+#   N::Integer,
+#   ndiags::Integer,
+#   R::Array{TF,2},
+#   offset::Vector{TI},
+#   x::Vector{TF},
+#   y::Vector{TF}) where {TF<:Real,TI<:Integer}
+
+#   for i = 1 : ndiags
+#     d = offset[i]
+#     r0 = max(1, 1-d)
+#     r1 = min(N, N-d)
+#     c0 = max(1, 1+d)
+#     #r0_plus_c0 = r0 + c0
+#     c = deepcopy(c0)
+#     for r = r0 : r1
+#       @inbounds y[r] = y[r] + R[r,i] * x[c]#original
+#       c += 1
+#     end
+#   end
+# return y
+# end
+
+# function CDS_MVp(
+#   N      ::Integer,
+#   ndiags ::Integer,
+#   R      ::Array{TF,2},
+#   offset ::Vector{TI},
+#   x      ::Vector{TF},
+#   y      ::Vector{TF}) where {TF<:Real,TI<:Integer}
+
+#   for i = 1 : ndiags
+#     d  = offset[i]
+#     r0 = max(1, 1-d)
+#     r1 = min(N, N-d)
+#     c0 = max(1, 1+d)
+#     # for r = r0 : r1
+#     #   c = r - r0 + c0 #original
+#     #   @inbounds y[r] = y[r] + R[r,i] * x[c]#original
+#     # end
+#     @inbounds y[r0 : r1] .= y[r0 : r1] .+ R[r0:r1,i].*x[r0 - r0 + c0:r1 - r0 + c0]
+#   end
+
+#   return y
+# end
+
diff --git a/src/CDS_MVp_MT.jl b/src/CDS_MVp_MT.jl
index 654de77..53e4e35 100644
--- a/src/CDS_MVp_MT.jl
+++ b/src/CDS_MVp_MT.jl
@@ -15,7 +15,7 @@ function CDS_MVp_MT(
                         y      ::Vector{TF}) where {TF<:Real,TI<:Integer}
 
   for i = 1 : ndiags
-      d = offset[i]
+      d  = offset[i]
       r0 = max(1, 1-d)
       r1 = min(N, N-d)
       c0 = max(1, 1+d)
diff --git a/src/CDS_MVp_MT_subfunc.jl b/src/CDS_MVp_MT_subfunc.jl
index 5b0cdc1..84da27a 100644
--- a/src/CDS_MVp_MT_subfunc.jl
+++ b/src/CDS_MVp_MT_subfunc.jl
@@ -4,15 +4,14 @@ wrapper around Julia threads to compute a multi-threaded matrix vector product i
 This is a subfunction of CDS_MVp_MT.jl
 """
 function CDS_MVp_MT_subfunc(
-        R::Array{TF,2},
-        x::Vector{TF},
-        y::Vector{TF},
-        r0::Int,
-        c0::Int,
-        r1::Int,
-        i::Int) where {TF<:Real}
+        R  ::Array{TF,2},
+        x  ::Vector{TF},
+        y  ::Vector{TF},
+        r0 ::Int,
+        c0 ::Int,
+        r1 ::Int,
+        i  ::Int) where {TF<:Real}
 
-      #s=rind-1
   @Threads.threads for r = r0 : r1
       c = r - r0 + c0 #original
       @inbounds y[r] = y[r] + R[r,i] * x[c]#original
diff --git a/src/CDS_scaled_add!.jl b/src/CDS_scaled_add!.jl
index 70477ba..d0f60f6 100644
--- a/src/CDS_scaled_add!.jl
+++ b/src/CDS_scaled_add!.jl
@@ -3,7 +3,7 @@ export CDS_scaled_add!
 
 """
  Computes A = A + alpha * B for A and B in the compressed diagonal storage format (CDS/DIA)
- TO DO: make this function multi-threaded per column of A and B
+ TO DO: make this function multi-threaded per column of A and B, or use BLAS functions
 """
 function CDS_scaled_add!(
                   A           ::Array{TF,2},
@@ -16,7 +16,7 @@ function CDS_scaled_add!(
 for k=1:length(B_offsets)
   A_update_col = findall((in)(B_offsets[k]),A_offsets)
   if isempty(A_update_col) == true
-    error("attempted to update a diagonal in A in CDS storage that does not excist. A and B need to have the same nonzero diagonals")
+    error("attempted to update a diagonal in A in CDS storage that does not exist. A and B need to have the same nonzero diagonals")
   end
   A[:,A_update_col] .= A[:,A_update_col] .+ alpha .* B[:,k];
 end
diff --git a/src/PARSDMM.jl b/src/PARSDMM.jl
index d944f25..12d1a2e 100755
--- a/src/PARSDMM.jl
+++ b/src/PARSDMM.jl
@@ -95,57 +95,58 @@ x_solve_tol_ref = TF(1.0) #scalar required to determine tolerance for x-minimiza
 end #end initialization timer
 
 for i=1:maxit #main loop
+
   #form right hand side for x-minimization
   @timeit to "form rhs for linear system" begin
-  rhs               = rhs_compose(rhs,l,y,rho,TD_OP,p,Blas_active,parallel)
+  rhs = rhs_compose(rhs,l,y,rho,TD_OP,p,Blas_active,parallel)
   end #end timer for rhs
 
   # x-minimization
   @timeit to "argmin x" begin
   copy!(x_old,x);
   (x,iter,relres,x_solve_tol_ref) = argmin_x(Q,rhs,x,x_solve_tol_ref,i,log_PARSDMM,Q_offsets,Ax_out)
-  log_PARSDMM.cg_it[i]     = iter;#cg_log.iters;
-  log_PARSDMM.cg_relres[i] = relres;#cg_log[:resnorm][end];
+  log_PARSDMM.cg_it[i]            = iter
+  log_PARSDMM.cg_relres[i]        = relres
   end #end timer for argmin x 
 
   # y-minimization & l-update
   @timeit to "argmin y and l update" begin
   if parallel==true
-    [ @spawnat pid update_y_l_parallel(x,i,Blas_active,
-      y[:L],y_old[:L],l[:L],l_old[:L],rho[:L],gamma[:L],prox[:L],TD_OP[:L],P_sub[:L],
-      x_hat[:L],r_pri[:L],s[:L],set_feas[:L],feasibility_only) for pid in y.pids]
-
-    # [@spawnat pid update_y_l_parallel_exp(x,i,Blas_active,
-    #   y[:L][1],y_old[:L][1],l[:L][1],l_old[:L][1],rho[:L][1],gamma[:L][1],prox[:L][1],TD_OP[:L][1],P_sub[:L],
-    #   x_hat[:L][1],r_pri[:L][1],s[:L][1],set_feas[:L]) for pid in y.pids]
-      #logging distributed quantities
-      @sync for ii=1:p
-        @async log_PARSDMM.r_pri[i,ii]=@fetchfrom r_pri.pids[ii] norm(r_pri[:L][1])
-      end
-      if mod(i,10)==0 #log every 10 it, or whatever number is suitable
-        for ii=1:pp
-          log_PARSDMM.set_feasibility[counter,ii]=@fetchfrom set_feas.pids[ii] set_feas[:L][1]
-        end
-        counter+=1
+
+    #using spawnat
+      [ @sync @spawnat pid update_y_l_parallel(x,i,Blas_active,
+        y[:L],y_old[:L],l[:L],l_old[:L],rho[:L],gamma[:L],prox[:L],TD_OP[:L],P_sub[:L],
+        x_hat[:L],r_pri[:L],s[:L],set_feas[:L],feasibility_only) for pid in y.pids]
+    
+    #logging distributed quantities
+    for ii=1:p
+      log_PARSDMM.r_pri[i,ii] = norm(r_pri[ii])
+      #log_PARSDMM.r_dual[i,ii] = norm( rho[ii]*(TD_OP[ii]'*(y[ii]-y_old[ii])) )
+    end
+    if mod(i,10)==0 #log every 10 it, or whatever number is suitable
+      for ii=1:pp
+        log_PARSDMM.set_feasibility[counter,ii] = set_feas[ii]
       end
-  else
-    (y,l,r_pri,s,log_PARSDMM,counter,y_old,l_old)=update_y_l(x,p,i,Blas_active,y,y_old,l,l_old,rho,gamma,prox,TD_OP,log_PARSDMM,P_sub,counter,x_hat,r_pri,s,feasibility_only);
+      counter+=1
+    end
+  else #not distributed 
+    (y,l,r_pri,s,log_PARSDMM,counter,y_old,l_old) = update_y_l(x,p,i,Blas_active,y,y_old,l,l_old,rho,gamma,prox,TD_OP,log_PARSDMM,P_sub,counter,x_hat,r_pri,s,feasibility_only);
     log_PARSDMM.r_dual_total[i] = sum(log_PARSDMM.r_dual[i,:]);
   end
 
   #some more logging
-  log_PARSDMM.r_pri_total[i]  = sum(log_PARSDMM.r_pri[i,:]);
+  log_PARSDMM.r_pri_total[i] = sum(log_PARSDMM.r_pri[i,:]);
   if options.Minkowski == false
-    log_PARSDMM.obj[i]          = TF(0.5) .* norm(x .- m)^2
+    log_PARSDMM.obj[i] = TF(0.5) .* norm(x .- m)^2
   else
-    log_PARSDMM.obj[i]          = TF(0.5) .* norm(TD_OP[end]*x .- m)^2
+    log_PARSDMM.obj[i] = TF(0.5) .* norm(TD_OP[end]*x .- m)^2
   end
 
-  log_PARSDMM.evol_x[i]       = norm(x_old .- x) ./ norm(x);
-  log_PARSDMM.rho[i,:]        = rho;
-  log_PARSDMM.gamma[i,:]      = gamma;
+  log_PARSDMM.evol_x[i]  = norm(x_old .- x) ./ norm(x);
+  log_PARSDMM.rho[i,:]   = rho;
+  log_PARSDMM.gamma[i,:] = gamma;
 
-   end #end timer for argmin y and l-update
+  end #end timer for argmin y and l-update
 
   # Stopping conditions
   @timeit to "stopping conditions check" begin
@@ -160,7 +161,7 @@ for i=1:maxit #main loop
 
   # adjust penalty parameters rho and relaxation parameters gamma
   @timeit to "adjust rho and gamma" begin
-  if i==1
+  if i==1 #only at the first iteration
     if parallel==true
       [@spawnat pid l_hat[:L][1] = l_old[:L][1] .+ rho[:L][1] .* ( -s[:L][1] .+ y_old[:L][1] ) for pid in l_hat.pids]
       [@spawnat pid copy!(l_hat_0[:L][1],l_hat[:L][1] ) for pid in l_hat.pids]
@@ -178,76 +179,77 @@ for i=1:maxit #main loop
     end
   end
 
-     if (adjust_rho == true || adjust_gamma == true) && mod(i,rho_update_frequency)==TF(0)
-       if parallel==true
-         [ @spawnat pid adapt_rho_gamma_parallel(gamma[:L],rho[:L],adjust_gamma,adjust_rho,y[:L],
-         y_old[:L],s[:L],s_0[:L],l[:L],l_hat_0[:L],l_0[:L],l_old[:L],y_0[:L],l_hat[:L],d_l_hat[:L],d_H_hat[:L],d_l[:L],d_G_hat[:L]) for pid in y.pids]
-         #[ @spawnat pid adapt_rho_gamma_parallel_exp(gamma[:L][1],rho[:L][1],adjust_gamma,adjust_rho,y[:L][1],
-         #y_old[:L][1],s[:L][1],s_0[:L][1],l[:L][1],l_hat_0[:L][1],l_0[:L][1],l_old[:L][1],y_0[:L][1],l_hat[:L][1],d_l_hat[:L][1],d_H_hat[:L][1],d_l[:L][1],d_G_hat[:L][1]) for pid in y.pids]
-       else
-         (rho,gamma,l_hat,d_l_hat,d_H_hat,d_l,d_G_hat)=adapt_rho_gamma(i,gamma,rho,adjust_gamma,adjust_rho,y,y_old,s,s_0,l,l_hat_0,l_0,l_old,y_0,p,l_hat,d_l_hat,d_H_hat,d_l,d_G_hat);
-       end
-         if i>1
-           if parallel==true
-             [@spawnat pid copy!(l_hat_0[:L][1],l_hat[:L][1] ) for pid in l_hat.pids]
-             [@spawnat pid copy!(y_0[:L][1] , y[:L][1] ) for pid in y.pids]
-             [@spawnat pid copy!(s_0[:L][1] , s[:L][1] ) for pid in s.pids]
-             [@spawnat pid copy!(l_0[:L][1] , l[:L][1] ) for pid in l.pids]
-           else
-             for ii=1:p
-               copy!(l_hat_0[ii],l_hat[ii])
-               copy!(y_0[ii],y[ii])
-               copy!(s_0[ii],s[ii])
-               copy!(l_0[ii],l[ii])
-             end
-           end #end if parallel
-         end
-     end #end adjust rho and gamma
-
-     #adjust rho to set-feasibility estimates
-     if parallel==true
-        rho = convert(Vector{TF},rho); #gather rho
-     end
-     if adjust_feasibility_rho == true && mod(i,10)==TF(0) #&& norm(rho-log_PARSDMM.rho[i,:])<(10*eps(TF))#only update rho if it is not already updated
-         ## adjust rho feasibility
-         #if primal residual for a set is much larger than for the other sets
-         #and the feasibility error is also much larger, increase rho to lower
-         #primal residual and (hopefully) feasibility error
-        (max_set_feas,max_set_feas_ind) = findmax(log_PARSDMM.set_feasibility[counter-1,:])
-        sort_feas = sort(log_PARSDMM.set_feasibility[counter-1,:]);
-        if i>10
-          rho[max_set_feas_ind] = TF(2.0) .* rho[max_set_feas_ind]
+  if (adjust_rho == true || adjust_gamma == true) && mod(i,rho_update_frequency)==0#TF(0)
+    if parallel==true
+      #using spawnat
+      [@sync @spawnat pid adapt_rho_gamma_parallel(gamma[:L],rho[:L],adjust_gamma,adjust_rho,y[:L],
+      y_old[:L],s[:L],s_0[:L],l[:L],l_hat_0[:L],l_0[:L],l_old[:L],y_0[:L],l_hat[:L],d_l_hat[:L],d_H_hat[:L],d_l[:L],d_G_hat[:L]) for pid in y.pids]
+ 
+    else
+      (rho,gamma,l_hat,d_l_hat,d_H_hat,d_l,d_G_hat) = adapt_rho_gamma(i,gamma,rho,adjust_gamma,adjust_rho,y,y_old,s,s_0,l,l_hat_0,l_0,l_old,y_0,p,l_hat,d_l_hat,d_H_hat,d_l,d_G_hat);
+    end
+
+    if i>1 #after the first iteration
+      if parallel==true
+        [@spawnat pid copy!(l_hat_0[:L][1],l_hat[:L][1] ) for pid in l_hat.pids]
+        [@spawnat pid copy!(y_0[:L][1] , y[:L][1] ) for pid in y.pids]
+        [@spawnat pid copy!(s_0[:L][1] , s[:L][1] ) for pid in s.pids]
+        [@spawnat pid copy!(l_0[:L][1] , l[:L][1] ) for pid in l.pids]
+      else
+        for ii=1:p
+          copy!(l_hat_0[ii],l_hat[ii])
+          copy!(y_0[ii],y[ii])
+          copy!(s_0[ii],s[ii])
+          copy!(l_0[ii],l[ii])
         end
-     end #end adjust_feasibility_rho
-
-     #enforce max and min values for rho, to prevent the condition number of Q -> inf
-     rho = max.(min.(rho,TF(1e4)),TF(1e-2)) #hardcoded bounds
-     end #end timer for rho and gamma update
-
-     @timeit to "Q-update" begin
-     ind_updated = findall(rho .!= log_PARSDMM.rho[i,:]) # locate changed rho index
-     ind_updated = convert(Array{TI,1},ind_updated)
-
-     #re-assemble total transform domain operator as a matrix
-     if isempty(findall((in)(ind_updated),p))==false
-       if parallel==true && feasibility_only==false
-         prox    = convert(Vector{Any},prox); #gather rho
-         prox[p] = input -> prox_l2s!(input,rho[p],m)
-         prox    = distribute(prox)
-       elseif feasibility_only==false
-         prox[p] = input -> prox_l2s!(input,rho[p],m)
-       end
-     end
-     Q = Q_update!(Q,AtA,set_Prop,rho,ind_updated,log_PARSDMM,i,Q_offsets)
-     if parallel==true
-       rho = distribute(rho) #distribute again (gather -> process -> distribute is a ugly hack, fix this later)
-     end
-    end #end Q-update timer
-
-     if i==maxit
-       println("PARSDMM reached maxit")
-       (TD_OP,AtA,log_PARSDMM) = output_check_PARSDMM(x,TD_OP,AtA,log_PARSDMM,i,counter)
-     end
+      end #end if parallel
+    end
+  end #end adjust rho and gamma
+
+  #adjust rho to set-feasibility estimates
+  if parallel==true
+    rho = convert(Vector{TF},rho); #gather rho
+  end
+  if adjust_feasibility_rho == true && mod(i,10)==TF(0) #&& norm(rho-log_PARSDMM.rho[i,:])<(10*eps(TF))#only update rho if it is not already updated
+    ## adjust rho feasibility
+    #if primal residual for a set is much larger than for the other sets
+    #and the feasibility error is also much larger, increase rho to lower
+    #primal residual and (hopefully) feasibility error
+    (max_set_feas,max_set_feas_ind) = findmax(log_PARSDMM.set_feasibility[counter-1,:])
+    sort_feas = sort(log_PARSDMM.set_feasibility[counter-1,:]);
+    if i>10
+      rho[max_set_feas_ind] = TF(2.0) .* rho[max_set_feas_ind]
+    end
+  end #end adjust_feasibility_rho
+
+  #enforce max and min values for rho, to prevent the condition number of Q -> inf
+  rho = max.(min.(rho,TF(1e4)),TF(1e-2)) #hardcoded bounds
+  end #end timer for rho and gamma update
+
+  @timeit to "Q-update" begin
+  ind_updated = findall(rho .!= log_PARSDMM.rho[i,:]) # locate changed rho index
+  ind_updated = convert(Array{TI,1},ind_updated)
+
+  #re-assemble total transform domain operator as a matrix
+  if isempty(findall((in)(ind_updated),p))==false
+    if parallel==true && feasibility_only==false
+      prox    = convert(Vector{Any},prox); #gather rho
+      prox[p] = input -> prox_l2s!(input,rho[p],m)
+      prox    = distribute(prox)
+    elseif feasibility_only==false
+      prox[p] = input -> prox_l2s!(input,rho[p],m)
+    end
+  end
+  Q = Q_update!(Q,AtA,set_Prop,rho,ind_updated,log_PARSDMM,i,Q_offsets)
+  if parallel==true
+    rho = distribute(rho) #distribute again (gather -> process -> distribute is a ugly hack, fix this later)
+  end
+end #end Q-update timer
+
+  if i==maxit
+    println("PARSDMM reached maxit")
+    (TD_OP,AtA,log_PARSDMM) = output_check_PARSDMM(x,TD_OP,AtA,log_PARSDMM,i,counter)
+  end
 
  end #end main loop
 
@@ -258,7 +260,7 @@ end #end funtion
 # output checks
 function output_check_PARSDMM(x,TD_OP,AtA,log_PARSDMM,i,counter)
   if isreal(x)==false
-      println("warning: Result of PARSDMM is not real")
+      @warn "Result of PARSDMM is not real"
   end
   log_PARSDMM.obj             = log_PARSDMM.obj[1:i]
   log_PARSDMM.evol_x          = log_PARSDMM.evol_x[1:i]
diff --git a/src/PARSDMM_initialize.jl b/src/PARSDMM_initialize.jl
index b2808f6..28efcc4 100755
--- a/src/PARSDMM_initialize.jl
+++ b/src/PARSDMM_initialize.jl
@@ -27,288 +27,288 @@ function PARSDMM_initialize(
                             feasibility_only=false  ::Bool
                             ) where {TF<:Real,TI<:Integer}
 
-                            ind_ref = maxit
-                            if options.Minkowski == false
-                              N = length(x)
-                            elseif options.Minkowski == true && zero_ini_guess==true
-                              N = length(x)*2
-                            elseif options.Minkowski == true && zero_ini_guess==false
-                              @assert length(x)==(2*length(m))
-                              N = length(x)
-                            end
-                            # const N = size(TD_OP[1],2) #this line will give really weird errors later on for some reason..
-
-
-                            # if typeof(AtA[1])==SparseMatrixCSC{TF,TI}
-                            #   push!(AtA,speye(TF,N));
-                            # elseif typeof(AtA[1])==Array{TF,2}
-                            #   push!(AtA,ones(TF,N,1))
-                            # end
-                            # push!(TD_OP,speye(TF,N));
-                            #
-                            # push!(set_Prop.AtA_offsets,[0])
-                            # push!(set_Prop.banded,true)
-                            # push!(set_Prop.AtA_diag,true)
-                            # push!(set_Prop.dense,false)
-
-                            p  = length(TD_OP); #number of terms in in the sum of functions of the projeciton problem
-                            pp = p-1;
-                            if feasibility_only==true; pp=p; end;
-
-                            rho     = Vector{TF}(undef,p)
-                            if length(rho_ini)==1
-                              fill!(rho,rho_ini[1])
-                            else
-                              copy!(rho,rho_ini)
-                            end
-                            prox = copy(P_sub)
-                            if feasibility_only==false
-                              #define prox for all terms in the sum (projectors onto sets)
-                              #add prox for the data fidelity term 0.5||m-x||_2^2
-                              m_orig = deepcopy(m) ::Vector{TF}
-                              prox_data = input -> prox_l2s!(input,rho[end],m_orig)
-                              push!(prox,prox_data);
-                            end
-                            if parallel==true
-                              prox  = distribute(prox)
-                              P_sub = distribute(P_sub)
-                              rho   = distribute(rho)
-                            end
-
-                            if parallel==true && nworkers()<length(TD_OP)
-                                error("parallel PARSDMM requires at least one JULIA worker per constraint set + 1 (",length(TD_OP),")")
-                            end
-
-                            # detect feasibility of model that needs to be projected
-                            stop = false
-                            feasibility_initial = zeros(TF,length(P_sub))
-                            if options.Minkowski == true
-                              m = [m ; zeros(TF,length(m)) ]
-                            end
-                            if parallel
-                              feasibility_initial=distribute(feasibility_initial)
-                              #[@spawnat pid m for pid in P_sub.pids]
-                              [@spawnat pid compute_relative_feasibility(m,feasibility_initial[:L],TD_OP[:L],P_sub[:L]) for pid in P_sub.pids]
-                              feasibility_initial = convert(Vector{TF},feasibility_initial)
-                            else
-                              for ii = 1:length(P_sub)
-                                feasibility_initial[ii] = norm(P_sub[ii](TD_OP[ii]*m) .- TD_OP[ii]*m) ./ (norm(TD_OP[ii]*m)+(100*eps(TF)));
-                              end
-                            end
-                            if maximum(feasibility_initial)<options.feas_tol #accept input as feasible and return
-                                println("input to PARSDMM is feasible, returning")
-                                stop = true
-                            end
-
-                            # if one of the sets is non-convex, use different lambda and rho update frequency, don't update gamma and set a different fixed gamma
-                            for ii=1:pp
-                                if set_Prop.ncvx[ii] == true
-                                    println("non-convex set(s) involved, using special settings")
-                                    rho_update_frequency  = 3;
-                                    adjust_gamma          = false
-                                    gamma_ini             = TF(0.75)
-                                end
-                            end
-
-                            #allocate arrays of vectors
-                            Ax_out = zeros(TF,N)
-
-                            #if y and l are empty, allocate them and fill with zeros
-                            if isempty(l)==true
-                              l=Vector{Vector{TF}}(undef,p)
-                              for i=1:length(TD_OP); l[i]=zeros(TF,size(TD_OP[i],1)); end
-                            end
-                            if isempty(y)==true
-                              y=Vector{Vector{TF}}(undef,p)
-                              for i=1:length(TD_OP); y[i]=zeros(TF,size(TD_OP[i],1)); end
-                            end
-
-                            gamma   = Vector{TF}(undef,p);
-
-                            #y       = Vector{Vector{TF}}(p);
-                            y_0     = Vector{Vector{TF}}(undef,p);
-                            y_old   = Vector{Vector{TF}}(undef,p);
-
-                            #l       = Vector{Vector{TF}}(p);
-                            l_0     = Vector{Vector{TF}}(undef,p);
-                            l_old   = Vector{Vector{TF}}(undef,p);
-                            l_hat_0 = Vector{Vector{TF}}(undef,p);
-                            l_hat   = Vector{Vector{TF}}(undef,p)
-
-                            x_0     = Vector{TF}(undef,N);
-                            x_old   = Vector{TF}(undef,N);
-                            x_hat   = Vector{Vector{TF}}(undef,p);
-
-                            r_dual  = Vector{Vector{TF}}(undef,p);
-                            r_pri   = Vector{Vector{TF}}(undef,p);
-
-                            rhs     = Vector{TF}(undef,N);
-
-                            s       = Vector{Vector{TF}}(undef,p);
-                            s_0     = Vector{Vector{TF}}(undef,p);
-
-                            d_l_hat = Vector{Vector{TF}}(undef,p)
-                            d_H_hat = Vector{Vector{TF}}(undef,p)
-                            d_l     = Vector{Vector{TF}}(undef,p)
-                            d_G_hat = Vector{Vector{TF}}(undef,p)
-
-                            for ii=1:p #initialize all rho's, gamma's, y's and l's
-                                gamma[ii]   = gamma_ini;
-
-                                if parallel == false
-                                  ly          = size(TD_OP[ii],1)
-                                else
-                                 ly = 1
-                                end
-                                y_0[ii]     = zeros(TF,ly);#copy(y[ii])#zeros(ly);#0.*y[ii];
-                                y_old[ii]   = zeros(TF,ly);#copy(y[ii])#zeros(ly);#0.*y[ii];
-
-                                l_old[ii]   = zeros(TF,ly);#0.*y[ii];
-                                #l[ii]       = zeros(ly);#0.*y[ii];
-                                l_0[ii]     = zeros(TF,ly);#0.*y[ii];
-                                l_hat_0[ii] = zeros(TF,ly);#0.*y[ii];
-                                l_hat[ii]   = zeros(TF,ly)
-
-                                x_hat[ii]   = zeros(TF,ly)
-                                s_0[ii]     = zeros(TF,ly)
-                                s[ii]       = zeros(TF,ly)
-                                r_pri[ii]   = zeros(TF,ly)
-
-                                d_l_hat[ii] = zeros(TF,ly)
-                                d_H_hat[ii] = zeros(TF,ly)
-                                d_l[ii]     = zeros(TF,ly)
-                                d_G_hat[ii] = zeros(TF,ly)
-                            end
-
-                            #assemble total transform domain operator in normal equation form (A^t A)
-                            #note that matrices have already been converted to diagonal storage format if possible for all operators in AtA
-                            
-                            n_sparse_arrays  = 0
-                            n_JOLI_operators = 0
-                            n_CDS_matrices   = 0
-                            for i=1:p; if typeof(AtA[i])==SparseMatrixCSC{TF,TI}; n_sparse_arrays += 1; end; end
-                            for i=1:p; if typeof(AtA[i]) <: joAbstractLinearOperator; n_JOLI_operators += 1; end; end
-                            for i=1:p; if typeof(AtA[i])==Array{TF,2}; n_CDS_matrices += 1; end; end
-
-                            if n_sparse_arrays == 2 #if all arrays in AtA are sparse matrices
-                              Q_offsets = []
-                              Q = rho[1]*AtA[1];
-                              for i=2:p
-                                  Q .= Q .+ rho[i]*AtA[i]
-                              end
-                            elseif n_JOLI_operators >= 1 #at least one operator is a JOLI operator -> wrap all others as a JOLI matrix as well
-                              Q_offsets = []
-                              if typeof(AtA[1]) <: joAbstractLinearOperator
-                                Q = rho[1]*AtA[1]
-                              else
-                                Q = rho[1]*joMatrix(AtA[1])
-                              end
-                              for i=2:p
-                                if typeof(AtA[i]) <: joAbstractLinearOperator
-                                  Q = Q + rho[i]*AtA[i]
-                                else
-                                  Q = Q + rho[i]*joMatrix(AtA[i])
-                                end
-                              end
-                            else #n_CDS_matrices == p should hold #all matrics in AtA are in compressed diagonal storage format
-                              all_offsets=zeros(TI,999,99)
-                              for i=1:length(AtA) #find all unique offset
-                                all_offsets[1:length(set_Prop.AtA_offsets[i]),i]=set_Prop.AtA_offsets[i]
-                              end
-                              Q_offsets=convert(Vector{TI},unique(all_offsets))
-                              Q=zeros(TF,size(AtA[1],1),length(Q_offsets))
-                              for i=1:length(AtA)
-                                for j=1:length(set_Prop.AtA_offsets[i])
-                                  #Q_current_col = findin(Q_offsets,set_Prop.AtA_offsets[i][j])
-                                  Q_current_col = findall((in)(set_Prop.AtA_offsets[i][j]),Q_offsets)
-                                  Q[:,Q_current_col] .= Q[:,Q_current_col] .+ rho[i] .* AtA[i][:,j]
-                                end
-                              end
-                            end
-
-                            # log_PARSDMM = log_type_PARSDMM(zeros(maxit,pp),zeros(maxit,p),zeros(maxit,p),zeros(maxit),zeros(maxit),zeros(maxit),
-                            # zeros(maxit),zeros(maxit,p),zeros(maxit,p),zeros(maxit),zeros(maxit),TF(0),TF(0),TF(0),TF(0),TF(0),TF(0),TF(0));
-                            log_PARSDMM = log_type_PARSDMM(zeros(maxit,pp),zeros(maxit,p),zeros(maxit,p),zeros(maxit),zeros(maxit),zeros(maxit),
-                            zeros(maxit),zeros(maxit,p),zeros(maxit,p),zeros(maxit),zeros(maxit),TimerOutput());
-                            
-                            log_PARSDMM.set_feasibility[1,:]=feasibility_initial
-
-                            if parallel==true
-
-                              gamma = distribute(gamma)
-
-                              #y     = distribute(y)
-                              y_0   = distribute(y_0)
-                              y_old = distribute(y_old)
-
-                              #l       = distribute(l)
-                              l_0     = distribute(l_0)
-                              l_old   = distribute(l_old)
-                              l_hat_0 = distribute(l_hat_0)
-                              l_hat   = distribute(l_hat)
-
-                              x_hat   = distribute(x_hat)
-                              s_0     = distribute(s_0)
-                              s       = distribute(s)
-                              r_pri   = distribute(r_pri)
-
-                              d_l_hat = distribute(d_l_hat)
-                              d_H_hat = distribute(d_H_hat)
-                              d_l     = distribute(d_l)
-                              d_G_hat = distribute(d_G_hat)
-                            end
-
-                            #fill distributed vectors with zeros (from 1 entry to N entries, because this is faster than first fill and then dist. Should be able to do in one go ideally)
-                            if parallel == true
-                              [@spawnat pid y_0[:L][1]     = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid y_old[:L][1]   = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid l_0[:L][1]     = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid l_old[:L][1]   = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid l_hat_0[:L][1] = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid l_hat[:L][1]   = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid x_hat[:L][1]   = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid s_0[:L][1]     = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid s[:L][1]       = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid r_pri[:L][1]   = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid d_l_hat[:L][1] = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid d_H_hat[:L][1] = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid d_l[:L][1]     = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                              [@spawnat pid d_G_hat[:L][1] = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
-                            end
-
-                            #distribute y and l if they are not already distributed
-                            if (parallel==true) && (typeof(l)<:DistributedArrays.DArray) == false
-                              l=distribute(l)
-                            end
-                            if parallel==true && (typeof(y)<:DistributedArrays.DArray) == false
-                               y=distribute(y)
-                            end
-                            if parallel==true
-                              [ @spawnat pid adjust_gamma for pid in y.pids ]
-                              [ @spawnat pid adjust_rho for pid in y.pids ]
-                              [ @spawnat pid adjust_feasibility_rho for pid in y.pids ]
-                            end
-                            if parallel == true
-                              [@spawnat pid comp_grid for pid in  y.pids]
-                            end
-
-                            if parallel==true
-                              set_feas=zeros(TF,length(TD_OP)) #initialize a vector only required for parallel stuff
-                              set_feas=distribute(set_feas)
-                            else
-                              set_feas=[]
-                            end
-
-                            if zero_ini_guess==true
-                              if parallel==false
-                                for i=1:length(l); fill!(l[i],TF(0.0)); end
-                                for i=1:length(y); fill!(y[i],TF(0.0)); end
-                              else
-                                [@spawnat pid fill!(l[:L][1],TF(0.0)) for pid in l.pids]
-                                [@spawnat pid fill!(y[:L][1],TF(0.0)) for pid in y.pids]
-                              end
-                              fill!(x,TF(0.0))
-                            end
+  ind_ref = maxit
+  if options.Minkowski == false
+    N = length(x)
+  elseif options.Minkowski == true && zero_ini_guess==true
+    N = length(x)*2
+  elseif options.Minkowski == true && zero_ini_guess==false
+    @assert length(x)==(2*length(m))
+    N = length(x)
+  end
+  # const N = size(TD_OP[1],2) #this line will give really weird errors later on for some reason..
+
+
+  # if typeof(AtA[1])==SparseMatrixCSC{TF,TI}
+  #   push!(AtA,speye(TF,N));
+  # elseif typeof(AtA[1])==Array{TF,2}
+  #   push!(AtA,ones(TF,N,1))
+  # end
+  # push!(TD_OP,speye(TF,N));
+  #
+  # push!(set_Prop.AtA_offsets,[0])
+  # push!(set_Prop.banded,true)
+  # push!(set_Prop.AtA_diag,true)
+  # push!(set_Prop.dense,false)
+
+  p  = length(TD_OP); #number of terms in in the sum of functions of the projeciton problem
+  pp = p-1;
+  if feasibility_only==true; pp=p; end;
+
+  rho     = Vector{TF}(undef,p)
+  if length(rho_ini)==1
+    fill!(rho,rho_ini[1])
+  else
+    copy!(rho,rho_ini)
+  end
+  prox = copy(P_sub)
+  if feasibility_only==false
+    #define prox for all terms in the sum (projectors onto sets)
+    #add prox for the data fidelity term 0.5||m-x||_2^2
+    m_orig = deepcopy(m) ::Vector{TF}
+    prox_data = input -> prox_l2s!(input,rho[end],m_orig)
+    push!(prox,prox_data);
+  end
+  if parallel==true
+    prox  = distribute(prox)
+    P_sub = distribute(P_sub)
+    rho   = distribute(rho)
+  end
+
+  if parallel==true && nworkers()<length(TD_OP)
+      error("parallel PARSDMM requires at least one JULIA worker per constraint set + 1 (",length(TD_OP),")")
+  end
+
+  # detect feasibility of model that needs to be projected
+  stop = false
+  feasibility_initial = zeros(TF,length(P_sub))
+  if options.Minkowski == true
+    m = [m ; zeros(TF,length(m)) ]
+  end
+  if parallel
+    #f_compute_rel_feas = (feasibility_initial,TD_OP,P_sub) -> compute_relative_feasibility(m,feasibility_initial,TD_OP,P_sub) 
+    #[@spawnat pid f_compute_rel_feas for pid in 2:nworkers()]
+    #feasibility_initial = pmap(f_compute_rel_feas, feasibility_initial,TD_OP,P_sub; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing)
+    feasibility_initial = pmap((feasibility_initial,TD_OP,P_sub) -> compute_relative_feasibility(m,feasibility_initial,TD_OP,P_sub) , feasibility_initial,TD_OP,P_sub; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing)
+  else
+    for ii = 1:length(P_sub)
+      feasibility_initial[ii] = norm(P_sub[ii](TD_OP[ii]*m) .- TD_OP[ii]*m) ./ (norm(TD_OP[ii]*m)+(100*eps(TF)));
+    end
+  end
+  if maximum(feasibility_initial)<options.feas_tol #accept input as feasible and return
+      println("input to PARSDMM is feasible, returning")
+      stop = true
+  end
+
+  # if one of the sets is non-convex, use different lambda and rho update frequency, don't update gamma and set a different fixed gamma
+  for ii=1:pp
+      if set_Prop.ncvx[ii] == true
+          println("non-convex set(s) involved, using special settings")
+          rho_update_frequency  = 3;
+          adjust_gamma          = false
+          gamma_ini             = TF(0.75)
+      end
+  end
+
+  #allocate arrays of vectors
+  Ax_out = zeros(TF,N)
+
+  #if y and l are empty, allocate them and fill with zeros
+  if isempty(l)==true
+    l=Vector{Vector{TF}}(undef,p)
+    for i=1:length(TD_OP); l[i]=zeros(TF,size(TD_OP[i],1)); end
+  end
+  if isempty(y)==true
+    y=Vector{Vector{TF}}(undef,p)
+    for i=1:length(TD_OP); y[i]=zeros(TF,size(TD_OP[i],1)); end
+  end
+
+  gamma   = Vector{TF}(undef,p);
+
+  #y       = Vector{Vector{TF}}(p);
+  y_0     = Vector{Vector{TF}}(undef,p);
+  y_old   = Vector{Vector{TF}}(undef,p);
+
+  #l       = Vector{Vector{TF}}(p);
+  l_0     = Vector{Vector{TF}}(undef,p);
+  l_old   = Vector{Vector{TF}}(undef,p);
+  l_hat_0 = Vector{Vector{TF}}(undef,p);
+  l_hat   = Vector{Vector{TF}}(undef,p)
+
+  x_0     = Vector{TF}(undef,N);
+  x_old   = Vector{TF}(undef,N);
+  x_hat   = Vector{Vector{TF}}(undef,p);
+
+  r_dual  = Vector{Vector{TF}}(undef,p);
+  r_pri   = Vector{Vector{TF}}(undef,p);
+
+  rhs     = Vector{TF}(undef,N);
+
+  s       = Vector{Vector{TF}}(undef,p);
+  s_0     = Vector{Vector{TF}}(undef,p);
+
+  d_l_hat = Vector{Vector{TF}}(undef,p)
+  d_H_hat = Vector{Vector{TF}}(undef,p)
+  d_l     = Vector{Vector{TF}}(undef,p)
+  d_G_hat = Vector{Vector{TF}}(undef,p)
+
+  for ii=1:p #initialize all rho's, gamma's, y's and l's
+      gamma[ii]   = gamma_ini;
+
+      if parallel == false
+        ly          = size(TD_OP[ii],1)
+      else
+        ly = 1
+      end
+      y_0[ii]     = zeros(TF,ly);#copy(y[ii])#zeros(ly);#0.*y[ii];
+      y_old[ii]   = zeros(TF,ly);#copy(y[ii])#zeros(ly);#0.*y[ii];
+
+      l_old[ii]   = zeros(TF,ly);#0.*y[ii];
+      #l[ii]       = zeros(ly);#0.*y[ii];
+      l_0[ii]     = zeros(TF,ly);#0.*y[ii];
+      l_hat_0[ii] = zeros(TF,ly);#0.*y[ii];
+      l_hat[ii]   = zeros(TF,ly)
+
+      x_hat[ii]   = zeros(TF,ly)
+      s_0[ii]     = zeros(TF,ly)
+      s[ii]       = zeros(TF,ly)
+      r_pri[ii]   = zeros(TF,ly)
+
+      d_l_hat[ii] = zeros(TF,ly)
+      d_H_hat[ii] = zeros(TF,ly)
+      d_l[ii]     = zeros(TF,ly)
+      d_G_hat[ii] = zeros(TF,ly)
+  end
+
+  #assemble total transform domain operator in normal equation form (A^t A)
+  #note that matrices have already been converted to diagonal storage format if possible for all operators in AtA
+
+  n_sparse_arrays  = 0
+  n_JOLI_operators = 0
+  n_CDS_matrices   = 0
+  for i=1:p; if typeof(AtA[i])==SparseMatrixCSC{TF,TI}; n_sparse_arrays += 1; end; end
+  for i=1:p; if typeof(AtA[i]) <: joAbstractLinearOperator; n_JOLI_operators += 1; end; end
+  for i=1:p; if typeof(AtA[i])==Array{TF,2}; n_CDS_matrices += 1; end; end
+
+  if n_sparse_arrays == 2 #if all arrays in AtA are sparse matrices
+    Q_offsets = []
+    Q = rho[1]*AtA[1];
+    for i=2:p
+        Q .= Q .+ rho[i]*AtA[i]
+    end
+  elseif n_JOLI_operators >= 1 #at least one operator is a JOLI operator -> wrap all others as a JOLI matrix as well
+    Q_offsets = []
+    if typeof(AtA[1]) <: joAbstractLinearOperator
+      Q = rho[1]*AtA[1]
+    else
+      Q = rho[1]*joMatrix(AtA[1])
+    end
+    for i=2:p
+      if typeof(AtA[i]) <: joAbstractLinearOperator
+        Q = Q + rho[i]*AtA[i]
+      else
+        Q = Q + rho[i]*joMatrix(AtA[i])
+      end
+    end
+  else #n_CDS_matrices == p should hold #all matrics in AtA are in compressed diagonal storage format
+    all_offsets=zeros(TI,999,99)
+    for i=1:length(AtA) #find all unique offset
+      all_offsets[1:length(set_Prop.AtA_offsets[i]),i]=set_Prop.AtA_offsets[i]
+    end
+    Q_offsets=convert(Vector{TI},unique(all_offsets))
+    Q=zeros(TF,size(AtA[1],1),length(Q_offsets))
+    for i=1:length(AtA)
+      for j=1:length(set_Prop.AtA_offsets[i])
+        #Q_current_col = findin(Q_offsets,set_Prop.AtA_offsets[i][j])
+        Q_current_col = findall((in)(set_Prop.AtA_offsets[i][j]),Q_offsets)
+        Q[:,Q_current_col] .= Q[:,Q_current_col] .+ rho[i] .* AtA[i][:,j]
+      end
+    end
+  end
+
+  #initialize logs
+  log_PARSDMM = log_type_PARSDMM(zeros(maxit,pp),zeros(maxit,p),zeros(maxit,p),zeros(maxit),zeros(maxit),zeros(maxit),
+  zeros(maxit),zeros(maxit,p),zeros(maxit,p),zeros(maxit),zeros(maxit),TimerOutput());
+
+  log_PARSDMM.set_feasibility[1,:] = feasibility_initial
+
+  if parallel == true
+
+    gamma = distribute(gamma)
+
+    #y     = distribute(y)
+    y_0   = distribute(y_0)
+    y_old = distribute(y_old)
+
+    #l       = distribute(l)
+    l_0     = distribute(l_0)
+    l_old   = distribute(l_old)
+    l_hat_0 = distribute(l_hat_0)
+    l_hat   = distribute(l_hat)
+
+    x_hat   = distribute(x_hat)
+    s_0     = distribute(s_0)
+    s       = distribute(s)
+    r_pri   = distribute(r_pri)
+
+    d_l_hat = distribute(d_l_hat)
+    d_H_hat = distribute(d_H_hat)
+    d_l     = distribute(d_l)
+    d_G_hat = distribute(d_G_hat)
+  end
+
+  #fill distributed vectors with zeros (from 1 entry to N entries, because this is faster than first fill and then dist. Should be able to do in one go ideally)
+  if parallel == true
+    [@spawnat pid y_0[:L][1]     = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid y_old[:L][1]   = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid l_0[:L][1]     = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid l_old[:L][1]   = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid l_hat_0[:L][1] = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid l_hat[:L][1]   = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid x_hat[:L][1]   = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid s_0[:L][1]     = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid s[:L][1]       = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid r_pri[:L][1]   = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid d_l_hat[:L][1] = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid d_H_hat[:L][1] = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid d_l[:L][1]     = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+    [@spawnat pid d_G_hat[:L][1] = zeros(TF,size(TD_OP[:L][1],1)) for pid in y_0.pids]
+  end
+
+  #distribute y and l if they are not already distributed
+  if (parallel==true) && (typeof(l)<:DistributedArrays.DArray) == false
+    l = distribute(l)
+  end
+  if parallel==true && (typeof(y)<:DistributedArrays.DArray) == false
+    y = distribute(y)
+  end
+  if parallel==true
+    [ @spawnat pid adjust_gamma for pid in y.pids ]
+    [ @spawnat pid adjust_rho for pid in y.pids ]
+    [ @spawnat pid adjust_feasibility_rho for pid in y.pids ]
+  end
+  if parallel == true
+    [@spawnat pid comp_grid for pid in  y.pids]
+  end
+
+  if parallel == true
+    set_feas = zeros(TF,length(TD_OP)) #initialize a vector only required for parallel stuff
+    set_feas = distribute(set_feas)
+  else
+    set_feas = []
+  end
+
+  if zero_ini_guess == true
+    if parallel==false
+      for i=1:length(l); fill!(l[i],TF(0.0)); end
+      for i=1:length(y); fill!(y[i],TF(0.0)); end
+    else
+      [@spawnat pid fill!(l[:L][1],TF(0.0)) for pid in l.pids]
+      [@spawnat pid fill!(y[:L][1],TF(0.0)) for pid in y.pids]
+    end
+    fill!(x,TF(0.0))
+  end
+
 return ind_ref,N,TD_OP,AtA,p,rho_update_frequency,adjust_gamma,adjust_rho,adjust_feasibility_rho,gamma_ini,rho,gamma,y,y_0,y_old,
        l,l_0,l_old,l_hat_0,x_0,x_old,r_dual,rhs,s,s_0,Q,prox,log_PARSDMM,l_hat,x_hat,r_pri,
        d_l_hat,d_H_hat,d_l,d_G_hat,P_sub,Q_offsets,stop,feasibility_initial,set_feas,Ax_out
diff --git a/src/adapt_rho_gamma.jl b/src/adapt_rho_gamma.jl
index 5f31d3a..aa3cb8c 100644
--- a/src/adapt_rho_gamma.jl
+++ b/src/adapt_rho_gamma.jl
@@ -6,127 +6,127 @@ Updates relaxation and augmented-Lagrangian penalty parameter.
 To be used in serial version of PARSDMM.
 """
 function adapt_rho_gamma(
-                                  i               ::Integer,
-                                  gamma           ::Vector{TF},
-                                  rho             ::Vector{TF},
-                                  adjust_gamma    ::Bool,
-                                  adjust_rho      ::Bool,
-                                  y               ::Vector{Vector{TF}},
-                                  y_old           ::Vector{Vector{TF}},
-                                  s               ::Vector{Vector{TF}},
-                                  s_0             ::Vector{Vector{TF}},
-                                  l               ::Vector{Vector{TF}},
-                                  l_hat_0         ::Vector{Vector{TF}},
-                                  l_0             ::Vector{Vector{TF}},
-                                  l_old           ::Vector{Vector{TF}},
-                                  y_0             ::Vector{Vector{TF}},
-                                  p               ::Integer,
-                                  l_hat           ::Vector{Vector{TF}},
-                                  d_l_hat         ::Vector{Vector{TF}},
-                                  d_H_hat         ::Vector{Vector{TF}},
-                                  d_l             ::Vector{Vector{TF}},
-                                  d_G_hat         ::Vector{Vector{TF}}
-                                  ) where {TF<:Real}
+                        i               ::Integer,
+                        gamma           ::Vector{TF},
+                        rho             ::Vector{TF},
+                        adjust_gamma    ::Bool,
+                        adjust_rho      ::Bool,
+                        y               ::Vector{Vector{TF}},
+                        y_old           ::Vector{Vector{TF}},
+                        s               ::Vector{Vector{TF}},
+                        s_0             ::Vector{Vector{TF}},
+                        l               ::Vector{Vector{TF}},
+                        l_hat_0         ::Vector{Vector{TF}},
+                        l_0             ::Vector{Vector{TF}},
+                        l_old           ::Vector{Vector{TF}},
+                        y_0             ::Vector{Vector{TF}},
+                        p               ::Integer,
+                        l_hat           ::Vector{Vector{TF}},
+                        d_l_hat         ::Vector{Vector{TF}},
+                        d_H_hat         ::Vector{Vector{TF}},
+                        d_l             ::Vector{Vector{TF}},
+                        d_G_hat         ::Vector{Vector{TF}}
+                        ) where {TF<:Real}
 
-    if TF==Float64
-        safeguard = 1e-10
-    elseif TF==Float32
-        safeguard = 1f-6
-    end
+  if TF==Float64
+      safeguard = 1e-10
+  elseif TF==Float32
+      safeguard = 1f-6
+  end
 
   eps_correlation = TF(0.3) #hardcoded and suggested value by the paper based on numerical evidence
 
   #Threads.@threads for ii = 1:p
   for ii = 1:p
-      l_hat[ii]   .= l_old[ii] .+ rho[ii] .* ( -s[ii] .+ y_old[ii] )
-      d_l_hat[ii] .= l_hat[ii] .- l_hat_0[ii]
-      d_H_hat[ii] .= s[ii] .- s_0[ii]
+    l_hat[ii]   .= l_old[ii] .+ rho[ii] .* ( -s[ii] .+ y_old[ii] )
+    d_l_hat[ii] .= l_hat[ii] .- l_hat_0[ii]
+    d_H_hat[ii] .= s[ii] .- s_0[ii]
 
 
-      d_dHh_dlh   = dot(d_H_hat[ii],d_l_hat[ii])
-      n_d_H_hat   = norm(d_H_hat[ii])
-      n_d_l_hat   = norm(d_l_hat[ii])
-      d_l[ii]    .= l[ii] .- l_0[ii]
-      n_d_l       = norm(d_l[ii])
-      d_G_hat[ii].= -(y[ii] .- y_0[ii])
-      n_d_G_hat   = norm(d_G_hat[ii])
-      d_dGh_dl    = dot(d_G_hat[ii],d_l[ii])
+    d_dHh_dlh   = dot(d_H_hat[ii],d_l_hat[ii])
+    n_d_H_hat   = norm(d_H_hat[ii])
+    n_d_l_hat   = norm(d_l_hat[ii])
+    d_l[ii]    .= l[ii] .- l_0[ii]
+    n_d_l       = norm(d_l[ii])
+    d_G_hat[ii].= -(y[ii] .- y_0[ii])
+    n_d_G_hat   = norm(d_G_hat[ii])
+    d_dGh_dl    = dot(d_G_hat[ii],d_l[ii])
 
-      alpha_reliable = false;
-      if (n_d_H_hat*n_d_l_hat) > safeguard && (n_d_H_hat.^2) > safeguard && d_dHh_dlh>safeguard
-        alpha_reliable = true
-        alpha_correlation = d_dHh_dlh./( n_d_H_hat*n_d_l_hat  );
-      end
+    alpha_reliable = false;
+    if (n_d_H_hat*n_d_l_hat) > safeguard && (n_d_H_hat.^2) > safeguard && d_dHh_dlh>safeguard
+      alpha_reliable = true
+      alpha_correlation = d_dHh_dlh./( n_d_H_hat*n_d_l_hat  );
+    end
 
-      beta_reliable = false;
-      if (n_d_G_hat*n_d_l) > safeguard && (n_d_G_hat^2) > safeguard && d_dGh_dl > safeguard
-        beta_reliable = true
-        beta_correlation  = d_dGh_dl ./ ( n_d_G_hat*n_d_l );
-      end
+    beta_reliable = false;
+    if (n_d_G_hat*n_d_l) > safeguard && (n_d_G_hat^2) > safeguard && d_dGh_dl > safeguard
+      beta_reliable = true
+      beta_correlation  = d_dGh_dl ./ ( n_d_G_hat*n_d_l );
+    end
 
-      alpha_comp = false
-      if (alpha_reliable==true) && (alpha_correlation > eps_correlation)
-        alpha_comp = true
-        alpha_hat_MG  = d_dHh_dlh ./ (n_d_H_hat.^2);
-        alpha_hat_SD  = (n_d_l_hat^2) ./ d_dHh_dlh;
-        if (TF(2.0)*alpha_hat_MG) > alpha_hat_SD
-          alpha_hat = alpha_hat_MG;
-        else
-          alpha_hat = alpha_hat_SD - alpha_hat_MG/TF(2.0);
-        end
+    alpha_comp = false
+    if (alpha_reliable==true) && (alpha_correlation > eps_correlation)
+      alpha_comp = true
+      alpha_hat_MG  = d_dHh_dlh ./ (n_d_H_hat.^2);
+      alpha_hat_SD  = (n_d_l_hat^2) ./ d_dHh_dlh;
+      if (TF(2.0)*alpha_hat_MG) > alpha_hat_SD
+        alpha_hat = alpha_hat_MG;
+      else
+        alpha_hat = alpha_hat_SD - alpha_hat_MG/TF(2.0);
       end
+    end
 
-      beta_comp = false
-      if (beta_reliable==1) && (beta_correlation > eps_correlation)
-        beta_comp = true
-        beta_hat_MG = d_dGh_dl ./ (n_d_G_hat^2);
-        beta_hat_SD = (n_d_l^2) ./ d_dGh_dl;
-        if (TF(2.0)*beta_hat_MG) > beta_hat_SD
-          beta_hat = beta_hat_MG;
-        else
-          beta_hat = beta_hat_SD - beta_hat_MG/TF(2.0);
-        end
+    beta_comp = false
+    if (beta_reliable==1) && (beta_correlation > eps_correlation)
+      beta_comp = true
+      beta_hat_MG = d_dGh_dl ./ (n_d_G_hat^2);
+      beta_hat_SD = (n_d_l^2) ./ d_dGh_dl;
+      if (TF(2.0)*beta_hat_MG) > beta_hat_SD
+        beta_hat = beta_hat_MG;
+      else
+        beta_hat = beta_hat_SD - beta_hat_MG/TF(2.0);
       end
+    end
 
-      #update rho and or gamma
-      if adjust_rho == true  && adjust_gamma == false
-        if alpha_comp == true && beta_comp == true
-          rho[ii] = sqrt(alpha_hat*beta_hat);
-        elseif alpha_comp == true && beta_comp == false
-          rho[ii] = alpha_hat;
-        elseif alpha_comp == false && beta_comp == true
-          rho[ii] = beta_hat;
-        else
-          #rho = rho; #do nothing
-        end
-      elseif adjust_rho == true  && adjust_gamma == true
-        if alpha_comp == true && beta_comp == true
-          rho[ii] = sqrt(alpha_hat*beta_hat);
-          gamma[ii]=TF(1.0)+(( TF(2.0)*sqrt(alpha_hat*beta_hat) ) ./ ( alpha_hat+beta_hat ));
-        elseif alpha_comp == true && beta_comp == false
-          rho[ii] = alpha_hat;
-          gamma[ii]=TF(1.9);
-        elseif  alpha_comp == false && beta_comp == true
-          rho[ii] = beta_hat;
-          gamma[ii]=TF(1.1);
-        else
-          #rho = rho; #do nothing
-          gamma[ii]=TF(1.5);
-        end
-      elseif adjust_rho == false  && adjust_gamma == true
-        if alpha_comp == true && beta_comp == true
-          gamma[ii]=TF(1.0)+(( TF(2.0)*sqrt(alpha_hat*beta_hat) ) ./ ( alpha_hat+beta_hat ));
-        elseif alpha_comp == true && beta_comp == false
-          gamma[ii]=TF(1.9);
-        elseif  alpha_comp == false && beta_comp == true
-          gamma[ii]=TF(1.1);
-        else
-          gamma[ii]=TF(1.5);
-        end
-      end #end compute new rho and gamma
-    end #end for loop
+    #update rho and or gamma
+    if adjust_rho == true  && adjust_gamma == false
+      if alpha_comp == true && beta_comp == true
+        rho[ii] = sqrt(alpha_hat*beta_hat);
+      elseif alpha_comp == true && beta_comp == false
+        rho[ii] = alpha_hat;
+      elseif alpha_comp == false && beta_comp == true
+        rho[ii] = beta_hat;
+      else
+        #rho = rho; #do nothing
+      end
+    elseif adjust_rho == true  && adjust_gamma == true
+      if alpha_comp == true && beta_comp == true
+        rho[ii] = sqrt(alpha_hat*beta_hat);
+        gamma[ii]=TF(1.0)+(( TF(2.0)*sqrt(alpha_hat*beta_hat) ) ./ ( alpha_hat+beta_hat ));
+      elseif alpha_comp == true && beta_comp == false
+        rho[ii] = alpha_hat;
+        gamma[ii]=TF(1.9);
+      elseif  alpha_comp == false && beta_comp == true
+        rho[ii] = beta_hat;
+        gamma[ii]=TF(1.1);
+      else
+        #rho = rho; #do nothing
+        gamma[ii]=TF(1.5);
+      end
+    elseif adjust_rho == false  && adjust_gamma == true
+      if alpha_comp == true && beta_comp == true
+        gamma[ii]=TF(1.0)+(( TF(2.0)*sqrt(alpha_hat*beta_hat) ) ./ ( alpha_hat+beta_hat ));
+      elseif alpha_comp == true && beta_comp == false
+        gamma[ii]=TF(1.9);
+      elseif  alpha_comp == false && beta_comp == true
+        gamma[ii]=TF(1.1);
+      else
+        gamma[ii]=TF(1.5);
+      end
+    end #end compute new rho and gamma
+  end #end for loop
 
 
 
-return rho,gamma,l_hat,d_l_hat,d_H_hat,d_l ,d_G_hat
+return rho ,gamma, l_hat, d_l_hat, d_H_hat, d_l, d_G_hat
 end #function adapt_rho_gamma
diff --git a/src/adapt_rho_gamma_parallel.jl b/src/adapt_rho_gamma_parallel.jl
index f351435..b73cb99 100644
--- a/src/adapt_rho_gamma_parallel.jl
+++ b/src/adapt_rho_gamma_parallel.jl
@@ -26,104 +26,104 @@ function adapt_rho_gamma_parallel(
                                   d_G_hat         ::Vector{Vector{TF}}
                                   ) where {TF<:Real}
 
-eps_correlation = TF(0.3) #hardcoded and suggested value by the paper based on numerical evidence
-
-
-      if TF==Float64
-          safeguard = 1e-10
-      elseif TF==Float32
-          safeguard = 1f-6
-      end
-
-      #standard loop-fusion
-       @. l_hat[1]   = l_old[1] + rho[1]* ( -s[1] + y_old[1] )
-      # @. d_l_hat[1] = l_hat[1] - l_hat_0[1]
-      # @. d_H_hat[1] = s[1] - s_0[1]
-      # @. d_l[1]     = l[1] - l_0[1]
-      # @. d_G_hat[1] = -(y[1] - y_0[1])
-
-      #wrap multi-theading inside a function
-      a_is_b_min_c_MT!(d_l_hat[1],l_hat[1],l_hat_0[1])
-      a_is_b_min_c_MT!(d_H_hat[1],s[1],s_0[1])
-      a_is_b_min_c_MT!(d_l[1],l[1],l_0[1])
-      a_is_b_min_c_MT!(d_G_hat[1],y_0[1],y[1])
-
-
-      d_dHh_dlh   = dot(d_H_hat[1],d_l_hat[1])
-      d_dGh_dl    = dot(d_G_hat[1],d_l[1])
-
-      n_d_H_hat   = norm(d_H_hat[1])
-      n_d_l_hat   = norm(d_l_hat[1])
-      n_d_l       = norm(d_l[1])
-      n_d_G_hat   = norm(d_G_hat[1])
-
-      alpha_comp = false
-      #print(alpha_comp)
-      if (n_d_H_hat*n_d_l_hat) > safeguard && (n_d_H_hat.^2) > safeguard && d_dHh_dlh>safeguard
-        alpha_correlation = d_dHh_dlh./( n_d_H_hat*n_d_l_hat  )
-        if alpha_correlation > eps_correlation
-          alpha_comp = true
-          alpha_hat_MG  = d_dHh_dlh./(n_d_H_hat.^2);
-          alpha_hat_SD  = (n_d_l_hat^2)./d_dHh_dlh;
-          if (TF(2.0)*alpha_hat_MG) > alpha_hat_SD
-            alpha_hat = alpha_hat_MG;
-          else
-            alpha_hat = alpha_hat_SD - alpha_hat_MG/TF(2.0);
-          end
-        end
-      end
-
-      beta_comp = false
-      if (n_d_G_hat*n_d_l) > safeguard && (n_d_G_hat^2) > safeguard && d_dGh_dl > safeguard
-        beta_correlation  = d_dGh_dl./( n_d_G_hat*n_d_l )
-        if beta_correlation > eps_correlation
-          beta_comp = true
-          beta_hat_MG = d_dGh_dl./(n_d_G_hat^2);
-          beta_hat_SD = (n_d_l^2)./d_dGh_dl;
-          if (TF(2.0)*beta_hat_MG) > beta_hat_SD
-            beta_hat = beta_hat_MG;
-          else
-            beta_hat = beta_hat_SD - beta_hat_MG/TF(2.0);
-          end
-        end
+  eps_correlation = TF(0.3) #hardcoded and suggested value by the paper based on numerical evidence
+
+
+  if TF==Float64
+      safeguard = 1e-10
+  elseif TF==Float32
+      safeguard = 1f-6
+  end
+
+  #standard loop-fusion
+    @. l_hat[1]   = l_old[1] + rho[1]* ( -s[1] + y_old[1] )
+  # @. d_l_hat[1] = l_hat[1] - l_hat_0[1]
+  # @. d_H_hat[1] = s[1] - s_0[1]
+  # @. d_l[1]     = l[1] - l_0[1]
+  # @. d_G_hat[1] = -(y[1] - y_0[1])
+
+  #wrap multi-theading inside a function
+  a_is_b_min_c_MT!(d_l_hat[1],l_hat[1],l_hat_0[1])
+  a_is_b_min_c_MT!(d_H_hat[1],s[1],s_0[1])
+  a_is_b_min_c_MT!(d_l[1],l[1],l_0[1])
+  a_is_b_min_c_MT!(d_G_hat[1],y_0[1],y[1])
+
+
+  d_dHh_dlh   = dot(d_H_hat[1],d_l_hat[1])
+  d_dGh_dl    = dot(d_G_hat[1],d_l[1])
+
+  n_d_H_hat   = norm(d_H_hat[1])
+  n_d_l_hat   = norm(d_l_hat[1])
+  n_d_l       = norm(d_l[1])
+  n_d_G_hat   = norm(d_G_hat[1])
+
+  alpha_comp = false
+  #print(alpha_comp)
+  if (n_d_H_hat*n_d_l_hat) > safeguard && (n_d_H_hat.^2) > safeguard && d_dHh_dlh>safeguard
+    alpha_correlation = d_dHh_dlh./( n_d_H_hat*n_d_l_hat  )
+    if alpha_correlation > eps_correlation
+      alpha_comp = true
+      alpha_hat_MG  = d_dHh_dlh./(n_d_H_hat.^2);
+      alpha_hat_SD  = (n_d_l_hat^2)./d_dHh_dlh;
+      if (TF(2.0)*alpha_hat_MG) > alpha_hat_SD
+        alpha_hat = alpha_hat_MG;
+      else
+        alpha_hat = alpha_hat_SD - alpha_hat_MG/TF(2.0);
       end
-
-      #update rho and or gamma
-    if adjust_rho == true  && adjust_gamma == true
-      if alpha_comp == true && beta_comp == true
-        rho[1] = sqrt(alpha_hat*beta_hat);
-        gamma[1]=TF(1.0)+(( TF(2.0)*sqrt(alpha_hat*beta_hat) )./( alpha_hat+beta_hat ));
-      elseif alpha_comp == true && beta_comp == false
-        rho[1] = alpha_hat;
-        gamma[1]=TF(1.9);
-      elseif  alpha_comp == false && beta_comp == true
-        rho[1] = beta_hat;
-        gamma[1]=TF(1.1);
+    end
+  end
+
+  beta_comp = false
+  if (n_d_G_hat*n_d_l) > safeguard && (n_d_G_hat^2) > safeguard && d_dGh_dl > safeguard
+    beta_correlation  = d_dGh_dl./( n_d_G_hat*n_d_l )
+    if beta_correlation > eps_correlation
+      beta_comp = true
+      beta_hat_MG = d_dGh_dl./(n_d_G_hat^2);
+      beta_hat_SD = (n_d_l^2)./d_dGh_dl;
+      if (TF(2.0)*beta_hat_MG) > beta_hat_SD
+        beta_hat = beta_hat_MG;
       else
-        #rho = rho; #do nothing
-        gamma[1]=TF(1.5);
+        beta_hat = beta_hat_SD - beta_hat_MG/TF(2.0);
       end
-    elseif adjust_rho == true  && adjust_gamma == false
-        if alpha_comp == true && beta_comp == true
-          rho[1] = sqrt(alpha_hat*beta_hat);
-        elseif alpha_comp == true && beta_comp == false
-          rho[1] = alpha_hat;
-        elseif alpha_comp == false && beta_comp == true
-          rho[1] = beta_hat;
-        else
-          #rho = rho; #do nothing
-        end
-      elseif adjust_rho == false  && adjust_gamma == true
-        if alpha_comp == true && beta_comp == true
-          gamma[1]=TF(1.0)+(( TF(2.0)*sqrt(alpha_hat*beta_hat) )./( alpha_hat+beta_hat ));
-        elseif alpha_comp == true && beta_comp == false
-          gamma[1]=TF(1.9);
-        elseif  alpha_comp == false && beta_comp == true
-          gamma[1]=TF(1.1);
-        else
-          gamma[1]=TF(1.5);
-        end
-      end #end compute new rho and gamma
+    end
+  end
+
+  #update rho and or gamma
+  if adjust_rho == true  && adjust_gamma == true
+  if alpha_comp == true && beta_comp == true
+    rho[1] = sqrt(alpha_hat*beta_hat);
+    gamma[1]=TF(1.0)+(( TF(2.0)*sqrt(alpha_hat*beta_hat) )./( alpha_hat+beta_hat ));
+  elseif alpha_comp == true && beta_comp == false
+    rho[1] = alpha_hat;
+    gamma[1]=TF(1.9);
+  elseif  alpha_comp == false && beta_comp == true
+    rho[1] = beta_hat;
+    gamma[1]=TF(1.1);
+  else
+    #rho = rho; #do nothing
+    gamma[1]=TF(1.5);
+  end
+  elseif adjust_rho == true  && adjust_gamma == false
+    if alpha_comp == true && beta_comp == true
+      rho[1] = sqrt(alpha_hat*beta_hat);
+    elseif alpha_comp == true && beta_comp == false
+      rho[1] = alpha_hat;
+    elseif alpha_comp == false && beta_comp == true
+      rho[1] = beta_hat;
+    else
+      #rho = rho; #do nothing
+    end
+  elseif adjust_rho == false  && adjust_gamma == true
+    if alpha_comp == true && beta_comp == true
+      gamma[1]=TF(1.0)+(( TF(2.0)*sqrt(alpha_hat*beta_hat) )./( alpha_hat+beta_hat ));
+    elseif alpha_comp == true && beta_comp == false
+      gamma[1]=TF(1.9);
+    elseif  alpha_comp == false && beta_comp == true
+      gamma[1]=TF(1.1);
+    else
+      gamma[1]=TF(1.5);
+    end
+  end #end compute new rho and gamma
 
 
 
diff --git a/src/cg.jl b/src/cg.jl
index ff0767c..eb92023 100644
--- a/src/cg.jl
+++ b/src/cg.jl
@@ -85,16 +85,20 @@ function cg(A::Function,b::Vector{TF}; tol::Real=1e-2,maxIter::Integer=100,M::Fu
 
 		Ap 	  = A(p)::Vector{TF}
 		gamma = dot(r,z)
+		#gamma = BLAS.dot(n, r,1,z,1)#
 		alpha = gamma / dot(p,Ap)
+		#alpha = gamma / BLAS.dot(n, p,1,Ap,1)#dot(p,Ap)
+
 		if alpha==Inf || alpha<0
 			flag = -2; break
 		end
 
 		BLAS.axpy!(n,alpha,p,1,x,1) # x = alpha*p+x
-		if storeInterm; X[:,iter] = x; end
+		#if storeInterm; X[:,iter] = x; end
 		BLAS.axpy!(n,-alpha,Ap,1,r,1) # r -= alpha*Ap
 
-		resvec[iter] = norm(r)/nr0
+		#resvec[iter] = BLAS.nrm2(n, r, 1) / nr0#
+		resvec[iter]  = norm(r)/nr0
 		if out==2
 			println(iter,resvec[iter])
 		end
@@ -103,10 +107,12 @@ function cg(A::Function,b::Vector{TF}; tol::Real=1e-2,maxIter::Integer=100,M::Fu
 		end
 
 		z    = M(r)::Vector{TF}
-		beta = dot(z,r) / gamma
+		#beta = BLAS.dot(n, z,1,r,1) / gamma
+		beta  = dot(z,r) / gamma
 		# the following two lines are equivalent to p = z + beta*p
-		p = BLAS.scal!(n,beta,p,1)::Vector{TF}
-		p = BLAS.axpy!(n,TF(1.0),z,1,p,1)::Vector{TF}
+		#p = BLAS.scal!(n,beta,p,1)::Vector{TF}
+		#p = BLAS.axpy!(n,TF(1.0),z,1,p,1)::Vector{TF}
+		p  = BLAS.axpby!(TF(1.0), z, beta, p)::Vector{TF} # Overwrite Y with X*a + Y*b, where a and b are scalars. Return Y
 	end
 
 	if out>=0
diff --git a/src/compute_relative_feasibility.jl b/src/compute_relative_feasibility.jl
index 98101aa..45b8282 100644
--- a/src/compute_relative_feasibility.jl
+++ b/src/compute_relative_feasibility.jl
@@ -6,13 +6,19 @@ r_feas =  ||P(A*x)-A*x||/||A*x||
 Helper function that is intended for the parallel version only, where just a single
 operator is sent to a worker where this script runs
 """
-function compute_relative_feasibility(m::Vector{TF}, feasibility_initial::Vector{TF},
-                                      TD_OP::Vector{Union{SparseMatrixCSC{TF,TI},JOLI.joAbstractLinearOperator{TF,TF}}},
-                                       P_sub) where {TF<:Real,TI<:Integer}
+# function compute_relative_feasibility(m::Vector{TF}, feasibility_initial::Vector{TF},
+#                                       TD_OP::Vector{Union{SparseMatrixCSC{TF,TI},JOLI.joAbstractLinearOperator{TF,TF}}},
+#                                        P_sub) where {TF<:Real,TI<:Integer}
+function compute_relative_feasibility(m::Vector{TF},feasibility_initial::TF,
+    TD_OP::Union{SparseMatrixCSC{TF,TI},JOLI.joAbstractLinearOperator{TF,TF}},
+     P_sub) where {TF<:Real,TI<:Integer}
 
-    #the two lines below don't work in this case, because P_sub mutates the input
-    # pm = TD_OP[1]*m
-    # feasibility_initial[1]=norm(P_sub[1](pm) .- pm) ./ (norm(pm)+(100*eps(TF)))
-    feasibility_initial[1]=norm(P_sub[1](TD_OP[1]*m) .- TD_OP[1]*m) ./ (norm(TD_OP[1]*m)+(100*eps(TF)))
-#gc() check if we need this in julia >0.7
+    #note that P_sub mutates the input in-place
+    s_temp = TD_OP*m
+    norm(P_sub(deepcopy(s_temp)) .- s_temp)
+     # feasibility_initial[1]=norm(P_sub[1](pm) .- pm) ./ (norm(pm)+(100*eps(TF)))
+    feasibility_initial = norm(P_sub(deepcopy(s_temp)) .- s_temp) ./ (norm(s_temp)+(100*eps(TF)))
+    s_temp = []
+    #gc()# check if we need this in julia >0.7
+    return feasibility_initial
 end
diff --git a/src/projectors/project_l1_Duchi!.jl b/src/projectors/project_l1_Duchi!.jl
index 509a31c..13a385d 100644
--- a/src/projectors/project_l1_Duchi!.jl
+++ b/src/projectors/project_l1_Duchi!.jl
@@ -1,4 +1,4 @@
-export project_l1_Duchi!, sa_old, sa_new!
+export project_l1_Duchi!
 
 """
     project_l1_Duchi!(v::Union{Vector{TF},Vector{Complex{TF}}}, b::TF)
@@ -29,7 +29,13 @@ function project_l1_Duchi!(v::Union{Vector{TF},Vector{Complex{TF}}}, b::TF) wher
   #use RadixSort for Float32 (short keywords)
   copyto!(u, v)
   u .= abs.(u)
-  sort!(u, rev=true, alg=RadixSort) 
+  u  = convert(Vector{TF},u)
+  if TF==Float32
+    sort!(u, rev=true, alg=RadixSort) 
+  else
+    u = sort!(u, rev=true, alg=QuickSort)
+  end
+  
   
   # if TF==Float32
   #   u = sort!(abs.(u), rev=true, alg=RadixSort)
diff --git a/src/rhs_compose.jl b/src/rhs_compose.jl
index 3f0677c..2cb02c5 100644
--- a/src/rhs_compose.jl
+++ b/src/rhs_compose.jl
@@ -4,25 +4,30 @@ export rhs_compose
 form the right-hand-side for the linear system in PARSDMM for the x-minimization
 """
 function rhs_compose(
-                              rhs         ::Vector{TF},
-                              l           ::Union{ Vector{Vector{TF}},DistributedArrays.DArray{Array{TF,1},1,Array{Array{TF,1},1}} },
-                              y           ::Union{ Vector{Vector{TF}},DistributedArrays.DArray{Array{TF,1},1,Array{Array{TF,1},1}} },
-                              rho         ::Union{ Vector{TF}, DistributedArrays.DArray{TF,1,Array{TF,1}} },
-                              TD_OP       ::Union{Vector{Union{SparseMatrixCSC{TF,TI},JOLI.joAbstractLinearOperator{TF,TF}}},DistributedArrays.DArray{Union{JOLI.joAbstractLinearOperator{TF,TF}, SparseMatrixCSC{TF,TI}},1,Array{Union{JOLI.joAbstractLinearOperator{TF,TF}, SparseMatrixCSC{TF,TI}},1}} },
-                              p           ::Integer,
-                              Blas_active ::Bool,
-                              parallel    ::Bool
-                              ) where {TF<:Real,TI<:Integer}
+                      rhs         ::Vector{TF},
+                      l           ::Union{ Vector{Vector{TF}},DistributedArrays.DArray{Array{TF,1},1,Array{Array{TF,1},1}} },
+                      y           ::Union{ Vector{Vector{TF}},DistributedArrays.DArray{Array{TF,1},1,Array{Array{TF,1},1}} },
+                      rho         ::Union{ Vector{TF}, DistributedArrays.DArray{TF,1,Array{TF,1}} },
+                      TD_OP       ::Union{Vector{Union{SparseMatrixCSC{TF,TI},JOLI.joAbstractLinearOperator{TF,TF}}},DistributedArrays.DArray{Union{JOLI.joAbstractLinearOperator{TF,TF}, SparseMatrixCSC{TF,TI}},1,Array{Union{JOLI.joAbstractLinearOperator{TF,TF}, SparseMatrixCSC{TF,TI}},1}} },
+                      p           ::Integer,
+                      Blas_active ::Bool,
+                      parallel    ::Bool
+                      ) where {TF<:Real,TI<:Integer}
 
-if parallel==true
+if parallel == true
   rhs = @distributed (+) for ii = 1:p
   TD_OP[ii]'*(rho[ii] .* y[ii] .+ l[ii])
   end
+  #mul!(rhs,TD_OP',rho .* y .+ l) #not possible
+  #rhs = TD_OP'*(rho .* y .+ l) possible but slow
 else #serial
   fill!(rhs,TF(0));
   if Blas_active #serial with explicit BLAS calls
+    temp_array = zeros(TF,length(rhs))
     for ii=1:p
-      BLAS.axpy!(TF(1.0), TD_OP[ii]'*(rho[ii] .* y[ii] .+ l[ii]), rhs)
+      mul!(temp_array,TD_OP[ii]',rho[ii] .* y[ii] .+ l[ii])
+      #BLAS.axpy!(TF(1.0), TD_OP[ii]'*(rho[ii] .* y[ii] .+ l[ii]), rhs)
+      BLAS.axpy!(TF(1.0), temp_array, rhs)
     end
   else
     for ii=1:p #serial with loop fusion only
@@ -32,4 +37,4 @@ else #serial
 end #end creating new rhs
 
 return rhs
-end #function
+end #function
\ No newline at end of file
diff --git a/src/update_y_l.jl b/src/update_y_l.jl
index 86636c4..7ab6b28 100644
--- a/src/update_y_l.jl
+++ b/src/update_y_l.jl
@@ -39,7 +39,9 @@ for ii=1:p
   copy!(y_old[ii],y[ii]);
   copy!(l_old[ii],l[ii]);
 
-  s[ii] = TD_OP[ii]*x;
+  
+  mul!(s[ii],TD_OP[ii],x) #s[ii] = TD_OP[ii]*x;
+
   if Blas_active  #&& i>5
     if gamma[ii]==1 #without relaxation
       copyto!(y[ii],s[ii])
@@ -55,49 +57,49 @@ for ii=1:p
       copyto!(y[ii],x_hat[ii])
       #y[ii]       = prox[ii]( BLAS.axpy!(-rho1[ii],l[ii],y[ii]))
       BLAS.axpy!(-rho1[ii],l[ii],y[ii])
-      y[ii] = prox[ii](y[ii])
-      r_pri[ii]   .= -s[ii] .+ y[ii];
+      y[ii]      = prox[ii](y[ii])
+      r_pri[ii] .= -s[ii] .+ y[ii];
       BLAS.axpy!(rho[ii],y[ii] .- x_hat[ii],l[ii]);
     end
-  else
+  else #without explicit BLAS calls
     if gamma[ii]==1 #without relaxation
       #y[ii]       = prox[ii]( s[ii].-l[ii].*rho1[ii] );
-      y[ii]      .= s[ii] .- l[ii].*rho1[ii]
-      y[ii] = prox[ii](y[ii]);
+      y[ii]       .= s[ii] .- l[ii].*rho1[ii]
+      y[ii]        = prox[ii](y[ii]);
       r_pri[ii]   .= -s[ii] .+ y[ii];
       l[ii]       .= l[ii] .+ rho[ii] .* r_pri[ii];
     else #relaxed iterations
       x_hat[ii]   .= gamma[ii].*s[ii] .+ ( TF(1.0) .- gamma[ii] ) .* y[ii];
       #y[ii]       = prox[ii]( x_hat[ii].-l[ii].*rho1[ii] );
       y[ii]       .= x_hat[ii] .- l[ii] .* rho1[ii]
-      y[ii] = prox[ii]( y[ii] );
+      y[ii]        = prox[ii]( y[ii] );
       r_pri[ii]   .= -s[ii] .+ y[ii];
       l[ii]       .= l[ii] .+ rho[ii] .* ( -x_hat[ii] .+ y[ii] );
     end
   end #end blas
 
-log_PARSDMM.r_pri[i,ii]  = norm(r_pri[ii])#./(norm(y[ii])+(100*eps(TF))); #add 1-14, because y may be 0 for certain initializations of x,y,l
-if Blas_active
-  log_PARSDMM.r_dual[i,ii] = norm( BLAS.scal!(lx,rho[ii],TD_OP[ii]'*(y[ii] .- y_old[ii]),1 ))
-else
- log_PARSDMM.r_dual[i,ii] = norm( rho[ii] .* (TD_OP[ii]'*(y[ii] .- y_old[ii])) )
-end
-#log feasibility
-if feasibility_only==false
-  if mod(i,10)==0 && ii<p#log every 10 it, or whatever number is suitable
-    temp = P_sub[ii](deepcopy(s[ii]))
-    #copy!(x_hat[ii],s[ii])
-    log_PARSDMM.set_feasibility[counter,ii] = norm( temp .- s[ii]) ./ (norm(s[ii])+(100*eps(TF))); #use x_hat[ii] as output, because it is already allocated and not used afterwards. The next iteration it will be overwritten anyway before use
-  end
-else
-  if mod(i,10)==0#log every 10 it, or whatever number is suitable
-    #copy!(x_hat[ii],s[ii])
-    temp = P_sub[ii](deepcopy(s[ii]))
-    log_PARSDMM.set_feasibility[counter,ii] = norm( temp .- s[ii]) ./ (norm(s[ii])+(100*eps(TF))); #use x_hat[ii] as output, because it is already allocated and not used afterwards. The next iteration it will be overwritten anyway before use
-  end
-end
+  log_PARSDMM.r_pri[i,ii]  = norm(r_pri[ii])#./(norm(y[ii])+(100*eps(TF))); #add 1-14, because y may be 0 for certain initializations of x,y,l
+  copy!(x_hat[ii],y[ii] .- y_old[ii])
+  #log_PARSDMM.r_dual[i,ii] = rho[ii] .* norm( TD_OP[ii]'*(y[ii] .- y_old[ii]) )
+  log_PARSDMM.r_dual[i,ii] = rho[ii] .* norm( TD_OP[ii]'*x_hat[ii] )
+
+  #log feasibility using the Euclidean distance-to-set function
+  #log every 10 it, or whatever number is suitable. Don't compute feasibility for entry 
+  # p in case of the projection problem, because then it corresponds to the l2-prox for
+  # the euclidean distance from the inital point.
+  if mod(i,10) == 0 && ((feasibility_only == false && ii<p) || feasibility_only == true)
+
+      copy!(x_hat[ii],s[ii]) #use the already allocated x_hat as a temp placeholder
+      P_sub[ii](x_hat[ii])   #P_sub mutates the input in-place: P(A*x), s=A*x, x_hat = s. The next iteration x_hat will be overwritten anyway before use
+      log_PARSDMM.set_feasibility[counter,ii] = norm( x_hat[ii] .- s[ii]) ./ (norm(s[ii])+(100*eps(TF))); 
+      
+      #equivalent to, but with less allocations
+      #log_PARSDMM.set_feasibility[counter,ii] = norm( P_sub[ii](deepcopy(s[ii])) .- s[ii]) ./ (norm(s[ii])+(100*eps(TF))); #use x_hat[ii] as output, because it is already allocated and not used afterwards. The next iteration it will be overwritten anyway before use
+
+    end
 
 end #end y and l update loop
+
 if mod(i,10)==0
   counter += 1;
 end
diff --git a/src/update_y_l_parallel.jl b/src/update_y_l_parallel.jl
index 6ead3a9..e1661d6 100644
--- a/src/update_y_l_parallel.jl
+++ b/src/update_y_l_parallel.jl
@@ -23,13 +23,12 @@ function update_y_l_parallel(
   feasibility_only=false::Bool
   ) where {TF<:Real,TI<:Integer}
 
-rho1 = TF
-rho1 = TF(1.0) ./ rho[1];
+  rho1 = TF(1.0) ./ rho[1];
 
   copy!(y_old[1],y[1]);
   copy!(l_old[1],l[1]);
 
-  s[1] = TD_OP[1]*x;
+  mul!(s[1],TD_OP[1],x) #s[1] = TD_OP[1]*x;
   if Blas_active
     if gamma[1]==1 #without relaxation
       copyto!(y[1],s[1])
@@ -53,44 +52,38 @@ rho1 = TF(1.0) ./ rho[1];
     if gamma[1]==1 #without relaxation
       #y[1]       = prox[1]( s[1]-l[1]*rho1[1] );
        @. y[1]       = s[1]-l[1]*rho1[1]
-       y[1] = prox[1](y[1]);
+       y[1]          = prox[1](y[1]);
        @. r_pri[1]   = -s[1]+y[1];
        @. l[1]       = l[1]+rho[1]*r_pri[1];
     else #relaxed iterations
        @. x_hat[1]   = gamma[1]*s[1] + ( TF(1.0)-gamma[1] )*y[1]
-      #y[1]       = prox[1]( x_hat[1]-l[1]*rho1[1] );
+      #y[1]          = prox[1]( x_hat[1]-l[1]*rho1[1] );
        @. y[1]       = x_hat[1]-l[1]*rho1[1]
-       y[1] = prox[1]( y[1] )
+       y[1]          = prox[1]( y[1] )
        @. r_pri[1]   = -s[1]+y[1]
        @. l[1]       = l[1]+rho[1]*( -x_hat[1]+y[1] )
     end
   end #end blas
 
-#log_PARSDMM.r_pri[i,1]  = norm(r_pri[1])#./(norm(y[1])+(100*eps(TF))); #add 1-14, because y may be 0 for certain initializations of x,y,l
-#log r_pri outside this function for parallel use (temporarily)
-# if Blas_active
-#   log_PARSDMM.r_dual[i,1] = norm( BLAS.scal!(lx,rho[1],TD_OP[1]'*(y[1]-y_old[1]),1 ))
-# else
-#  log_PARSDMM.r_dual[i,1] = norm( rho[1]*(TD_OP[1]'*(y[1]-y_old[1])) )
-# end
-#log feasibility
-if feasibility_only==false
-  if mod(i,10)==0 && myid()<nprocs() #ii<length(P_sub)#log every 10 it, or whatever number is suitable
-     temp = P_sub[1](deepcopy(s[1]))
-     set_feas[1] = norm(temp-s[1]) ./ (norm(s[1])+(100*eps(TF)))
-  end
-else
-  if mod(i,10)==0 #ii<length(P_sub)#log every 10 it, or whatever number is suitable
-     temp = P_sub[1](deepcopy(s[1]))
-     set_feas[1] = norm(temp-s[1]) ./ (norm(s[1])+(100*eps(TF)))
-  end
-end
+  #log_PARSDMM.r_pri[i,1]  = norm(r_pri[1])#./(norm(y[1])+(100*eps(TF))); #add 1-14, because y may be 0 for certain initializations of x,y,l
+  #log r_pri outside this function for parallel use (temporarily)
+  # if Blas_active
+  #   log_PARSDMM.r_dual[i,1] = norm( BLAS.scal!(lx,rho[1],TD_OP[1]'*(y[1]-y_old[1]),1 ))
+  # else
+  #  log_PARSDMM.r_dual[i,1] = norm( rho[1]*(TD_OP[1]'*(y[1]-y_old[1])) )
+  # end
 
+  #log feasibility
+  if (mod(i,10) == 0 && length(P_sub)==1 ) && ((feasibility_only == false && myid()<nprocs()) || feasibility_only == true)
 
+    copy!(x_hat[1],s[1]) #use the already allocated x_hat as a temp placeholder
+    P_sub[1](x_hat[1])   #P_sub mutates the input in-place: P(A*x), s=A*x, x_hat = s. The next iteration x_hat will be overwritten anyway before use
+    set_feas[1] = norm( x_hat[1] .- s[1]) ./ (norm(s[1])+(100*eps(TF))); 
+    
+    #equivalent to, but with less allocations
+    #log_PARSDMM.set_feasibility[counter,ii] = norm( P_sub[ii](deepcopy(s[ii])) .- s[ii]) ./ (norm(s[ii])+(100*eps(TF))); #use x_hat[ii] as output, because it is already allocated and not used afterwards. The next iteration it will be overwritten anyway before use
 
-# if mod(i,10)==0
-#   counter+=1;
-# end
+  end
 
 #return y,l,r_pri,s,log_PARSDMM,counter,y_old,l_old,x_hat,set_feas_vec
 
diff --git a/test/runtests.jl b/test/runtests.jl
index cc80746..7fc2886 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -43,7 +43,7 @@ end
 
   #test full algorithms
   include("test_PARSDMM.jl")
-  #include("test_PARSDMM_parallel.jl")
+  include("test_PARSDMM_parallel.jl")
   #include("test_PARSDMM_multilevel.jl")
 
 end