improve parallel version, simplify some code, some memory reduction a…

…nd speedup
slimgroup · Jan 28, 2022 · 2026064 · 2026064
1 parent 6005194
commit 2026064
Show file tree

Hide file tree

Showing 17 changed files with 781 additions and 708 deletions.
diff --git a/examples/projection_intersection_2D.jl b/examples/projection_intersection_2D.jl
@@ -36,14 +36,14 @@ elseif options.FL==Float32
 end
 
 #load image to project
-file = matopen(joinpath(dirname(pathof(SetIntersectionProjection)), "../examples/Data/compass_velocity.mat"))
-m    = read(file, "Data");close(file)
-m    = m[1:341,200:600]
-m    = permutedims(m,[2,1])
+file = matopen(joinpath(dirname(pathof(SetIntersectionProjection)), "../examples/Data/compass_velocity.mat"));
+m    = read(file, "Data");close(file);
+m    = m[1:341,200:600];
+m    = permutedims(m,[2,1]);
 
 #set up computational grid (25 and 6 m are the original distances between grid points)
 comp_grid = compgrid((TF(25.0), TF(6.0)),(size(m,1), size(m,2)))
-m         = convert(Vector{TF},vec(m))
+m         = convert(Vector{TF},vec(m));
 
 #define axis limits and colorbar limits
 xmax = comp_grid.d[1]*comp_grid.n[1]
@@ -55,13 +55,13 @@ vma  = 4500
 constraint = Vector{SetIntersectionProjection.set_definitions}()
 
 #bounds:
-m_min     = 1500.0
+m_min     = 1480.0
 m_max     = 4500.0
 set_type  = "bounds"
 TD_OP     = "identity"
 app_mode  = ("matrix","")
 custom_TD_OP = ([],false)
-push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
 #slope constraints (vertical)
 m_min     = 0.0
@@ -70,11 +70,11 @@ set_type  = "bounds"
 TD_OP     = "D_z"
 app_mode  = ("matrix","")
 custom_TD_OP = ([],false)
-push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
 options.parallel       = false
-(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
-(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
+(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
 println("")
 println("PARSDMM serial (bounds and bounds on D_z):")
@@ -103,14 +103,17 @@ tight_layout()
 #tight_layout(pad=0.0, w_pad=0.0, h_pad=1.0)
 savefig("PARSDMM_logs.png",bbox_inches="tight")
 
+#print timings in terminal
+log_PARSDMM.timing
+
 println("")
 println("PARSDMM parallel (bounds and bounds on D_z):")
 options.parallel       = true
-(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
-(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
+(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
 @time (x2,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
-#@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
+@time (x2,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 #@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 
 #plot
@@ -119,6 +122,9 @@ subplot(2,1,1);imshow(permutedims(reshape(m,(comp_grid.n[1],comp_grid.n[2])),[2,
 subplot(2,1,2);imshow(permutedims(reshape(x2,(comp_grid.n[1],comp_grid.n[2])),[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("Projection (bounds and bounds on D_z)")
 savefig("projected_model_ParallelPARSDMM.png",bbox_inches="tight")
 
+#print timings in terminal
+log_PARSDMM.timing
+
 # #use multilevel-serial (2-levels)
 # options.parallel = false
 

diff --git a/examples/projection_intersection_3D.jl b/examples/projection_intersection_3D.jl
@@ -3,7 +3,7 @@
 # Bas Peters, 2017
 
 using Distributed
-using LinearAlgebra
+@everywhere using LinearAlgebra
 @everywhere using SetIntersectionProjection
 using HDF5
 using PyPlot
@@ -17,7 +17,7 @@ end
 if ~isfile("overthrust_3D_true_model.h5")
   run(`wget ftp://slim.gatech.edu/data/SoftwareRelease/WaveformInversion.jl/3DFWI/overthrust_3D_true_model.h5`)
 end
-n, d, o, m = read(h5open("overthrust_3D_true_model.h5","r"), "n", "d", "o", "m")
+n, d, o, m = read(h5open("overthrust_3D_true_model.h5","r"), "n", "d", "o", "m");
 
 m .= 1000.0 ./ sqrt.(m);
 m = m[50:200,50:200,:];
@@ -36,7 +36,7 @@ options.Blas_active            = true
 options.maxit                  = 500
 
 set_zero_subnormals(true)
-BLAS.set_num_threads(4)
+@everywhere BLAS.set_num_threads(4)
 
 #select working precision
 if options.FL==Float64
@@ -68,7 +68,7 @@ set_type  = "bounds"
 TD_OP     = "identity"
 app_mode  = ("matrix","")
 custom_TD_OP = ([],false)
-push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
 #slope constraints (vertical)
 m_min     = 0.0
@@ -77,11 +77,11 @@ set_type  = "bounds"
 TD_OP     = "D_z"
 app_mode  = ("matrix","")
 custom_TD_OP = ([],false)
-push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
 options.parallel       = false
-(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
-(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
+(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
 println("")
 println("PARSDMM serial (bounds and bounds on D_z):")
@@ -133,12 +133,12 @@ println("PARSDMM multilevel-serial (bounds and bounds on D_z):")
 println("")
 println("PARSDMM parallel (bounds and bounds on D_z):")
 options.parallel       = true
-(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
-(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
+(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
-@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options)
-@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options)
-@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options)
+@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
+@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
+@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 
 #use multilevel-parallel (2-levels)
 options.parallel  = true

diff --git a/src/CDS_MVp.jl b/src/CDS_MVp.jl
@@ -1,4 +1,4 @@
-export CDS_MVp
+export CDS_MVp, CDS_MVp2, CDS_MVp3, CDS_MVp4
 
 """
 compute single-thread matrix vector product with vector x, output is vector y: y=A*x
@@ -7,15 +7,15 @@ R is a tall matrix N by ndiagonals, corresponding to a square matrix A
 offsets indicate offset of diagonal compared to the main diagonal in A (which is 0)
 """
 function CDS_MVp(
-                        N::Integer,
-                        ndiags::Integer,
-                        R::Array{TF,2},
-                        offset::Vector{TI},
-                        x::Vector{TF},
-                        y::Vector{TF}) where {TF<:Real,TI<:Integer}
+                        N      ::Integer,
+                        ndiags ::Integer,
+                        R      ::Array{TF,2},
+                        offset ::Vector{TI},
+                        x      ::Vector{TF},
+                        y      ::Vector{TF}) where {TF<:Real,TI<:Integer}
 
   for i = 1 : ndiags
-      d = offset[i]
+      d  = offset[i]
       r0 = max(1, 1-d)
       r1 = min(N, N-d)
       c0 = max(1, 1+d)
@@ -26,3 +26,51 @@ function CDS_MVp(
   end
   return y
 end
+
+# #the versions below is insignificantly faster than the original version above
+# function CDS_MVp(
+#   N::Integer,
+#   ndiags::Integer,
+#   R::Array{TF,2},
+#   offset::Vector{TI},
+#   x::Vector{TF},
+#   y::Vector{TF}) where {TF<:Real,TI<:Integer}
+
+#   for i = 1 : ndiags
+#     d = offset[i]
+#     r0 = max(1, 1-d)
+#     r1 = min(N, N-d)
+#     c0 = max(1, 1+d)
+#     #r0_plus_c0 = r0 + c0
+#     c = deepcopy(c0)
+#     for r = r0 : r1
+#       @inbounds y[r] = y[r] + R[r,i] * x[c]#original
+#       c += 1
+#     end
+#   end
+# return y
+# end
+
+# function CDS_MVp(
+#   N      ::Integer,
+#   ndiags ::Integer,
+#   R      ::Array{TF,2},
+#   offset ::Vector{TI},
+#   x      ::Vector{TF},
+#   y      ::Vector{TF}) where {TF<:Real,TI<:Integer}
+
+#   for i = 1 : ndiags
+#     d  = offset[i]
+#     r0 = max(1, 1-d)
+#     r1 = min(N, N-d)
+#     c0 = max(1, 1+d)
+#     # for r = r0 : r1
+#     #   c = r - r0 + c0 #original
+#     #   @inbounds y[r] = y[r] + R[r,i] * x[c]#original
+#     # end
+#     @inbounds y[r0 : r1] .= y[r0 : r1] .+ R[r0:r1,i].*x[r0 - r0 + c0:r1 - r0 + c0]
+#   end
+
+#   return y
+# end
+
diff --git a/src/CDS_MVp_MT.jl b/src/CDS_MVp_MT.jl
@@ -15,7 +15,7 @@ function CDS_MVp_MT(
                         y      ::Vector{TF}) where {TF<:Real,TI<:Integer}
 
   for i = 1 : ndiags
-      d = offset[i]
+      d  = offset[i]
       r0 = max(1, 1-d)
       r1 = min(N, N-d)
       c0 = max(1, 1+d)

diff --git a/src/CDS_MVp_MT_subfunc.jl b/src/CDS_MVp_MT_subfunc.jl
@@ -4,15 +4,14 @@ wrapper around Julia threads to compute a multi-threaded matrix vector product i
 This is a subfunction of CDS_MVp_MT.jl
 """
 function CDS_MVp_MT_subfunc(
-        R::Array{TF,2},
-        x::Vector{TF},
-        y::Vector{TF},
-        r0::Int,
-        c0::Int,
-        r1::Int,
-        i::Int) where {TF<:Real}
+        R  ::Array{TF,2},
+        x  ::Vector{TF},
+        y  ::Vector{TF},
+        r0 ::Int,
+        c0 ::Int,
+        r1 ::Int,
+        i  ::Int) where {TF<:Real}
 
-      #s=rind-1
   @Threads.threads for r = r0 : r1
       c = r - r0 + c0 #original
       @inbounds y[r] = y[r] + R[r,i] * x[c]#original

diff --git a/src/CDS_scaled_add!.jl b/src/CDS_scaled_add!.jl
@@ -3,7 +3,7 @@ export CDS_scaled_add!
 
 """
  Computes A = A + alpha * B for A and B in the compressed diagonal storage format (CDS/DIA)
- TO DO: make this function multi-threaded per column of A and B
+ TO DO: make this function multi-threaded per column of A and B, or use BLAS functions
 """
 function CDS_scaled_add!(
                   A           ::Array{TF,2},
@@ -16,7 +16,7 @@ function CDS_scaled_add!(
 for k=1:length(B_offsets)
   A_update_col = findall((in)(B_offsets[k]),A_offsets)
   if isempty(A_update_col) == true
-    error("attempted to update a diagonal in A in CDS storage that does not excist. A and B need to have the same nonzero diagonals")
+    error("attempted to update a diagonal in A in CDS storage that does not exist. A and B need to have the same nonzero diagonals")
   end
   A[:,A_update_col] .= A[:,A_update_col] .+ alpha .* B[:,k];
 end