Cleanup (#180)

--------- Co-authored-by: Stefan Krastanov <[email protected]>
QuantumSavory · Sep 8, 2023 · cbd5aa5 · cbd5aa5
1 parent 5c9822e
commit cbd5aa5
Show file tree

Hide file tree

Showing 11 changed files with 177 additions and 131 deletions.
diff --git a/benchmark/benchmark-surface-code-gpu.jl b/benchmark/benchmark-surface-code-gpu.jl
@@ -10,7 +10,25 @@ using CSV
 
 include("surface-code-circuit.jl")
 
+function pftrajectories_threads(frames, ccircuit)
+    trajectories = size(frames.frame, 1)
+    nthr = min(Threads.nthreads(),trajectories÷(100))
+    if nthr>1
+        batchsize = trajectories÷nthr
+        Threads.@threads for i in 1:nthr
+            b = (i-1)*batchsize+1
+            e = i==nthr ? trajectories : i*batchsize
+            pftrajectories((@view frames[b:e]), ccircuit)
+        end
+    else
+        pftrajectories(frames, ccircuit)
+    end
+    return frames
+end
+
 function get_stats()
+    println("Starting tests with $(Threads.nthreads()) threads out of `Sys.CPU_THREADS = $(Sys.CPU_THREADS)`...")
+
     # Powers of 2 to benchmark
     powers = reverse(3:19) # start from the hardest one first! (fail fast)
     n_values = 2 .^ powers
@@ -33,10 +51,10 @@ function get_stats()
         cpu_frames() = to_cpu(QuantumClifford._create_pauliframe(ccircuit; trajectories=n))
         gpu_frames() = to_gpu(QuantumClifford._create_pauliframe(ccircuit; trajectories=n))
 
-        cpu_row() = pftrajectories(fastrow(cpu_frames()), ccircuit)
-        cpu_column() = pftrajectories(fastcolumn(cpu_frames()), ccircuit)
-        gpu_row() = pftrajectories(fastrow(gpu_frames()), ccircuit)
-        gpu_column() = pftrajectories(fastcolumn(gpu_frames()), ccircuit)
+        cpu_row() = pftrajectories_threads(fastrow(cpu_frames()), ccircuit)
+        cpu_column() = pftrajectories_threads(fastcolumn(cpu_frames()), ccircuit)
+        gpu_row() = pftrajectories(fastrow(gpu_frames()), ccircuit), synchronize()
+        gpu_column() = pftrajectories(fastcolumn(gpu_frames()), ccircuit), synchronize()
 
         push!(cpu_row_times, @belapsed $cpu_row())
         push!(cpu_column_times, @belapsed $cpu_column())

diff --git a/ext/QuantumCliffordGPUExt/QuantumCliffordGPUExt.jl b/ext/QuantumCliffordGPUExt/QuantumCliffordGPUExt.jl
@@ -12,5 +12,4 @@ include("pauli_frames.jl")
 include("fastmemlayout.jl")
 include("apply_noise.jl")
 
-# todo hide _apply function later. only for internal use
 end
diff --git a/ext/QuantumCliffordGPUExt/adapters.jl b/ext/QuantumCliffordGPUExt/adapters.jl
@@ -1,5 +1,6 @@
 const InnerGPUType = UInt32;
 const InnerCPUType = UInt64;
+
 # todo. make the conversion types consiting with this...
 # also define a cpu default type?!
 

diff --git a/ext/QuantumCliffordGPUExt/apply.jl b/ext/QuantumCliffordGPUExt/apply.jl
@@ -1,9 +1,13 @@
 using QuantumClifford: getxbit, getzbit, setxbit, setzbit
 
+##############################
+# SingleQubitOperator Kernel
+##############################
+
 function single_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
                                  phases::DeviceVector{Tmz},
                                  op::SingleQubitOperator,
-                                 rows::Unsigned,
+                                 rows::Int,
                                  compute_phases::Bool) where {Tme <: Unsigned, Tmz <: Unsigned}
     idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if idx > rows
@@ -12,10 +16,9 @@ function single_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
     c = op.q
     r = idx
     sh = QuantumClifford.getshift(Tme, c)
-    xx,zx,xz,zz = Tme.((op.xx,op.zx,op.xz,op.zz)) .<< sh # maybe do this in parent class?
+    xx,zx,xz,zz = Tme.((op.xx,op.zx,op.xz,op.zz)) .<< sh # maybe putting this in parent function call will improve speed?
     anticom = ~iszero((~zz & xz & ~xx & zx) | ( zz & ~xz & xx & zx) | (zz &  xz & xx & ~zx))
 
-    # todo. in future each gpu core can be responsible for multiple rows
     x = getxbit(xzs, r, c)
     z = getzbit(xzs, r, c)
     setxbit(xzs, r, c, (x&xx)⊻(z&zx))
@@ -38,10 +41,14 @@ function single_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
     return nothing
 end
 
+##############################
+# AbstractSingleQubitOperator Kernel
+##############################
+
 function abstract_single_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
                                  phases::DeviceVector{Tmz},
                                  gate::QuantumClifford.AbstractSingleQubitOperator,
-                                 rows::Unsigned,
+                                 rows::Int,
                                  compute_phases::Bool) where {Tme <: Unsigned, Tmz <: Unsigned}
     idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if idx > rows
@@ -59,32 +66,14 @@ function abstract_single_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
     return nothing
 end
 
-function _apply!(stab::StabilizerGPU{T},
-    op::QuantumClifford.SingleQubitOperator;
-    phases::Val{B}=Val(true)) where {B, T <: Unsigned}
-    # todo how to use phases similar to before in kernel functions??!
-    rows::Unsigned = size(stab, 1)
-    tab = QuantumClifford.tab(stab)
-    # todo. why can't I pass phases=compute_phases normally without function call?
-    (@run_cuda single_qubit_gpu_kernel(tab.xzs, tab.phases, op, rows, B) rows)
-    return stab
-end
-
-function _apply!(stab::StabilizerGPU{T},
-    op::QuantumClifford.AbstractSingleQubitOperator;
-    phases::Val{B}=Val(true)) where {B, T <: Unsigned}
-
-    rows::Unsigned = size(stab, 1)
-    tab = QuantumClifford.tab(stab)
-    # todo. why can't I pass phases=compute_phases normally without function call?
-    (@run_cuda abstract_single_qubit_gpu_kernel(tab.xzs, tab.phases, op, rows, B) rows)
-    return stab
-end
+##############################
+# AbstractTwoQubitOperator Kernel
+##############################
 
 function two_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
                               phases::DeviceVector{Tze},
-                              gate::QuantumClifford.AbstractTwoQubitOperator, # todo. change to two qubit operator instead os abstract version
-                              rows::Unsigned,
+                              gate::QuantumClifford.AbstractTwoQubitOperator,
+                              rows::Int,
                               compute_phases::Bool=true) where {Tme <: Unsigned, Tze <: Unsigned}
     idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if idx > rows
@@ -112,13 +101,36 @@ function two_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
     return nothing
 end
 
+##############################
+# Wrapper _apply! functions
+##############################
+
+function _apply!(stab::StabilizerGPU{T},
+    op::QuantumClifford.SingleQubitOperator;
+
+    phases::Val{B}=Val(true)) where {B, T <: Unsigned}
+    rows = size(stab, 1)
+    tab = QuantumClifford.tab(stab)
+    (@run_cuda single_qubit_gpu_kernel(tab.xzs, tab.phases, op, rows, B) rows)
+    return stab
+end
+
+function _apply!(stab::StabilizerGPU{T},
+    op::QuantumClifford.AbstractSingleQubitOperator;
+    phases::Val{B}=Val(true)) where {B, T <: Unsigned}
+
+    rows = size(stab, 1)
+    tab = QuantumClifford.tab(stab)
+    (@run_cuda abstract_single_qubit_gpu_kernel(tab.xzs, tab.phases, op, rows, B) rows)
+    return stab
+end
 
 function _apply!(stab::StabilizerGPU{T}, 
                  gate::G; 
-                 phases::Val{B}=Val(true)) where {B, G<:QuantumClifford.AbstractTwoQubitOperator, T <: Unsigned} # todo. change to two qubit operator instead os abstract version
-    rows::Unsigned = size(stab, 1)
+                 phases::Val{B}=Val(true)) where {B, G<:QuantumClifford.AbstractTwoQubitOperator, T <: Unsigned}
+
+    rows = size(stab, 1)
     tab = QuantumClifford.tab(stab)
-    # todo. why can't I pass compute_phases=compute_phases normally without function call?
     (@run_cuda two_qubit_gpu_kernel(tab.xzs, tab.phases, gate, rows, B) rows)
     return stab
 end
diff --git a/ext/QuantumCliffordGPUExt/apply_noise.jl b/ext/QuantumCliffordGPUExt/apply_noise.jl
@@ -1,5 +1,7 @@
 using QuantumClifford: _div, _mod
 
+#according to https://github.com/JuliaGPU/CUDA.jl/blob/ac1bc29a118e7be56d9edb084a4dea4224c1d707/test/core/device/random.jl#L33
+#CUDA.jl supports calling rand() inside kernel
 function applynoise!(frame::PauliFrameGPU{T},noise::UnbiasedUncorrelatedNoise,i::Int) where {T <: Unsigned}
     p = noise.errprobthird
     lowbit = T(1)
@@ -9,7 +11,7 @@ function applynoise!(frame::PauliFrameGPU{T},noise::UnbiasedUncorrelatedNoise,i:
 
     stab = frame.frame
     xzs = tab(stab).xzs
-    rows::Unsigned = size(stab, 1)
+    rows = size(stab, 1)
 
     @run_cuda applynoise_kernel(xzs, p, ibig, ismallm, rows) rows
     return frame
@@ -20,14 +22,13 @@ function applynoise_kernel(xzs::DeviceMatrix{Tme},
     p::Real,
     ibig::Int,
     ismallm::Tme,
-    rows::Unsigned) where {Tme <: Unsigned} 
+    rows::Int) where {Tme <: Unsigned} 
 
     f = (blockIdx().x - 1) * blockDim().x + threadIdx().x;
     if f > rows
         return nothing
     end
 
-
     r = rand()
     if  r < p # X error
         xzs[ibig,f] ⊻= ismallm

diff --git a/ext/QuantumCliffordGPUExt/fastmemlayout.jl b/ext/QuantumCliffordGPUExt/fastmemlayout.jl
@@ -1,7 +1,14 @@
 using LinearAlgebra: Adjoint
 using QuantumClifford: Tableau
 
-# when we use to_gpu, to_cpu the efffect of fastrow, fastcolumn disappears
+"""
+TableauCUDA is the type of Tableau with its data stored in CUDA arrays
+TableauAdj is the type of Tableau with its data stored in an adjoint of a CUDA arrays
+
+rest of the fastrow, fastcolumn functions are implemented in QuantumClifford.jl
+"""
+
+# todo when we use to_gpu, to_cpu the efffect of fastrow, fastcolumn disappears
 fastrow(t::TableauCUDA) = t
 fastrow(t::TableauAdj) = Tableau(t.phases, t.nqubits, CuArray(t.xzs))
 fastcolumn(t::TableauCUDA) = Tableau(t.phases, t.nqubits, CuArray(t.xzs')')

diff --git a/ext/QuantumCliffordGPUExt/pauli_frames.jl b/ext/QuantumCliffordGPUExt/pauli_frames.jl
@@ -1,7 +1,6 @@
-function apply!(f::PauliFrameGPU{T}, op::QuantumClifford.AbstractCliffordOperator) where {T <: Unsigned}
-    _apply!(f.frame, op; phases=Val(false))
-    return f
-end
+##############################
+# sMZ
+##############################
 
 function apply_sMZ_kernel!(xzs::DeviceMatrix{Tme},
                           measurements::DeviceMatrix{Bool},
@@ -18,7 +17,7 @@ function apply_sMZ_kernel!(xzs::DeviceMatrix{Tme},
     return nothing
 end
 
-function apply!(frame::PauliFrameGPU{T}, op::QuantumClifford.sMZ) where {T <: Unsigned} # TODO sMX, sMY
+function apply!(frame::PauliFrameGPU{T}, op::QuantumClifford.sMZ) where {T <: Unsigned}
     op.bit == 0 && return frame
     i = op.qubit
     xzs = frame.frame.tab.xzs
@@ -30,6 +29,10 @@ function apply!(frame::PauliFrameGPU{T}, op::QuantumClifford.sMZ) where {T <: Un
     return frame
 end
 
+##############################
+# sMRZ
+##############################
+
 function apply_sMRZ_kernel!(xzs::DeviceMatrix{Tme},
                           measurements::DeviceMatrix{Bool},
                           op::QuantumClifford.sMRZ,
@@ -59,81 +62,3 @@ function apply!(frame::PauliFrameGPU{T}, op::QuantumClifford.sMRZ) where {T <: U
     (@run_cuda apply_sMRZ_kernel!(xzs, frame.measurements, op, ibig, ismallm, length(frame)) length(frame))
     return frame
 end
-
-
-
-##################################################################################
-
-# function partition_circuit(::Type{T}, circ) where {T <: Unsigned}
-#     # time complexity: O(gates * qubits/64) for UInt64
-
-#     circ = if eltype(circ) <: QuantumClifford.CompactifiedGate
-#         circ
-#     else
-#         compactify_circuit(circ)
-#     end
-
-#     qmax = maximum((maximum(affectedqubits(g)) for g in circ))
-#     columns_cnt = QuantumClifford._div(T, qmax) + 1
-
-#     partitions = []
-#     partition_idx = []
-#     last_column_occupier = [0 for _ in 1:columns_cnt] # last_column_occupier[i] = group id of the last gate that occupies indice i
-
-#     for g in circ
-#         column_indices = [QuantumClifford.getbigindex(T, qbit) for qbit in affectedqubits(g)]
-#         my_partition_idx = maximum(last_column_occupier[column_indices]) + 1
-#         push!(partition_idx, my_partition_idx)
-#         if my_partition_idx > length(partitions)
-#             push!(partitions, [g])
-#         else
-#             push!(partitions[my_partition_idx], g)
-#         end
-#         last_column_occupier[column_indices] .= my_partition_idx
-#     end
-#     partitions
-# end
-
-# function partition_circuit(circ)
-#     # time complexity: O(gates * qubits/64) for UInt64
-
-#     circ = if eltype(circ) <: QuantumClifford.CompactifiedGate
-#         circ
-#     else
-#         compactify_circuit(circ)
-#     end
-
-#     qmax = maximum((maximum(affectedqubits(g)) for g in circ))
-
-#     partitions = []
-#     partition_idx = []
-#     last_column_occupier = [0 for _ in 1:qmax] # last_column_occupier[i] = group id of the last gate that occupies indice i
-
-#     for g in circ
-#         indices = [x for x in affectedqubits(g)]
-#         my_partition_idx = maximum(last_column_occupier[indices]) + 1
-#         push!(partition_idx, my_partition_idx)
-#         if my_partition_idx > length(partitions)
-#             push!(partitions, [g])
-#         else
-#             push!(partitions[my_partition_idx], g)
-#         end
-#         last_column_occupier[indices] .= my_partition_idx
-#     end
-#     partitions
-# end
-
-# function pftrajectories(state::PauliFrameGPU{T}, circuit) where {T <: Unsigned}
-#     for gates in partition_circuit(circuit)
-#         # for gate in gates
-#         #     CUDA.@sync apply!(state, gate)
-#         # end
-
-#         @sync begin
-#             for gate in gates
-#                 @async apply!(state, gate)
-#             end
-#         end
-#     end
-#     return state
-# end
diff --git a/ext/QuantumCliffordGPUExt/types.jl b/ext/QuantumCliffordGPUExt/types.jl
@@ -1,7 +1,8 @@
 using LinearAlgebra: Adjoint
 
-# todo is there anyway to do this automatically so that if the code in QuantumClifford changes, we don't have to change this?!
-# especially with UInt8 types
+
+const PhaseInnerType = UInt8
+const MeasurementInnerType = Bool
 
 CUDAValue{T} = CuArray{T, 0, CUDA.Mem.DeviceBuffer} where {T}
 CUDAVector{T} = CuArray{T, 1, CUDA.Mem.DeviceBuffer} where {T}
@@ -13,28 +14,27 @@ AdjCUDAVector{T} = CuArray{T, 1, CUDA.Mem.DeviceBuffer} where {T} # do not adjoi
 AdjCUDAMatrix{T} = Adjoint{T, CuArray{T, 2, CUDA.Mem.DeviceBuffer}} where {T}
 AdjCUDAParams = [AdjCUDAValue, AdjCUDAVector, AdjCUDAMatrix]
 
-# what is CuDeviceArray?
 DeviceValue{T} = Union{CuDeviceArray{T, 0, 1}, Adjoint{T, CuDeviceArray{T, 0, 1}}} where {T}
 DeviceVector{T} = Union{CuDeviceArray{T, 1, 1}, Adjoint{T, CuDeviceArray{T, 1, 1}}} where {T}
 DeviceMatrix{T} = Union{CuDeviceArray{T, 2, 1}, Adjoint{T, CuDeviceArray{T, 2, 1}}} where {T}
 
 
 function getTableauGPU(GPUValue, GPUVector, GPUMatrix)
-    TableauGPU{T} = QuantumClifford.Tableau{Tzv, Tm} where {T <: Unsigned, Tzv <: GPUVector{UInt8}, Tm <: GPUMatrix{T}}
+    TableauGPU{T} = QuantumClifford.Tableau{Tzv, Tm} where {T <: Unsigned, Tzv <: GPUVector{PhaseInnerType}, Tm <: GPUMatrix{T}}
 end
 function getStabilizerGPU(GPUValue, GPUVector, GPUMatrix, GPUTableau)
     StabilizerGPU{T} = QuantumClifford.Stabilizer{<:GPUTableau{T}} where {T <: Unsigned}
 end
 function getPauliOperatorGPU(GPUValue, GPUVector, GPUMatrix, GPUTableau)
-    PauliOperatorGPU{T} = QuantumClifford.PauliOperator{Tz, Tv} where {T <: Unsigned, Tz<:GPUValue{UInt8}, Tv<:GPUVector{T}}
+    PauliOperatorGPU{T} = QuantumClifford.PauliOperator{Tz, Tv} where {T <: Unsigned, Tz<:GPUValue{PhaseInnerType}, Tv<:GPUVector{T}}
 end
 
 # todo. type definition here is stronger than the code in pauliframes.jl  this will cause serious problems
 # especially because its not obvious whether TFrame is Tableau or Stabilizer in pauliframes.jl
 # and we are assuming that TMeasurement is made of booleans
 function getPauliFrameGPU(GPUValue, GPUVector, GPUMatrix, GPUTableau)
     StabilizerGPU{T} = getStabilizerGPU(GPUValue, GPUVector, GPUMatrix, GPUTableau){T} where {T <: Unsigned}
-    PauliFrameGPU{T} = QuantumClifford.PauliFrame{TFrame, TMeasurement} where {T <: Unsigned, TFrame <: StabilizerGPU{T}, TMeasurement <: GPUMatrix{Bool}}
+    PauliFrameGPU{T} = QuantumClifford.PauliFrame{TFrame, TMeasurement} where {T <: Unsigned, TFrame <: StabilizerGPU{T}, TMeasurement <: GPUMatrix{MeasurementInnerType}}
 end
 
 const TableauCUDA{T} = getTableauGPU(CUDAParams...){T} where {T <: Unsigned}

diff --git a/ext/QuantumCliffordGPUExt/utils.jl b/ext/QuantumCliffordGPUExt/utils.jl
@@ -9,7 +9,6 @@ macro run_cuda(call, ndrange)
         local config = CUDA.launch_configuration(kernel.fun)
         local threads = min($(esc(ndrange)), config.threads)
         local blocks = cld($(esc(ndrange)), threads)
-        # @show threads blocks
         kernel($(args...); threads=threads, blocks=blocks)
     end
 end