Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup #180

Merged
merged 19 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions benchmark/benchmark-surface-code-gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,25 @@ using CSV

include("surface-code-circuit.jl")

function pftrajectories_threads(frames, ccircuit)
trajectories = size(frames.frame, 1)
nthr = min(Threads.nthreads(),trajectories÷(100))
if nthr>1
batchsize = trajectories÷nthr
Threads.@threads for i in 1:nthr
b = (i-1)*batchsize+1
e = i==nthr ? trajectories : i*batchsize
pftrajectories((@view frames[b:e]), ccircuit)
end
else
pftrajectories(frames, ccircuit)
end
return frames
end

function get_stats()
println("Starting tests with $(Threads.nthreads()) threads out of `Sys.CPU_THREADS = $(Sys.CPU_THREADS)`...")

# Powers of 2 to benchmark
powers = reverse(3:19) # start from the hardest one first! (fail fast)
n_values = 2 .^ powers
Expand All @@ -33,10 +51,10 @@ function get_stats()
cpu_frames() = to_cpu(QuantumClifford._create_pauliframe(ccircuit; trajectories=n))
gpu_frames() = to_gpu(QuantumClifford._create_pauliframe(ccircuit; trajectories=n))

cpu_row() = pftrajectories(fastrow(cpu_frames()), ccircuit)
cpu_column() = pftrajectories(fastcolumn(cpu_frames()), ccircuit)
gpu_row() = pftrajectories(fastrow(gpu_frames()), ccircuit)
gpu_column() = pftrajectories(fastcolumn(gpu_frames()), ccircuit)
cpu_row() = pftrajectories_threads(fastrow(cpu_frames()), ccircuit)
cpu_column() = pftrajectories_threads(fastcolumn(cpu_frames()), ccircuit)
gpu_row() = pftrajectories(fastrow(gpu_frames()), ccircuit), synchronize()
gpu_column() = pftrajectories(fastcolumn(gpu_frames()), ccircuit), synchronize()

push!(cpu_row_times, @belapsed $cpu_row())
push!(cpu_column_times, @belapsed $cpu_column())
Expand Down
1 change: 0 additions & 1 deletion ext/QuantumCliffordGPUExt/QuantumCliffordGPUExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,4 @@ include("pauli_frames.jl")
include("fastmemlayout.jl")
include("apply_noise.jl")

# todo hide _apply function later. only for internal use
end
1 change: 1 addition & 0 deletions ext/QuantumCliffordGPUExt/adapters.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const InnerGPUType = UInt32;
const InnerCPUType = UInt64;

# todo. make the conversion types consiting with this...
# also define a cpu default type?!

Expand Down
72 changes: 42 additions & 30 deletions ext/QuantumCliffordGPUExt/apply.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
using QuantumClifford: getxbit, getzbit, setxbit, setzbit

##############################
# SingleQubitOperator Kernel
##############################

function single_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
phases::DeviceVector{Tmz},
op::SingleQubitOperator,
rows::Unsigned,
rows::Int,
compute_phases::Bool) where {Tme <: Unsigned, Tmz <: Unsigned}
idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if idx > rows
Expand All @@ -12,10 +16,9 @@ function single_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
c = op.q
r = idx
sh = QuantumClifford.getshift(Tme, c)
xx,zx,xz,zz = Tme.((op.xx,op.zx,op.xz,op.zz)) .<< sh # maybe do this in parent class?
xx,zx,xz,zz = Tme.((op.xx,op.zx,op.xz,op.zz)) .<< sh # maybe putting this in parent function call will improve speed?
anticom = ~iszero((~zz & xz & ~xx & zx) | ( zz & ~xz & xx & zx) | (zz & xz & xx & ~zx))

# todo. in future each gpu core can be responsible for multiple rows
x = getxbit(xzs, r, c)
z = getzbit(xzs, r, c)
setxbit(xzs, r, c, (x&xx)⊻(z&zx))
Expand All @@ -38,10 +41,14 @@ function single_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
return nothing
end

##############################
# AbstractSingleQubitOperator Kernel
##############################

function abstract_single_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
phases::DeviceVector{Tmz},
gate::QuantumClifford.AbstractSingleQubitOperator,
rows::Unsigned,
rows::Int,
compute_phases::Bool) where {Tme <: Unsigned, Tmz <: Unsigned}
idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if idx > rows
Expand All @@ -59,32 +66,14 @@ function abstract_single_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
return nothing
end

function _apply!(stab::StabilizerGPU{T},
op::QuantumClifford.SingleQubitOperator;
phases::Val{B}=Val(true)) where {B, T <: Unsigned}
# todo how to use phases similar to before in kernel functions??!
rows::Unsigned = size(stab, 1)
tab = QuantumClifford.tab(stab)
# todo. why can't I pass phases=compute_phases normally without function call?
(@run_cuda single_qubit_gpu_kernel(tab.xzs, tab.phases, op, rows, B) rows)
return stab
end

function _apply!(stab::StabilizerGPU{T},
op::QuantumClifford.AbstractSingleQubitOperator;
phases::Val{B}=Val(true)) where {B, T <: Unsigned}

rows::Unsigned = size(stab, 1)
tab = QuantumClifford.tab(stab)
# todo. why can't I pass phases=compute_phases normally without function call?
(@run_cuda abstract_single_qubit_gpu_kernel(tab.xzs, tab.phases, op, rows, B) rows)
return stab
end
##############################
# AbstractTwoQubitOperator Kernel
##############################

function two_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
phases::DeviceVector{Tze},
gate::QuantumClifford.AbstractTwoQubitOperator, # todo. change to two qubit operator instead os abstract version
rows::Unsigned,
gate::QuantumClifford.AbstractTwoQubitOperator,
rows::Int,
compute_phases::Bool=true) where {Tme <: Unsigned, Tze <: Unsigned}
idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if idx > rows
Expand Down Expand Up @@ -112,13 +101,36 @@ function two_qubit_gpu_kernel(xzs::DeviceMatrix{Tme},
return nothing
end

##############################
# Wrapper _apply! functions
##############################

function _apply!(stab::StabilizerGPU{T},
op::QuantumClifford.SingleQubitOperator;

phases::Val{B}=Val(true)) where {B, T <: Unsigned}
rows = size(stab, 1)
tab = QuantumClifford.tab(stab)
(@run_cuda single_qubit_gpu_kernel(tab.xzs, tab.phases, op, rows, B) rows)
return stab
end

function _apply!(stab::StabilizerGPU{T},
op::QuantumClifford.AbstractSingleQubitOperator;
phases::Val{B}=Val(true)) where {B, T <: Unsigned}

rows = size(stab, 1)
tab = QuantumClifford.tab(stab)
(@run_cuda abstract_single_qubit_gpu_kernel(tab.xzs, tab.phases, op, rows, B) rows)
return stab
end

function _apply!(stab::StabilizerGPU{T},
gate::G;
phases::Val{B}=Val(true)) where {B, G<:QuantumClifford.AbstractTwoQubitOperator, T <: Unsigned} # todo. change to two qubit operator instead os abstract version
rows::Unsigned = size(stab, 1)
phases::Val{B}=Val(true)) where {B, G<:QuantumClifford.AbstractTwoQubitOperator, T <: Unsigned}

rows = size(stab, 1)
tab = QuantumClifford.tab(stab)
# todo. why can't I pass compute_phases=compute_phases normally without function call?
(@run_cuda two_qubit_gpu_kernel(tab.xzs, tab.phases, gate, rows, B) rows)
return stab
end
7 changes: 4 additions & 3 deletions ext/QuantumCliffordGPUExt/apply_noise.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
using QuantumClifford: _div, _mod

#according to https://github.com/JuliaGPU/CUDA.jl/blob/ac1bc29a118e7be56d9edb084a4dea4224c1d707/test/core/device/random.jl#L33
#CUDA.jl supports calling rand() inside kernel
function applynoise!(frame::PauliFrameGPU{T},noise::UnbiasedUncorrelatedNoise,i::Int) where {T <: Unsigned}
p = noise.errprobthird
lowbit = T(1)
Expand All @@ -9,7 +11,7 @@ function applynoise!(frame::PauliFrameGPU{T},noise::UnbiasedUncorrelatedNoise,i:

stab = frame.frame
xzs = tab(stab).xzs
rows::Unsigned = size(stab, 1)
rows = size(stab, 1)

@run_cuda applynoise_kernel(xzs, p, ibig, ismallm, rows) rows
return frame
Expand All @@ -20,14 +22,13 @@ function applynoise_kernel(xzs::DeviceMatrix{Tme},
p::Real,
ibig::Int,
ismallm::Tme,
rows::Unsigned) where {Tme <: Unsigned}
rows::Int) where {Tme <: Unsigned}

f = (blockIdx().x - 1) * blockDim().x + threadIdx().x;
if f > rows
return nothing
end


r = rand()
if r < p # X error
xzs[ibig,f] ⊻= ismallm
Expand Down
9 changes: 8 additions & 1 deletion ext/QuantumCliffordGPUExt/fastmemlayout.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
using LinearAlgebra: Adjoint
using QuantumClifford: Tableau

# when we use to_gpu, to_cpu the efffect of fastrow, fastcolumn disappears
"""
TableauCUDA is the type of Tableau with its data stored in CUDA arrays
TableauAdj is the type of Tableau with its data stored in an adjoint of a CUDA arrays

rest of the fastrow, fastcolumn functions are implemented in QuantumClifford.jl
"""

# todo when we use to_gpu, to_cpu the efffect of fastrow, fastcolumn disappears
fastrow(t::TableauCUDA) = t
fastrow(t::TableauAdj) = Tableau(t.phases, t.nqubits, CuArray(t.xzs))
fastcolumn(t::TableauCUDA) = Tableau(t.phases, t.nqubits, CuArray(t.xzs')')
Expand Down
91 changes: 8 additions & 83 deletions ext/QuantumCliffordGPUExt/pauli_frames.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
function apply!(f::PauliFrameGPU{T}, op::QuantumClifford.AbstractCliffordOperator) where {T <: Unsigned}
_apply!(f.frame, op; phases=Val(false))
return f
end
##############################
# sMZ
##############################

function apply_sMZ_kernel!(xzs::DeviceMatrix{Tme},
measurements::DeviceMatrix{Bool},
Expand All @@ -18,7 +17,7 @@ function apply_sMZ_kernel!(xzs::DeviceMatrix{Tme},
return nothing
end

function apply!(frame::PauliFrameGPU{T}, op::QuantumClifford.sMZ) where {T <: Unsigned} # TODO sMX, sMY
function apply!(frame::PauliFrameGPU{T}, op::QuantumClifford.sMZ) where {T <: Unsigned}
op.bit == 0 && return frame
i = op.qubit
xzs = frame.frame.tab.xzs
Expand All @@ -30,6 +29,10 @@ function apply!(frame::PauliFrameGPU{T}, op::QuantumClifford.sMZ) where {T <: Un
return frame
end

##############################
# sMRZ
##############################

function apply_sMRZ_kernel!(xzs::DeviceMatrix{Tme},
measurements::DeviceMatrix{Bool},
op::QuantumClifford.sMRZ,
Expand Down Expand Up @@ -59,81 +62,3 @@ function apply!(frame::PauliFrameGPU{T}, op::QuantumClifford.sMRZ) where {T <: U
(@run_cuda apply_sMRZ_kernel!(xzs, frame.measurements, op, ibig, ismallm, length(frame)) length(frame))
return frame
end



##################################################################################

# function partition_circuit(::Type{T}, circ) where {T <: Unsigned}
# # time complexity: O(gates * qubits/64) for UInt64

# circ = if eltype(circ) <: QuantumClifford.CompactifiedGate
# circ
# else
# compactify_circuit(circ)
# end

# qmax = maximum((maximum(affectedqubits(g)) for g in circ))
# columns_cnt = QuantumClifford._div(T, qmax) + 1

# partitions = []
# partition_idx = []
# last_column_occupier = [0 for _ in 1:columns_cnt] # last_column_occupier[i] = group id of the last gate that occupies indice i

# for g in circ
# column_indices = [QuantumClifford.getbigindex(T, qbit) for qbit in affectedqubits(g)]
# my_partition_idx = maximum(last_column_occupier[column_indices]) + 1
# push!(partition_idx, my_partition_idx)
# if my_partition_idx > length(partitions)
# push!(partitions, [g])
# else
# push!(partitions[my_partition_idx], g)
# end
# last_column_occupier[column_indices] .= my_partition_idx
# end
# partitions
# end

# function partition_circuit(circ)
# # time complexity: O(gates * qubits/64) for UInt64

# circ = if eltype(circ) <: QuantumClifford.CompactifiedGate
# circ
# else
# compactify_circuit(circ)
# end

# qmax = maximum((maximum(affectedqubits(g)) for g in circ))

# partitions = []
# partition_idx = []
# last_column_occupier = [0 for _ in 1:qmax] # last_column_occupier[i] = group id of the last gate that occupies indice i

# for g in circ
# indices = [x for x in affectedqubits(g)]
# my_partition_idx = maximum(last_column_occupier[indices]) + 1
# push!(partition_idx, my_partition_idx)
# if my_partition_idx > length(partitions)
# push!(partitions, [g])
# else
# push!(partitions[my_partition_idx], g)
# end
# last_column_occupier[indices] .= my_partition_idx
# end
# partitions
# end

# function pftrajectories(state::PauliFrameGPU{T}, circuit) where {T <: Unsigned}
# for gates in partition_circuit(circuit)
# # for gate in gates
# # CUDA.@sync apply!(state, gate)
# # end

# @sync begin
# for gate in gates
# @async apply!(state, gate)
# end
# end
# end
# return state
# end
12 changes: 6 additions & 6 deletions ext/QuantumCliffordGPUExt/types.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
using LinearAlgebra: Adjoint

# todo is there anyway to do this automatically so that if the code in QuantumClifford changes, we don't have to change this?!
# especially with UInt8 types

const PhaseInnerType = UInt8
const MeasurementInnerType = Bool

CUDAValue{T} = CuArray{T, 0, CUDA.Mem.DeviceBuffer} where {T}
CUDAVector{T} = CuArray{T, 1, CUDA.Mem.DeviceBuffer} where {T}
Expand All @@ -13,28 +14,27 @@ AdjCUDAVector{T} = CuArray{T, 1, CUDA.Mem.DeviceBuffer} where {T} # do not adjoi
AdjCUDAMatrix{T} = Adjoint{T, CuArray{T, 2, CUDA.Mem.DeviceBuffer}} where {T}
AdjCUDAParams = [AdjCUDAValue, AdjCUDAVector, AdjCUDAMatrix]

# what is CuDeviceArray?
DeviceValue{T} = Union{CuDeviceArray{T, 0, 1}, Adjoint{T, CuDeviceArray{T, 0, 1}}} where {T}
DeviceVector{T} = Union{CuDeviceArray{T, 1, 1}, Adjoint{T, CuDeviceArray{T, 1, 1}}} where {T}
DeviceMatrix{T} = Union{CuDeviceArray{T, 2, 1}, Adjoint{T, CuDeviceArray{T, 2, 1}}} where {T}


function getTableauGPU(GPUValue, GPUVector, GPUMatrix)
TableauGPU{T} = QuantumClifford.Tableau{Tzv, Tm} where {T <: Unsigned, Tzv <: GPUVector{UInt8}, Tm <: GPUMatrix{T}}
TableauGPU{T} = QuantumClifford.Tableau{Tzv, Tm} where {T <: Unsigned, Tzv <: GPUVector{PhaseInnerType}, Tm <: GPUMatrix{T}}
end
function getStabilizerGPU(GPUValue, GPUVector, GPUMatrix, GPUTableau)
StabilizerGPU{T} = QuantumClifford.Stabilizer{<:GPUTableau{T}} where {T <: Unsigned}
end
function getPauliOperatorGPU(GPUValue, GPUVector, GPUMatrix, GPUTableau)
PauliOperatorGPU{T} = QuantumClifford.PauliOperator{Tz, Tv} where {T <: Unsigned, Tz<:GPUValue{UInt8}, Tv<:GPUVector{T}}
PauliOperatorGPU{T} = QuantumClifford.PauliOperator{Tz, Tv} where {T <: Unsigned, Tz<:GPUValue{PhaseInnerType}, Tv<:GPUVector{T}}
end

# todo. type definition here is stronger than the code in pauliframes.jl this will cause serious problems
# especially because its not obvious whether TFrame is Tableau or Stabilizer in pauliframes.jl
# and we are assuming that TMeasurement is made of booleans
function getPauliFrameGPU(GPUValue, GPUVector, GPUMatrix, GPUTableau)
StabilizerGPU{T} = getStabilizerGPU(GPUValue, GPUVector, GPUMatrix, GPUTableau){T} where {T <: Unsigned}
PauliFrameGPU{T} = QuantumClifford.PauliFrame{TFrame, TMeasurement} where {T <: Unsigned, TFrame <: StabilizerGPU{T}, TMeasurement <: GPUMatrix{Bool}}
PauliFrameGPU{T} = QuantumClifford.PauliFrame{TFrame, TMeasurement} where {T <: Unsigned, TFrame <: StabilizerGPU{T}, TMeasurement <: GPUMatrix{MeasurementInnerType}}
end

const TableauCUDA{T} = getTableauGPU(CUDAParams...){T} where {T <: Unsigned}
Expand Down
1 change: 0 additions & 1 deletion ext/QuantumCliffordGPUExt/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ macro run_cuda(call, ndrange)
local config = CUDA.launch_configuration(kernel.fun)
local threads = min($(esc(ndrange)), config.threads)
local blocks = cld($(esc(ndrange)), threads)
# @show threads blocks
kernel($(args...); threads=threads, blocks=blocks)
end
end
Loading