From 44a18964a62652d853a494ded0b992e33d349ccd Mon Sep 17 00:00:00 2001
From: Anthony Micciche <32377705+amicciche@users.noreply.github.com>
Date: Thu, 31 Aug 2023 12:04:46 -0400
Subject: [PATCH 1/5] shor syndrome, classic xor, and pauli frame x measurement
 (#170)

---------

Co-authored-by: Stefan Krastanov <stefan@krastanov.org>
---
 CHANGELOG.md                                  |   6 +
 Project.toml                                  |   2 +-
 .../QuantumCliffordQuantikzExt.jl             |   1 +
 src/QuantumClifford.jl                        |   2 +-
 src/ecc/ECC.jl                                | 115 +++++++++++++++++-
 src/misc_ops.jl                               |   9 ++
 src/pauli_frames.jl                           |  25 +++-
 test/runtests.jl                              |   1 +
 test/test_ecc.jl                              |   4 +-
 test/test_ecc_syndromes.jl                    |  54 ++++++++
 test/test_jet.jl                              |   2 +-
 11 files changed, 208 insertions(+), 13 deletions(-)
 create mode 100644 test/test_ecc_syndromes.jl

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 47b8e2458..f69abf7b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,9 +2,15 @@
 
 - `permute` will be a wrapper around to `QuantumInterface.permutesubsystems`. Documentation for `permute!` would be similarly updated
 - reworking the rest of `NoisyCircuits` and moving it out of `Experimental`
+- `random_pauli(...; realphase=true)` should be the default
 
 # News
 
+## v0.8.16 - 2023-08-31
+
+- **(breaking)** Changes to the circuit generation in the non-public ECC module. Adding a Shor syndrome measurement circuit.
+- Added a `ClassicalXOR` gate, for now supporting only `PauliFrame`s.
+
 ## v0.8.15 - 2023-08-16
 
 - Initial support for GPU accelerated circuit simulation (with CUDA).
diff --git a/Project.toml b/Project.toml
index 68d25a160..47542bfa0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -47,7 +47,7 @@ Makie = "0.19.7"
 Nemo = "0.34, 0.35"
 Plots = "1.38.0"
 PrecompileTools = "1"
-Quantikz = "1.3"
+Quantikz = "1.3.1"
 QuantumInterface = "0.3.0"
 QuantumOpticsBase = "0.4"
 SIMD = "3.4.0"
diff --git a/ext/QuantumCliffordQuantikzExt/QuantumCliffordQuantikzExt.jl b/ext/QuantumCliffordQuantikzExt/QuantumCliffordQuantikzExt.jl
index b94627cf0..04aae7e66 100644
--- a/ext/QuantumCliffordQuantikzExt/QuantumCliffordQuantikzExt.jl
+++ b/ext/QuantumCliffordQuantikzExt/QuantumCliffordQuantikzExt.jl
@@ -72,6 +72,7 @@ function Quantikz.QuantikzOp(op::Reset) # TODO This is complicated because quant
 end
 Quantikz.QuantikzOp(op::NoiseOp) = Quantikz.Noise(op.indices)
 Quantikz.QuantikzOp(op::NoiseOpAll) = Quantikz.NoiseAll()
+Quantikz.QuantikzOp(op::ClassicalXOR) = Quantikz.ClassicalDecision(sort([op.store, op.bits...]))
 
 function lstring(pauli::PauliOperator)
     v = join(("\\mathtt{$(o)}" for o in replace(string(pauli)[3:end],"_"=>"I")),"\\\\")
diff --git a/src/QuantumClifford.jl b/src/QuantumClifford.jl
index 6cbfb56c9..6c77b1106 100644
--- a/src/QuantumClifford.jl
+++ b/src/QuantumClifford.jl
@@ -53,7 +53,7 @@ export
     # Misc Ops
     SparseGate,
     sMX, sMY, sMZ, PauliMeasurement, Reset, sMRX, sMRY, sMRZ,
-    BellMeasurement,
+    BellMeasurement, ClassicalXOR,
     VerifyOp,
     Register,
     # Enumeration and Randoms
diff --git a/src/ecc/ECC.jl b/src/ecc/ECC.jl
index 5eefd1b46..0410b5a60 100644
--- a/src/ecc/ECC.jl
+++ b/src/ecc/ECC.jl
@@ -44,6 +44,10 @@ end
 """Generate the non-fault-tolerant stabilizer measurement cicuit for a given code instance or parity check tableau.
 
 Use the `ancillary_index` and `bit_index` arguments to offset where the corresponding part the circuit starts.
+
+Returns the circuit, the number of ancillary qubits that were added, and a list of bit indices that will store the measurement results.
+
+See also: [`shor_syndrome_circuit`](@ref)
 """
 function naive_syndrome_circuit end
 
@@ -51,8 +55,15 @@ function naive_syndrome_circuit(code_type::AbstractECC, ancillary_index=1, bit_i
     naive_syndrome_circuit(parity_checks(code_type), ancillary_index, bit_index)
 end
 
-"""Circuit that measures the corresponding PauliOperator by using conditional gates into an ancillary
-qubit at index `nqubits(p)+ancillary_index` and stores the measurement result into classical bit `bit_index`."""
+"""Naive Pauli measurement circuit using a single ancillary qubit.
+
+Not a fault-tolerant circuit, but useful for testing purposes.
+
+Measures the corresponding `PauliOperator` by using conditional gates into an ancillary
+qubit at index `nqubits(p)+ancillary_index` and stores the measurement result
+into classical bit `bit_index`.
+
+See also: [`naive_syndrome_circuit`](@ref), [`shor_ancillary_paulimeasurement`](@ref)"""
 function naive_ancillary_paulimeasurement(p::PauliOperator, ancillary_index=1, bit_index=1)
     circuit = AbstractOperation[]
     numQubits = nqubits(p)
@@ -73,14 +84,106 @@ end
 
 function naive_syndrome_circuit(parity_check_tableau, ancillary_index=1, bit_index=1)
     naive_sc = AbstractOperation[]
+    ancillaries = 0
+    bits = 0
+    for check in parity_check_tableau
+        append!(naive_sc,naive_ancillary_paulimeasurement(check, ancillary_index+ancillaries, bit_index+bits))
+        ancillaries +=1
+        bits +=1
+    end
+
+    return naive_sc, ancillaries, bit_index:bit_index+bits-1
+end
+
+"""Generate the Shor fault-tolerant stabilizer measurement cicuit for a given code instance or parity check tableau.
+
+Use the `ancillary_index` and `bit_index` arguments to offset where the corresponding part the circuit starts.
+Ancillary qubits
+
+Returns:
+  - The cat state preparation circuit.
+  - The Shor syndrome measurement circuit.
+  - The number of ancillary qubits that were added.
+  - The list of bit indices that store the final measurement results.
+
+See also: [`naive_syndrome_circuit`](@ref)
+"""
+function shor_syndrome_circuit end
+
+function shor_syndrome_circuit(code_type::AbstractECC, ancillary_index=1, bit_index=1)
+    shor_syndrome_circuit(parity_checks(code_type), ancillary_index, bit_index)
+end
+
+"""Shor's Pauli measurement circuit using a multiple entangled ancillary qubits.
+
+A fault-tolerant circuit.
+
+Measures the corresponding PauliOperator by using conditional gates into multiple ancillary
+entangled qubits starting at index `nqubits(p)+ancillary_index`
+and stores the measurement result into classical bits starting at `bit_index`.
+The final measurement result is the XOR of all the bits.
+
+Returns:
+  - The cat state preparation circuit.
+  - The Shor syndrome measurement circuit.
+  - One more than the index of the last added ancillary qubit.
+  - One more than the index of the last added classical bit.
+
+See also: [`naive_syndrome_circuit`](@ref), [`naive_ancillary_paulimeasurement`](@ref)"""
+function shor_ancillary_paulimeasurement(p::PauliOperator, ancillary_index=1, bit_index=1)
+    init_ancil_index = ancillary_index
+    circuit = AbstractOperation[]
+    measurements = AbstractOperation[]
+    numQubits = nqubits(p)
+    for qubit in 1:numQubits
+        if p[qubit] == (1,0)
+            push!(circuit, sXCZ(qubit, numQubits + ancillary_index))
+        elseif p[qubit] == (0,1)
+            push!(circuit, sZCZ(qubit, numQubits + ancillary_index))
+        elseif p[qubit] == (1,1)
+            push!(circuit, sYCZ(qubit, numQubits + ancillary_index))
+        end
+        if p[qubit] != (0,0)
+            push!(measurements, sMRX(numQubits + ancillary_index, bit_index))
+            ancillary_index +=1
+            bit_index +=1
+        end
+    end
+    circuit = vcat(circuit,measurements)
+
+    cat_state_circuit = AbstractOperation[]
+    push!(cat_state_circuit, sHadamard(numQubits + init_ancil_index))
+    for i in (init_ancil_index+1):(ancillary_index -1)
+        push!(cat_state_circuit, sCNOT(numQubits + (i - 1), numQubits + i))
+    end
+
+    return cat_state_circuit, circuit, ancillary_index, bit_index
+end
+
+function shor_syndrome_circuit(parity_check_tableau, ancillary_index=1, bit_index=1)
+    shor_sc = AbstractOperation[]
+    xor_indices = []
+    cat_circuit = AbstractOperation[]
+    initial_ancillary_index = ancillary_index
 
     for check in parity_check_tableau
-        naive_sc = append!(naive_sc,naive_ancillary_paulimeasurement(check, ancillary_index, bit_index))
-        ancillary_index +=1
-        bit_index +=1
+        cat_circ, circ, new_ancillary_index, new_bit_index = shor_ancillary_paulimeasurement(check, ancillary_index, bit_index)
+        push!(xor_indices, collect(bit_index:new_bit_index-1))
+
+        append!(shor_sc, circ)
+        append!(cat_circuit, cat_circ)
+
+        ancillary_index = new_ancillary_index
+        bit_index = new_bit_index
+    end
+
+    final_bits = 0
+    for indices in xor_indices
+        push!(shor_sc, QuantumClifford.ClassicalXOR(indices, bit_index+final_bits))
+        final_bits += 1
     end
 
-    return naive_sc
+    return cat_circuit, shor_sc, ancillary_index-initial_ancillary_index, bit_index:bit_index+final_bits-1
 end
 
 """The distance of a code."""
diff --git a/src/misc_ops.jl b/src/misc_ops.jl
index 5361d21e7..af24b0395 100644
--- a/src/misc_ops.jl
+++ b/src/misc_ops.jl
@@ -126,3 +126,12 @@ function applywstatus!(s::AbstractQCState, v::VerifyOp) # XXX It assumes the oth
 end
 
 operatordeterminism(::Type{VerifyOp}) = DeterministicOperatorTrait()
+
+"""Applies an XOR gate to classical bits. Currently only implemented for funcitonality with pauli frames."""
+struct ClassicalXOR{N} <: AbstractOperation
+    "The indices of the classical bits to be xor-ed"
+    bits::NTuple{N,Int}
+    "The index of the classical bit that will store the results"
+    store::Int
+end
+ClassicalXOR(bits::Vector, store::Int) = ClassicalXOR(tuple(bits...),store)
diff --git a/src/pauli_frames.jl b/src/pauli_frames.jl
index ee0a1d1fb..71d5dc10b 100644
--- a/src/pauli_frames.jl
+++ b/src/pauli_frames.jl
@@ -57,7 +57,28 @@ function apply!(f::PauliFrame, op::AbstractCliffordOperator)
     return f
 end
 
-function apply!(frame::PauliFrame, op::sMZ) # TODO sMX, sMY
+function apply!(frame::PauliFrame, xor::ClassicalXOR)
+    for f in eachindex(frame)
+        value = frame.measurements[f,xor.bits[1]]
+        for i in xor.bits[2:end]
+            value ⊻= frame.measurements[f,i]
+        end
+        frame.measurements[f, xor.store] = value
+    end
+end
+
+function apply!(frame::PauliFrame, op::sMX) # TODO implement a faster direct version
+    op.bit == 0 && return frame
+    apply!(frame, sHadamard(op.qubit))
+    apply!(frame, sMZ(op.qubit, op.bit))
+end
+
+function apply!(frame::PauliFrame, op::sMRX) # TODO implement a faster direct version
+    apply!(frame, sHadamard(op.qubit))
+    apply!(frame, sMRZ(op.qubit, op.bit))
+end
+
+function apply!(frame::PauliFrame, op::sMZ) # TODO sMY, and faster sMX
     op.bit == 0 && return frame
     i = op.qubit
     xzs = frame.frame.tab.xzs
@@ -75,7 +96,7 @@ function apply!(frame::PauliFrame, op::sMZ) # TODO sMX, sMY
     return frame
 end
 
-function apply!(frame::PauliFrame, op::sMRZ) # TODO sMRX, sMRY
+function apply!(frame::PauliFrame, op::sMRZ) # TODO sMRY, and faster sMRX
     i = op.qubit
     xzs = frame.frame.tab.xzs
     T = eltype(xzs)
diff --git a/test/runtests.jl b/test/runtests.jl
index 3b7b5b4b2..a7099f8b4 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -60,6 +60,7 @@ end
 @doset "enumerate"
 @doset "quantumoptics"
 @doset "ecc"
+@doset "ecc_syndromes"
 @doset "precompile"
 @doset "pauliframe"
 @doset "allocations"
diff --git a/test/test_ecc.jl b/test/test_ecc.jl
index eec808bca..5b7a16b9e 100644
--- a/test/test_ecc.jl
+++ b/test/test_ecc.jl
@@ -67,7 +67,7 @@ function test_naive_syndrome(c::AbstractECC)
     s1 = copy(logicalqubits)
     syndrome1 = [project!(s1, check)[3] for check in parity_checks(c)]
     # measure using `naive_syndrome_circuit`
-    naive_circuit = naive_syndrome_circuit(c)
+    naive_circuit, _ = naive_syndrome_circuit(c)
     ancillaryqubits = one(Stabilizer,code_s(c))
     s2 = copy(logicalqubits)
     syndrome2 = Register(s2⊗ancillaryqubits, falses(code_s(c)))
@@ -89,7 +89,7 @@ end
 
 function test_with_pframes(code)
     ecirc = encoding_circuit(code)
-    scirc = naive_syndrome_circuit(code)
+    scirc, _ = naive_syndrome_circuit(code)
     nframes = 10
     dataqubits = code_n(code)
     ancqubits = code_s(code)
diff --git a/test/test_ecc_syndromes.jl b/test/test_ecc_syndromes.jl
new file mode 100644
index 000000000..851f95157
--- /dev/null
+++ b/test/test_ecc_syndromes.jl
@@ -0,0 +1,54 @@
+using Test
+using QuantumClifford
+using QuantumClifford: mul_left!
+using QuantumClifford.ECC: AbstractECC, Steane7, Shor9, Bitflip3, naive_syndrome_circuit, encoding_circuit, shor_syndrome_circuit, code_n, code_s
+
+codes = [
+    Bitflip3(),
+    Steane7(),
+    Shor9(),
+]
+
+##
+
+function pframe_naive_vs_shor_syndrome(code)
+    ecirc = encoding_circuit(code)
+    naive_scirc, naive_ancillaries = naive_syndrome_circuit(code)
+    shor_cat_scirc, shor_scirc, shor_ancillaries, shor_bits = shor_syndrome_circuit(code)
+    nframes = 10
+    dataqubits = code_n(code)
+    syndromebits = code_s(code)
+    naive_qubits = dataqubits + syndromebits
+    shor_qubits = dataqubits + shor_ancillaries
+    # no noise
+    naive_frames = PauliFrame(nframes, naive_qubits, syndromebits)
+    shor_frames = PauliFrame(nframes, shor_qubits, last(shor_bits))
+    naive_circuit = [ecirc..., naive_scirc...]
+    shor_circuit = [ecirc..., shor_cat_scirc..., shor_scirc...]
+    pftrajectories(naive_frames, naive_circuit)
+    pftrajectories(shor_frames, shor_circuit)
+    @test pfmeasurements(naive_frames) == pfmeasurements(shor_frames)[:,shor_bits]
+    # with errors
+    for _ in 1:10
+        naive_frames = PauliFrame(nframes, naive_qubits, syndromebits)
+        shor_frames = PauliFrame(nframes, shor_qubits, last(shor_bits))
+        pftrajectories(naive_frames, ecirc)
+        pftrajectories(shor_frames, [ecirc..., shor_cat_scirc...])
+        # manually injecting the same type of noise in the frames -- not really a user accessible API
+        p = random_pauli(dataqubits, realphase=true)
+        pn = embed(naive_qubits, 1:dataqubits, p)
+        ps = embed(shor_qubits, 1:dataqubits, p)
+        mul_left!(naive_frames.frame, pn)
+        mul_left!(shor_frames.frame, ps)
+        # run the syndrome circuits using the public API
+        pftrajectories(naive_frames, naive_scirc)
+        pftrajectories(shor_frames, shor_scirc)
+        @test pfmeasurements(naive_frames) == pfmeasurements(shor_frames)[:,shor_bits]
+    end
+end
+
+@testset "naive and shor measurement circuits" begin
+    for c in codes
+        pframe_naive_vs_shor_syndrome(c)
+    end
+end
diff --git a/test/test_jet.jl b/test/test_jet.jl
index 994ea8712..744f78470 100644
--- a/test/test_jet.jl
+++ b/test/test_jet.jl
@@ -36,5 +36,5 @@ end
     )
     @show rep
     @test_broken length(JET.get_reports(rep)) == 0
-    @test length(JET.get_reports(rep)) <= 7
+    @test length(JET.get_reports(rep)) <= 9
 end

From 2cd865e0f96c6c71362beb4180646d568a231784 Mon Sep 17 00:00:00 2001
From: Shayan Pardis <46478622+Shayan-P@users.noreply.github.com>
Date: Thu, 31 Aug 2023 12:29:28 -0400
Subject: [PATCH 2/5] Add noise gate GPU support (#171)

---------

Co-authored-by: Stefan Krastanov <github.acc@krastanov.org>
---
 .../QuantumCliffordGPUExt.jl                  |  3 +-
 ext/QuantumCliffordGPUExt/apply_noise.jl      | 40 +++++++++++++++++++
 test/test_gpu.jl                              | 28 ++++++++++++-
 3 files changed, 69 insertions(+), 2 deletions(-)
 create mode 100644 ext/QuantumCliffordGPUExt/apply_noise.jl

diff --git a/ext/QuantumCliffordGPUExt/QuantumCliffordGPUExt.jl b/ext/QuantumCliffordGPUExt/QuantumCliffordGPUExt.jl
index c79571c8c..fd564ed2c 100644
--- a/ext/QuantumCliffordGPUExt/QuantumCliffordGPUExt.jl
+++ b/ext/QuantumCliffordGPUExt/QuantumCliffordGPUExt.jl
@@ -2,7 +2,7 @@ module QuantumCliffordGPUExt
 
 using CUDA
 using QuantumClifford
-import QuantumClifford: to_cpu, to_gpu, _apply!, apply!, pftrajectories, fastcolumn, fastrow
+import QuantumClifford: to_cpu, to_gpu, _apply!, apply!, pftrajectories, fastcolumn, fastrow, applynoise!
 
 include("utils.jl")
 include("types.jl")
@@ -10,6 +10,7 @@ include("adapters.jl")
 include("apply.jl")
 include("pauli_frames.jl")
 include("fastmemlayout.jl")
+include("apply_noise.jl")
 
 # todo hide _apply function later. only for internal use
 end
diff --git a/ext/QuantumCliffordGPUExt/apply_noise.jl b/ext/QuantumCliffordGPUExt/apply_noise.jl
new file mode 100644
index 000000000..57b4dfebf
--- /dev/null
+++ b/ext/QuantumCliffordGPUExt/apply_noise.jl
@@ -0,0 +1,40 @@
+using QuantumClifford: _div, _mod
+
+function applynoise!(frame::PauliFrameGPU{T},noise::UnbiasedUncorrelatedNoise,i::Int) where {T <: Unsigned}
+    p = noise.errprobthird
+    lowbit = T(1)
+    ibig = _div(T,i-1)+1
+    ismall = _mod(T,i-1)
+    ismallm = lowbit<<(ismall)
+
+    stab = frame.frame
+    xzs = tab(stab).xzs
+    rows::Unsigned = size(stab, 1)
+
+    @run_cuda applynoise_kernel(xzs, p, ibig, ismallm, rows) rows
+    return frame
+end
+
+
+function applynoise_kernel(xzs::DeviceMatrix{Tme},
+    p::Real,
+    ibig::Int,
+    ismallm::Tme,
+    rows::Unsigned) where {Tme <: Unsigned} 
+
+    f = (blockIdx().x - 1) * blockDim().x + threadIdx().x;
+    if f > rows
+        return nothing
+    end
+
+    r = rand()
+    if  r < p # X error
+        xzs[ibig,f] ⊻= ismallm
+    elseif r < 2p # Z error
+        xzs[end÷2+ibig,f] ⊻= ismallm
+    elseif r < 3p # Y error
+        xzs[ibig,f] ⊻= ismallm
+        xzs[end÷2+ibig,f] ⊻= ismallm
+    end
+    return nothing
+end
diff --git a/test/test_gpu.jl b/test/test_gpu.jl
index 6740c6404..a82f3bea3 100644
--- a/test/test_gpu.jl
+++ b/test/test_gpu.jl
@@ -34,8 +34,13 @@ function apply_single_or_double_qubit_and_compare(n, s, s_gpu)
     end
 end
 
+function approx(a, b, threshold)
+    maximum(abs.(a - b)) <= threshold
+end
+
+
 @testset "GPU" begin
-    CUDA.allowscalar(false) # todo how to add this to all tests.
+    CUDA.allowscalar(false) # make sure we are using GPU kernels and not iterating on indices
 
     @test begin
         p = random_pauli(3)
@@ -118,4 +123,25 @@ end
         end
         true
     end
+
+    @test begin
+        # test applynoise
+        N = 4
+        trajectories = 10000
+        f = PauliFrame(trajectories, N, N);
+        f = to_gpu(f)
+        p = 0.1
+        measurements = pftrajectories(f, [
+            sMZ(1, 1),
+            sHadamard(2),
+            sMZ(2, 2),
+            NoiseOp(UnbiasedUncorrelatedNoise(p), [3, 4]),
+            sMZ(3, 3),
+            sHadamard(4),
+            sMZ(4, 4)
+        ]).measurements
+        avg_result = to_cpu(sum(measurements, dims=1) / trajectories)
+        error_threshold = 0.02
+        approx(vec(avg_result), [0, .5, 2p, .5], error_threshold)
+    end
 end

From 5d7f2bf4e48caeee6698348d8b5f24877757d0f9 Mon Sep 17 00:00:00 2001
From: Stefan Krastanov <stefan@krastanov.org>
Date: Thu, 31 Aug 2023 13:47:37 -0400
Subject: [PATCH 3/5] docfix due to a breakingchange in naive_syndrome_circuit
 (#183)

---
 docs/src/ecc_example_sim.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/ecc_example_sim.md b/docs/src/ecc_example_sim.md
index 5feef8edf..e20f66703 100644
--- a/docs/src/ecc_example_sim.md
+++ b/docs/src/ecc_example_sim.md
@@ -26,7 +26,7 @@ ecirc = encoding_circuit(code)
 
 ... and the corresponding syndrome measurement circuit (the non-fault tolerant one)
 ```@example 1
-scirc = naive_syndrome_circuit(code)
+scirc, _ = naive_syndrome_circuit(code)
 ```
 
 The most straightforward way to start sampling syndromes is to set up a table of Pauli frames.

From 4e767d2163fb21d63090988ff713917a9570dfbc Mon Sep 17 00:00:00 2001
From: Stefan Krastanov <github.acc@krastanov.org>
Date: Thu, 31 Aug 2023 14:46:31 -0400
Subject: [PATCH 4/5] bump version number

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 47542bfa0..ae5bc7dbe 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "QuantumClifford"
 uuid = "0525e862-1e90-11e9-3e4d-1b39d7109de1"
 authors = ["Stefan Krastanov <stefan@krastanov.org>"]
-version = "0.8.15"
+version = "0.8.16"
 
 [deps]
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"

From 5c9822e18ddaa121aae3f818383a40ae553e9fba Mon Sep 17 00:00:00 2001
From: Shayan Pardis <46478622+Shayan-P@users.noreply.github.com>
Date: Fri, 8 Sep 2023 13:22:13 -0400
Subject: [PATCH 5/5] Optimize inner gpu type (#172)

---------

Co-authored-by: Stefan Krastanov <stefan@krastanov.org>
---
 benchmark/benchmark_gpu_innertype.jl     | 81 ++++++++++++++++++++++++
 ext/QuantumCliffordGPUExt/adapters.jl    | 43 +++++++------
 ext/QuantumCliffordGPUExt/apply_noise.jl |  1 +
 3 files changed, 107 insertions(+), 18 deletions(-)
 create mode 100644 benchmark/benchmark_gpu_innertype.jl

diff --git a/benchmark/benchmark_gpu_innertype.jl b/benchmark/benchmark_gpu_innertype.jl
new file mode 100644
index 000000000..b1e639a2c
--- /dev/null
+++ b/benchmark/benchmark_gpu_innertype.jl
@@ -0,0 +1,81 @@
+using Revise
+using QuantumClifford
+using QuantumClifford: to_cpu, to_gpu
+using CUDA
+using BenchmarkTools
+using Plots
+using ProgressBars
+using DataFrames
+using CSV
+
+include("surface-code-circuit.jl")
+
+ints = [UInt8, UInt16, UInt32, UInt64, UInt128]
+names = [:UInt8, :UInt16, :UInt32, :UInt64, :UInt128]
+
+function get_stats()
+    println("Starting tests with $(Threads.nthreads()) threads out of `Sys.CPU_THREADS = $(Sys.CPU_THREADS)`...")
+
+    # Powers of 2 to benchmark
+    powers = reverse(3:18) # start from the hardest one first! (fail fast)
+    n_values = 2 .^ powers
+
+    times = Dict([
+        sm=>[]
+        for (sm, tp) in zip(names, ints)
+    ])
+
+    ccircuit = if eltype(circ) <: QuantumClifford.CompactifiedGate
+        circ
+    else
+        compactify_circuit(circ)
+    end
+
+    CUDA.allowscalar(false)
+
+
+    for n in ProgressBar(n_values)    
+        for (sm, T) in zip(names, ints)
+            gpu_frames() = to_gpu(QuantumClifford._create_pauliframe(ccircuit; trajectories=n), T)
+            gpu_column() = pftrajectories(fastcolumn(gpu_frames()), ccircuit), synchronize()
+            @show n, T
+            @elapsed gpu_column() # to eliminate compile time
+            push!(times[sm], @belapsed $gpu_column())
+        end
+        # note. this is also measuring the time it takes to move data to gpu
+    end
+    df = DataFrame(times)
+    df.n_values=n_values
+    return df
+end
+
+function load_stats()
+    if isfile("gpu_innertype_comparison.csv")
+        DataFrame(CSV.File("gpu_innertype_comparison.csv"))
+    else
+        get_stats()
+    end
+end
+
+function save_stats(df)
+    CSV.write("gpu_innertype_comparison.csv", df)
+end
+
+function plot_result(df)
+    my_y_axis(arrs) = begin
+        arr = []
+        for a in arrs
+            append!(arr, a)
+        end
+        logarr = log.(arr)
+        exp.(range(min(logarr...), stop=max(logarr...), length=8))
+    end
+    plot(df.n_values, [df[!, col] for col in names], marker=:circle, label=reshape(["times $col" for col in names], (1, length(names))), xlabel="n", ylabel="Execution Time (s)", title="gpu inner type comparison (fast column)", xticks=df.n_values[1:2:end], xscale=:log2, yticks=my_y_axis([df[!, col] for col in names]), yscale=:log10, legend=:topleft, size=(1200, 800))
+end
+
+function main()
+    df = load_stats()
+    save_stats(df)
+    plot_result(df)
+    savefig("benchmark_gpu_innertype.png")
+end
diff --git a/ext/QuantumCliffordGPUExt/adapters.jl b/ext/QuantumCliffordGPUExt/adapters.jl
index 5b009728a..86d91685a 100644
--- a/ext/QuantumCliffordGPUExt/adapters.jl
+++ b/ext/QuantumCliffordGPUExt/adapters.jl
@@ -1,33 +1,40 @@
-# const InnerGPUType = UInt64;
+const InnerGPUType = UInt32;
+const InnerCPUType = UInt64;
 # todo. make the conversion types consiting with this...
 # also define a cpu default type?!
 
-to_gpu(array::AbstractArray) = CuArray(array);
+to_gpu(array::AbstractArray) = CuArray(array)
 to_cpu(array::AbstractArray) = Array(array);
 
+# changes the type to InnerGPUType or InnerCPUType as well as copying to cpu/gpu
+to_gpu(array::AbstractArray, tp::Type{T}) where {T <: Unsigned} = 
+    CuArray(reinterpret(T, collect(array)))
+to_cpu(array::AbstractArray, tp::Type{T}) where {T <: Unsigned} = 
+    Array(reinterpret(T, collect(array)))
+
 # maybe change the format of storing the data in gpu array 
 # so that it is more convinient to work with them on gpu?
 # todo later add some type checking to avoid copying (or throw error) if the data is already on gpu/cpu
-to_gpu(tab::QuantumClifford.Tableau) =
-    QuantumClifford.Tableau(to_gpu(tab.phases), tab.nqubits, to_gpu(tab.xzs))
+to_gpu(tab::QuantumClifford.Tableau, tp::Type{T}=InnerGPUType) where {T <: Unsigned} =
+    QuantumClifford.Tableau(to_gpu(tab.phases), tab.nqubits, to_gpu(tab.xzs, tp))
 
-to_cpu(tab::QuantumClifford.Tableau) =
-    QuantumClifford.Tableau(to_cpu(tab.phases), tab.nqubits, to_cpu(tab.xzs))
+to_cpu(tab::QuantumClifford.Tableau, tp::Type{T}=InnerCPUType) where {T <: Unsigned} =
+    QuantumClifford.Tableau(to_cpu(tab.phases), tab.nqubits, to_cpu(tab.xzs, tp))
 
-to_gpu(pauli::QuantumClifford.PauliOperator) =
-    QuantumClifford.PauliOperator(to_gpu(pauli.phase), pauli.nqubits, to_gpu(pauli.xz))
+to_gpu(pauli::QuantumClifford.PauliOperator, tp::Type{T}=InnerGPUType) where {T <: Unsigned} =
+    QuantumClifford.PauliOperator(to_gpu(pauli.phase), pauli.nqubits, to_gpu(pauli.xz, tp))
 
-to_cpu(pauli::QuantumClifford.PauliOperator) =
-    QuantumClifford.PauliOperator(to_cpu(pauli.phase), pauli.nqubits, to_cpu(pauli.xz))
+to_cpu(pauli::QuantumClifford.PauliOperator, tp::Type{T}=InnerCPUType) where {T <: Unsigned} =
+    QuantumClifford.PauliOperator(to_cpu(pauli.phase), pauli.nqubits, to_cpu(pauli.xz, tp))
 
-to_gpu(stabilizer::QuantumClifford.Stabilizer) =
-    Stabilizer(to_gpu(tab(stabilizer)))
+to_gpu(stabilizer::QuantumClifford.Stabilizer, tp::Type{T}=InnerGPUType) where {T <: Unsigned} =
+    Stabilizer(to_gpu(tab(stabilizer), tp))
 
-to_cpu(stabilizer::QuantumClifford.Stabilizer) = 
-    Stabilizer(to_cpu(tab(stabilizer)))
+to_cpu(stabilizer::QuantumClifford.Stabilizer, tp::Type{T}=InnerCPUType) where {T <: Unsigned} = 
+    Stabilizer(to_cpu(tab(stabilizer), tp))
 
-to_gpu(pauli_frame::QuantumClifford.PauliFrame) =
-    QuantumClifford.PauliFrame(to_gpu(pauli_frame.frame), to_gpu(pauli_frame.measurements))
+to_gpu(pauli_frame::QuantumClifford.PauliFrame, tp::Type{T}=InnerGPUType) where {T <: Unsigned} =
+    QuantumClifford.PauliFrame(to_gpu(pauli_frame.frame, tp), to_gpu(pauli_frame.measurements))
 
-to_cpu(pauli_frame::QuantumClifford.PauliFrame) =
-    QuantumClifford.PauliFrame(to_cpu(pauli_frame.frame), to_cpu(pauli_frame.measurements))
+to_cpu(pauli_frame::QuantumClifford.PauliFrame, tp::Type{T}=InnerCPUType) where {T <: Unsigned} =
+    QuantumClifford.PauliFrame(to_cpu(pauli_frame.frame, tp), to_cpu(pauli_frame.measurements))
diff --git a/ext/QuantumCliffordGPUExt/apply_noise.jl b/ext/QuantumCliffordGPUExt/apply_noise.jl
index 57b4dfebf..ecfcd647e 100644
--- a/ext/QuantumCliffordGPUExt/apply_noise.jl
+++ b/ext/QuantumCliffordGPUExt/apply_noise.jl
@@ -27,6 +27,7 @@ function applynoise_kernel(xzs::DeviceMatrix{Tme},
         return nothing
     end
 
+
     r = rand()
     if  r < p # X error
         xzs[ibig,f] ⊻= ismallm