diff --git a/README.md b/README.md
index 1d7a70a..7fc1f88 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 # pytorch_cmspepr
 
-pytorch bindings for optimized knn and aggregation kernels
+pytorch bindings for optimized knn and aggregation kernels.
+
+Now also has a C++ extension for the [Object Condensation](https://arxiv.org/abs/2002.03605) loss function.
 
 
 ## Example
@@ -121,7 +123,7 @@ pytest tests
 
 ## Performance
 
-The following profiling code can be used:
+The following profiling code can be used (see the script [performance.py](scripts/performance.py)):
 
 ```python
 import time
@@ -171,4 +173,7 @@ CPU (torch_cmspepr) took 0.22623349189758302 sec/evt
 CPU (torch_cluster) took 0.2259768319129944 sec/evt
 CUDA (torch_cmspepr) took 0.026673252582550048 sec/evt
 CUDA (torch_cluster) took 0.22262062072753908 sec/evt
-```
\ No newline at end of file
+```
+
+Similarly, there is a profiling script available for object condensation, see [performance_oc.py](scripts/performance_oc.py).
+Here a 3x speed up is achieved w.r.t. to the pure-Python implementation of object condensation, but more importantly, memory consumption is drastically reduced.
diff --git a/extensions/oc_cpu.cpp b/extensions/oc_cpu.cpp
new file mode 100644
index 0000000..1178326
--- /dev/null
+++ b/extensions/oc_cpu.cpp
@@ -0,0 +1,345 @@
+#include <torch/extension.h>
+
+// #include <string> //size_t, just for helper function
+#include <cmath>
+// #include <iostream>
+
+#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
+#define I2D(i,j,Nj) j + Nj*i
+
+/*
+Returns the squared distance between two nodes in clustering space.
+*/
+float calc_dist_sq(
+    const size_t i, // index of node i
+    const size_t j, // index of node j
+    const float_t *x, // node feature matrix
+    const size_t ndim // number of dimensions 
+    ){
+    float_t distsq = 0;
+    if (i == j) return 0;
+    // std::cout << "dist_sq i=" << i << " j=" << j << std::endl;
+    for (size_t idim = 0; idim < ndim; idim++) {
+        float_t dist = x[I2D(i,idim,ndim)] - x[I2D(j,idim,ndim)];
+        // std::cout
+        //     << "  idim=" << idim
+        //     << " x[" << i << "][" << idim << "]=" << x[I2D(i,idim,ndim)]
+        //     << " x[" << j << "][" << idim << "]=" << x[I2D(j,idim,ndim)]
+        //     << " d=" << dist
+        //     << " d_sq=" << dist*dist
+        //     << std::endl;
+        distsq += dist * dist;
+    }
+    // std::cout << "  d_sq_sum=" << distsq << std::endl;
+    return distsq;
+    }
+
+
+void oc_kernel(
+    // Global event info
+    const float_t* beta, // beta per node
+    const float_t* q,    // charge per node
+    const float_t* x,    // cluster space coordinates
+    const size_t n_dim_cluster_space, // Number of dimensions of the cluster space
+    const int32_t* cond_indices,     // indices of the condensation points
+    const int32_t* cond_counts,      // nr of nodes connected to the cond point
+    const size_t cond_indices_start, // row split start for cond points
+    const size_t cond_indices_end,   // row split end for cond points
+    const int32_t* which_cond_point, // (n_nodes,) array pointing to the cond point index
+    const int32_t n_nodes, // Number of nodes in the event of this node
+
+    // To be parallellized over
+    const size_t i_node, // index of the node in question
+
+    // Outputs:
+    float_t * V_att,
+    float_t * V_rep,
+    float_t * V_srp
+    ){
+
+    int32_t i_cond = which_cond_point[i_node];
+
+    // std::cout
+    //     << "i_node=" << i_node
+    //     << " i_cond=" << i_cond
+    //     << " q[i_node]=" << q[i_node]
+    //     << " cond_start=" << cond_indices_start
+    //     << " cond_end=" << cond_indices_end
+    //     << " n_nodes=" << n_nodes
+    //     << std::endl;
+
+    // V_att and V_srp
+    if (i_cond == -1 || i_node == (size_t)i_cond){
+        // Noise node, or a condensation point itself
+        // std::cout << "  Noise hit or cond point, V_att/V_srp set to 0." << std::endl;
+        *V_att = 0.;
+        *V_srp = 0.;
+        }
+    else {
+        float d_sq = calc_dist_sq(i_node, i_cond, x, n_dim_cluster_space);
+        float d = sqrt(d_sq);
+        float_t d_huber = d+0.00001 <= 4.0 ?  d_sq  :  2.0 * 4.0 * (d - 4.0) ;
+        *V_att = d_huber * q[i_node] * q[i_cond] / (float)n_nodes;
+        // V_srp must still be normalized! This is done in the V_rep loop because the
+        // normalization numbers are easier to access there.
+        *V_srp = 1. / (20.*d_sq + 1.);
+        // std::cout << "  d_huber for i_node " << i_node << ": "
+        //     << d_huber
+        //     << "; d_sq=" << d_sq
+        //     << "; V_att=" << *V_att
+        //     << "; V_srp=" << *V_srp
+        //     << std::endl;
+        }
+
+    // V_rep
+    *V_rep = 0.;
+    for (size_t i=cond_indices_start; i<cond_indices_end; i++) {
+        int32_t i_cond_other = cond_indices[i];
+        if (i_cond_other == i_cond){
+            // Still have to normalize V_srp; this is a convenient albeit awkward time
+            // to do so.
+            *V_srp *= -beta[i_cond] / (float)cond_counts[i] / (float)(cond_indices_end-cond_indices_start);
+            // Should not repulse from own cond point, so skip V_rep calculation
+            continue;
+            }
+        float d_sq = calc_dist_sq(i_node, i_cond_other, x, n_dim_cluster_space);
+        float V_rep_this = exp(-4.0 * d_sq) * q[i_node] * q[i_cond_other];
+        if (V_rep_this < 0.) V_rep_this = 0.;
+        *V_rep += V_rep_this;
+        }
+    *V_rep /= (float)n_nodes;
+    }
+
+
+torch::Tensor
+oc_cpu(
+        torch::Tensor beta_tensor,
+        torch::Tensor q_tensor,
+        torch::Tensor x_tensor,
+        torch::Tensor y_tensor,
+        torch::Tensor row_splits_tensor
+        ){
+
+    const size_t n_nodes = q_tensor.size(0);
+    const auto n_dim_cluster_space = x_tensor.size(1);
+    const size_t n_events = row_splits_tensor.size(0) - 1;
+
+    // std::cout
+    //     << "n_nodes=" << n_nodes
+    //     << " n_dim_cluster_space=" << n_dim_cluster_space
+    //     << " n_events=" << n_events
+    //     << std::endl;
+
+    auto beta = beta_tensor.data_ptr<float_t>();
+    auto q = q_tensor.data_ptr<float_t>();
+    auto x = x_tensor.data_ptr<float_t>();
+    auto y = y_tensor.data_ptr<int32_t>();
+    auto row_splits = row_splits_tensor.data_ptr<int32_t>();
+
+    // Determine number of condensation points per event (and total)
+    size_t n_cond = 0;
+    int32_t* n_cond_per_event = (int32_t *)malloc(n_events * sizeof(int32_t));
+    for (size_t i_event=0; i_event<n_events; i_event++) {
+        int32_t y_max = 0;
+        for (int32_t i_node=row_splits[i_event]; i_node<row_splits[i_event+1]; i_node++)
+            if (y[i_node] > y_max) y_max = y[i_node];
+        n_cond_per_event[i_event] = y_max;
+        n_cond += y_max;
+    }
+
+    // Determine the row splits for cond points
+    // e.g. n_cond_per_event = [2, 4, 1]
+    // then cond_indices_row_splits = [0, 2, 6, 7]
+    int32_t* cond_indices_row_splits = (int32_t *)malloc((n_events+1) * sizeof(int32_t));
+    cond_indices_row_splits[0] = 0;
+    for (size_t i_event=0; i_event<n_events; i_event++)
+        cond_indices_row_splits[i_event+1] = cond_indices_row_splits[i_event] + n_cond_per_event[i_event];
+
+    // Determine the condensation point indices per event
+    // (basically scatter_max(scatter_max(...)) )
+    // O(N) complexity, but could kernelize at a later stage.
+    int32_t* cond_indices = (int32_t *)malloc(n_cond * sizeof(int32_t));
+    int32_t* cond_counts = (int32_t *)malloc(n_cond * sizeof(int32_t));
+    size_t i_cond_indices_filler = 0;
+    int32_t* which_cond_point = (int32_t *)malloc(n_nodes * sizeof(int32_t));
+    for (size_t i_event=0; i_event<n_events; i_event++) {
+        // Open up two arrays, both sized nr of cond points in this event:
+        // - q_max, which holds the max charge found so far per cond point
+        // - count, which holds the number of nodes per cluster / cond point
+        // - i_max, which holds the index where the max charge was found.
+        size_t n_cond_this_event = n_cond_per_event[i_event];
+        float_t* q_max = (float_t *)malloc(n_cond_this_event * sizeof(float_t));
+        int32_t* count = (int32_t *)malloc(n_cond_this_event * sizeof(int32_t));
+        for(size_t i=0; i<n_cond_this_event; i++){
+            q_max[i] = 0.;
+            count[i] = 0;
+            }
+        int32_t* i_max = (int32_t *)malloc(n_cond_this_event * sizeof(int32_t));
+        // Loop over nodes in event, overwrite q_max and i_max when necessary
+        for (int32_t i_node=row_splits[i_event]; i_node<row_splits[i_event+1]; i_node++){
+            int32_t y_node = y[i_node];
+            if (y_node == 0) continue; // Bkg nodes don't belong to a cond point
+            count[y_node-1]++;
+            if (q[i_node] > q_max[y_node-1]){
+                // std::cout
+                //     << "i_node=" << i_node
+                //     << " y_node-1=" << y_node-1
+                //     << " q[i_node]=" << q[i_node]
+                //     << " > q_max[y_node-1]=" << q_max[y_node-1]
+                //     << "\n Updating i_max[y_node-1] to " << i_node
+                //     << std::endl;
+                q_max[y_node-1] = q[i_node];
+                i_max[y_node-1] = i_node;
+                }
+            }
+
+        // Loop over nodes in event, use i_max to determine per node to which
+        // cond point it belongs
+        for (int32_t i_node=row_splits[i_event]; i_node<row_splits[i_event+1]; i_node++){
+            int32_t y_node = y[i_node];
+            if (y_node == 0){
+                // Bkg nodes don't belong to a cond point
+                which_cond_point[i_node] = -1;
+                }
+            else {            
+                which_cond_point[i_node] = i_max[y_node-1];
+                }
+            }
+        // Copy the i_max and count info to the global cond_indices/counts array
+        for(size_t i=0; i<n_cond_this_event; i++){
+            cond_indices[i_cond_indices_filler] = i_max[i];
+            cond_counts[i_cond_indices_filler] = count[i];
+            i_cond_indices_filler++;
+            }
+        free(q_max);
+        free(i_max);
+        free(count);
+    }
+
+    // Debug printout
+
+    // std::cout << "n_cond_per_event =";
+    // for (size_t i=0; i<n_events; i++) std::cout << " " << n_cond_per_event[i];
+    // std::cout << std::endl;
+
+    // std::cout << "cond_indices_row_splits =";
+    // for (size_t i=0; i<n_events+1; i++) std::cout << " " << cond_indices_row_splits[i];
+    // std::cout << std::endl;
+
+    // std::cout << "cond_indices =";
+    // for (size_t i=0; i<n_cond; i++) std::cout << " " << cond_indices[i];
+    // std::cout << std::endl;
+
+    // std::cout << "cond_counts =";
+    // for (size_t i=0; i<n_cond; i++) std::cout << " " << cond_counts[i];
+    // std::cout << std::endl;
+
+    // std::cout << "which_cond_point =";
+    // for (size_t i=0; i<n_nodes; i++) std::cout << " " << which_cond_point[i];
+    // std::cout << std::endl;
+
+
+    // Prepare output tensor
+    auto options = torch::TensorOptions().dtype(torch::kFloat32);
+    auto losses_tensor = torch::zeros({ 5 }, options);
+    auto losses = losses_tensor.data_ptr<float_t>();
+
+
+    float* V_att = (float *)malloc(n_nodes * sizeof(float));
+    float* V_rep = (float *)malloc(n_nodes * sizeof(float));
+    float* V_srp = (float *)malloc(n_nodes * sizeof(float));
+
+    // Loop over events
+    for (size_t i_event=0; i_event<n_events; i_event++) {
+        size_t cond_start = cond_indices_row_splits[i_event];
+        size_t cond_end = cond_indices_row_splits[i_event+1];
+
+        size_t node_start = row_splits[i_event];
+        size_t node_end = row_splits[i_event+1];
+
+        // Loop over nodes
+        for (size_t i_node=node_start; i_node<node_end; i_node++) {
+            oc_kernel(
+                // Global event info
+                beta,
+                q,
+                x,
+                n_dim_cluster_space,
+                cond_indices,
+                cond_counts,
+                cond_start,
+                cond_end,
+                which_cond_point,
+                node_end-node_start,
+                // This node (to be parallellized in a CUDA kernel)
+                i_node,
+                // Output
+                &(V_att[i_node]),
+                &(V_rep[i_node]),
+                &(V_srp[i_node])
+                );
+            }
+        }
+
+    // L_beta_cond_logterm and L_beta_noise
+    // L_beta_cond_logterm = (-0.2 * torch.log(beta_cond + 1e-9)).mean()
+    // L_beta_noise = sB * beta[is_noise].mean(); sB multiplication done in Python
+    float L_beta_cond_logterm = 0.;
+    float L_beta_noise = 0.;
+    for (size_t i_event=0; i_event<n_events; i_event++) {
+        // L_beta_cond_logterm
+        size_t cond_start = cond_indices_row_splits[i_event];
+        size_t cond_end = cond_indices_row_splits[i_event+1];
+        float n_cond_this_event = cond_end - cond_start;
+        for (size_t i_cond=cond_start; i_cond<cond_end; i_cond++) {
+            float beta_cond = beta[cond_indices[i_cond]];
+            L_beta_cond_logterm += -0.2 * log(beta_cond + 0.000000001) / (float)n_cond_this_event;
+            }
+        // L_beta_noise
+        float L_beta_noise_this_event = 0.;
+        int n_noise_this_event = 0;
+        for (int32_t i_node=row_splits[i_event]; i_node<row_splits[i_event+1]; i_node++) {
+            if (y[i_node] == 0){
+                L_beta_noise_this_event += beta[i_node];
+                n_noise_this_event++;
+                }
+            }
+        if (n_noise_this_event>0)
+            L_beta_noise += L_beta_noise_this_event / (float)n_noise_this_event ;
+        }
+    losses[3] = L_beta_cond_logterm / (float)n_events;
+    losses[4] = L_beta_noise / (float)n_events;
+
+    free(n_cond_per_event);
+    free(cond_indices_row_splits);
+    free(cond_indices);
+    free(cond_counts);
+    free(which_cond_point);
+
+    float V_att_sum = 0.;
+    float V_rep_sum = 0.;
+    float V_srp_sum = 0.;
+    for (size_t i_node=0; i_node<n_nodes; i_node++){
+        V_att_sum += V_att[i_node];
+        V_rep_sum += V_rep[i_node];
+        V_srp_sum += V_srp[i_node];
+        }
+
+    // for (size_t i=0; i<n_nodes; i++) std::cout << "V_att[" << i << "]=" << V_att[i] << std::endl;
+    // for (size_t i=0; i<n_nodes; i++) std::cout << "V_rep[" << i << "]=" << V_rep[i] << std::endl;
+    // for (size_t i=0; i<n_nodes; i++) std::cout << "V_srp[" << i << "]=" << V_srp[i] << std::endl;
+
+    losses[0] = V_att_sum / (float)n_events;
+    losses[1] = V_rep_sum / (float)n_events;
+    losses[2] = V_srp_sum / (float)n_events;
+
+    free(V_att);
+    free(V_rep);
+    free(V_srp);
+
+    return losses_tensor;
+}
+
+TORCH_LIBRARY(oc_cpu, m) {
+  m.def("oc_cpu", oc_cpu);
+}
\ No newline at end of file
diff --git a/scripts/performance_oc.py b/scripts/performance_oc.py
new file mode 100644
index 0000000..542eb1d
--- /dev/null
+++ b/scripts/performance_oc.py
@@ -0,0 +1,78 @@
+import torch
+from torch_geometric.data import Data
+import torch_cmspepr
+
+import tqdm
+import time
+
+
+def make_random_event(n_nodes=10000, n_events=5):
+    model_out = torch.rand((n_nodes, 32))
+
+    # Varying event sizes
+    event_fracs = torch.normal(torch.ones(n_events), 0.1)
+    event_fracs /= event_fracs.sum()
+    event_sizes = (event_fracs * n_nodes).type(torch.int)
+    event_sizes[-1] += n_nodes - event_sizes.sum()  # Make sure it adds up to n_nodes
+
+    batch = torch.arange(n_events).repeat_interleave(event_sizes)
+    row_splits = torch.cat(
+        (torch.zeros(1, dtype=torch.int), torch.cumsum(event_sizes, 0))
+    )
+
+    ys = []
+    for i_event in range(n_events):
+        # Somewhere between 3 and 8 particles
+        n_clusters = torch.randint(3, 8, (1,)).item()
+        cluster_fracs = torch.randint(50, 200, (n_clusters,)).type(torch.float)
+        cluster_fracs[0] += 200  # Boost the amount of noise relatively
+        cluster_fracs /= cluster_fracs.sum()
+        cluster_sizes = (cluster_fracs * event_sizes[i_event]).type(torch.int)
+        # Make sure it adds up to n_nodes in this event
+        cluster_sizes[-1] += event_sizes[i_event] - cluster_sizes.sum()
+        ys.append(torch.arange(n_clusters).repeat_interleave(cluster_sizes))
+    y = torch.cat(ys)
+
+    y = y.type(torch.int)
+    row_splits = row_splits.type(torch.int)
+    return model_out, y, batch, row_splits
+
+
+def test_oc_performance():
+    try:
+        import cmspepr_hgcal_core.objectcondensation as objectcondensation
+    except ImportError:
+        print('Install cmspepr_hgcal_core to run this test')
+        return
+
+    objectcondensation.ObjectCondensation.beta_term_option = 'short_range_potential'
+    objectcondensation.ObjectCondensation.sB = 1.0
+
+    t_py = 0.0
+    t_cpp = 0.0
+    N = 1000
+    for i_test in tqdm.tqdm(range(N)):
+        # Don't count prep work in performance
+        model_out, y, batch, row_splits = make_random_event()
+        data = Data(y=y.type(torch.long), batch=batch)
+        beta = torch.sigmoid(model_out[:, 0]).contiguous()
+        q = objectcondensation.calc_q_betaclip(
+            torch.sigmoid(model_out[:, 0])
+        ).contiguous()
+        x = model_out[:, 1:].contiguous()
+
+        t0 = time.perf_counter()
+        objectcondensation.oc_loss(model_out, data)
+        t1 = time.perf_counter()
+        torch_cmspepr.oc(beta, q, x, y, batch)
+        t2 = time.perf_counter()
+
+        t_py += t1 - t0
+        t_cpp += t2 - t1
+
+    print(f'Average python time: {t_py/N:.4f}')
+    print(f'Average cpp    time: {t_cpp/N:.4f}')
+    print(f'Speed up is {t_py/t_cpp:.2f}x')
+
+
+test_oc_performance()
diff --git a/setup.py b/setup.py
index 0c10eea..a98a717 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,8 @@
     extra_link_args=['-s']
     )
 extensions_cpu = [
-    CppExtension('select_knn_cpu', ['extensions/select_knn_cpu.cpp'], **cpu_kwargs)
+    CppExtension('select_knn_cpu', ['extensions/select_knn_cpu.cpp'], **cpu_kwargs),
+    CppExtension('oc_cpu', ['extensions/oc_cpu.cpp'], **cpu_kwargs)
     ]
 cuda_kwargs = dict(
     include_dirs=[extensions_dir],
@@ -77,7 +78,7 @@ def repr_ext(ext):
 tests_require = ['pytest', 'pytest-cov', 'scipy']
 setup(
     name='torch_cmspepr',
-    version='1.0.0',
+    version='1.1.0',
     author='Lindsey Gray <Lindsey.Gray@cern.ch>, Jan Kieseler <jan.kieseler@cern.ch>, Thomas Klijnsma <thomasklijnsma@gmail.com>',
     author_email='Lindsey.Gray@cern.ch',
     url='',
diff --git a/tests/test_oc.py b/tests/test_oc.py
new file mode 100644
index 0000000..dd56334
--- /dev/null
+++ b/tests/test_oc.py
@@ -0,0 +1,190 @@
+import os.path as osp
+from math import log
+
+import torch
+from torch_geometric.data import Data
+
+SO_DIR = osp.dirname(osp.dirname(osp.abspath(__file__)))
+
+
+def calc_q_betaclip(beta, qmin=1.0):
+    return (beta.clip(0.0, 1 - 1e-4) / 1.002).arctanh() ** 2 + qmin
+
+
+# Single event
+class single:
+    # fmt: off
+    model_out = torch.FloatTensor([
+        # Event 0
+        # beta x0    x1        y
+        [0.01, 0.40, 0.40],  # 0
+        [0.02, 0.10, 0.90],  # 0
+        [0.12, 0.70, 0.70],  # 1 <- d_sq to cond point = 0.02^2 + 0.02^2 = 0.0008; d=0.0283
+        [0.01, 0.90, 0.10],  # 0
+        [0.13, 0.72, 0.72],  # 1 <-- cond point for y=1
+        ])
+    # fmt: on
+    x = model_out[:, 1:].contiguous()
+    y = torch.LongTensor([0, 0, 1, 0, 1])
+    batch = torch.LongTensor([0, 0, 0, 0, 0])
+    beta = torch.sigmoid(model_out[:, 0]).contiguous()
+    q = calc_q_betaclip(beta)
+
+    @classmethod
+    def d(cls, i, j):
+        return ((cls.x[i] - cls.x[j]) ** 2).sum()
+
+    # Manual OC:
+    @classmethod
+    def losses(cls):
+        beta = single.beta
+        q = single.q
+        d = single.d
+
+        V_att = d(2, 4) * q[2] * q[4] / 5.0  # Since d is small, d == d_huber
+        V_rep = (
+            torch.exp(-4.0 * d(0, 4)) * q[0] * q[4]
+            + torch.exp(-4.0 * d(1, 4)) * q[1] * q[4]
+            + torch.exp(-4.0 * d(3, 4)) * q[3] * q[4]
+        ) / 5.0
+        V_srp = -1.0 / (20.0 * d(2, 4) + 1.0) * beta[4] / 2.0
+        L_beta_cond_logterm = -0.2 * log(beta[4] + 1e-9)
+        L_beta_noise = (beta[0] + beta[1] + beta[3]) / 3.0
+
+        losses_man = torch.FloatTensor(
+            [V_att, V_rep, V_srp, L_beta_cond_logterm, L_beta_noise]
+        )
+        return losses_man
+
+
+def test_oc_cpu_single():
+    torch.ops.load_library(osp.join(SO_DIR, 'oc_cpu.so'))
+
+    losses_cpp = torch.ops.oc_cpu.oc_cpu(
+        single.beta,
+        single.q,
+        single.x,
+        single.y.type(torch.int),
+        torch.IntTensor([0, 5]),
+    )
+
+    losses_man = single.losses()
+    print(f'{losses_man=}')
+    print(f'{losses_cpp=}')
+    assert torch.allclose(losses_cpp, losses_man, rtol=0.001, atol=0.001)
+
+
+def test_oc_python_single():
+    import torch_cmspepr
+
+    losses = torch_cmspepr.oc(single.beta, single.q, single.x, single.y, single.batch)
+    losses_man = single.losses()
+    print(f'{losses_man=}')
+    print(f'{losses=}')
+    assert torch.allclose(losses, losses_man, rtol=0.001, atol=0.001)
+
+
+class multiple:
+    # fmt: off
+    model_out = torch.FloatTensor([
+        # Event 0
+        # beta x0    x1       idx y
+        [0.01, 0.40, 0.40],  #  0 0
+        [0.02, 0.10, 0.90],  #  1 0
+        [0.12, 0.70, 0.70],  #  2 1 <- d_sq to cond point = 0.02^2 + 0.02^2 = 0.0008; d=0.0283
+        [0.01, 0.90, 0.10],  #  3 0
+        [0.13, 0.72, 0.72],  #  4 1 <-- cond point for y=1
+        # Event 1
+        [0.11, 0.40, 0.40],  #  5 2
+        [0.02, 0.10, 0.90],  #  6 0
+        [0.12, 0.70, 0.70],  #  7 1 <-- cond point for y=1
+        [0.01, 0.90, 0.10],  #  8 0
+        [0.13, 0.72, 0.72],  #  9 2 <-- cond point for y=2
+        [0.11, 0.72, 0.72],  # 10 1
+        ])
+    x = model_out[:,1:].contiguous()
+    y = torch.LongTensor([
+        0, 0, 1, 0, 1,    # Event 0
+        2, 0, 1, 0, 2, 1  # Event 1
+        ])
+    batch = torch.LongTensor([
+        0, 0, 0, 0, 0,    # Event 0
+        1, 1, 1, 1, 1, 1  # Event 1
+        ])
+    # fmt: on
+    row_splits = torch.IntTensor([0, 5, 11])
+    beta = torch.sigmoid(model_out[:, 0]).contiguous()
+    q = calc_q_betaclip(beta).contiguous()
+
+
+def test_oc_cpu_batch():
+    torch.ops.load_library(osp.join(SO_DIR, 'oc_cpu.so'))
+    try:
+        import cmspepr_hgcal_core.objectcondensation as objectcondensation
+    except ImportError:
+        print('Install cmspepr_hgcal_core to run this test')
+        return
+
+    objectcondensation.ObjectCondensation.beta_term_option = 'short_range_potential'
+    objectcondensation.ObjectCondensation.sB = 1.0
+
+    loss_py = objectcondensation.oc_loss(
+        multiple.model_out, Data(y=multiple.y, batch=multiple.batch)
+    )
+    losses_py = torch.FloatTensor(
+        [
+            loss_py["V_att"],
+            loss_py["V_rep"],
+            loss_py["L_beta_sig"],
+            loss_py["L_beta_cond_logterm"],
+            loss_py["L_beta_noise"],
+        ]
+    )
+    losses_cpp = torch.ops.oc_cpu.oc_cpu(
+        multiple.beta,
+        multiple.q,
+        multiple.x,
+        multiple.y.type(torch.int),
+        multiple.row_splits,
+    )
+    print(losses_py)
+    print(losses_cpp)
+    # Lots of rounding errors in python vs c++, can't compare too rigorously
+    assert torch.allclose(losses_cpp, losses_py, rtol=0.01, atol=0.01)
+
+
+def test_oc_python_batch():
+    import torch_cmspepr
+
+    try:
+        import cmspepr_hgcal_core.objectcondensation as objectcondensation
+    except ImportError:
+        print('Install cmspepr_hgcal_core to run this test')
+        return
+
+    objectcondensation.ObjectCondensation.beta_term_option = 'short_range_potential'
+    objectcondensation.ObjectCondensation.sB = 1.0
+
+    loss_py = objectcondensation.oc_loss(
+        multiple.model_out, Data(y=multiple.y, batch=multiple.batch)
+    )
+    losses_py = torch.FloatTensor(
+        [
+            loss_py["V_att"],
+            loss_py["V_rep"],
+            loss_py["L_beta_sig"],
+            loss_py["L_beta_cond_logterm"],
+            loss_py["L_beta_noise"],
+        ]
+    )
+    losses = torch_cmspepr.oc(
+        multiple.beta,
+        multiple.q,
+        multiple.x,
+        multiple.y.type(torch.int),
+        multiple.batch,
+    )
+    print(losses_py)
+    print(losses)
+    # Lots of rounding errors in python vs c++, can't compare too rigorously
+    assert torch.allclose(losses, losses_py, rtol=0.01, atol=0.01)
diff --git a/torch_cmspepr/__init__.py b/torch_cmspepr/__init__.py
index 6a5ffa0..8ddbad9 100644
--- a/torch_cmspepr/__init__.py
+++ b/torch_cmspepr/__init__.py
@@ -3,7 +3,7 @@
 import logging
 import torch
 
-__version__ = '1.0.0'
+__version__ = '1.1.0'
 
 
 def setup_logger(name: str = "cmspepr") -> logging.Logger:
@@ -49,8 +49,9 @@ def load_ops(so_file):
 THISDIR = osp.dirname(osp.abspath(__file__))
 load_ops(osp.join(THISDIR, "../select_knn_cpu.so"))
 load_ops(osp.join(THISDIR, "../select_knn_cuda.so"))
-
+load_ops(osp.join(THISDIR, "../oc_cpu.so"))
 
 from torch_cmspepr.select_knn import select_knn, knn_graph
+from torch_cmspepr.oc import oc
 
-__all__ = ['select_knn', 'knn_graph', 'logger']
+__all__ = ['select_knn', 'knn_graph', 'oc', 'logger']
diff --git a/torch_cmspepr/oc.py b/torch_cmspepr/oc.py
new file mode 100644
index 0000000..607b249
--- /dev/null
+++ b/torch_cmspepr/oc.py
@@ -0,0 +1,49 @@
+import torch
+
+
+@torch.jit.script
+def oc(
+    beta: torch.FloatTensor,
+    q: torch.FloatTensor,
+    x: torch.FloatTensor,
+    y: torch.LongTensor,  # Use long for consistency
+    batch: torch.LongTensor,  # Use long for consistency
+    sB: float = 1.0,
+):
+    """
+    Calculate the object condensation loss function.
+
+    Args:
+        beta (torch.FloatTensor): Beta as described in https://arxiv.org/abs/2002.03605;
+            simply a sigmoid of the raw model output
+        q (torch.FloatTensor): Charge q per node; usually a function of beta.
+        x (torch.FloatTensor): Latent clustering space coordinates for every node.
+        y (torch.LongTensor): Clustering truth. WARNING: The torch.op expects y to be
+            nicely *incremental*. There should not be any holes in it.
+        batch (torch.LongTensor): Batch vector to designate event boundaries. WARNING:
+            It is expected that batch is *sorted*.
+
+    Returns:
+        torch.FloatTensor: A len-5 tensor with the 5 loss components of the OC loss
+            function: V_att, V_rep, V_srp, L_beta_cond_logterm, and L_beta_noise. The
+            full OC loss is simply the sum of this tensor.
+    """
+    N = beta.size(0)
+    assert beta.dim() == 1
+    assert q.dim() == 1
+    assert beta.size() == q.size()
+    assert x.size(0) == N
+    assert y.size(0) == N
+    assert batch.size(0) == N
+    device = beta.device
+
+    # TEMPORARY: No GPU version available yet.
+    assert device == torch.device('cpu')
+
+    # Translate batch vector into row splits
+    counts = torch.zeros(batch.max() + 1, dtype=torch.int, device=device)
+    counts.scatter_add_(0, batch, torch.ones_like(batch, dtype=torch.int))
+    counts = torch.cat((torch.zeros(1, dtype=torch.int, device=device), counts))
+    row_splits = torch.cumsum(counts, 0).type(torch.int)
+
+    return torch.ops.oc_cpu.oc_cpu(beta, q, x, y.type(torch.int), row_splits)