From 965ef8f096b3840d29abe8ba3c4c4fd2c690c2af Mon Sep 17 00:00:00 2001
From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com>
Date: Mon, 6 Jan 2025 13:08:36 -0500
Subject: [PATCH] Use a naive sparse histogram. (#534)

* Use a naive sparse histogram.

* Remove make_from_counts
---
 src/lsdb/io/to_hats.py             | 13 ++++++-------
 tests/lsdb/catalog/test_catalog.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/lsdb/io/to_hats.py b/src/lsdb/io/to_hats.py
index 9ef35006..51b811cb 100644
--- a/src/lsdb/io/to_hats.py
+++ b/src/lsdb/io/to_hats.py
@@ -6,13 +6,12 @@
 
 import dask
 import hats as hc
-import hats.pixel_math.healpix_shim as hp
 import nested_pandas as npd
 import numpy as np
 from hats.catalog import PartitionInfo
 from hats.catalog.healpix_dataset.healpix_dataset import HealpixDataset as HCHealpixDataset
 from hats.pixel_math import HealpixPixel, spatial_index_to_healpix
-from hats.pixel_math.sparse_histogram import SparseHistogram
+from hats.pixel_math.sparse_histogram import HistogramAggregator, SparseHistogram
 from upath import UPath
 
 if TYPE_CHECKING:
@@ -42,7 +41,7 @@ def perform_write(
         at the specified order.
     """
     if len(df) == 0:
-        return 0, SparseHistogram.make_empty(histogram_order)
+        return 0, SparseHistogram([], [], histogram_order)
     pixel_dir = hc.io.pixel_directory(base_catalog_dir, hp_pixel.order, hp_pixel.pixel)
     hc.io.file_io.make_directory(pixel_dir, exist_ok=True)
     pixel_path = hc.io.paths.pixel_catalog_file(base_catalog_dir, hp_pixel)
@@ -64,7 +63,7 @@ def calculate_histogram(df: npd.NestedFrame, histogram_order: int) -> SparseHist
     order_pixels = spatial_index_to_healpix(df.index.to_numpy(), target_order=histogram_order)
     gb = df.groupby(order_pixels, sort=False).apply(len)
     indexes, counts_at_indexes = gb.index.to_numpy(), gb.to_numpy(na_value=0)
-    return SparseHistogram.make_from_counts(indexes, counts_at_indexes, histogram_order)
+    return SparseHistogram(indexes, counts_at_indexes, histogram_order)
 
 
 # pylint: disable=protected-access
@@ -116,11 +115,11 @@ def to_hats(
     )
     new_hc_structure.catalog_info.to_properties_file(base_catalog_path)
     # Save the point distribution map
-    full_histogram = np.zeros(hp.order2npix(histogram_order))
+    total_histogram = HistogramAggregator(histogram_order)
     for partition_hist in histograms:
-        full_histogram += partition_hist.to_array()
+        total_histogram.add(partition_hist)
     point_map_path = hc.io.paths.get_point_map_file_pointer(base_catalog_path)
-    hc.io.file_io.write_fits_image(full_histogram, point_map_path)
+    hc.io.file_io.write_fits_image(total_histogram.full_histogram, point_map_path)
 
 
 def write_partitions(
diff --git a/tests/lsdb/catalog/test_catalog.py b/tests/lsdb/catalog/test_catalog.py
index e04ac61c..3fd52be6 100644
--- a/tests/lsdb/catalog/test_catalog.py
+++ b/tests/lsdb/catalog/test_catalog.py
@@ -261,6 +261,34 @@ def test_save_catalog_when_catalog_is_empty(small_sky_order1_catalog, tmp_path):
         cone_search_catalog.to_hats(base_catalog_path)
 
 
+def test_save_big_catalog(tmp_path):
+    """Load a catalog with many partitions, and save with to_hats."""
+    mock_partition_df = pd.DataFrame(
+        {
+            "ra": np.linspace(0, 360, 100_000),
+            "dec": np.linspace(-90, 90, 100_000),
+            "id": np.arange(100_000, 200_000),
+        }
+    )
+
+    base_catalog_path = tmp_path / "big_sky"
+
+    kwargs = {
+        "catalog_name": "big_sky",
+        "catalog_type": "object",
+        "lowest_order": 6,
+        "highest_order": 10,
+        "threshold": 500,
+    }
+
+    catalog = lsdb.from_dataframe(mock_partition_df, margin_threshold=None, **kwargs)
+
+    catalog.to_hats(base_catalog_path)
+
+    read_catalog = hc.read_hats(base_catalog_path)
+    assert len(read_catalog.get_healpix_pixels()) == len(catalog.get_healpix_pixels())
+
+
 def test_save_catalog_with_some_empty_partitions(small_sky_order1_catalog, tmp_path):
     base_catalog_path = tmp_path / "small_sky"