From 965ef8f096b3840d29abe8ba3c4c4fd2c690c2af Mon Sep 17 00:00:00 2001 From: Melissa DeLucchi <113376043+delucchi-cmu@users.noreply.github.com> Date: Mon, 6 Jan 2025 13:08:36 -0500 Subject: [PATCH] Use a naive sparse histogram. (#534) * Use a naive sparse histogram. * Remove make_from_counts --- src/lsdb/io/to_hats.py | 13 ++++++------- tests/lsdb/catalog/test_catalog.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/lsdb/io/to_hats.py b/src/lsdb/io/to_hats.py index 9ef35006..51b811cb 100644 --- a/src/lsdb/io/to_hats.py +++ b/src/lsdb/io/to_hats.py @@ -6,13 +6,12 @@ import dask import hats as hc -import hats.pixel_math.healpix_shim as hp import nested_pandas as npd import numpy as np from hats.catalog import PartitionInfo from hats.catalog.healpix_dataset.healpix_dataset import HealpixDataset as HCHealpixDataset from hats.pixel_math import HealpixPixel, spatial_index_to_healpix -from hats.pixel_math.sparse_histogram import SparseHistogram +from hats.pixel_math.sparse_histogram import HistogramAggregator, SparseHistogram from upath import UPath if TYPE_CHECKING: @@ -42,7 +41,7 @@ def perform_write( at the specified order. """ if len(df) == 0: - return 0, SparseHistogram.make_empty(histogram_order) + return 0, SparseHistogram([], [], histogram_order) pixel_dir = hc.io.pixel_directory(base_catalog_dir, hp_pixel.order, hp_pixel.pixel) hc.io.file_io.make_directory(pixel_dir, exist_ok=True) pixel_path = hc.io.paths.pixel_catalog_file(base_catalog_dir, hp_pixel) @@ -64,7 +63,7 @@ def calculate_histogram(df: npd.NestedFrame, histogram_order: int) -> SparseHist order_pixels = spatial_index_to_healpix(df.index.to_numpy(), target_order=histogram_order) gb = df.groupby(order_pixels, sort=False).apply(len) indexes, counts_at_indexes = gb.index.to_numpy(), gb.to_numpy(na_value=0) - return SparseHistogram.make_from_counts(indexes, counts_at_indexes, histogram_order) + return SparseHistogram(indexes, counts_at_indexes, histogram_order) # pylint: disable=protected-access @@ -116,11 +115,11 @@ def to_hats( ) new_hc_structure.catalog_info.to_properties_file(base_catalog_path) # Save the point distribution map - full_histogram = np.zeros(hp.order2npix(histogram_order)) + total_histogram = HistogramAggregator(histogram_order) for partition_hist in histograms: - full_histogram += partition_hist.to_array() + total_histogram.add(partition_hist) point_map_path = hc.io.paths.get_point_map_file_pointer(base_catalog_path) - hc.io.file_io.write_fits_image(full_histogram, point_map_path) + hc.io.file_io.write_fits_image(total_histogram.full_histogram, point_map_path) def write_partitions( diff --git a/tests/lsdb/catalog/test_catalog.py b/tests/lsdb/catalog/test_catalog.py index e04ac61c..3fd52be6 100644 --- a/tests/lsdb/catalog/test_catalog.py +++ b/tests/lsdb/catalog/test_catalog.py @@ -261,6 +261,34 @@ def test_save_catalog_when_catalog_is_empty(small_sky_order1_catalog, tmp_path): cone_search_catalog.to_hats(base_catalog_path) +def test_save_big_catalog(tmp_path): + """Load a catalog with many partitions, and save with to_hats.""" + mock_partition_df = pd.DataFrame( + { + "ra": np.linspace(0, 360, 100_000), + "dec": np.linspace(-90, 90, 100_000), + "id": np.arange(100_000, 200_000), + } + ) + + base_catalog_path = tmp_path / "big_sky" + + kwargs = { + "catalog_name": "big_sky", + "catalog_type": "object", + "lowest_order": 6, + "highest_order": 10, + "threshold": 500, + } + + catalog = lsdb.from_dataframe(mock_partition_df, margin_threshold=None, **kwargs) + + catalog.to_hats(base_catalog_path) + + read_catalog = hc.read_hats(base_catalog_path) + assert len(read_catalog.get_healpix_pixels()) == len(catalog.get_healpix_pixels()) + + def test_save_catalog_with_some_empty_partitions(small_sky_order1_catalog, tmp_path): base_catalog_path = tmp_path / "small_sky"