Save named indices in from_dataframe (#539)

* doesnt destroy index for Rubin * Fixed dataframe -> self.dataframe * Fixed all dataframe->self.dataframe * Keep named indices in from_dataframe * Install nested packages from source in docs --------- Co-authored-by: Neven Caplar <[email protected]>
astronomy-commons · Jan 21, 2025 · 1c5c44f · 1c5c44f
1 parent a64e163
commit 1c5c44f
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 0 deletions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -13,3 +13,5 @@ sphinx-book-theme
 sphinx-design
 git+https://github.com/astronomy-commons/hats.git@main
 git+https://github.com/astronomy-commons/hats-import.git@main
+git+https://github.com/lincc-frameworks/nested-pandas.git@main
+git+https://github.com/lincc-frameworks/nested-dask.git@main
diff --git a/src/lsdb/loaders/dataframe/dataframe_catalog_loader.py b/src/lsdb/loaders/dataframe/dataframe_catalog_loader.py
@@ -27,6 +27,7 @@
     _append_partition_information_to_dataframe,
     _extra_property_dict,
     _generate_dask_dataframe,
+    _has_named_index,
 )
 from lsdb.types import DaskDFPixelMap
 
@@ -170,6 +171,8 @@ def load_catalog(self) -> Catalog:
     def _set_spatial_index(self):
         """Generates the spatial indices for each data point and assigns
         the spatial index column as the Dataframe index."""
+        if _has_named_index(self.dataframe):
+            self.dataframe.reset_index(inplace=True)
         self.dataframe[SPATIAL_INDEX_COLUMN] = compute_spatial_index(
             ra_values=self.dataframe[self.catalog_info.ra_column].to_numpy(),
             dec_values=self.dataframe[self.catalog_info.dec_column].to_numpy(),

diff --git a/src/lsdb/loaders/dataframe/from_dataframe_utils.py b/src/lsdb/loaders/dataframe/from_dataframe_utils.py
@@ -148,3 +148,17 @@ def _extra_property_dict(est_size_bytes: int):
     properties["hats_version"] = "v0.1"
 
     return properties
+
+
+def _has_named_index(dataframe: npd.NestedFrame) -> bool:
+    """Heuristic to determine if a dataframe has some meaningful index.
+
+    This will reject dataframes with no index name for a single index,
+    or empty names for multi-index (e.g. [] or [None]).
+    """
+    if dataframe.index.name is not None:
+        ## Single index with a given name.
+        return True
+    if len(dataframe.index.names) == 0 or all(name is None for name in dataframe.index.names):
+        return False
+    return True
diff --git a/tests/lsdb/loaders/dataframe/test_from_dataframe.py b/tests/lsdb/loaders/dataframe/test_from_dataframe.py
@@ -402,3 +402,23 @@ def test_from_dataframe_with_arrow_schema(small_sky_order1_df, small_sky_order1_
     expected_schema = hc.read_hats(small_sky_order1_dir).schema
     catalog = lsdb.from_dataframe(small_sky_order1_df, schema=expected_schema)
     assert catalog.hc_structure.schema is expected_schema
+
+
+def test_from_dataframe_keeps_named_index(small_sky_order1_df):
+    assert small_sky_order1_df.index.name is None
+    small_sky_order1_df.set_index("id", inplace=True)
+    catalog = lsdb.from_dataframe(small_sky_order1_df)
+    assert catalog._ddf.index.name == "_healpix_29"
+    assert "id" in catalog.columns
+    ids = catalog["id"].compute().to_numpy()
+    expected_ids = small_sky_order1_df.index.to_numpy()
+    assert np.array_equal(ids, expected_ids)
+
+
+def test_from_dataframe_does_not_keep_unnamed_index(small_sky_order1_df):
+    assert small_sky_order1_df.index.name is None
+    range_index = pd.RangeIndex(start=0, stop=len(small_sky_order1_df), step=1)
+    assert small_sky_order1_df.index.equals(range_index)
+    catalog = lsdb.from_dataframe(small_sky_order1_df)
+    assert catalog._ddf.index.name == "_healpix_29"
+    assert "index" not in catalog.columns