Skip to content

Commit

Permalink
Save named indices in from_dataframe (#539)
Browse files Browse the repository at this point in the history
* doesnt destroy index for Rubin

* Fixed dataframe -> self.dataframe

* Fixed all dataframe->self.dataframe

* Keep named indices in from_dataframe

* Install nested packages from source in docs

---------

Co-authored-by: Neven Caplar <[email protected]>
  • Loading branch information
camposandro and nevencaplar authored Jan 21, 2025
1 parent a64e163 commit 1c5c44f
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@ sphinx-book-theme
sphinx-design
git+https://github.com/astronomy-commons/hats.git@main
git+https://github.com/astronomy-commons/hats-import.git@main
git+https://github.com/lincc-frameworks/nested-pandas.git@main
git+https://github.com/lincc-frameworks/nested-dask.git@main
3 changes: 3 additions & 0 deletions src/lsdb/loaders/dataframe/dataframe_catalog_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
_append_partition_information_to_dataframe,
_extra_property_dict,
_generate_dask_dataframe,
_has_named_index,
)
from lsdb.types import DaskDFPixelMap

Expand Down Expand Up @@ -170,6 +171,8 @@ def load_catalog(self) -> Catalog:
def _set_spatial_index(self):
"""Generates the spatial indices for each data point and assigns
the spatial index column as the Dataframe index."""
if _has_named_index(self.dataframe):
self.dataframe.reset_index(inplace=True)
self.dataframe[SPATIAL_INDEX_COLUMN] = compute_spatial_index(
ra_values=self.dataframe[self.catalog_info.ra_column].to_numpy(),
dec_values=self.dataframe[self.catalog_info.dec_column].to_numpy(),
Expand Down
14 changes: 14 additions & 0 deletions src/lsdb/loaders/dataframe/from_dataframe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,17 @@ def _extra_property_dict(est_size_bytes: int):
properties["hats_version"] = "v0.1"

return properties


def _has_named_index(dataframe: npd.NestedFrame) -> bool:
"""Heuristic to determine if a dataframe has some meaningful index.
This will reject dataframes with no index name for a single index,
or empty names for multi-index (e.g. [] or [None]).
"""
if dataframe.index.name is not None:
## Single index with a given name.
return True
if len(dataframe.index.names) == 0 or all(name is None for name in dataframe.index.names):
return False
return True
20 changes: 20 additions & 0 deletions tests/lsdb/loaders/dataframe/test_from_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,3 +402,23 @@ def test_from_dataframe_with_arrow_schema(small_sky_order1_df, small_sky_order1_
expected_schema = hc.read_hats(small_sky_order1_dir).schema
catalog = lsdb.from_dataframe(small_sky_order1_df, schema=expected_schema)
assert catalog.hc_structure.schema is expected_schema


def test_from_dataframe_keeps_named_index(small_sky_order1_df):
assert small_sky_order1_df.index.name is None
small_sky_order1_df.set_index("id", inplace=True)
catalog = lsdb.from_dataframe(small_sky_order1_df)
assert catalog._ddf.index.name == "_healpix_29"
assert "id" in catalog.columns
ids = catalog["id"].compute().to_numpy()
expected_ids = small_sky_order1_df.index.to_numpy()
assert np.array_equal(ids, expected_ids)


def test_from_dataframe_does_not_keep_unnamed_index(small_sky_order1_df):
assert small_sky_order1_df.index.name is None
range_index = pd.RangeIndex(start=0, stop=len(small_sky_order1_df), step=1)
assert small_sky_order1_df.index.equals(range_index)
catalog = lsdb.from_dataframe(small_sky_order1_df)
assert catalog._ddf.index.name == "_healpix_29"
assert "index" not in catalog.columns

0 comments on commit 1c5c44f

Please sign in to comment.