Skip to content
This repository has been archived by the owner on Jan 14, 2025. It is now read-only.

Commit

Permalink
add check functions
Browse files Browse the repository at this point in the history
  • Loading branch information
dougbrn committed Oct 23, 2023
1 parent 1e5c20b commit d71db3a
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dynamic=["version"]
dependencies = [
'pandas',
'numpy<=1.23.5',
'dask>=2023.5.0',
'dask>=2023.6.1',
'dask[distributed]',
'pyarrow',
'pyvo',
Expand Down
74 changes: 69 additions & 5 deletions src/tape/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas as pd

from dask.distributed import Client
from collections import Counter

from .analysis.base import AnalysisFunction
from .analysis.feature_extractor import BaseLightCurveFeature, FeatureExtractor
Expand Down Expand Up @@ -151,7 +152,7 @@ def insert_sources(

# Create the new row and set the paritioning to match the original dataframe.
df2 = dd.DataFrame.from_dict(rows, npartitions=1)
df2 = df2.set_index(self._id_col, drop=True)
df2 = df2.set_index(self._id_col, drop=True, sort=False)

# Save the divisions and number of partitions.
prev_div = self._source.divisions
Expand Down Expand Up @@ -206,6 +207,54 @@ def info(self, verbose=True, memory_usage=True, **kwargs):
print("Source Table")
self._source.info(verbose=verbose, memory_usage=memory_usage, **kwargs)

def check_sorted(self, table="object"):
"""Checks to see if an Ensemble Dataframe is sorted on the index.
Parameters
----------
table: `str`, optional
The table to check.
Returns
-------
A boolean value indicating whether the index is sorted (True)
or not (False)
"""
if table == "object":
idx = self._object.index
elif table == "source":
idx = self._source.index
else:
raise ValueError(f"{table} is not one of 'object' or 'source'")
return idx.map_partitions(lambda a: np.all(a[:-1] <= a[1:])).compute().all()

def check_lightcurve_cohesion(ens):
"""Checks to see if lightcurves are split across multiple partitions.
With partitioned data, and source information represented by rows, it
is possible that when loading data or manipulating it in some way (most
likely a repartition) that the sources for a given object will be split
among multiple partitions. This function will check to see if all
lightcurves are "cohesive", meaning the sources for that object only
live in a single partition of the dataset.
Returns
-------
A boolean value indicating whether the sources tied to a given object
are only found in a single partition (True), or if they are split
across multiple partitions (False)
"""
idx = ens._source.index
counts = idx.map_partitions(lambda a: Counter(a.unique())).compute()

unq_counter = counts[0]
for i in range(len(counts) - 1):
unq_counter += counts[i + 1]
if any(c >= 2 for c in unq_counter.values()):
return False
return True

def compute(self, table=None, **kwargs):
"""Wrapper for dask.dataframe.DataFrame.compute()
Expand Down Expand Up @@ -802,7 +851,9 @@ def batch(self, func, *args, meta=None, use_map=True, compute=True, on=None, **k
Determines whether `dask.dataframe.DataFrame.map_partitions` is
used (True). Using map_partitions is generally more efficient, but
requires the data from each lightcurve is housed in a single
partition. If False, a groupby will be performed instead.
partition. This can be checked using
`Ensemble.check_lightcurve_cohesion`. If False, a groupby will be
performed instead.
compute: `boolean`
Determines whether to compute the result immediately or hold for a
later compute call.
Expand Down Expand Up @@ -961,6 +1012,7 @@ def from_dask_dataframe(
sync_tables=True,
npartitions=None,
partition_size=None,
sort=False,
**kwargs,
):
"""Read in Dask dataframe(s) into an ensemble object
Expand All @@ -985,6 +1037,9 @@ def from_dask_dataframe(
partition_size: `int`, optional
If specified, attempts to repartition the ensemble to partitions
of size `partition_size`.
sort: `bool`, optional
If True, sorts the DataFrame by the id column. Otherwise set the index
on the individual existing partitions. Defaults to False.
Returns
----------
Expand All @@ -994,14 +1049,14 @@ def from_dask_dataframe(
self._load_column_mapper(column_mapper, **kwargs)

# Set the index of the source frame and save the resulting table
self._source = source_frame.set_index(self._id_col, drop=True)
self._source = source_frame.set_index(self._id_col, drop=True, sort=sort)

if object_frame is None: # generate an indexed object table from source
self._object = self._generate_object_table()

else:
self._object = object_frame
self._object = self._object.set_index(self._id_col)
self._object = self._object.set_index(self._id_col, sort=sort)

# Optionally sync the tables, recalculates nobs columns
if sync_tables:
Expand Down Expand Up @@ -1148,6 +1203,7 @@ def from_parquet(
additional_cols=True,
npartitions=None,
partition_size=None,
sort=False,
**kwargs,
):
"""Read in parquet file(s) into an ensemble object
Expand Down Expand Up @@ -1181,6 +1237,9 @@ def from_parquet(
partition_size: `int`, optional
If specified, attempts to repartition the ensemble to partitions
of size `partition_size`.
sort: `bool`, optional
If True, sorts the DataFrame by the id column. Otherwise set the index
on the individual existing partitions. Defaults to False.
Returns
----------
Expand Down Expand Up @@ -1218,6 +1277,7 @@ def from_parquet(
sync_tables=sync_tables,
npartitions=npartitions,
partition_size=partition_size,
sort=sort,
**kwargs,
)

Expand Down Expand Up @@ -1275,7 +1335,7 @@ def available_datasets(self):

return {key: datasets_file[key]["description"] for key in datasets_file.keys()}

def from_source_dict(self, source_dict, column_mapper=None, npartitions=1, **kwargs):
def from_source_dict(self, source_dict, column_mapper=None, npartitions=1, sort=False, **kwargs):
"""Load the sources into an ensemble from a dictionary.
Parameters
Expand All @@ -1288,6 +1348,9 @@ def from_source_dict(self, source_dict, column_mapper=None, npartitions=1, **kwa
npartitions: `int`, optional
If specified, attempts to repartition the ensemble to the specified
number of partitions
sort: `bool`, optional
If True, sorts the DataFrame by the id column. Otherwise set the index
on the individual existing partitions. Defaults to False.
Returns
----------
Expand All @@ -1304,6 +1367,7 @@ def from_source_dict(self, source_dict, column_mapper=None, npartitions=1, **kwa
column_mapper=column_mapper,
sync_tables=True,
npartitions=npartitions,
sort=sort,
**kwargs,
)

Expand Down
54 changes: 54 additions & 0 deletions tests/tape_tests/test_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,60 @@ def test_core_wrappers(parquet_ensemble):
parquet_ensemble.compute()


@pytest.mark.parametrize("data_sorted", [True, False])
def test_check_sorted(dask_client, data_sorted):
# Create some fake data.

if data_sorted:
rows = {
"id": [8001, 8001, 8001, 8001, 8002, 8002, 8002, 8002, 8002],
"time": [10.1, 10.2, 10.2, 11.1, 11.2, 11.3, 11.4, 15.0, 15.1],
"band": ["g", "g", "b", "g", "b", "g", "g", "g", "g"],
"err": [1.0, 2.0, 1.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0],
"flux": [1.0, 2.0, 5.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0],
}
else:
rows = {
"id": [8001, 8002, 8001, 8001, 8002, 8002, 8001, 8002, 8002],
"time": [10.1, 10.2, 10.2, 11.1, 11.2, 11.3, 11.4, 15.0, 15.1],
"band": ["g", "g", "b", "g", "b", "g", "g", "g", "g"],
"err": [1.0, 2.0, 1.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0],
"flux": [1.0, 2.0, 5.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0],
}
cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band")
ens = Ensemble(client=dask_client)
ens.from_source_dict(rows, column_mapper=cmap, sort=False)

assert ens.check_sorted("source") == data_sorted


@pytest.mark.parametrize("data_cohesion", [True, False])
def test_check_lightcurve_cohesion(dask_client, data_cohesion):
# Create some fake data.

if data_cohesion:
rows = {
"id": [8001, 8001, 8001, 8001, 8001, 8002, 8002, 8002, 8002],
"time": [10.1, 10.2, 10.2, 11.1, 11.2, 11.3, 11.4, 15.0, 15.1],
"band": ["g", "g", "b", "g", "b", "g", "g", "g", "g"],
"err": [1.0, 2.0, 1.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0],
"flux": [1.0, 2.0, 5.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0],
}
else:
rows = {
"id": [8001, 8001, 8001, 8001, 8002, 8002, 8002, 8002, 8001],
"time": [10.1, 10.2, 10.2, 11.1, 11.2, 11.3, 11.4, 15.0, 15.1],
"band": ["g", "g", "b", "g", "b", "g", "g", "g", "g"],
"err": [1.0, 2.0, 1.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0],
"flux": [1.0, 2.0, 5.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0],
}
cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band")
ens = Ensemble(client=dask_client)
ens.from_source_dict(rows, column_mapper=cmap, sort=False, npartitions=2)

assert ens.check_lightcurve_cohesion() == data_cohesion


def test_persist(dask_client):
# Create some fake data.
rows = {
Expand Down

0 comments on commit d71db3a

Please sign in to comment.