Skip to content

Commit

Permalink
Merge pull request #276 from lincc-frameworks/check_functions
Browse files Browse the repository at this point in the history
add check functions
  • Loading branch information
dougbrn authored Oct 25, 2023
2 parents 1e5c20b + 66203f9 commit 4243b85
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dynamic=["version"]
dependencies = [
'pandas',
'numpy<=1.23.5',
'dask>=2023.5.0',
'dask>=2023.6.1',
'dask[distributed]',
'pyarrow',
'pyvo',
Expand Down
94 changes: 89 additions & 5 deletions src/tape/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas as pd

from dask.distributed import Client
from collections import Counter

from .analysis.base import AnalysisFunction
from .analysis.feature_extractor import BaseLightCurveFeature, FeatureExtractor
Expand Down Expand Up @@ -151,7 +152,7 @@ def insert_sources(

# Create the new row and set the paritioning to match the original dataframe.
df2 = dd.DataFrame.from_dict(rows, npartitions=1)
df2 = df2.set_index(self._id_col, drop=True)
df2 = df2.set_index(self._id_col, drop=True, sort=True)

# Save the divisions and number of partitions.
prev_div = self._source.divisions
Expand All @@ -169,6 +170,8 @@ def insert_sources(
elif self._source.npartitions != prev_num:
self._source = self._source.repartition(npartitions=prev_num)

return self

def client_info(self):
"""Calls the Dask Client, which returns cluster information
Expand Down Expand Up @@ -206,6 +209,57 @@ def info(self, verbose=True, memory_usage=True, **kwargs):
print("Source Table")
self._source.info(verbose=verbose, memory_usage=memory_usage, **kwargs)

def check_sorted(self, table="object"):
"""Checks to see if an Ensemble Dataframe is sorted (increasing) on
the index.
Parameters
----------
table: `str`, optional
The table to check.
Returns
-------
A boolean value indicating whether the index is sorted (True)
or not (False)
"""
if table == "object":
idx = self._object.index
elif table == "source":
idx = self._source.index
else:
raise ValueError(f"{table} is not one of 'object' or 'source'")

# Use the existing index function to check if it's sorted (increasing)
return idx.is_monotonic_increasing.compute()

def check_lightcurve_cohesion(self):
"""Checks to see if lightcurves are split across multiple partitions.
With partitioned data, and source information represented by rows, it
is possible that when loading data or manipulating it in some way (most
likely a repartition) that the sources for a given object will be split
among multiple partitions. This function will check to see if all
lightcurves are "cohesive", meaning the sources for that object only
live in a single partition of the dataset.
Returns
-------
A boolean value indicating whether the sources tied to a given object
are only found in a single partition (True), or if they are split
across multiple partitions (False)
"""
idx = self._source.index
counts = idx.map_partitions(lambda a: Counter(a.unique())).compute()

unq_counter = counts[0]
for i in range(1, len(counts)):
unq_counter += counts[i]
if any(c >= 2 for c in unq_counter.values()):
return False
return True

def compute(self, table=None, **kwargs):
"""Wrapper for dask.dataframe.DataFrame.compute()
Expand Down Expand Up @@ -802,7 +856,9 @@ def batch(self, func, *args, meta=None, use_map=True, compute=True, on=None, **k
Determines whether `dask.dataframe.DataFrame.map_partitions` is
used (True). Using map_partitions is generally more efficient, but
requires the data from each lightcurve is housed in a single
partition. If False, a groupby will be performed instead.
partition. This can be checked using
`Ensemble.check_lightcurve_cohesion`. If False, a groupby will be
performed instead.
compute: `boolean`
Determines whether to compute the result immediately or hold for a
later compute call.
Expand Down Expand Up @@ -961,6 +1017,8 @@ def from_dask_dataframe(
sync_tables=True,
npartitions=None,
partition_size=None,
sorted=False,
sort=False,
**kwargs,
):
"""Read in Dask dataframe(s) into an ensemble object
Expand All @@ -985,6 +1043,12 @@ def from_dask_dataframe(
partition_size: `int`, optional
If specified, attempts to repartition the ensemble to partitions
of size `partition_size`.
sorted: bool, optional
If the index column is already sorted in increasing order.
Defaults to False
sort: `bool`, optional
If True, sorts the DataFrame by the id column. Otherwise set the
index on the individual existing partitions. Defaults to False.
Returns
----------
Expand All @@ -994,14 +1058,14 @@ def from_dask_dataframe(
self._load_column_mapper(column_mapper, **kwargs)

# Set the index of the source frame and save the resulting table
self._source = source_frame.set_index(self._id_col, drop=True)
self._source = source_frame.set_index(self._id_col, drop=True, sorted=sorted, sort=sort)

if object_frame is None: # generate an indexed object table from source
self._object = self._generate_object_table()

else:
self._object = object_frame
self._object = self._object.set_index(self._id_col)
self._object = self._object.set_index(self._id_col, sorted=sorted, sort=sort)

# Optionally sync the tables, recalculates nobs columns
if sync_tables:
Expand Down Expand Up @@ -1148,6 +1212,8 @@ def from_parquet(
additional_cols=True,
npartitions=None,
partition_size=None,
sorted=False,
sort=False,
**kwargs,
):
"""Read in parquet file(s) into an ensemble object
Expand Down Expand Up @@ -1181,6 +1247,12 @@ def from_parquet(
partition_size: `int`, optional
If specified, attempts to repartition the ensemble to partitions
of size `partition_size`.
sorted: bool, optional
If the index column is already sorted in increasing order.
Defaults to False
sort: `bool`, optional
If True, sorts the DataFrame by the id column. Otherwise set the
index on the individual existing partitions. Defaults to False.
Returns
----------
Expand Down Expand Up @@ -1218,6 +1290,8 @@ def from_parquet(
sync_tables=sync_tables,
npartitions=npartitions,
partition_size=partition_size,
sorted=sorted,
sort=sort,
**kwargs,
)

Expand Down Expand Up @@ -1275,7 +1349,9 @@ def available_datasets(self):

return {key: datasets_file[key]["description"] for key in datasets_file.keys()}

def from_source_dict(self, source_dict, column_mapper=None, npartitions=1, **kwargs):
def from_source_dict(
self, source_dict, column_mapper=None, npartitions=1, sorted=False, sort=False, **kwargs
):
"""Load the sources into an ensemble from a dictionary.
Parameters
Expand All @@ -1288,6 +1364,12 @@ def from_source_dict(self, source_dict, column_mapper=None, npartitions=1, **kwa
npartitions: `int`, optional
If specified, attempts to repartition the ensemble to the specified
number of partitions
sorted: bool, optional
If the index column is already sorted in increasing order.
Defaults to False
sort: `bool`, optional
If True, sorts the DataFrame by the id column. Otherwise set the
index on the individual existing partitions. Defaults to False.
Returns
----------
Expand All @@ -1304,6 +1386,8 @@ def from_source_dict(self, source_dict, column_mapper=None, npartitions=1, **kwa
column_mapper=column_mapper,
sync_tables=True,
npartitions=npartitions,
sorted=sorted,
sort=sort,
**kwargs,
)

Expand Down
66 changes: 64 additions & 2 deletions tests/tape_tests/test_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,8 +372,15 @@ def test_insert_paritioned(dask_client):
"flux": [0.5 * float(i) for i in range(num_points)],
"band": [all_bands[i % 4] for i in range(num_points)],
}
cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band")
ens.from_source_dict(rows, column_mapper=cmap, npartitions=4)
cmap = ColumnMapper(
id_col="id",
time_col="time",
flux_col="flux",
err_col="err",
band_col="band",
provenance_col="provenance",
)
ens.from_source_dict(rows, column_mapper=cmap, npartitions=4, sort=True)

# Save the old data for comparison.
old_data = ens.compute("source")
Expand Down Expand Up @@ -435,6 +442,61 @@ def test_core_wrappers(parquet_ensemble):
parquet_ensemble.compute()


@pytest.mark.parametrize("data_sorted", [True, False])
@pytest.mark.parametrize("npartitions", [1, 2])
def test_check_sorted(dask_client, data_sorted, npartitions):
# Create some fake data.

if data_sorted:
rows = {
"id": [8001, 8001, 8001, 8001, 8002, 8002, 8002, 8002, 8002],
"time": [10.1, 10.2, 10.2, 11.1, 11.2, 11.3, 11.4, 15.0, 15.1],
"band": ["g", "g", "b", "g", "b", "g", "g", "g", "g"],
"err": [1.0, 2.0, 1.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0],
"flux": [1.0, 2.0, 5.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0],
}
else:
rows = {
"id": [8002, 8002, 8002, 8002, 8002, 8001, 8001, 8002, 8002],
"time": [10.1, 10.2, 10.2, 11.1, 11.2, 11.3, 11.4, 15.0, 15.1],
"band": ["g", "g", "b", "g", "b", "g", "g", "g", "g"],
"err": [1.0, 2.0, 1.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0],
"flux": [1.0, 2.0, 5.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0],
}
cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band")
ens = Ensemble(client=dask_client)
ens.from_source_dict(rows, column_mapper=cmap, sort=False, npartitions=npartitions)

assert ens.check_sorted("source") == data_sorted


@pytest.mark.parametrize("data_cohesion", [True, False])
def test_check_lightcurve_cohesion(dask_client, data_cohesion):
# Create some fake data.

if data_cohesion:
rows = {
"id": [8001, 8001, 8001, 8001, 8001, 8002, 8002, 8002, 8002],
"time": [10.1, 10.2, 10.2, 11.1, 11.2, 11.3, 11.4, 15.0, 15.1],
"band": ["g", "g", "b", "g", "b", "g", "g", "g", "g"],
"err": [1.0, 2.0, 1.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0],
"flux": [1.0, 2.0, 5.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0],
}
else:
rows = {
"id": [8001, 8001, 8001, 8001, 8002, 8002, 8002, 8002, 8001],
"time": [10.1, 10.2, 10.2, 11.1, 11.2, 11.3, 11.4, 15.0, 15.1],
"band": ["g", "g", "b", "g", "b", "g", "g", "g", "g"],
"err": [1.0, 2.0, 1.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0],
"flux": [1.0, 2.0, 5.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0],
}
cmap = ColumnMapper(id_col="id", time_col="time", flux_col="flux", err_col="err", band_col="band")
ens = Ensemble(client=dask_client)
ens.from_source_dict(rows, column_mapper=cmap, sort=False, npartitions=2)

assert ens.check_lightcurve_cohesion() == data_cohesion


def test_persist(dask_client):
# Create some fake data.
rows = {
Expand Down

0 comments on commit 4243b85

Please sign in to comment.