From 98b9cfc78283b0ef9e705596c0688feebc4eb8aa Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 30 Jan 2024 15:40:20 -0800 Subject: [PATCH] add sampling and inspection docs --- .../tutorials/working_with_the_ensemble.ipynb | 64 ++++++++++++++++++- src/tape/ensemble.py | 9 ++- 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/docs/tutorials/working_with_the_ensemble.ipynb b/docs/tutorials/working_with_the_ensemble.ipynb index 25a4818e..d98a6cfa 100644 --- a/docs/tutorials/working_with_the_ensemble.ipynb +++ b/docs/tutorials/working_with_the_ensemble.ipynb @@ -90,6 +90,7 @@ " err_col=\"error\",\n", " band_col=\"band\",\n", " npartitions=1,\n", + " sort=True,\n", ")" ] }, @@ -130,7 +131,9 @@ ")\n", "\n", "# Pass the ColumnMapper along to from_pandas\n", - "ens.from_pandas(source_frame=source_table, object_frame=object_table, column_mapper=col_map, npartitions=1)" + "ens.from_pandas(\n", + " source_frame=source_table, object_frame=object_table, column_mapper=col_map, npartitions=1, sort=True\n", + ")" ] }, { @@ -201,10 +204,11 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Inspection, Filtering, and Selecting\n", + "## Inspection and Filtering\n", "\n", "The `Ensemble` contains an assortment of functions for inspecting and filtering your data." ] @@ -290,6 +294,40 @@ "ens.source.compute()" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Several methods exist to access individual lightcurves within the `Ensemble`. First of which is the `to_timeseries` function. This allows you to supply a given object ID, and returns a `TimeSeries` object (see )." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ens.to_timeseries(8003).data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, if you aren't interested in a particular lightcurve, you can draw a random one from the `Ensemble` using `Ensemble.select_random_timeseries`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ens.select_random_timeseries(seed=1).data" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -397,6 +435,28 @@ "In the above operations, we remove any rows that have at least 1 NaN value present. And then filter such that only lightcurves which have at least 50 measurements are retained." ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sampling\n", + "\n", + "In addition to filtering by specific constraints, it's possible to select a subset of your data to work with. `Ensemble.sample` will randomly select a fraction of objects from the full object list. By default this will return a new ensemble." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subset_ens = ens.sample(frac=0.5, overwrite=False) # select ~half of the objects, don't overwrite ens\n", + "\n", + "print(\"Number of pre-sampled objects: \", len(ens.object))\n", + "print(\"Number of post-sampled objects: \", len(subset_ens.object))" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py index f97aeec4..fd4433f5 100644 --- a/src/tape/ensemble.py +++ b/src/tape/ensemble.py @@ -475,10 +475,13 @@ def persist(self, **kwargs): def sample(self, overwrite=False, **kwargs): """Selects a sample of objects. + This sampling will be lazily applied to the SourceFrame as well, but + will not affect any additional result frames. + Parameters ---------- overwrite: boolean, optional - Indicates whether to overwrite the current ensemble (set True), or + Indicates whether to overwrite the current ensemble (set True), or create a new ensemble for the subset of objects (set False). **kwargs: keyword arguments passed along to @@ -488,7 +491,7 @@ def sample(self, overwrite=False, **kwargs): ---------- ensemble: `tape.ensemble.Ensemble` A new ensemble with the subset of data selected - + """ # first do an object sync, ensure object table is up to date @@ -504,7 +507,7 @@ def sample(self, overwrite=False, **kwargs): # sync to source, removes all tied sources self._lazy_sync_tables(table="source") - return self # current in-place implementation + return self # current in-place implementation else: # make a new ensemble # TODO: Investigate shared client warning