From f41f21c6f11def2928cdb0270e17161080dfce39 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Thu, 28 Mar 2024 09:48:21 -0700 Subject: [PATCH 1/8] fix divisions warnings --- docs/examples/rrlyr-period.ipynb | 2 +- docs/gettingstarted/quickstart.ipynb | 2 +- docs/tutorials/batch_showcase.ipynb | 1 + docs/tutorials/common_data_operations.ipynb | 2 +- docs/tutorials/structure_function_showcase.ipynb | 12 +++--------- docs/tutorials/tape_datasets.ipynb | 14 +++++++++++--- docs/tutorials/using_ray_with_the_ensemble.ipynb | 6 +++--- docs/tutorials/working_with_the_ensemble.ipynb | 8 ++++---- docs/tutorials/working_with_the_timeseries.ipynb | 7 ++++--- 9 files changed, 29 insertions(+), 25 deletions(-) diff --git a/docs/examples/rrlyr-period.ipynb b/docs/examples/rrlyr-period.ipynb index b3e2a0d0..1a0b9beb 100644 --- a/docs/examples/rrlyr-period.ipynb +++ b/docs/examples/rrlyr-period.ipynb @@ -42,7 +42,7 @@ "outputs": [], "source": [ "# Load SDSS Stripe 82 RR Lyrae catalog\n", - "ens = Ensemble(client=False).from_dataset(\"s82_rrlyrae\")" + "ens = Ensemble(client=False).from_dataset(\"s82_rrlyrae\", sorted=True)" ] }, { diff --git a/docs/gettingstarted/quickstart.ipynb b/docs/gettingstarted/quickstart.ipynb index 8ced1893..c2946a6b 100644 --- a/docs/gettingstarted/quickstart.ipynb +++ b/docs/gettingstarted/quickstart.ipynb @@ -38,7 +38,7 @@ "from tape import Ensemble\n", "\n", "ens = Ensemble() # Initialize a TAPE Ensemble\n", - "ens.from_dataset(\"s82_qso\")" + "ens.from_dataset(\"s82_qso\", sorted=True)" ] }, { diff --git a/docs/tutorials/batch_showcase.ipynb b/docs/tutorials/batch_showcase.ipynb index b25b78fa..c9b1a9c4 100644 --- a/docs/tutorials/batch_showcase.ipynb +++ b/docs/tutorials/batch_showcase.ipynb @@ -67,6 +67,7 @@ "ens.from_source_dict(\n", " source_dict,\n", " column_mapper=ColumnMapper(id_col=\"id\", time_col=\"mjd\", flux_col=\"flux\", err_col=\"err\", band_col=\"band\"),\n", + " sorted=True,\n", ")" ] }, diff --git a/docs/tutorials/common_data_operations.ipynb b/docs/tutorials/common_data_operations.ipynb index 51e326a7..26def4bc 100644 --- a/docs/tutorials/common_data_operations.ipynb +++ b/docs/tutorials/common_data_operations.ipynb @@ -37,7 +37,7 @@ "\n", "ens = Ensemble()\n", "\n", - "ens.from_dataset(\"s82_rrlyrae\", sort=True)" + "ens.from_dataset(\"s82_rrlyrae\", sorted=True)" ] }, { diff --git a/docs/tutorials/structure_function_showcase.ipynb b/docs/tutorials/structure_function_showcase.ipynb index d7aafb12..cfa2cb0e 100644 --- a/docs/tutorials/structure_function_showcase.ipynb +++ b/docs/tutorials/structure_function_showcase.ipynb @@ -158,13 +158,6 @@ "in a TAPE `ensemble`. " ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -249,6 +242,7 @@ "ens.from_source_dict(\n", " {\"id_ens\": id_ens, \"t_ens\": t_ens, \"y_ens\": y_ens, \"yerr_ens\": yerr_ens, \"filter_ens\": filter_ens},\n", " column_mapper=manual_colmap,\n", + " sorted=True,\n", ")" ] }, @@ -564,11 +558,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.9" + "version": "3.10.11" }, "vscode": { "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + "hash": "83afbb17b435d9bf8b0d0042367da76f26510da1c5781f0ff6e6c518eab621ec" } } }, diff --git a/docs/tutorials/tape_datasets.ipynb b/docs/tutorials/tape_datasets.ipynb index 0d1b397f..614bcc99 100644 --- a/docs/tutorials/tape_datasets.ipynb +++ b/docs/tutorials/tape_datasets.ipynb @@ -46,7 +46,7 @@ ")\n", "\n", "# Read in data from a parquet file that contains source (timeseries) data\n", - "ens.from_parquet(source_file=f\"{rel_path}/source/test_source.parquet\", column_mapper=col_map)\n", + "ens.from_parquet(source_file=f\"{rel_path}/source/test_source.parquet\", column_mapper=col_map, sorted=True)\n", "\n", "ens.source.head(5) # View the first 5 entries of the source table" ] @@ -80,6 +80,7 @@ " source_file=f\"{rel_path}/source/test_source.parquet\",\n", " object_file=f\"{rel_path}/object/test_object.parquet\",\n", " column_mapper=col_map,\n", + " sorted=True,\n", ")\n", "\n", "ens.object.head(5) # View the first 5 entries of the object table" @@ -147,7 +148,7 @@ "metadata": {}, "outputs": [], "source": [ - "ens.from_dataset(\"s82_rrlyrae\") # Let's grab the Stripe 82 RR Lyrae\n", + "ens.from_dataset(\"s82_rrlyrae\", sorted=True) # Let's grab the Stripe 82 RR Lyrae\n", "\n", "ens.object.head(5)" ] @@ -209,10 +210,17 @@ "source": [ "colmap = ColumnMapper(id_col=\"id\", time_col=\"time\", flux_col=\"flux\", err_col=\"error\", band_col=\"band\")\n", "ens = Ensemble()\n", - "ens.from_source_dict(source_dict, column_mapper=colmap)\n", + "ens.from_source_dict(source_dict, column_mapper=colmap, sorted=True)\n", "\n", "ens.info()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/tutorials/using_ray_with_the_ensemble.ipynb b/docs/tutorials/using_ray_with_the_ensemble.ipynb index 6e1f7332..0b3b3389 100644 --- a/docs/tutorials/using_ray_with_the_ensemble.ipynb +++ b/docs/tutorials/using_ray_with_the_ensemble.ipynb @@ -80,7 +80,7 @@ "metadata": {}, "outputs": [], "source": [ - "ens.from_dataset(\"s82_qso\")\n", + "ens.from_dataset(\"s82_qso\", sorted=True)\n", "ens.source = ens.source.repartition(npartitions=10)\n", "ens.batch(\n", " calc_sf2, use_map=False\n", @@ -117,7 +117,7 @@ "%%time\n", "\n", "ens = Ensemble(client=False) # Do not use a client\n", - "ens.from_dataset(\"s82_qso\")\n", + "ens.from_dataset(\"s82_qso\", sorted=True)\n", "ens.source = ens.source.repartition(npartitions=10)\n", "ens.batch(calc_sf2, use_map=False)" ] @@ -151,7 +151,7 @@ "%%time\n", "\n", "ens = Ensemble()\n", - "ens.from_dataset(\"s82_qso\")\n", + "ens.from_dataset(\"s82_qso\", sorted=True)\n", "ens.source = ens.source.repartition(npartitions=10)\n", "ens.batch(calc_sf2, use_map=False).compute()" ] diff --git a/docs/tutorials/working_with_the_ensemble.ipynb b/docs/tutorials/working_with_the_ensemble.ipynb index b56f8961..c7ffa163 100644 --- a/docs/tutorials/working_with_the_ensemble.ipynb +++ b/docs/tutorials/working_with_the_ensemble.ipynb @@ -26,7 +26,7 @@ "source": [ "from tape.ensemble import Ensemble\n", "\n", - "ens = Ensemble().from_dataset(\"s82_rrlyrae\", sort=True)" + "ens = Ensemble().from_dataset(\"s82_rrlyrae\", sorted=True)" ] }, { @@ -197,7 +197,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Storing and Accessing Result Frames\n", + "## Storing and Accessing Result Frames\n", "The `Ensemble` provides a powerful batching interface, `Ensemble.batch()`, to perform analysis functions in parallel across your lightcurves.\n", "\n", "For the below example, we use the included suite of analysis functions to apply `tape.analysis.calc_stetson_J` on our dataset. (For more info on `Ensemble.batch()`, including providing your own custom functions, see the [Ensemble Batch Showcase](https://tape.readthedocs.io/en/latest/tutorials/batch_showcase.html#) )" @@ -319,7 +319,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Keeping the Object and Source Tables in Sync\n", + "## Keeping the Object and Source Tables in Sync\n", "\n", "The `TAPE` `Ensemble` attempts to lazily \"sync\" the Object and Source tables such that:\n", "\n", @@ -397,7 +397,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.11" }, "vscode": { "interpreter": { diff --git a/docs/tutorials/working_with_the_timeseries.ipynb b/docs/tutorials/working_with_the_timeseries.ipynb index 09c5f2b0..40d75628 100644 --- a/docs/tutorials/working_with_the_timeseries.ipynb +++ b/docs/tutorials/working_with_the_timeseries.ipynb @@ -40,6 +40,7 @@ " flux_col=\"psFlux\",\n", " err_col=\"psFluxErr\",\n", " band_col=\"filterName\",\n", + " sorted=True,\n", ")" ] }, @@ -77,7 +78,7 @@ ], "metadata": { "kernelspec": { - "display_name": "py310", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -91,11 +92,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.11" }, "vscode": { "interpreter": { - "hash": "08968836a6367873274ed1d5e98a07391f42fc3a62bd5aba54afbd7b11ba8673" + "hash": "83afbb17b435d9bf8b0d0042367da76f26510da1c5781f0ff6e6c518eab621ec" } } }, From f3919ce8c34d626b345a2205f0edf7085e48cd5b Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Thu, 28 Mar 2024 10:10:21 -0700 Subject: [PATCH 2/8] revert common data notebook for calc_nobs issue --- docs/tutorials/common_data_operations.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/common_data_operations.ipynb b/docs/tutorials/common_data_operations.ipynb index 26def4bc..a045b4df 100644 --- a/docs/tutorials/common_data_operations.ipynb +++ b/docs/tutorials/common_data_operations.ipynb @@ -37,7 +37,7 @@ "\n", "ens = Ensemble()\n", "\n", - "ens.from_dataset(\"s82_rrlyrae\", sorted=True)" + "ens.from_dataset(\"s82_rrlyrae\")" ] }, { From 8e8aa5a209d95a06225f722ae6e1769db6f90d1e Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Thu, 28 Mar 2024 10:45:24 -0700 Subject: [PATCH 3/8] re-add common_data_operations --- docs/tutorials/common_data_operations.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/common_data_operations.ipynb b/docs/tutorials/common_data_operations.ipynb index a045b4df..26def4bc 100644 --- a/docs/tutorials/common_data_operations.ipynb +++ b/docs/tutorials/common_data_operations.ipynb @@ -37,7 +37,7 @@ "\n", "ens = Ensemble()\n", "\n", - "ens.from_dataset(\"s82_rrlyrae\")" + "ens.from_dataset(\"s82_rrlyrae\", sorted=True)" ] }, { From 3c9351c59960263b6ff04df297f289b6a6d94c28 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Thu, 28 Mar 2024 13:38:10 -0700 Subject: [PATCH 4/8] fix calc_nobs cell --- docs/tutorials/common_data_operations.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/common_data_operations.ipynb b/docs/tutorials/common_data_operations.ipynb index 26def4bc..37bb410f 100644 --- a/docs/tutorials/common_data_operations.ipynb +++ b/docs/tutorials/common_data_operations.ipynb @@ -251,7 +251,7 @@ "source": [ "ens.calc_nobs(by_band=True)\n", "\n", - "ens.object[[\"nobs_u\", \"nobs_g\", \"nobs_r\", \"nobs_i\", \"nobs_z\", \"nobs_total\"]].head(5)" + "ens.object.head(5)[[\"nobs_u\", \"nobs_g\", \"nobs_r\", \"nobs_i\", \"nobs_z\", \"nobs_total\"]]" ] }, { From 583013ea9b94cc9c5ad979dc40bb38aa23e27421 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Thu, 28 Mar 2024 14:19:04 -0700 Subject: [PATCH 5/8] leave common data operations for now --- docs/tutorials/common_data_operations.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/common_data_operations.ipynb b/docs/tutorials/common_data_operations.ipynb index 37bb410f..53f7b1ce 100644 --- a/docs/tutorials/common_data_operations.ipynb +++ b/docs/tutorials/common_data_operations.ipynb @@ -37,7 +37,7 @@ "\n", "ens = Ensemble()\n", "\n", - "ens.from_dataset(\"s82_rrlyrae\", sorted=True)" + "ens.from_dataset(\"s82_rrlyrae\", sort=True)" ] }, { From ef87dd52bc6e48d34fcf665ab16a79e6ca8bc4d5 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Thu, 28 Mar 2024 14:47:50 -0700 Subject: [PATCH 6/8] hacky solution to from_dataset --- docs/tutorials/common_data_operations.ipynb | 2 +- src/tape/ensemble.py | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/tutorials/common_data_operations.ipynb b/docs/tutorials/common_data_operations.ipynb index 53f7b1ce..37bb410f 100644 --- a/docs/tutorials/common_data_operations.ipynb +++ b/docs/tutorials/common_data_operations.ipynb @@ -37,7 +37,7 @@ "\n", "ens = Ensemble()\n", "\n", - "ens.from_dataset(\"s82_rrlyrae\", sort=True)" + "ens.from_dataset(\"s82_rrlyrae\", sorted=True)" ] }, { diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py index 4d35fd8d..308a85ff 100644 --- a/src/tape/ensemble.py +++ b/src/tape/ensemble.py @@ -2034,12 +2034,25 @@ def from_dataset(self, dataset, **kwargs): band_col=dataset_map["band"], ) - return self.from_parquet( - source_file=dataset_info["source_file"], - object_file=dataset_info["object_file"], + # repartition and sort these demo datasets + src = dd.read_parquet(dataset_info["source_file"]).repartition(npartitions=4).persist() + obj = dd.read_parquet(dataset_info["object_file"]).repartition(npartitions=2).persist() + + src = src.set_index(dataset_map["id"], sort=True) + obj = obj.set_index(dataset_map["id"], sort=True) + + return self.from_dask_dataframe( + source_frame=src, + object_frame=obj, column_mapper=col_map, **kwargs, ) + # return self.from_parquet( + # source_file=dataset_info["source_file"], + # object_file=dataset_info["object_file"], + # column_mapper=col_map, + # **kwargs, + # ) def available_datasets(self): """Retrieve descriptions of available TAPE datasets. From a6ea6072a0790dad084f44297265a76efb990421 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Thu, 28 Mar 2024 15:08:43 -0700 Subject: [PATCH 7/8] fix save issue --- docs/tutorials/common_data_operations.ipynb | 22 ++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/tutorials/common_data_operations.ipynb b/docs/tutorials/common_data_operations.ipynb index 37bb410f..1f92cba3 100644 --- a/docs/tutorials/common_data_operations.ipynb +++ b/docs/tutorials/common_data_operations.ipynb @@ -249,7 +249,7 @@ "metadata": {}, "outputs": [], "source": [ - "ens.calc_nobs(by_band=True)\n", + "ens.calc_nobs(by_band=True, temporary=False)\n", "\n", "ens.object.head(5)[[\"nobs_u\", \"nobs_g\", \"nobs_r\", \"nobs_i\", \"nobs_z\", \"nobs_total\"]]" ] @@ -464,8 +464,8 @@ "metadata": {}, "outputs": [], "source": [ - "ens.source.repartition(partition_size=\"100MB\").update_ensemble() # 100MBs is generally recommended\n", - "ens.source # In this case, we have a small set of data that easily fits into one partition" + "ens.source.repartition(partition_size=\"100MB\") # 100MBs is generally recommended\n", + "# In this case, we have a small set of data that easily fits into one partition" ] }, { @@ -523,6 +523,15 @@ "In some situations, you may find yourself running a given workflow many times. Due to the nature of lazy-computation, this will involve repeated execution of data I/O, pre-processing steps, initial analysis, etc. In these situations, it may be effective to instead save the ensemble state to disk after completion of these initial processing steps. To accomplish this, we can use the `Ensemble.save_ensemble()` function." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ens.object.head(5)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -551,6 +560,13 @@ "new_ens = Ensemble()\n", "new_ens.from_ensemble(\"./ensemble\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From d6cdc976efb39539ef7c189fcbfa13da9a42ee14 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Thu, 28 Mar 2024 15:28:25 -0700 Subject: [PATCH 8/8] remove commented out lines --- src/tape/ensemble.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py index 308a85ff..9a6ef7f9 100644 --- a/src/tape/ensemble.py +++ b/src/tape/ensemble.py @@ -2047,12 +2047,6 @@ def from_dataset(self, dataset, **kwargs): column_mapper=col_map, **kwargs, ) - # return self.from_parquet( - # source_file=dataset_info["source_file"], - # object_file=dataset_info["object_file"], - # column_mapper=col_map, - # **kwargs, - # ) def available_datasets(self): """Retrieve descriptions of available TAPE datasets.