diff --git a/docs/examples/rrlyr-period.ipynb b/docs/examples/rrlyr-period.ipynb index b3e2a0d0..1a0b9beb 100644 --- a/docs/examples/rrlyr-period.ipynb +++ b/docs/examples/rrlyr-period.ipynb @@ -42,7 +42,7 @@ "outputs": [], "source": [ "# Load SDSS Stripe 82 RR Lyrae catalog\n", - "ens = Ensemble(client=False).from_dataset(\"s82_rrlyrae\")" + "ens = Ensemble(client=False).from_dataset(\"s82_rrlyrae\", sorted=True)" ] }, { diff --git a/docs/gettingstarted/quickstart.ipynb b/docs/gettingstarted/quickstart.ipynb index 47a54d79..41da4ca5 100644 --- a/docs/gettingstarted/quickstart.ipynb +++ b/docs/gettingstarted/quickstart.ipynb @@ -48,7 +48,7 @@ "from tape import Ensemble\n", "\n", "ens = Ensemble() # Initialize a TAPE Ensemble\n", - "ens.from_dataset(\"s82_qso\")" + "ens.from_dataset(\"s82_qso\", sorted=True)" ] }, { diff --git a/docs/tutorials/batch_showcase.ipynb b/docs/tutorials/batch_showcase.ipynb index b25b78fa..c9b1a9c4 100644 --- a/docs/tutorials/batch_showcase.ipynb +++ b/docs/tutorials/batch_showcase.ipynb @@ -67,6 +67,7 @@ "ens.from_source_dict(\n", " source_dict,\n", " column_mapper=ColumnMapper(id_col=\"id\", time_col=\"mjd\", flux_col=\"flux\", err_col=\"err\", band_col=\"band\"),\n", + " sorted=True,\n", ")" ] }, diff --git a/docs/tutorials/common_data_operations.ipynb b/docs/tutorials/common_data_operations.ipynb index 7dec3e76..9dee7c93 100644 --- a/docs/tutorials/common_data_operations.ipynb +++ b/docs/tutorials/common_data_operations.ipynb @@ -37,7 +37,7 @@ "\n", "ens = Ensemble()\n", "\n", - "ens.from_dataset(\"s82_rrlyrae\", sort=True)" + "ens.from_dataset(\"s82_rrlyrae\", sorted=True)" ] }, { @@ -252,9 +252,9 @@ "metadata": {}, "outputs": [], "source": [ - "ens.calc_nobs(by_band=True)\n", + "ens.calc_nobs(by_band=True, temporary=False)\n", "\n", - "ens.object[[\"nobs_u\", \"nobs_g\", \"nobs_r\", \"nobs_i\", \"nobs_z\", \"nobs_total\"]].head(5)" + "ens.object.head(5)[[\"nobs_u\", \"nobs_g\", \"nobs_r\", \"nobs_i\", \"nobs_z\", \"nobs_total\"]]" ] }, { @@ -467,8 +467,8 @@ "metadata": {}, "outputs": [], "source": [ - "ens.source.repartition(partition_size=\"100MB\").update_ensemble() # 100MBs is generally recommended\n", - "ens.source # In this case, we have a small set of data that easily fits into one partition" + "ens.source.repartition(partition_size=\"100MB\") # 100MBs is generally recommended\n", + "# In this case, we have a small set of data that easily fits into one partition" ] }, { @@ -548,6 +548,15 @@ "In some situations, you may find yourself running a given workflow many times. Due to the nature of lazy-computation, this will involve repeated execution of data I/O, pre-processing steps, initial analysis, etc. In these situations, it may be effective to instead save the ensemble state to disk after completion of these initial processing steps. To accomplish this, we can use the `Ensemble.save_ensemble()` function." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ens.object.head(5)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -576,6 +585,13 @@ "new_ens = Ensemble()\n", "new_ens.from_ensemble(\"./ensemble\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/tutorials/structure_function_showcase.ipynb b/docs/tutorials/structure_function_showcase.ipynb index d7aafb12..cfa2cb0e 100644 --- a/docs/tutorials/structure_function_showcase.ipynb +++ b/docs/tutorials/structure_function_showcase.ipynb @@ -158,13 +158,6 @@ "in a TAPE `ensemble`. " ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -249,6 +242,7 @@ "ens.from_source_dict(\n", " {\"id_ens\": id_ens, \"t_ens\": t_ens, \"y_ens\": y_ens, \"yerr_ens\": yerr_ens, \"filter_ens\": filter_ens},\n", " column_mapper=manual_colmap,\n", + " sorted=True,\n", ")" ] }, @@ -564,11 +558,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.9" + "version": "3.10.11" }, "vscode": { "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + "hash": "83afbb17b435d9bf8b0d0042367da76f26510da1c5781f0ff6e6c518eab621ec" } } }, diff --git a/docs/tutorials/tape_datasets.ipynb b/docs/tutorials/tape_datasets.ipynb index 0d1b397f..614bcc99 100644 --- a/docs/tutorials/tape_datasets.ipynb +++ b/docs/tutorials/tape_datasets.ipynb @@ -46,7 +46,7 @@ ")\n", "\n", "# Read in data from a parquet file that contains source (timeseries) data\n", - "ens.from_parquet(source_file=f\"{rel_path}/source/test_source.parquet\", column_mapper=col_map)\n", + "ens.from_parquet(source_file=f\"{rel_path}/source/test_source.parquet\", column_mapper=col_map, sorted=True)\n", "\n", "ens.source.head(5) # View the first 5 entries of the source table" ] @@ -80,6 +80,7 @@ " source_file=f\"{rel_path}/source/test_source.parquet\",\n", " object_file=f\"{rel_path}/object/test_object.parquet\",\n", " column_mapper=col_map,\n", + " sorted=True,\n", ")\n", "\n", "ens.object.head(5) # View the first 5 entries of the object table" @@ -147,7 +148,7 @@ "metadata": {}, "outputs": [], "source": [ - "ens.from_dataset(\"s82_rrlyrae\") # Let's grab the Stripe 82 RR Lyrae\n", + "ens.from_dataset(\"s82_rrlyrae\", sorted=True) # Let's grab the Stripe 82 RR Lyrae\n", "\n", "ens.object.head(5)" ] @@ -209,10 +210,17 @@ "source": [ "colmap = ColumnMapper(id_col=\"id\", time_col=\"time\", flux_col=\"flux\", err_col=\"error\", band_col=\"band\")\n", "ens = Ensemble()\n", - "ens.from_source_dict(source_dict, column_mapper=colmap)\n", + "ens.from_source_dict(source_dict, column_mapper=colmap, sorted=True)\n", "\n", "ens.info()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/tutorials/using_ray_with_the_ensemble.ipynb b/docs/tutorials/using_ray_with_the_ensemble.ipynb index 6e1f7332..0b3b3389 100644 --- a/docs/tutorials/using_ray_with_the_ensemble.ipynb +++ b/docs/tutorials/using_ray_with_the_ensemble.ipynb @@ -80,7 +80,7 @@ "metadata": {}, "outputs": [], "source": [ - "ens.from_dataset(\"s82_qso\")\n", + "ens.from_dataset(\"s82_qso\", sorted=True)\n", "ens.source = ens.source.repartition(npartitions=10)\n", "ens.batch(\n", " calc_sf2, use_map=False\n", @@ -117,7 +117,7 @@ "%%time\n", "\n", "ens = Ensemble(client=False) # Do not use a client\n", - "ens.from_dataset(\"s82_qso\")\n", + "ens.from_dataset(\"s82_qso\", sorted=True)\n", "ens.source = ens.source.repartition(npartitions=10)\n", "ens.batch(calc_sf2, use_map=False)" ] @@ -151,7 +151,7 @@ "%%time\n", "\n", "ens = Ensemble()\n", - "ens.from_dataset(\"s82_qso\")\n", + "ens.from_dataset(\"s82_qso\", sorted=True)\n", "ens.source = ens.source.repartition(npartitions=10)\n", "ens.batch(calc_sf2, use_map=False).compute()" ] diff --git a/docs/tutorials/working_with_the_ensemble.ipynb b/docs/tutorials/working_with_the_ensemble.ipynb index b56f8961..c7ffa163 100644 --- a/docs/tutorials/working_with_the_ensemble.ipynb +++ b/docs/tutorials/working_with_the_ensemble.ipynb @@ -26,7 +26,7 @@ "source": [ "from tape.ensemble import Ensemble\n", "\n", - "ens = Ensemble().from_dataset(\"s82_rrlyrae\", sort=True)" + "ens = Ensemble().from_dataset(\"s82_rrlyrae\", sorted=True)" ] }, { @@ -197,7 +197,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Storing and Accessing Result Frames\n", + "## Storing and Accessing Result Frames\n", "The `Ensemble` provides a powerful batching interface, `Ensemble.batch()`, to perform analysis functions in parallel across your lightcurves.\n", "\n", "For the below example, we use the included suite of analysis functions to apply `tape.analysis.calc_stetson_J` on our dataset. (For more info on `Ensemble.batch()`, including providing your own custom functions, see the [Ensemble Batch Showcase](https://tape.readthedocs.io/en/latest/tutorials/batch_showcase.html#) )" @@ -319,7 +319,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Keeping the Object and Source Tables in Sync\n", + "## Keeping the Object and Source Tables in Sync\n", "\n", "The `TAPE` `Ensemble` attempts to lazily \"sync\" the Object and Source tables such that:\n", "\n", @@ -397,7 +397,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.11" }, "vscode": { "interpreter": { diff --git a/docs/tutorials/working_with_the_timeseries.ipynb b/docs/tutorials/working_with_the_timeseries.ipynb index 05bddcbf..3889532e 100644 --- a/docs/tutorials/working_with_the_timeseries.ipynb +++ b/docs/tutorials/working_with_the_timeseries.ipynb @@ -40,6 +40,7 @@ " flux_col=\"psFlux\",\n", " err_col=\"psFluxErr\",\n", " band_col=\"filterName\",\n", + " sorted=True,\n", ")" ] }, @@ -85,7 +86,7 @@ ], "metadata": { "kernelspec": { - "display_name": "py310", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -103,7 +104,7 @@ }, "vscode": { "interpreter": { - "hash": "08968836a6367873274ed1d5e98a07391f42fc3a62bd5aba54afbd7b11ba8673" + "hash": "83afbb17b435d9bf8b0d0042367da76f26510da1c5781f0ff6e6c518eab621ec" } } }, diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py index 4d35fd8d..9a6ef7f9 100644 --- a/src/tape/ensemble.py +++ b/src/tape/ensemble.py @@ -2034,9 +2034,16 @@ def from_dataset(self, dataset, **kwargs): band_col=dataset_map["band"], ) - return self.from_parquet( - source_file=dataset_info["source_file"], - object_file=dataset_info["object_file"], + # repartition and sort these demo datasets + src = dd.read_parquet(dataset_info["source_file"]).repartition(npartitions=4).persist() + obj = dd.read_parquet(dataset_info["object_file"]).repartition(npartitions=2).persist() + + src = src.set_index(dataset_map["id"], sort=True) + obj = obj.set_index(dataset_map["id"], sort=True) + + return self.from_dask_dataframe( + source_frame=src, + object_frame=obj, column_mapper=col_map, **kwargs, )