Merge pull request #411 from lincc-frameworks/doc_div_warnings

fix divisions warnings
lincc-frameworks · Mar 28, 2024 · 4a8849f · 4a8849f
2 parents 60f12c3 + d6cdc97
commit 4a8849f
Show file tree

Hide file tree

Showing 10 changed files with 58 additions and 31 deletions.
diff --git a/docs/examples/rrlyr-period.ipynb b/docs/examples/rrlyr-period.ipynb
@@ -42,7 +42,7 @@
    "outputs": [],
    "source": [
     "# Load SDSS Stripe 82 RR Lyrae catalog\n",
-    "ens = Ensemble(client=False).from_dataset(\"s82_rrlyrae\")"
+    "ens = Ensemble(client=False).from_dataset(\"s82_rrlyrae\", sorted=True)"
    ]
   },
   {

diff --git a/docs/gettingstarted/quickstart.ipynb b/docs/gettingstarted/quickstart.ipynb
@@ -48,7 +48,7 @@
     "from tape import Ensemble\n",
     "\n",
     "ens = Ensemble()  # Initialize a TAPE Ensemble\n",
-    "ens.from_dataset(\"s82_qso\")"
+    "ens.from_dataset(\"s82_qso\", sorted=True)"
    ]
   },
   {

diff --git a/docs/tutorials/batch_showcase.ipynb b/docs/tutorials/batch_showcase.ipynb
@@ -67,6 +67,7 @@
     "ens.from_source_dict(\n",
     "    source_dict,\n",
     "    column_mapper=ColumnMapper(id_col=\"id\", time_col=\"mjd\", flux_col=\"flux\", err_col=\"err\", band_col=\"band\"),\n",
+    "    sorted=True,\n",
     ")"
    ]
   },

diff --git a/docs/tutorials/common_data_operations.ipynb b/docs/tutorials/common_data_operations.ipynb
@@ -37,7 +37,7 @@
     "\n",
     "ens = Ensemble()\n",
     "\n",
-    "ens.from_dataset(\"s82_rrlyrae\", sort=True)"
+    "ens.from_dataset(\"s82_rrlyrae\", sorted=True)"
    ]
   },
   {
@@ -252,9 +252,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ens.calc_nobs(by_band=True)\n",
+    "ens.calc_nobs(by_band=True, temporary=False)\n",
     "\n",
-    "ens.object[[\"nobs_u\", \"nobs_g\", \"nobs_r\", \"nobs_i\", \"nobs_z\", \"nobs_total\"]].head(5)"
+    "ens.object.head(5)[[\"nobs_u\", \"nobs_g\", \"nobs_r\", \"nobs_i\", \"nobs_z\", \"nobs_total\"]]"
    ]
   },
   {
@@ -467,8 +467,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ens.source.repartition(partition_size=\"100MB\").update_ensemble()  # 100MBs is generally recommended\n",
-    "ens.source  # In this case, we have a small set of data that easily fits into one partition"
+    "ens.source.repartition(partition_size=\"100MB\")  # 100MBs is generally recommended\n",
+    "# In this case, we have a small set of data that easily fits into one partition"
    ]
   },
   {
@@ -548,6 +548,15 @@
     "In some situations, you may find yourself running a given workflow many times. Due to the nature of lazy-computation, this will involve repeated execution of data I/O, pre-processing steps, initial analysis, etc. In these situations, it may be effective to instead save the ensemble state to disk after completion of these initial processing steps. To accomplish this, we can use the `Ensemble.save_ensemble()` function."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ens.object.head(5)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -576,6 +585,13 @@
     "new_ens = Ensemble()\n",
     "new_ens.from_ensemble(\"./ensemble\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/docs/tutorials/structure_function_showcase.ipynb b/docs/tutorials/structure_function_showcase.ipynb
@@ -158,13 +158,6 @@
     "in a TAPE `ensemble`. "
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -249,6 +242,7 @@
     "ens.from_source_dict(\n",
     "    {\"id_ens\": id_ens, \"t_ens\": t_ens, \"y_ens\": y_ens, \"yerr_ens\": yerr_ens, \"filter_ens\": filter_ens},\n",
     "    column_mapper=manual_colmap,\n",
+    "    sorted=True,\n",
     ")"
    ]
   },
@@ -564,11 +558,11 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.9"
+   "version": "3.10.11"
   },
   "vscode": {
    "interpreter": {
-    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+    "hash": "83afbb17b435d9bf8b0d0042367da76f26510da1c5781f0ff6e6c518eab621ec"
    }
   }
  },

diff --git a/docs/tutorials/tape_datasets.ipynb b/docs/tutorials/tape_datasets.ipynb
@@ -46,7 +46,7 @@
     ")\n",
     "\n",
     "# Read in data from a parquet file that contains source (timeseries) data\n",
-    "ens.from_parquet(source_file=f\"{rel_path}/source/test_source.parquet\", column_mapper=col_map)\n",
+    "ens.from_parquet(source_file=f\"{rel_path}/source/test_source.parquet\", column_mapper=col_map, sorted=True)\n",
     "\n",
     "ens.source.head(5)  # View the first 5 entries of the source table"
    ]
@@ -80,6 +80,7 @@
     "    source_file=f\"{rel_path}/source/test_source.parquet\",\n",
     "    object_file=f\"{rel_path}/object/test_object.parquet\",\n",
     "    column_mapper=col_map,\n",
+    "    sorted=True,\n",
     ")\n",
     "\n",
     "ens.object.head(5)  # View the first 5 entries of the object table"
@@ -147,7 +148,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ens.from_dataset(\"s82_rrlyrae\")  # Let's grab the Stripe 82 RR Lyrae\n",
+    "ens.from_dataset(\"s82_rrlyrae\", sorted=True)  # Let's grab the Stripe 82 RR Lyrae\n",
     "\n",
     "ens.object.head(5)"
    ]
@@ -209,10 +210,17 @@
    "source": [
     "colmap = ColumnMapper(id_col=\"id\", time_col=\"time\", flux_col=\"flux\", err_col=\"error\", band_col=\"band\")\n",
     "ens = Ensemble()\n",
-    "ens.from_source_dict(source_dict, column_mapper=colmap)\n",
+    "ens.from_source_dict(source_dict, column_mapper=colmap, sorted=True)\n",
     "\n",
     "ens.info()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/docs/tutorials/using_ray_with_the_ensemble.ipynb b/docs/tutorials/using_ray_with_the_ensemble.ipynb
@@ -80,7 +80,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ens.from_dataset(\"s82_qso\")\n",
+    "ens.from_dataset(\"s82_qso\", sorted=True)\n",
     "ens.source = ens.source.repartition(npartitions=10)\n",
     "ens.batch(\n",
     "    calc_sf2, use_map=False\n",
@@ -117,7 +117,7 @@
     "%%time\n",
     "\n",
     "ens = Ensemble(client=False)  # Do not use a client\n",
-    "ens.from_dataset(\"s82_qso\")\n",
+    "ens.from_dataset(\"s82_qso\", sorted=True)\n",
     "ens.source = ens.source.repartition(npartitions=10)\n",
     "ens.batch(calc_sf2, use_map=False)"
    ]
@@ -151,7 +151,7 @@
     "%%time\n",
     "\n",
     "ens = Ensemble()\n",
-    "ens.from_dataset(\"s82_qso\")\n",
+    "ens.from_dataset(\"s82_qso\", sorted=True)\n",
     "ens.source = ens.source.repartition(npartitions=10)\n",
     "ens.batch(calc_sf2, use_map=False).compute()"
    ]

diff --git a/docs/tutorials/working_with_the_ensemble.ipynb b/docs/tutorials/working_with_the_ensemble.ipynb
@@ -26,7 +26,7 @@
    "source": [
     "from tape.ensemble import Ensemble\n",
     "\n",
-    "ens = Ensemble().from_dataset(\"s82_rrlyrae\", sort=True)"
+    "ens = Ensemble().from_dataset(\"s82_rrlyrae\", sorted=True)"
    ]
   },
   {
@@ -197,7 +197,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Storing and Accessing Result Frames\n",
+    "## Storing and Accessing Result Frames\n",
     "The `Ensemble` provides a powerful batching interface, `Ensemble.batch()`, to perform analysis functions in parallel across your lightcurves.\n",
     "\n",
     "For the below example, we use the included suite of analysis functions to apply `tape.analysis.calc_stetson_J` on our dataset. (For more info on `Ensemble.batch()`, including providing your own custom functions, see the [Ensemble Batch Showcase](https://tape.readthedocs.io/en/latest/tutorials/batch_showcase.html#) )"
@@ -319,7 +319,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Keeping the Object and Source Tables in Sync\n",
+    "## Keeping the Object and Source Tables in Sync\n",
     "\n",
     "The `TAPE` `Ensemble` attempts to lazily \"sync\" the Object and Source tables such that:\n",
     "\n",
@@ -397,7 +397,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.10.11"
   },
   "vscode": {
    "interpreter": {

diff --git a/docs/tutorials/working_with_the_timeseries.ipynb b/docs/tutorials/working_with_the_timeseries.ipynb
@@ -40,6 +40,7 @@
     "    flux_col=\"psFlux\",\n",
     "    err_col=\"psFluxErr\",\n",
     "    band_col=\"filterName\",\n",
+    "    sorted=True,\n",
     ")"
    ]
   },
@@ -85,7 +86,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "py310",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -103,7 +104,7 @@
   },
   "vscode": {
    "interpreter": {
-    "hash": "08968836a6367873274ed1d5e98a07391f42fc3a62bd5aba54afbd7b11ba8673"
+    "hash": "83afbb17b435d9bf8b0d0042367da76f26510da1c5781f0ff6e6c518eab621ec"
    }
   }
  },

diff --git a/src/tape/ensemble.py b/src/tape/ensemble.py
@@ -2034,9 +2034,16 @@ def from_dataset(self, dataset, **kwargs):
             band_col=dataset_map["band"],
         )
 
-        return self.from_parquet(
-            source_file=dataset_info["source_file"],
-            object_file=dataset_info["object_file"],
+        # repartition and sort these demo datasets
+        src = dd.read_parquet(dataset_info["source_file"]).repartition(npartitions=4).persist()
+        obj = dd.read_parquet(dataset_info["object_file"]).repartition(npartitions=2).persist()
+
+        src = src.set_index(dataset_map["id"], sort=True)
+        obj = obj.set_index(dataset_map["id"], sort=True)
+
+        return self.from_dask_dataframe(
+            source_frame=src,
+            object_frame=obj,
             column_mapper=col_map,
             **kwargs,
         )