Merge pull request #74 from ocefpaf/close_73

implements a better multiple dfs return and cache results
ioos · Oct 13, 2023 · 29284a3 · 29284a3
2 parents 59ac88b + 81fdc17
commit 29284a3
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 14 deletions.
diff --git a/gliderpy/fetchers.py b/gliderpy/fetchers.py
@@ -3,6 +3,8 @@
 
 """
 
+import functools
+from copy import copy
 from typing import Optional
 
 import httpx
@@ -21,6 +23,23 @@
 _server = "https://gliders.ioos.us/erddap"
 
 
+@functools.lru_cache(maxsize=128)
+def _to_pandas_multiple(glider_grab):
+    """Thin wrapper to cache the results when multiple datasets are requested."""
+    df_all = {}
+    glider_grab_copy = copy(glider_grab)
+    for dataset_id in glider_grab_copy.datasets["Dataset ID"]:
+        glider_grab_copy.fetcher.dataset_id = dataset_id
+        df = glider_grab_copy.fetcher.to_pandas(
+            index_col="time (UTC)",
+            parse_dates=True,
+        )
+        dataset_url = glider_grab_copy.fetcher.get_download_url().split("?")[0]
+        df = standardise_df(df, dataset_url)
+        df_all.update({dataset_id: df})
+    return df_all
+
+
 def standardise_df(df, dataset_url):
     """
     Standardise variable names in a dataset and add column for url
@@ -57,25 +76,18 @@ def to_pandas(self):
         """
         Fetches data from the server and reads into a pandas dataframe
 
-        :return: pandas dataframe with datetime UTC as index
+        :return: pandas dataframe with datetime UTC as index, multiple dataset_ids dataframes are stored in a dictionary
         """
         if self.fetcher.dataset_id:
             df = self.fetcher.to_pandas(
                 index_col="time (UTC)",
                 parse_dates=True,
             )
         elif not self.fetcher.dataset_id and self.datasets is not None:
-            df_all = []
-            for dataset_id in self.datasets["Dataset ID"]:
-                self.fetcher.dataset_id = dataset_id
-                df = self.fetcher.to_pandas(
-                    index_col="time (UTC)",
-                    parse_dates=True,
-                )
-                dataset_url = self.fetcher.get_download_url().split("?")[0]
-                df = standardise_df(df, dataset_url)
-                df_all.append(df)
-            return pd.concat(df_all)
+            df_all = _to_pandas_multiple(self)
+            # We need to reset to avoid fetching a single dataset_id when making multiple requests.
+            self.fetcher.dataset_id = None
+            return df_all
         else:
             raise ValueError(
                 f"Must provide a {self.fetcher.dataset_id} or `query` terms to download data.",

diff --git a/notebooks/00-quick_intro.ipynb b/notebooks/00-quick_intro.ipynb
@@ -114,8 +114,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = glider_grab.to_pandas()\n",
-    "df.head()"
+    "datasets = glider_grab.to_pandas()\n",
+    "datasets.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datasets[\"ru23-20121025T1944\"].head()"
    ]
   },
   {