Merge pull request #134 from MOchiara/chiara-patch-36

[FEAT] Add function to check for any odd profile duration
OceanGlidersCommunity · Nov 21, 2024 · 02a726e · 02a726e
2 parents 0663544 + 722614b
commit 02a726e
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 11 deletions.
diff --git a/glidertest/plots.py b/glidertest/plots.py
@@ -431,8 +431,8 @@ def plot_prof_monotony(ds: xr.Dataset, ax: plt.Axes = None, **kw: dict, ) -> tup
             fig = plt.gcf()
 
         ax[0].plot(ds.TIME, ds.PROFILE_NUMBER)
-        ylabel = ds.PROFILE_NUMBER.attrs.get('long_name', 'Profile Number')
-        ax[0].set(ylabel=ylabel)
+
+        ax[0].set(ylabel='Profile number')
         if len(np.where((np.diff(ds.PROFILE_NUMBER) != 0) & (np.diff(ds.PROFILE_NUMBER) != 1))[0]) == 0:
             ax[1].text(0.2, 0.5, 'Data are monotonically increasing - no issues identified',
                        transform=ax[1].transAxes)
@@ -441,7 +441,7 @@ def plot_prof_monotony(ds: xr.Dataset, ax: plt.Axes = None, **kw: dict, ) -> tup
                           ds.DEPTH[np.where((np.diff(ds.PROFILE_NUMBER) != 0) & (np.diff(ds.PROFILE_NUMBER) != 1))],
                           s=10, c='red', label='Depth at which we have issues \n with the profile number assigned')
             ax[1].legend()
-        ax[1].set(ylabel='Depth')
+        ax[1].set(ylabel='Depth (m)')
         ax[1].invert_yaxis()
         ax[1].xaxis.set_major_locator(plt.MaxNLocator(8))
         utilities._time_axis_formatter(ax[1], ds, format_x_axis=True)
@@ -974,7 +974,7 @@ def plot_hysteresis(ds, var='DOXY', v_res=1, perct_err=2, ax=None):
     df, diff, err, rms = tools.compute_hyst_stat(ds, var=var, v_res=v_res)
     with plt.style.context(glidertest_style_file):
         if ax is None:
-            fig = plt.figure(figsize=(14, 10))
+            fig = plt.figure()
             force_plot = True
         else:
             fig = plt.gcf()
@@ -1009,3 +1009,79 @@ def plot_hysteresis(ds, var='DOXY', v_res=1, perct_err=2, ax=None):
         if force_plot:
             plt.show()
     return fig, ax
+
+
+def plot_outlier_duration(ds: xr.Dataset, rolling_mean: pd.Series, overtime, std=2, ax=None):
+    """
+       Generates two plots to visualize profile durations and highlight outliers.
+       This helps identify profiles with abnormal durations by comparing the actual profile durations
+       to a rolling mean, and by visualizing the shape and depth of the selected outlier profiles.
+
+       Parameters
+       ----------
+       ds : An xarray object containing at least the variables 'TIME', 'DEPTH', and 'PROFILE_NUMBER'.
+       These are used to compute the profile durations and plot depth profiles.
+
+       rolling_mean : A series representing the rolling mean of the profile durations,
+       which is used to highlight outliers based on standard deviation.
+
+       overtime :  A list of profile numbers identified as having unusual durations.
+       These profiles are marked on the plot to highlight the outliers.
+
+       std : float, optional, default 2
+           The number of standard deviations above and below the rolling mean that will be used to define the range
+           of "normal" durations. Profiles outside this range are considered outliers.
+
+       ax :The axes object on which to plot the results. If not provided, a new figure with two subplots is created.
+
+       Returns
+       -------
+       fig : The figure containing the generated plots.
+           1. A plot showing the profile durations with the rolling mean and the range defined by the rolling mean ± `std`
+              (standard deviation). The range is highlighted in orange.
+           2. A scatter plot of the profile depths, with outlier profiles marked in red. These outliers are determined based
+              on the duration exceeding the threshold defined by the rolling mean ± `std`.
+
+       ax :  A 1x2 array of axes used for the two subplots.
+
+       Original author
+       ------
+       Chiara Monforte
+       """
+
+    with plt.style.context(glidertest_style_file):
+        if ax is None:
+            fig,ax = plt.subplots(1,2)
+            force_plot = True
+        else:
+            fig = plt.gcf()
+            force_plot = False
+
+        data = tools.compute_prof_duration(ds)
+        ax[0].plot(data['profile_num'], data['profile_duration'], label='Profile duration')
+        ax[0].plot(data['profile_num'], rolling_mean, label='Rolling mean')
+        ax[0].fill_between(data['profile_num'], rolling_mean - (np.std(rolling_mean) * std),
+                           rolling_mean + (np.std(rolling_mean) * std), color='orange',
+                           edgecolor="orange", alpha=0.5,
+                           label=f'Rolling mean ± {std} std')
+        ax[0].legend()
+        ax[0].grid()
+        ax[0].set(xlabel='Profile number', ylabel='Profile duration (min)')
+
+        ax[1].scatter(ds.TIME, ds.DEPTH, s=0.1)
+        for i in range(len(overtime)):
+            profile = ds.TIME.where(ds.PROFILE_NUMBER == overtime[i]).dropna(dim='N_MEASUREMENTS')
+            ax[1].scatter(profile.TIME, profile.DEPTH, s=0.1, c='red', label='Profiles with odd duration')
+        ax[1].invert_yaxis()
+        handles, labels = plt.gca().get_legend_handles_labels()
+        by_label = dict(zip(labels, handles))
+        ax[1].legend(by_label.values(), by_label.keys(), markerscale=8., loc='lower right')
+        ax[1].set(ylabel='Depth (m)')
+        ax[1].grid()
+        every_nth = 2
+        for n, label in enumerate(ax[1].xaxis.get_ticklabels()):
+            if n % every_nth != 0:
+                label.set_visible(False)
+        if force_plot:
+            plt.show()
+    return fig, ax
diff --git a/glidertest/tools.py b/glidertest/tools.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas
 import pandas as pd
 import xarray as xr
 from scipy import stats
@@ -354,7 +355,7 @@ def quant_hysteresis(ds: xr.Dataset, var='DOXY', v_res=1):
     ----------------
     Chiara Monforte
     """
-    # utilities._check_necessary_variables(ds, ['PROFILE_NUMBER', 'DEPTH', var])
+    utilities._check_necessary_variables(ds, ['PROFILE_NUMBER', 'DEPTH', var])
     p = 1  # Horizontal resolution
     z = v_res  # Vertical resolution
     with warnings.catch_warnings():
@@ -366,7 +367,7 @@ def quant_hysteresis(ds: xr.Dataset, var='DOXY', v_res=1):
     return df
 
 
-def compute_hyst_stat(ds, var='DOXY', v_res=1):
+def compute_hyst_stat(ds: xr.Dataset, var='DOXY', v_res=1):
     """
     This function computes some basic statistics for the differences between climb and dive data
 
@@ -393,4 +394,58 @@ def compute_hyst_stat(ds, var='DOXY', v_res=1):
         diff = abs(df.dive - df.climb)
         err = (diff * 100) / np.nanmedian([df.dive, df.climb], axis=0)
         rms = np.sqrt(np.nanmean(abs(df.dive - df.climb) ** 2))
-    return df, diff, err, rms
+    return df, diff, err, rms
+
+
+def compute_prof_duration(ds:xr.Dataset):
+    """
+    This function computes some basic statistics for the differences between climb and dive data
+
+    Parameters
+    ----------
+    ds: xarray on OG1 format containing at least time and profile_number. Data should not be gridded.
+
+    Returns
+    -------
+    df: pandas dataframe containing the profile number and the duration of that profile in minutes
+
+    Original author
+    ----------------
+    Chiara  Monforte
+    """
+    utilities._check_necessary_variables(ds, ['PROFILE_NUMBER', 'TIME'])
+    deltat = ds.TIME.groupby(ds.PROFILE_NUMBER).max() - ds.TIME.groupby(ds.PROFILE_NUMBER).min()
+    deltat_min = (pd.to_timedelta(deltat).astype('timedelta64[s]') / 60).astype('int64')
+    df = pd.DataFrame(data={'profile_num': deltat.PROFILE_NUMBER, 'profile_duration': deltat_min})
+    return df
+
+def find_outlier_duration(df: pd.DataFrame, rolling=20, std=2):
+    """
+    This function computes some basic statistics for the differences between climb and dive data
+
+    Parameters
+    ----------
+    df: pandas dataframe containing the profile number and the duration of that profile in minutes
+    rolling: window size for the rolling mean
+    std: number of standard deviations to use for 'odd' profile duration
+
+    Returns
+    -------
+    The function prints a statement in case there are profiles with 'odd' duration,
+    rolling_mean: Rolling mean of profile duration computed with the set window
+    overt_prof: Profiles where the duration is above the set rolling mean with added standard deviation
+
+    Original author
+    ----------------
+    Chiara  Monforte
+    """
+
+
+    rolling_mean = df['profile_duration'].rolling(window=rolling, min_periods=1).mean()
+    overtime = np.where((df['profile_duration'] > rolling_mean + (np.std(rolling_mean) * std)) | (
+                df['profile_duration'] < rolling_mean - (np.std(rolling_mean) * std)))
+    overt_prof = df['profile_num'][overtime[0]].values
+    if len(overtime[0]) > 0:
+        print(
+            f'There are {len(overtime[0])} profiles where the duration differs by {std} standard deviations of the nearby {rolling} profiles have been detected. Further checks are recommended')
+    return rolling_mean, overt_prof
diff --git a/notebooks/demo.ipynb b/notebooks/demo.ipynb
@@ -30,6 +30,7 @@
     "- Vertical velocity (from a flight model)"
    ]
   },
+
   {
    "cell_type": "code",
    "execution_count": null,
@@ -127,7 +128,7 @@
    "outputs": [],
    "source": [
     "# Basic diagnostics of the gridding in the dataset\n",
-    "plots.plot_grid_spacing(ds);"
+    "plots.plot_grid_spacing(ds)"
    ]
   },
   {
@@ -138,7 +139,19 @@
    "outputs": [],
    "source": [
     "# Basic diagnostics of the watermass properties\n",
-    "plots.plot_ts(ds);"
+    "fig, ax = plots.plot_ts(ds)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9610be35-62f0-4801-9a86-bd7952756679",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Make is a square instead \n",
+    "fig.set_size_inches(10,10)\n",
+    "fig"
    ]
   },
   {
@@ -186,6 +199,27 @@
     "fig, ax = plots.plot_basic_vars(ds,v_res=1, start_prof=0, end_prof=int(ds.PROFILE_NUMBER.max()))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0c7eadb3-ead8-4ada-8d2f-d7e524b083b6",
+   "metadata": {},
+   "source": [
+    "We can check for inconsistent profile duration as well which can highlight possible issues with how the profile number is assigned or weird navigation patterns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f87de529-8226-4897-8ff7-f8c3c5fd22f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "duration = tools.compute_prof_duration(ds)\n",
+    "rolling_mean, overtime = tools.find_outlier_duration(duration,rolling=20, std=2)\n",
+    "\n",
+    "fig, ax = plots.plot_outlier_duration(ds, rolling_mean, overtime, std=2)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "3c8e018e-389b-410b-8455-d37d6effca7e",

diff --git a/tests/test_plots.py b/tests/test_plots.py
@@ -67,8 +67,15 @@ def test_temporal_drift(var='DOXY'):
 def test_profile_check():
     ds = fetchers.load_sample_dataset()
     tools.check_monotony(ds.PROFILE_NUMBER)
-    plots.plot_prof_monotony(ds)
-
+    fig, ax = plots.plot_prof_monotony(ds)
+    assert ax[0].get_ylabel() == 'Profile number'
+    assert ax[1].get_ylabel() == 'Depth (m)'
+    duration = tools.compute_prof_duration(ds)
+    rolling_mean, overtime = tools.find_outlier_duration(duration, rolling=20, std=2)
+    fig, ax = plots.plot_outlier_duration(ds, rolling_mean, overtime, std = 2)
+    assert ax[0].get_ylabel() == 'Profile duration (min)'
+    assert ax[0].get_xlabel() == 'Profile number'
+    assert ax[1].get_ylabel() == 'Depth (m)'
 
 def test_basic_statistics():
     ds = fetchers.load_sample_dataset()

diff --git a/tests/test_tools.py b/tests/test_tools.py
@@ -28,6 +28,8 @@ def test_check_monotony():
     temperature_monotony = tools.check_monotony(ds.TEMP)
     assert profile_number_monotony
     assert not temperature_monotony
+    duration = tools.compute_prof_duration(ds)
+    rolling_mean, overtime = tools.find_outlier_duration(duration, rolling=20, std=2)
 
 
 def test_vert_vel():