Skip to content

Commit

Permalink
Merge pull request #134 from MOchiara/chiara-patch-36
Browse files Browse the repository at this point in the history
[FEAT] Add function to check for any odd profile duration
  • Loading branch information
MOchiara authored Nov 21, 2024
2 parents 0663544 + 722614b commit 02a726e
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 11 deletions.
84 changes: 80 additions & 4 deletions glidertest/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,8 @@ def plot_prof_monotony(ds: xr.Dataset, ax: plt.Axes = None, **kw: dict, ) -> tup
fig = plt.gcf()

ax[0].plot(ds.TIME, ds.PROFILE_NUMBER)
ylabel = ds.PROFILE_NUMBER.attrs.get('long_name', 'Profile Number')
ax[0].set(ylabel=ylabel)

ax[0].set(ylabel='Profile number')
if len(np.where((np.diff(ds.PROFILE_NUMBER) != 0) & (np.diff(ds.PROFILE_NUMBER) != 1))[0]) == 0:
ax[1].text(0.2, 0.5, 'Data are monotonically increasing - no issues identified',
transform=ax[1].transAxes)
Expand All @@ -441,7 +441,7 @@ def plot_prof_monotony(ds: xr.Dataset, ax: plt.Axes = None, **kw: dict, ) -> tup
ds.DEPTH[np.where((np.diff(ds.PROFILE_NUMBER) != 0) & (np.diff(ds.PROFILE_NUMBER) != 1))],
s=10, c='red', label='Depth at which we have issues \n with the profile number assigned')
ax[1].legend()
ax[1].set(ylabel='Depth')
ax[1].set(ylabel='Depth (m)')
ax[1].invert_yaxis()
ax[1].xaxis.set_major_locator(plt.MaxNLocator(8))
utilities._time_axis_formatter(ax[1], ds, format_x_axis=True)
Expand Down Expand Up @@ -974,7 +974,7 @@ def plot_hysteresis(ds, var='DOXY', v_res=1, perct_err=2, ax=None):
df, diff, err, rms = tools.compute_hyst_stat(ds, var=var, v_res=v_res)
with plt.style.context(glidertest_style_file):
if ax is None:
fig = plt.figure(figsize=(14, 10))
fig = plt.figure()
force_plot = True
else:
fig = plt.gcf()
Expand Down Expand Up @@ -1009,3 +1009,79 @@ def plot_hysteresis(ds, var='DOXY', v_res=1, perct_err=2, ax=None):
if force_plot:
plt.show()
return fig, ax


def plot_outlier_duration(ds: xr.Dataset, rolling_mean: pd.Series, overtime, std=2, ax=None):
"""
Generates two plots to visualize profile durations and highlight outliers.
This helps identify profiles with abnormal durations by comparing the actual profile durations
to a rolling mean, and by visualizing the shape and depth of the selected outlier profiles.
Parameters
----------
ds : An xarray object containing at least the variables 'TIME', 'DEPTH', and 'PROFILE_NUMBER'.
These are used to compute the profile durations and plot depth profiles.
rolling_mean : A series representing the rolling mean of the profile durations,
which is used to highlight outliers based on standard deviation.
overtime : A list of profile numbers identified as having unusual durations.
These profiles are marked on the plot to highlight the outliers.
std : float, optional, default 2
The number of standard deviations above and below the rolling mean that will be used to define the range
of "normal" durations. Profiles outside this range are considered outliers.
ax :The axes object on which to plot the results. If not provided, a new figure with two subplots is created.
Returns
-------
fig : The figure containing the generated plots.
1. A plot showing the profile durations with the rolling mean and the range defined by the rolling mean ± `std`
(standard deviation). The range is highlighted in orange.
2. A scatter plot of the profile depths, with outlier profiles marked in red. These outliers are determined based
on the duration exceeding the threshold defined by the rolling mean ± `std`.
ax : A 1x2 array of axes used for the two subplots.
Original author
------
Chiara Monforte
"""

with plt.style.context(glidertest_style_file):
if ax is None:
fig,ax = plt.subplots(1,2)
force_plot = True
else:
fig = plt.gcf()
force_plot = False

data = tools.compute_prof_duration(ds)
ax[0].plot(data['profile_num'], data['profile_duration'], label='Profile duration')
ax[0].plot(data['profile_num'], rolling_mean, label='Rolling mean')
ax[0].fill_between(data['profile_num'], rolling_mean - (np.std(rolling_mean) * std),
rolling_mean + (np.std(rolling_mean) * std), color='orange',
edgecolor="orange", alpha=0.5,
label=f'Rolling mean ± {std} std')
ax[0].legend()
ax[0].grid()
ax[0].set(xlabel='Profile number', ylabel='Profile duration (min)')

ax[1].scatter(ds.TIME, ds.DEPTH, s=0.1)
for i in range(len(overtime)):
profile = ds.TIME.where(ds.PROFILE_NUMBER == overtime[i]).dropna(dim='N_MEASUREMENTS')
ax[1].scatter(profile.TIME, profile.DEPTH, s=0.1, c='red', label='Profiles with odd duration')
ax[1].invert_yaxis()
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax[1].legend(by_label.values(), by_label.keys(), markerscale=8., loc='lower right')
ax[1].set(ylabel='Depth (m)')
ax[1].grid()
every_nth = 2
for n, label in enumerate(ax[1].xaxis.get_ticklabels()):
if n % every_nth != 0:
label.set_visible(False)
if force_plot:
plt.show()
return fig, ax
61 changes: 58 additions & 3 deletions glidertest/tools.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas
import pandas as pd
import xarray as xr
from scipy import stats
Expand Down Expand Up @@ -354,7 +355,7 @@ def quant_hysteresis(ds: xr.Dataset, var='DOXY', v_res=1):
----------------
Chiara Monforte
"""
# utilities._check_necessary_variables(ds, ['PROFILE_NUMBER', 'DEPTH', var])
utilities._check_necessary_variables(ds, ['PROFILE_NUMBER', 'DEPTH', var])
p = 1 # Horizontal resolution
z = v_res # Vertical resolution
with warnings.catch_warnings():
Expand All @@ -366,7 +367,7 @@ def quant_hysteresis(ds: xr.Dataset, var='DOXY', v_res=1):
return df


def compute_hyst_stat(ds, var='DOXY', v_res=1):
def compute_hyst_stat(ds: xr.Dataset, var='DOXY', v_res=1):
"""
This function computes some basic statistics for the differences between climb and dive data
Expand All @@ -393,4 +394,58 @@ def compute_hyst_stat(ds, var='DOXY', v_res=1):
diff = abs(df.dive - df.climb)
err = (diff * 100) / np.nanmedian([df.dive, df.climb], axis=0)
rms = np.sqrt(np.nanmean(abs(df.dive - df.climb) ** 2))
return df, diff, err, rms
return df, diff, err, rms


def compute_prof_duration(ds:xr.Dataset):
"""
This function computes some basic statistics for the differences between climb and dive data
Parameters
----------
ds: xarray on OG1 format containing at least time and profile_number. Data should not be gridded.
Returns
-------
df: pandas dataframe containing the profile number and the duration of that profile in minutes
Original author
----------------
Chiara Monforte
"""
utilities._check_necessary_variables(ds, ['PROFILE_NUMBER', 'TIME'])
deltat = ds.TIME.groupby(ds.PROFILE_NUMBER).max() - ds.TIME.groupby(ds.PROFILE_NUMBER).min()
deltat_min = (pd.to_timedelta(deltat).astype('timedelta64[s]') / 60).astype('int64')
df = pd.DataFrame(data={'profile_num': deltat.PROFILE_NUMBER, 'profile_duration': deltat_min})
return df

def find_outlier_duration(df: pd.DataFrame, rolling=20, std=2):
"""
This function computes some basic statistics for the differences between climb and dive data
Parameters
----------
df: pandas dataframe containing the profile number and the duration of that profile in minutes
rolling: window size for the rolling mean
std: number of standard deviations to use for 'odd' profile duration
Returns
-------
The function prints a statement in case there are profiles with 'odd' duration,
rolling_mean: Rolling mean of profile duration computed with the set window
overt_prof: Profiles where the duration is above the set rolling mean with added standard deviation
Original author
----------------
Chiara Monforte
"""


rolling_mean = df['profile_duration'].rolling(window=rolling, min_periods=1).mean()
overtime = np.where((df['profile_duration'] > rolling_mean + (np.std(rolling_mean) * std)) | (
df['profile_duration'] < rolling_mean - (np.std(rolling_mean) * std)))
overt_prof = df['profile_num'][overtime[0]].values
if len(overtime[0]) > 0:
print(
f'There are {len(overtime[0])} profiles where the duration differs by {std} standard deviations of the nearby {rolling} profiles have been detected. Further checks are recommended')
return rolling_mean, overt_prof
38 changes: 36 additions & 2 deletions notebooks/demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"- Vertical velocity (from a flight model)"
]
},

{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -127,7 +128,7 @@
"outputs": [],
"source": [
"# Basic diagnostics of the gridding in the dataset\n",
"plots.plot_grid_spacing(ds);"
"plots.plot_grid_spacing(ds)"
]
},
{
Expand All @@ -138,7 +139,19 @@
"outputs": [],
"source": [
"# Basic diagnostics of the watermass properties\n",
"plots.plot_ts(ds);"
"fig, ax = plots.plot_ts(ds)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9610be35-62f0-4801-9a86-bd7952756679",
"metadata": {},
"outputs": [],
"source": [
"# Make is a square instead \n",
"fig.set_size_inches(10,10)\n",
"fig"
]
},
{
Expand Down Expand Up @@ -186,6 +199,27 @@
"fig, ax = plots.plot_basic_vars(ds,v_res=1, start_prof=0, end_prof=int(ds.PROFILE_NUMBER.max()))"
]
},
{
"cell_type": "markdown",
"id": "0c7eadb3-ead8-4ada-8d2f-d7e524b083b6",
"metadata": {},
"source": [
"We can check for inconsistent profile duration as well which can highlight possible issues with how the profile number is assigned or weird navigation patterns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f87de529-8226-4897-8ff7-f8c3c5fd22f6",
"metadata": {},
"outputs": [],
"source": [
"duration = tools.compute_prof_duration(ds)\n",
"rolling_mean, overtime = tools.find_outlier_duration(duration,rolling=20, std=2)\n",
"\n",
"fig, ax = plots.plot_outlier_duration(ds, rolling_mean, overtime, std=2)"
]
},
{
"cell_type": "markdown",
"id": "3c8e018e-389b-410b-8455-d37d6effca7e",
Expand Down
11 changes: 9 additions & 2 deletions tests/test_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,15 @@ def test_temporal_drift(var='DOXY'):
def test_profile_check():
ds = fetchers.load_sample_dataset()
tools.check_monotony(ds.PROFILE_NUMBER)
plots.plot_prof_monotony(ds)

fig, ax = plots.plot_prof_monotony(ds)
assert ax[0].get_ylabel() == 'Profile number'
assert ax[1].get_ylabel() == 'Depth (m)'
duration = tools.compute_prof_duration(ds)
rolling_mean, overtime = tools.find_outlier_duration(duration, rolling=20, std=2)
fig, ax = plots.plot_outlier_duration(ds, rolling_mean, overtime, std = 2)
assert ax[0].get_ylabel() == 'Profile duration (min)'
assert ax[0].get_xlabel() == 'Profile number'
assert ax[1].get_ylabel() == 'Depth (m)'

def test_basic_statistics():
ds = fetchers.load_sample_dataset()
Expand Down
2 changes: 2 additions & 0 deletions tests/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def test_check_monotony():
temperature_monotony = tools.check_monotony(ds.TEMP)
assert profile_number_monotony
assert not temperature_monotony
duration = tools.compute_prof_duration(ds)
rolling_mean, overtime = tools.find_outlier_duration(duration, rolling=20, std=2)


def test_vert_vel():
Expand Down

0 comments on commit 02a726e

Please sign in to comment.