diff --git a/glidertest/plots.py b/glidertest/plots.py index f53147e..697570e 100644 --- a/glidertest/plots.py +++ b/glidertest/plots.py @@ -431,8 +431,8 @@ def plot_prof_monotony(ds: xr.Dataset, ax: plt.Axes = None, **kw: dict, ) -> tup fig = plt.gcf() ax[0].plot(ds.TIME, ds.PROFILE_NUMBER) - ylabel = ds.PROFILE_NUMBER.attrs.get('long_name', 'Profile Number') - ax[0].set(ylabel=ylabel) + + ax[0].set(ylabel='Profile number') if len(np.where((np.diff(ds.PROFILE_NUMBER) != 0) & (np.diff(ds.PROFILE_NUMBER) != 1))[0]) == 0: ax[1].text(0.2, 0.5, 'Data are monotonically increasing - no issues identified', transform=ax[1].transAxes) @@ -441,7 +441,7 @@ def plot_prof_monotony(ds: xr.Dataset, ax: plt.Axes = None, **kw: dict, ) -> tup ds.DEPTH[np.where((np.diff(ds.PROFILE_NUMBER) != 0) & (np.diff(ds.PROFILE_NUMBER) != 1))], s=10, c='red', label='Depth at which we have issues \n with the profile number assigned') ax[1].legend() - ax[1].set(ylabel='Depth') + ax[1].set(ylabel='Depth (m)') ax[1].invert_yaxis() ax[1].xaxis.set_major_locator(plt.MaxNLocator(8)) utilities._time_axis_formatter(ax[1], ds, format_x_axis=True) @@ -974,7 +974,7 @@ def plot_hysteresis(ds, var='DOXY', v_res=1, perct_err=2, ax=None): df, diff, err, rms = tools.compute_hyst_stat(ds, var=var, v_res=v_res) with plt.style.context(glidertest_style_file): if ax is None: - fig = plt.figure(figsize=(14, 10)) + fig = plt.figure() force_plot = True else: fig = plt.gcf() @@ -1009,3 +1009,79 @@ def plot_hysteresis(ds, var='DOXY', v_res=1, perct_err=2, ax=None): if force_plot: plt.show() return fig, ax + + +def plot_outlier_duration(ds: xr.Dataset, rolling_mean: pd.Series, overtime, std=2, ax=None): + """ + Generates two plots to visualize profile durations and highlight outliers. + This helps identify profiles with abnormal durations by comparing the actual profile durations + to a rolling mean, and by visualizing the shape and depth of the selected outlier profiles. + + Parameters + ---------- + ds : An xarray object containing at least the variables 'TIME', 'DEPTH', and 'PROFILE_NUMBER'. + These are used to compute the profile durations and plot depth profiles. + + rolling_mean : A series representing the rolling mean of the profile durations, + which is used to highlight outliers based on standard deviation. + + overtime : A list of profile numbers identified as having unusual durations. + These profiles are marked on the plot to highlight the outliers. + + std : float, optional, default 2 + The number of standard deviations above and below the rolling mean that will be used to define the range + of "normal" durations. Profiles outside this range are considered outliers. + + ax :The axes object on which to plot the results. If not provided, a new figure with two subplots is created. + + Returns + ------- + fig : The figure containing the generated plots. + 1. A plot showing the profile durations with the rolling mean and the range defined by the rolling mean ± `std` + (standard deviation). The range is highlighted in orange. + 2. A scatter plot of the profile depths, with outlier profiles marked in red. These outliers are determined based + on the duration exceeding the threshold defined by the rolling mean ± `std`. + + ax : A 1x2 array of axes used for the two subplots. + + Original author + ------ + Chiara Monforte + """ + + with plt.style.context(glidertest_style_file): + if ax is None: + fig,ax = plt.subplots(1,2) + force_plot = True + else: + fig = plt.gcf() + force_plot = False + + data = tools.compute_prof_duration(ds) + ax[0].plot(data['profile_num'], data['profile_duration'], label='Profile duration') + ax[0].plot(data['profile_num'], rolling_mean, label='Rolling mean') + ax[0].fill_between(data['profile_num'], rolling_mean - (np.std(rolling_mean) * std), + rolling_mean + (np.std(rolling_mean) * std), color='orange', + edgecolor="orange", alpha=0.5, + label=f'Rolling mean ± {std} std') + ax[0].legend() + ax[0].grid() + ax[0].set(xlabel='Profile number', ylabel='Profile duration (min)') + + ax[1].scatter(ds.TIME, ds.DEPTH, s=0.1) + for i in range(len(overtime)): + profile = ds.TIME.where(ds.PROFILE_NUMBER == overtime[i]).dropna(dim='N_MEASUREMENTS') + ax[1].scatter(profile.TIME, profile.DEPTH, s=0.1, c='red', label='Profiles with odd duration') + ax[1].invert_yaxis() + handles, labels = plt.gca().get_legend_handles_labels() + by_label = dict(zip(labels, handles)) + ax[1].legend(by_label.values(), by_label.keys(), markerscale=8., loc='lower right') + ax[1].set(ylabel='Depth (m)') + ax[1].grid() + every_nth = 2 + for n, label in enumerate(ax[1].xaxis.get_ticklabels()): + if n % every_nth != 0: + label.set_visible(False) + if force_plot: + plt.show() + return fig, ax \ No newline at end of file diff --git a/glidertest/tools.py b/glidertest/tools.py index 5a19aa6..d78c1db 100644 --- a/glidertest/tools.py +++ b/glidertest/tools.py @@ -1,4 +1,5 @@ import numpy as np +import pandas import pandas as pd import xarray as xr from scipy import stats @@ -354,7 +355,7 @@ def quant_hysteresis(ds: xr.Dataset, var='DOXY', v_res=1): ---------------- Chiara Monforte """ - # utilities._check_necessary_variables(ds, ['PROFILE_NUMBER', 'DEPTH', var]) + utilities._check_necessary_variables(ds, ['PROFILE_NUMBER', 'DEPTH', var]) p = 1 # Horizontal resolution z = v_res # Vertical resolution with warnings.catch_warnings(): @@ -366,7 +367,7 @@ def quant_hysteresis(ds: xr.Dataset, var='DOXY', v_res=1): return df -def compute_hyst_stat(ds, var='DOXY', v_res=1): +def compute_hyst_stat(ds: xr.Dataset, var='DOXY', v_res=1): """ This function computes some basic statistics for the differences between climb and dive data @@ -393,4 +394,58 @@ def compute_hyst_stat(ds, var='DOXY', v_res=1): diff = abs(df.dive - df.climb) err = (diff * 100) / np.nanmedian([df.dive, df.climb], axis=0) rms = np.sqrt(np.nanmean(abs(df.dive - df.climb) ** 2)) - return df, diff, err, rms \ No newline at end of file + return df, diff, err, rms + + +def compute_prof_duration(ds:xr.Dataset): + """ + This function computes some basic statistics for the differences between climb and dive data + + Parameters + ---------- + ds: xarray on OG1 format containing at least time and profile_number. Data should not be gridded. + + Returns + ------- + df: pandas dataframe containing the profile number and the duration of that profile in minutes + + Original author + ---------------- + Chiara Monforte + """ + utilities._check_necessary_variables(ds, ['PROFILE_NUMBER', 'TIME']) + deltat = ds.TIME.groupby(ds.PROFILE_NUMBER).max() - ds.TIME.groupby(ds.PROFILE_NUMBER).min() + deltat_min = (pd.to_timedelta(deltat).astype('timedelta64[s]') / 60).astype('int64') + df = pd.DataFrame(data={'profile_num': deltat.PROFILE_NUMBER, 'profile_duration': deltat_min}) + return df + +def find_outlier_duration(df: pd.DataFrame, rolling=20, std=2): + """ + This function computes some basic statistics for the differences between climb and dive data + + Parameters + ---------- + df: pandas dataframe containing the profile number and the duration of that profile in minutes + rolling: window size for the rolling mean + std: number of standard deviations to use for 'odd' profile duration + + Returns + ------- + The function prints a statement in case there are profiles with 'odd' duration, + rolling_mean: Rolling mean of profile duration computed with the set window + overt_prof: Profiles where the duration is above the set rolling mean with added standard deviation + + Original author + ---------------- + Chiara Monforte + """ + + + rolling_mean = df['profile_duration'].rolling(window=rolling, min_periods=1).mean() + overtime = np.where((df['profile_duration'] > rolling_mean + (np.std(rolling_mean) * std)) | ( + df['profile_duration'] < rolling_mean - (np.std(rolling_mean) * std))) + overt_prof = df['profile_num'][overtime[0]].values + if len(overtime[0]) > 0: + print( + f'There are {len(overtime[0])} profiles where the duration differs by {std} standard deviations of the nearby {rolling} profiles have been detected. Further checks are recommended') + return rolling_mean, overt_prof \ No newline at end of file diff --git a/notebooks/demo.ipynb b/notebooks/demo.ipynb index 6c6a37a..ce0b197 100644 --- a/notebooks/demo.ipynb +++ b/notebooks/demo.ipynb @@ -30,6 +30,7 @@ "- Vertical velocity (from a flight model)" ] }, + { "cell_type": "code", "execution_count": null, @@ -127,7 +128,7 @@ "outputs": [], "source": [ "# Basic diagnostics of the gridding in the dataset\n", - "plots.plot_grid_spacing(ds);" + "plots.plot_grid_spacing(ds)" ] }, { @@ -138,7 +139,19 @@ "outputs": [], "source": [ "# Basic diagnostics of the watermass properties\n", - "plots.plot_ts(ds);" + "fig, ax = plots.plot_ts(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9610be35-62f0-4801-9a86-bd7952756679", + "metadata": {}, + "outputs": [], + "source": [ + "# Make is a square instead \n", + "fig.set_size_inches(10,10)\n", + "fig" ] }, { @@ -186,6 +199,27 @@ "fig, ax = plots.plot_basic_vars(ds,v_res=1, start_prof=0, end_prof=int(ds.PROFILE_NUMBER.max()))" ] }, + { + "cell_type": "markdown", + "id": "0c7eadb3-ead8-4ada-8d2f-d7e524b083b6", + "metadata": {}, + "source": [ + "We can check for inconsistent profile duration as well which can highlight possible issues with how the profile number is assigned or weird navigation patterns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f87de529-8226-4897-8ff7-f8c3c5fd22f6", + "metadata": {}, + "outputs": [], + "source": [ + "duration = tools.compute_prof_duration(ds)\n", + "rolling_mean, overtime = tools.find_outlier_duration(duration,rolling=20, std=2)\n", + "\n", + "fig, ax = plots.plot_outlier_duration(ds, rolling_mean, overtime, std=2)" + ] + }, { "cell_type": "markdown", "id": "3c8e018e-389b-410b-8455-d37d6effca7e", diff --git a/tests/test_plots.py b/tests/test_plots.py index 5ce9316..9538700 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -67,8 +67,15 @@ def test_temporal_drift(var='DOXY'): def test_profile_check(): ds = fetchers.load_sample_dataset() tools.check_monotony(ds.PROFILE_NUMBER) - plots.plot_prof_monotony(ds) - + fig, ax = plots.plot_prof_monotony(ds) + assert ax[0].get_ylabel() == 'Profile number' + assert ax[1].get_ylabel() == 'Depth (m)' + duration = tools.compute_prof_duration(ds) + rolling_mean, overtime = tools.find_outlier_duration(duration, rolling=20, std=2) + fig, ax = plots.plot_outlier_duration(ds, rolling_mean, overtime, std = 2) + assert ax[0].get_ylabel() == 'Profile duration (min)' + assert ax[0].get_xlabel() == 'Profile number' + assert ax[1].get_ylabel() == 'Depth (m)' def test_basic_statistics(): ds = fetchers.load_sample_dataset() diff --git a/tests/test_tools.py b/tests/test_tools.py index f49c4f3..a7ac19b 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -28,6 +28,8 @@ def test_check_monotony(): temperature_monotony = tools.check_monotony(ds.TEMP) assert profile_number_monotony assert not temperature_monotony + duration = tools.compute_prof_duration(ds) + rolling_mean, overtime = tools.find_outlier_duration(duration, rolling=20, std=2) def test_vert_vel():