From 748a1d74573d384a479b2d180693fb276e1a9277 Mon Sep 17 00:00:00 2001 From: Henry Date: Sun, 26 Nov 2023 18:23:09 +0100 Subject: [PATCH] :art: allow custom subselection, add NA if not available - Figure 2: add custom selection of models to aggregate best 5 models of several datasets (custom plotting for paper) - rotate performance label - add NA if model did not run (here: error or not finished within 24h) --- project/01_2_performance_plots.ipynb | 207 +++++++++++++++----- project/01_2_performance_plots.py | 154 ++++++++++++--- project/03_2_best_models_comparison_fig2.py | 18 +- vaep/plotting/__init__.py | 33 +++- 4 files changed, 322 insertions(+), 90 deletions(-) diff --git a/project/01_2_performance_plots.ipynb b/project/01_2_performance_plots.ipynb index 18229bd9e..df3aaacc6 100644 --- a/project/01_2_performance_plots.ipynb +++ b/project/01_2_performance_plots.ipynb @@ -32,6 +32,7 @@ "import random\n", "from pathlib import Path\n", "\n", + "from IPython.display import display\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", @@ -119,6 +120,7 @@ "# Machine parsed metadata from rawfile workflow\n", "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'\n", "models: str = 'Median,CF,DAE,VAE' # picked models to compare (comma separated)\n", + "sel_models: str = '' # user defined comparison (comma separated)\n", "# Restrict plotting to top N methods for imputation based on error of validation data, maximum 10\n", "plot_to_n: int = 5" ] @@ -184,7 +186,10 @@ "METRIC = 'MAE'\n", "MIN_FREQ = None\n", "MODELS_PASSED = args.models.split(',')\n", - "MODELS = MODELS_PASSED.copy()" + "MODELS = MODELS_PASSED.copy()\n", + "SEL_MODELS = None\n", + "if args.sel_models:\n", + " SEL_MODELS = args.sel_models.split(',')" ] }, { @@ -243,7 +248,7 @@ "id": "ffc6d140-f48e-4477-84f3-47a196e0a3d8", "metadata": {}, "source": [ - "## Across data completeness" + "## data completeness across entire data" ] }, { @@ -258,7 +263,6 @@ "# load frequency of training features...\n", "# needs to be pickle -> index.name needed\n", "freq_feat = vaep.io.datasplits.load_freq(args.data, file='freq_features.json')\n", - "\n", "freq_feat.head() # training data" ] }, @@ -272,7 +276,15 @@ "outputs": [], "source": [ "prop = freq_feat / len(data.train_X.index.levels[0])\n", - "prop.to_frame()" + "prop.sort_values().to_frame().plot()" + ] + }, + { + "cell_type": "markdown", + "id": "19e5adfb", + "metadata": {}, + "source": [ + "View training data in wide format" ] }, { @@ -288,6 +300,14 @@ "data.train_X" ] }, + { + "cell_type": "markdown", + "id": "21102a1d", + "metadata": {}, + "source": [ + "Number of samples and features:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -301,6 +321,14 @@ "print(f\"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}\")" ] }, + { + "cell_type": "markdown", + "id": "61186a4e", + "metadata": {}, + "source": [ + "Collect outputs in excel file:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -312,7 +340,8 @@ "source": [ "fname = args.folder_experiment / '01_2_performance_summary.xlsx'\n", "dumps[fname.stem] = fname\n", - "writer = pd.ExcelWriter(fname)" + "writer = pd.ExcelWriter(fname)\n", + "print(f\"Saving to: {fname}\")" ] }, { @@ -320,7 +349,7 @@ "id": "bbe028c4-190d-4d50-b8a7-d109817d7b98", "metadata": {}, "source": [ - "# Model specifications\n", + "## Model specifications\n", "- used for bar plot annotations" ] }, @@ -365,19 +394,8 @@ "outputs": [], "source": [ "# index name\n", - "freq_feat.index.name = data.train_X.columns.name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8088a91f-6aaa-4b9d-b855-332d2bbf5780", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# index name\n", + "freq_feat.index.name = data.train_X.columns.name\n", + "# sample index name\n", "sample_index_name = data.train_X.index.name" ] }, @@ -446,7 +464,7 @@ "lines_to_next_cell": 0 }, "source": [ - "## Select top N for plotting and set colors" + "### Select top N for plotting and set colors" ] }, { @@ -478,18 +496,20 @@ "source": [ "mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS]\n", "mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f')\n", - "mae_stats_ordered_val" + "mae_stats_ordered_val.T" ] }, { "cell_type": "markdown", "id": "f5b33f93", - "metadata": { - "lines_to_next_cell": 0 - }, + "metadata": {}, "source": [ - "Hack color order, by assing CF, DAE and VAE unique colors no matter their order\n", - "Could be extended to all supported imputation methods" + "Some model have fixed colors, others are assigned randomly\n", + "\n", + "> Note\n", + ">\n", + "> 1. The order of \"new\" models is important for the color assignment.\n", + "> 2. User defined model keys for the same model with two configuration will yield different colors." ] }, { @@ -514,12 +534,9 @@ }, "outputs": [], "source": [ - "# For top_N -> define colors\n", "TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n]\n", - "\n", "TOP_N_COLOR_PALETTE = {model: color for model,\n", " color in zip(TOP_N_ORDER, COLORS_TO_USE)}\n", - "\n", "TOP_N_ORDER" ] }, @@ -678,7 +695,7 @@ }, "outputs": [], "source": [ - "errors_val.describe() # mean of means" + "errors_val.describe()[ORDER_MODELS].T # mean of means" ] }, { @@ -692,7 +709,7 @@ "outputs": [], "source": [ "c_avg_error = 2\n", - "mask = (errors_val[MODELS] >= c_avg_error).any(axis=1)\n", + "mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1)\n", "errors_val.loc[mask]" ] }, @@ -715,15 +732,16 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(8, 3))\n", - "ax, errors_binned = vaep.plotting.errors.plot_errors_binned(\n", + "ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(\n", " pred_val[\n", " [TARGET_COL] + TOP_N_ORDER\n", " ],\n", + " feat_medians=data.train_X.median(),\n", " ax=ax,\n", " palette=TOP_N_COLOR_PALETTE,\n", " metric_name=METRIC,)\n", "ax.set_ylabel(f\"Average error ({METRIC})\")\n", - "fname = args.out_figures / f'2_{group}_errors_binned_by_int_val.pdf'\n", + "fname = args.out_figures / f'2_{group}_errors_binned_by_feat_median_val.pdf'\n", "figures[fname.stem] = fname\n", "vaep.savefig(ax.get_figure(), name=fname)" ] @@ -845,7 +863,7 @@ "lines_to_next_cell": 0 }, "source": [ - "## Intensity distribution as histogram\n", + "### Intensity distribution as histogram\n", "Plot top 4 models predictions for intensities in test data" ] }, @@ -880,8 +898,8 @@ " ax=ax,\n", " alpha=0.5,\n", " )\n", - " _ = [(l.set_rotation(90))\n", - " for l in ax.get_xticklabels()]\n", + " _ = [(l_.set_rotation(90))\n", + " for l_ in ax.get_xticklabels()]\n", " ax.legend()\n", "\n", "axes[0].set_ylabel('Number of observations')\n", @@ -1217,7 +1235,7 @@ " build_text,\n", " axis=1)\n", "except KeyError:\n", - " logger.warning(\"No model PIMMS models in comparsion. Using empty text\")\n", + " logger.warning(\"No PIMMS models in comparsion. Using empty text\")\n", " text = pd.Series('', index=model_configs.columns)\n", "\n", "_to_plot.loc[\"text\"] = text\n", @@ -1235,12 +1253,13 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(4, 2))\n", - "ax = _to_plot.loc[[feature_names.name]].plot.bar(rot=0,\n", - " ylabel=f\"{METRIC} for {feature_names.name} ({n_in_comparison:,} intensities)\",\n", - " # title=f'performance on test data (based on {n_in_comparison:,} measurements)',\n", - " color=COLORS_TO_USE,\n", - " ax=ax,\n", - " width=.8)\n", + "ax = _to_plot.loc[[feature_names.name]].plot.bar(\n", + " rot=0,\n", + " ylabel=f\"{METRIC} for {feature_names.name} ({n_in_comparison:,} intensities)\",\n", + " # title=f'performance on test data (based on {n_in_comparison:,} measurements)',\n", + " color=COLORS_TO_USE,\n", + " ax=ax,\n", + " width=.8)\n", "ax = vaep.plotting.add_height_to_barplot(ax, size=5)\n", "ax = vaep.plotting.add_text_to_barplot(ax, _to_plot.loc[\"text\"], size=5)\n", "ax.set_xticklabels([])\n", @@ -1273,7 +1292,7 @@ "id": "d88c21c7", "metadata": {}, "source": [ - "Plot error by median feature intensity" + "### Plot error by median feature intensity" ] }, { @@ -1306,6 +1325,106 @@ "errors_binned" ] }, + { + "cell_type": "markdown", + "id": "26370a1a", + "metadata": {}, + "source": [ + "### Custom model selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "712faf9a", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "if SEL_MODELS:\n", + " metrics = vaep.models.Metrics()\n", + " test_metrics = metrics.add_metrics(\n", + " pred_test[['observed', *SEL_MODELS]], key='test data')\n", + " test_metrics = pd.DataFrame(test_metrics)[SEL_MODELS]\n", + " test_metrics\n", + "\n", + " n_in_comparison = int(test_metrics.loc['N'].unique()[0])\n", + " n_in_comparison\n", + "\n", + " _to_plot = test_metrics.loc[METRIC].to_frame().T\n", + " _to_plot.index = [feature_names.name]\n", + " _to_plot\n", + "\n", + " try:\n", + " text = model_configs[[\"latent_dim\", \"hidden_layers\"]].apply(\n", + " build_text,\n", + " axis=1)\n", + " except KeyError:\n", + " logger.warning(\"No PIMMS models in comparsion. Using empty text\")\n", + " text = pd.Series('', index=model_configs.columns)\n", + "\n", + " _to_plot.loc[\"text\"] = text\n", + " _to_plot = _to_plot.fillna('')\n", + " _to_plot\n", + "\n", + " fig, ax = plt.subplots(figsize=(4, 2))\n", + " ax = _to_plot.loc[[feature_names.name]].plot.bar(\n", + " rot=0,\n", + " ylabel=f\"{METRIC} for {feature_names.name} ({n_in_comparison:,} intensities)\",\n", + " # title=f'performance on test data (based on {n_in_comparison:,} measurements)',\n", + " color=COLORS_TO_USE,\n", + " ax=ax,\n", + " width=.8)\n", + " ax = vaep.plotting.add_height_to_barplot(ax, size=5)\n", + " ax = vaep.plotting.add_text_to_barplot(ax, _to_plot.loc[\"text\"], size=5)\n", + " ax.set_xticklabels([])\n", + " fname = args.out_figures / f'2_{group}_performance_test_sel.pdf'\n", + " figures[fname.stem] = fname\n", + " vaep.savefig(fig, name=fname)\n", + "\n", + " dumps[fname.stem] = fname.with_suffix('.csv')\n", + " _to_plot_long = _to_plot.T\n", + " _to_plot_long = _to_plot_long.rename(\n", + " {feature_names.name: 'metric_value'}, axis=1)\n", + " _to_plot_long['data level'] = feature_names.name\n", + " _to_plot_long = _to_plot_long.set_index('data level', append=True)\n", + " _to_plot_long.to_csv(fname.with_suffix('.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a578570", + "metadata": {}, + "outputs": [], + "source": [ + "# custom selection\n", + "if SEL_MODELS:\n", + " vaep.plotting.make_large_descriptors(6)\n", + " fig, ax = plt.subplots(figsize=(8, 2))\n", + "\n", + " ax, errors_binned = vaep.plotting.errors.plot_errors_by_median(\n", + " pred=pred_test[\n", + " [TARGET_COL] + SEL_MODELS\n", + " ],\n", + " feat_medians=data.train_X.median(),\n", + " ax=ax,\n", + " metric_name=METRIC,\n", + " palette=COLORS_TO_USE\n", + " )\n", + " ax.set_ylim(0, 1.5)\n", + " # for text in ax.legend().get_texts():\n", + " # text.set_fontsize(6)\n", + " fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'\n", + " figures[fname.stem] = fname\n", + " vaep.savefig(ax.get_figure(), name=fname)\n", + " # vaep.plotting.make_large_descriptors(6)\n", + " dumps[fname.stem] = fname.with_suffix('.csv')\n", + " errors_binned.to_csv(fname.with_suffix('.csv'))\n", + " display(errors_binned)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/project/01_2_performance_plots.py b/project/01_2_performance_plots.py index 53221562e..fa8954e20 100644 --- a/project/01_2_performance_plots.py +++ b/project/01_2_performance_plots.py @@ -32,6 +32,7 @@ import random from pathlib import Path +from IPython.display import display import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -94,6 +95,7 @@ def build_text(s): # Machine parsed metadata from rawfile workflow fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' models: str = 'Median,CF,DAE,VAE' # picked models to compare (comma separated) +sel_models: str = '' # user defined comparison (comma separated) # Restrict plotting to top N methods for imputation based on error of validation data, maximum 10 plot_to_n: int = 5 @@ -119,6 +121,9 @@ def build_text(s): MIN_FREQ = None MODELS_PASSED = args.models.split(',') MODELS = MODELS_PASSED.copy() +SEL_MODELS = None +if args.sel_models: + SEL_MODELS = args.sel_models.split(',') # %% @@ -146,34 +151,43 @@ def build_text(s): vaep.savefig(fig, name=fname) # %% [markdown] -# ## Across data completeness +# ## data completeness across entire data # %% # load frequency of training features... # needs to be pickle -> index.name needed freq_feat = vaep.io.datasplits.load_freq(args.data, file='freq_features.json') - freq_feat.head() # training data # %% prop = freq_feat / len(data.train_X.index.levels[0]) -prop.to_frame() +prop.sort_values().to_frame().plot() + +# %% [markdown] +# View training data in wide format # %% data.to_wide_format() data.train_X +# %% [markdown] +# Number of samples and features: + # %% N_SAMPLES, M_FEAT = data.train_X.shape print(f"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}") +# %% [markdown] +# Collect outputs in excel file: + # %% fname = args.folder_experiment / '01_2_performance_summary.xlsx' dumps[fname.stem] = fname writer = pd.ExcelWriter(fname) +print(f"Saving to: {fname}") # %% [markdown] -# # Model specifications +# ## Model specifications # - used for bar plot annotations # %% @@ -196,9 +210,7 @@ def build_text(s): # %% # index name freq_feat.index.name = data.train_X.columns.name - -# %% -# index name +# sample index name sample_index_name = data.train_X.index.name # %% [markdown] @@ -228,7 +240,7 @@ def build_text(s): errors_val # over all samples and all features # %% [markdown] -# ## Select top N for plotting and set colors +# ### Select top N for plotting and set colors # %% ORDER_MODELS = (errors_val .abs() @@ -241,22 +253,24 @@ def build_text(s): # %% mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS] mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f') -mae_stats_ordered_val +mae_stats_ordered_val.T # %% [markdown] -# Hack color order, by assing CF, DAE and VAE unique colors no matter their order -# Could be extended to all supported imputation methods +# Some model have fixed colors, others are assigned randomly +# +# > Note +# > +# > 1. The order of "new" models is important for the color assignment. +# > 2. User defined model keys for the same model with two configuration will yield different colors. + # %% COLORS_TO_USE = vaep.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS)) sns.color_palette(COLORS_TO_USE) # %% -# For top_N -> define colors TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n] - TOP_N_COLOR_PALETTE = {model: color for model, color in zip(TOP_N_ORDER, COLORS_TO_USE)} - TOP_N_ORDER # %% [markdown] @@ -336,11 +350,11 @@ def build_text(s): # Some interpolated features are missing # %% -errors_val.describe() # mean of means +errors_val.describe()[ORDER_MODELS].T # mean of means # %% c_avg_error = 2 -mask = (errors_val[MODELS] >= c_avg_error).any(axis=1) +mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1) errors_val.loc[mask] @@ -350,15 +364,16 @@ def build_text(s): # %% fig, ax = plt.subplots(figsize=(8, 3)) -ax, errors_binned = vaep.plotting.errors.plot_errors_binned( +ax, errors_binned = vaep.plotting.errors.plot_errors_by_median( pred_val[ [TARGET_COL] + TOP_N_ORDER ], + feat_medians=data.train_X.median(), ax=ax, palette=TOP_N_COLOR_PALETTE, metric_name=METRIC,) ax.set_ylabel(f"Average error ({METRIC})") -fname = args.out_figures / f'2_{group}_errors_binned_by_int_val.pdf' +fname = args.out_figures / f'2_{group}_errors_binned_by_feat_median_val.pdf' figures[fname.stem] = fname vaep.savefig(ax.get_figure(), name=fname) @@ -409,7 +424,7 @@ def build_text(s): writer.close() # %% [markdown] -# ## Intensity distribution as histogram +# ### Intensity distribution as histogram # Plot top 4 models predictions for intensities in test data # %% min_max = vaep.plotting.data.min_max(pred_test[TARGET_COL]) @@ -434,8 +449,8 @@ def build_text(s): ax=ax, alpha=0.5, ) - _ = [(l.set_rotation(90)) - for l in ax.get_xticklabels()] + _ = [(l_.set_rotation(90)) + for l_ in ax.get_xticklabels()] ax.legend() axes[0].set_ylabel('Number of observations') @@ -608,7 +623,7 @@ def highlight_min(s, color, tolerence=0.00001): build_text, axis=1) except KeyError: - logger.warning("No model PIMMS models in comparsion. Using empty text") + logger.warning("No PIMMS models in comparsion. Using empty text") text = pd.Series('', index=model_configs.columns) _to_plot.loc["text"] = text @@ -618,12 +633,13 @@ def highlight_min(s, color, tolerence=0.00001): # %% fig, ax = plt.subplots(figsize=(4, 2)) -ax = _to_plot.loc[[feature_names.name]].plot.bar(rot=0, - ylabel=f"{METRIC} for {feature_names.name} ({n_in_comparison:,} intensities)", - # title=f'performance on test data (based on {n_in_comparison:,} measurements)', - color=COLORS_TO_USE, - ax=ax, - width=.8) +ax = _to_plot.loc[[feature_names.name]].plot.bar( + rot=0, + ylabel=f"{METRIC} for {feature_names.name} ({n_in_comparison:,} intensities)", + # title=f'performance on test data (based on {n_in_comparison:,} measurements)', + color=COLORS_TO_USE, + ax=ax, + width=.8) ax = vaep.plotting.add_height_to_barplot(ax, size=5) ax = vaep.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=5) ax.set_xticklabels([]) @@ -642,7 +658,7 @@ def highlight_min(s, color, tolerence=0.00001): # %% [markdown] -# Plot error by median feature intensity +# ### Plot error by median feature intensity # %% fig, ax = plt.subplots(figsize=(8, 2)) @@ -665,6 +681,86 @@ def highlight_min(s, color, tolerence=0.00001): errors_binned.to_csv(fname.with_suffix('.csv')) errors_binned +# %% [markdown] +# ### Custom model selection + +# %% +if SEL_MODELS: + metrics = vaep.models.Metrics() + test_metrics = metrics.add_metrics( + pred_test[['observed', *SEL_MODELS]], key='test data') + test_metrics = pd.DataFrame(test_metrics)[SEL_MODELS] + test_metrics + + n_in_comparison = int(test_metrics.loc['N'].unique()[0]) + n_in_comparison + + _to_plot = test_metrics.loc[METRIC].to_frame().T + _to_plot.index = [feature_names.name] + _to_plot + + try: + text = model_configs[["latent_dim", "hidden_layers"]].apply( + build_text, + axis=1) + except KeyError: + logger.warning("No PIMMS models in comparsion. Using empty text") + text = pd.Series('', index=model_configs.columns) + + _to_plot.loc["text"] = text + _to_plot = _to_plot.fillna('') + _to_plot + + fig, ax = plt.subplots(figsize=(4, 2)) + ax = _to_plot.loc[[feature_names.name]].plot.bar( + rot=0, + ylabel=f"{METRIC} for {feature_names.name} ({n_in_comparison:,} intensities)", + # title=f'performance on test data (based on {n_in_comparison:,} measurements)', + color=COLORS_TO_USE, + ax=ax, + width=.8) + ax = vaep.plotting.add_height_to_barplot(ax, size=5) + ax = vaep.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=5) + ax.set_xticklabels([]) + fname = args.out_figures / f'2_{group}_performance_test_sel.pdf' + figures[fname.stem] = fname + vaep.savefig(fig, name=fname) + + dumps[fname.stem] = fname.with_suffix('.csv') + _to_plot_long = _to_plot.T + _to_plot_long = _to_plot_long.rename( + {feature_names.name: 'metric_value'}, axis=1) + _to_plot_long['data level'] = feature_names.name + _to_plot_long = _to_plot_long.set_index('data level', append=True) + _to_plot_long.to_csv(fname.with_suffix('.csv')) + + +# %% +# custom selection +if SEL_MODELS: + vaep.plotting.make_large_descriptors(6) + fig, ax = plt.subplots(figsize=(8, 2)) + + ax, errors_binned = vaep.plotting.errors.plot_errors_by_median( + pred=pred_test[ + [TARGET_COL] + SEL_MODELS + ], + feat_medians=data.train_X.median(), + ax=ax, + metric_name=METRIC, + palette=COLORS_TO_USE + ) + ax.set_ylim(0, 1.5) + # for text in ax.legend().get_texts(): + # text.set_fontsize(6) + fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf' + figures[fname.stem] = fname + vaep.savefig(ax.get_figure(), name=fname) + # vaep.plotting.make_large_descriptors(6) + dumps[fname.stem] = fname.with_suffix('.csv') + errors_binned.to_csv(fname.with_suffix('.csv')) + display(errors_binned) + # %% (errors_binned .set_index( diff --git a/project/03_2_best_models_comparison_fig2.py b/project/03_2_best_models_comparison_fig2.py index 55c5a0d5c..39921344e 100644 --- a/project/03_2_best_models_comparison_fig2.py +++ b/project/03_2_best_models_comparison_fig2.py @@ -36,18 +36,18 @@ FOLDER = Path('runs/mnar_mcar/') SIZE = 'l' files_in = { - 'protein groups': FOLDER / 'pg_l_25MNAR/figures/2_1_performance_test.csv', - 'peptides': FOLDER / 'pep_l_25MNAR/figures/2_1_performance_test.csv', - 'precursors': FOLDER / 'evi_l_25MNAR/figures/2_1_performance_test.csv' + 'protein groups': FOLDER / 'pg_l_25MNAR/figures/2_1_performance_test_sel.csv', + 'peptides': FOLDER / 'pep_l_25MNAR/figures/2_1_performance_test_sel.csv', + 'precursors': FOLDER / 'evi_l_25MNAR/figures/2_1_performance_test_sel.csv' } # %% FOLDER = Path('runs/mnar_mcar/') SIZE = 'm' files_in = { - 'protein groups': FOLDER / 'pg_m_25MNAR/figures/2_1_performance_test.csv', - 'peptides': FOLDER / 'pep_m_25MNAR/figures/2_1_performance_test.csv', - 'precursors': FOLDER / 'evi_m_25MNAR/figures/2_1_performance_test.csv' + 'protein groups': FOLDER / 'pg_m_25MNAR/figures/2_1_performance_test_sel.csv', + 'peptides': FOLDER / 'pep_m_25MNAR/figures/2_1_performance_test_sel.csv', + 'precursors': FOLDER / 'evi_m_25MNAR/figures/2_1_performance_test_sel.csv' } # %% @@ -104,7 +104,9 @@ fontsize=7 )) -ax = vaep.plotting.add_height_to_barplot(ax, size=5) + +ax = vaep.plotting.add_height_to_barplot(ax, size=6, rotated=True) +ax.set_ylim((0, 0.75)) ax.legend(fontsize=5, loc='lower right') text = ( df['text'] @@ -113,7 +115,7 @@ .stack().loc[pd.IndexSlice[ORDER_MODELS, ORDER_DATA]] ) -ax = vaep.plotting.add_text_to_barplot(ax, text, size=5) +ax = vaep.plotting.add_text_to_barplot(ax, text, size=6) fig = ax.get_figure() fig.tight_layout() vaep.savefig(fig, fname) diff --git a/vaep/plotting/__init__.py b/vaep/plotting/__init__.py index a1fab486a..38e39c8d4 100644 --- a/vaep/plotting/__init__.py +++ b/vaep/plotting/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial import numpy as np import pandas as pd import matplotlib @@ -155,30 +156,44 @@ def add_prop_as_second_yaxis(ax: matplotlib.axes.Axes, n_samples: int, return ax2 -def add_height_to_barplot(ax, size=5): +def add_height_to_barplot(ax, size=5, rotated=False): + ax.annotate = partial(ax.annotate, text='NA', + xytext=(0, int(size / 2)), + ha='center', + size=size, + textcoords='offset points') + ax.annotate = partial(ax.annotate, + rotation=0, + va='center') + if rotated: + ax.annotate = partial(ax.annotate, + xytext=(1, int(size / 3)), + rotation=90, + va='bottom') for bar in ax.patches: if not bar.get_height(): + xy = (bar.get_x() + bar.get_width() / 2, + 0.0) + ax.annotate(text='NA', + xy=xy, + ) continue ax.annotate(text=format(bar.get_height(), '.2f'), xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), - xytext=(0, int(size / 2)), - ha='center', - va='center', - size=size, - textcoords='offset points') + ) return ax def add_text_to_barplot(ax, text, size=5): - for bar, text in zip(ax.patches, text): + for bar, text_ in zip(ax.patches, text): logger.debug(f"{bar = }, f{text = }, {bar.get_height() = }") if not bar.get_height(): continue - ax.annotate(text=text, + ax.annotate(text=text_, xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()), - xytext=(0, -5), + xytext=(1, -5), rotation=90, ha='center', va='top',