Skip to content

Commit

Permalink
🎨 minor improvements, adapt to high dimensional datasets
Browse files Browse the repository at this point in the history
- added FEATURES_CUTOFF (to restrict it to a reasonable size for quadratic computations as e.g. correlation matrices)
- added titles to note sampling
- use dots to show data better in some (line) plots

Added brief execution hint for notebook
  • Loading branch information
Henry committed Feb 8, 2024
1 parent 48cfd6d commit 18caa3a
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 36 deletions.
105 changes: 71 additions & 34 deletions project/00_5_training_data_exploration.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,18 @@
# -
#
# Does not save filtered data, this is done by splitting notebook. Only visualisations.
#
# Expected current format:
# - wide format (samples x features)
# > not the default output in MS-based proteomics
#
# An example of peptides in wide format would be:
#
# | Sample ID | pep A | pep B | pep C | ... |
# | --- | --- | --- | --- | --- |
# | sample_01 | 0.1 | 0.2 | 0.3 | ... |
# | sample_02 | 0.2 | NA | 0.4 | ... |
# | sample_03 | 0.3 | 0.2 | 0.1 | ... |

# %%
from __future__ import annotations
Expand Down Expand Up @@ -67,20 +79,20 @@ def only_every_x_ticks(ax, x=2, axis=None):
def use_first_n_chars_in_labels(ax, x=2):
"""Take first N characters of labels and use them as new labels"""
# xaxis
_new_labels = [l.get_text()[:x]
for l in ax.get_xticklabels()]
_new_labels = [_l.get_text()[:x]
for _l in ax.get_xticklabels()]
_ = ax.set_xticklabels(_new_labels)
# yaxis
_new_labels = [l.get_text()[:x] for l in ax.get_yticklabels()]
_new_labels = [_l.get_text()[:x] for _l in ax.get_yticklabels()]
_ = ax.set_yticklabels(_new_labels)
return ax


def split_xticklabels(ax, PG_SEPARATOR=';'):
"""Split labels by PG_SEPARATOR and only use first part"""
if PG_SEPARATOR is not None:
_new_labels = [l.get_text().split(PG_SEPARATOR)[0]
for l in ax.get_xticklabels()]
_new_labels = [_l.get_text().split(PG_SEPARATOR)[0]
for _l in ax.get_xticklabels()]
_ = ax.set_xticklabels(_new_labels)
return ax

Expand Down Expand Up @@ -134,6 +146,7 @@ def get_dynamic_range(min_max):
SAMPLE_FIRST_N_CHARS: int = 16 # number of characters used for sample names
# if True, do not use tick on heatmap - only label
NO_TICK_LABELS_ON_HEATMAP: bool = True
FEATURES_CUTOFF: int = 10_000 # cutoff for number of features to plot in clustermaps or heatmaps, randomly selected


# %% [markdown]
Expand All @@ -160,11 +173,12 @@ def get_dynamic_range(min_max):
elif FN_INTENSITIES.suffix == '.csv':
data = pd.read_csv(FN_INTENSITIES, index_col=INDEX_COL, nrows=N_FIRST_ROWS)
data

# %%
if LONG_FORMAT:
data = data.squeeze().unstack()
if LOG_TRANSFORM:
data = np.log2(data).astype(float)
data = np.log2(data + 1).astype(float)


# drop entrily missing rows or columns
Expand Down Expand Up @@ -240,27 +254,27 @@ def get_dynamic_range(min_max):
fig = plotting.data.plot_missing_dist_highdim(data,
min_feat_per_sample=min_feat_per_sample,
min_samples_per_feat=min_samples_per_feat)
fname = FIGUREFOLDER / f'dist_all_lineplot_w_cutoffs.pdf'
fname = FIGUREFOLDER / 'dist_all_lineplot_w_cutoffs.pdf'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname)

# %%
fig = plotting.data.plot_missing_dist_highdim(data)
fname = FIGUREFOLDER / f'dist_all_lineplot_wo_cutoffs.pdf'
fname = FIGUREFOLDER / 'dist_all_lineplot_wo_cutoffs.pdf'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname)

# %%
fig = plotting.data.plot_missing_pattern_histogram(data,
min_feat_per_sample=min_feat_per_sample,
min_samples_per_feat=min_samples_per_feat)
fname = FIGUREFOLDER / f'dist_all_histogram_w_cutoffs.pdf'
fname = FIGUREFOLDER / 'dist_all_histogram_w_cutoffs.pdf'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname)

# %%
fig = plotting.data.plot_missing_pattern_histogram(data)
fname = FIGUREFOLDER / f'dist_all_histogram_wo_cutoffs.pdf'
fname = FIGUREFOLDER / 'dist_all_histogram_wo_cutoffs.pdf'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname)

Expand All @@ -269,7 +283,7 @@ def get_dynamic_range(min_max):

# %%
fig = plotting.data.plot_missing_dist_boxplots(data)
fname = FIGUREFOLDER / f'dist_all_boxplots.pdf'
fname = FIGUREFOLDER / 'dist_all_boxplots.pdf'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname)

Expand All @@ -279,7 +293,7 @@ def get_dynamic_range(min_max):
# %%
fig = plotting.data.plot_missing_pattern_violinplot(
data, min_feat_per_sample, min_samples_per_feat)
fname = FIGUREFOLDER / f'dist_all_violin_plot.pdf'
fname = FIGUREFOLDER / 'dist_all_violin_plot.pdf'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname)

Expand All @@ -306,8 +320,19 @@ def get_dynamic_range(min_max):

# %%
# %%time
corr_lower_triangle = analyzers.corr_lower_triangle(data)
if data.shape[1] > FEATURES_CUTOFF:
selected = data.sample(n=FEATURES_CUTOFF, axis=1, random_state=42)
FEATURES_CUTOFF_TEXT = f'{FEATURES_CUTOFF:,d} randomly selected {COL_INDEX_NAME}'
else:
FEATURES_CUTOFF = data.shape[1]
FEATURES_CUTOFF_TEXT = f'{FEATURES_CUTOFF:,d} {COL_INDEX_NAME}'
selected = data
FEATURES_CUTOFF_TEXT

# %%
corr_lower_triangle = analyzers.corr_lower_triangle(selected)
fig, axes = analyzers.plot_corr_histogram(corr_lower_triangle, bins=40)
fig.suptitle(f'Histogram of correlations based on {FEATURES_CUTOFF_TEXT}')
fname = FIGUREFOLDER / 'corr_histogram_feat.pdf'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname)
Expand All @@ -320,6 +345,7 @@ def get_dynamic_range(min_max):
cv = data.std() / data.mean()
# biological coefficient of variation: standard deviation (variation) w.r.t mean
ax = cv.hist(bins=30)
ax.set_title(f'Histogram of coefficient of variation (CV) of {FEATURES_CUTOFF_TEXT}')
fname = FIGUREFOLDER / 'CV_histogram_features.pdf'
files_out[fname.name] = fname
vaep.savefig(ax.get_figure(), name=fname)
Expand All @@ -331,18 +357,22 @@ def get_dynamic_range(min_max):
# needs to deal with duplicates
# notna = data.notna().T.drop_duplicates().T
# get index and column names
vaep.plotting.make_large_descriptors(8)
cg = sns.clustermap(data.notna(),
vaep.plotting.make_large_descriptors(5)

cg = sns.clustermap(selected.notna(),
cbar_pos=None,
figsize=(8, 8))
ax = cg.ax_heatmap
if PG_SEPARATOR is not None:
_new_labels = [l.get_text().split(PG_SEPARATOR)[0]
for l in ax.get_xticklabels()]
_new_labels = [_l.get_text().split(PG_SEPARATOR)[0]
for _l in ax.get_xticklabels()]
_ = ax.set_xticklabels(_new_labels)
if NO_TICK_LABELS_ON_HEATMAP:
ax.set_xticks([])
ax.set_yticks([])
# cg.fig.suptitle(f'Present-absent pattern of {FEATURES_CUTOFF_TEXT}')
ax.set_title(f'Present-absent pattern of {FEATURES_CUTOFF_TEXT}')
cg.fig.tight_layout()
fname = FIGUREFOLDER / 'clustermap_present_absent_pattern.png'
files_out[fname.name] = fname
vaep.savefig(cg.fig,
Expand All @@ -355,21 +385,23 @@ def get_dynamic_range(min_max):

# %%
assert (len(cg.dendrogram_row.reordered_ind), len(
cg.dendrogram_col.reordered_ind)) == data.shape
cg.dendrogram_col.reordered_ind)) == selected.shape

# %%
vaep.plotting.make_large_descriptors(8)
fig, ax = plt.subplots(figsize=(4, 4))
vaep.plotting.make_large_descriptors(5)
fig, ax = plt.subplots(figsize=(8, 8))
ax = sns.heatmap(
data.iloc[cg.dendrogram_row.reordered_ind,
cg.dendrogram_col.reordered_ind],
selected.iloc[cg.dendrogram_row.reordered_ind,
cg.dendrogram_col.reordered_ind],
ax=ax,
)
ax.set_title(f'Heatmap of intensities clustered by missing pattern of {FEATURES_CUTOFF_TEXT}',
fontsize=8)
only_every_x_ticks(ax, x=2)
use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)
if PG_SEPARATOR is not None:
_new_labels = [l.get_text().split(PG_SEPARATOR)[0]
for l in ax.get_xticklabels()]
_new_labels = [_l.get_text().split(PG_SEPARATOR)[0]
for _l in ax.get_xticklabels()]
_ = ax.set_xticklabels(_new_labels)
if NO_TICK_LABELS_ON_HEATMAP:
ax.set_xticks([])
Expand All @@ -386,18 +418,20 @@ def get_dynamic_range(min_max):
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.heatmap(
analyzers.corr_lower_triangle(
data.iloc[:, cg.dendrogram_col.reordered_ind]),
selected.iloc[:, cg.dendrogram_col.reordered_ind]),
vmin=-1,
vmax=1,
cbar_kws={'shrink': 0.75},
ax=ax,
square=True,
)
ax.set_title(f'Heatmap of feature correlation of {FEATURES_CUTOFF_TEXT}',
fontsize=8)
_ = only_every_x_ticks(ax, x=2)
_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)
if PG_SEPARATOR is not None:
_new_labels = [l.get_text().split(PG_SEPARATOR)[0]
for l in ax.get_xticklabels()]
_new_labels = [_l.get_text().split(PG_SEPARATOR)[0]
for _l in ax.get_xticklabels()]
_ = ax.set_xticklabels(_new_labels)
if NO_TICK_LABELS_ON_HEATMAP:
ax.set_xticks([])
Expand All @@ -408,7 +442,8 @@ def get_dynamic_range(min_max):

# %%
lower_corr = analyzers.corr_lower_triangle(
data.T.iloc[:, cg.dendrogram_row.reordered_ind])
selected.T.iloc[:, cg.dendrogram_row.reordered_ind])

# %%
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.heatmap(
Expand All @@ -424,29 +459,31 @@ def get_dynamic_range(min_max):
if NO_TICK_LABELS_ON_HEATMAP:
ax.set_xticks([])
ax.set_yticks([])
ax.set_title(f'Heatmap of sample correlation based on {FEATURES_CUTOFF_TEXT}', fontsize=7)
fname = FIGUREFOLDER / 'heatmap_sample_correlation.png'
files_out[fname.name] = fname
vaep.savefig(fig, name=fname, pdf=False, dpi=600)

# %%
vaep.plotting.make_large_descriptors(12)
vaep.plotting.make_large_descriptors(6)
kwargs = dict()
if NO_TICK_LABELS_ON_HEATMAP:
kwargs['xticklabels'] = False
kwargs['yticklabels'] = False
cg = get_clustermap(data, **kwargs)
cg = get_clustermap(selected, **kwargs)
ax = cg.ax_heatmap
if PG_SEPARATOR is not None:
_new_labels = [l.get_text().split(PG_SEPARATOR)[0]
for l in ax.get_xticklabels()]
_new_labels = [_l.get_text().split(PG_SEPARATOR)[0]
for _l in ax.get_xticklabels()]
_ = ax.set_xticklabels(_new_labels)
_ = only_every_x_ticks(ax, x=2, axis=0)
_ = use_first_n_chars_in_labels(ax, x=SAMPLE_FIRST_N_CHARS)

# ax.set_title(f'Clustermap of intensities based on {FEATURES_CUTOFF_TEXT}', fontsize=7)
# cg.fig.tight_layout() # tight_layout makes the cbar a bit ugly
cg.fig.suptitle(f'Clustermap of intensities based on {FEATURES_CUTOFF_TEXT}', fontsize=7)
fname = FIGUREFOLDER / 'clustermap_intensities_normalized.png'
files_out[fname.name] = fname
cg.fig.savefig(fname, dpi=300) # avoid tight_layout
# tight_layout makes the cbar a bit ugly
# vaep.savefig(cg.fig,
# name=fname,
# pdf=False)
Expand Down
10 changes: 10 additions & 0 deletions project/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,16 @@ misc | misc_sampling_in_pandas.ipynb | How to sample in pandas

# Notebook descriptions (To be completed)

## Inspect dataset

### `00_5_training_data_exploration.py`

Can be execute manually

```bash
jupytext 00_5_training_data_exploration.py --to ipynb -o - | papermill - runs/example/00_5_training_data_exploration.ipynb -f config/single_dev_dataset/example/inspect_data.yaml
```

## Single experiment run
### `01_0_split_data.ipynb`

Expand Down
4 changes: 2 additions & 2 deletions vaep/plotting/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def plot_missing_dist_highdim(data: pd.DataFrame,
.size()
.sort_index()
.plot
.line(style='-',
.line(style='.',
ax=axes[0])
)
ax.set_ylabel('observations (samples)')
Expand All @@ -146,7 +146,7 @@ def plot_missing_dist_highdim(data: pd.DataFrame,
.size()
.sort_index()
.plot
.line(style='-',
.line(style='.',
ax=axes[1])
)
if min_samples_per_feat is not None:
Expand Down

0 comments on commit 18caa3a

Please sign in to comment.