From ae6f64ab7dbde32af305003a74e2a5edee02d0ee Mon Sep 17 00:00:00 2001
From: Yigitcan Oezer <yigitcan.oezer@audiolabs-erlangen.de>
Date: Wed, 20 Mar 2024 11:17:54 +0100
Subject: [PATCH] fix: drum extraction score_informed added (refactored)

---
 ...um_extraction_kam_nmf_score_informed.ipynb | 478 ++++++++++++++++++
 1 file changed, 478 insertions(+)
 create mode 100755 demo_drum_extraction_kam_nmf_score_informed.ipynb
diff --git a/demo_drum_extraction_kam_nmf_score_informed.ipynb b/demo_drum_extraction_kam_nmf_score_informed.ipynb
new file mode 100755
index 0000000..3b92cd6
--- /dev/null
+++ b/demo_drum_extraction_kam_nmf_score_informed.ipynb
@@ -0,0 +1,478 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Date: Jun 2019 (*Review: March 2024*)\n",
+    "\n",
+    "<br>Programmer: Christian Dittmar, Yiğitcan Özer\n",
+    "\n",
+    "This is the demo script which illustrates the main functionalities of the 'NMF toolbox'. For a detailed description we refer to [1,2] (see References below).\n",
+    "\n",
+    "#### The notebook proceeds in the following steps:\n",
+    "<br>1. It loads an example audio file containing drums and melodic instruments\n",
+    "<br>2. It computes the STFT of the audio data.\n",
+    "<br>3. It applies KAM and NMF as described in [2], with score-informed initialization of the components\n",
+    "<br>4. It visualizes the decomposition results.\n",
+    "<br>5. It resynthesizes the separated audio streams and saves them as wav files to the hard drive."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import scipy.io.wavfile as wav\n",
+    "import IPython.display as ipd\n",
+    "\n",
+    "from libnmfd.dsp.algorithms import hpss_kam_fitzgerald\n",
+    "from libnmfd.dsp.filters import alpha_wiener_filter\n",
+    "from libnmfd.dsp.transforms import forward_stft, inverse_stft, log_freq_log_mag\n",
+    "from libnmfd.utils import make_monaural, pcm_int16_to_float32np\n",
+    "from libnmfd.utils.core_utils import percussiveness_estimation, visualize_components_kam, visualize_components_nmf\n",
+    "from libnmfd.core.nmfconv import conv_model, drum_specific_soft_constraints_nmf, \\\n",
+    "    init_activations, init_templates, nmfd\n",
+    "\n",
+    "INPUT_DIR = 'data/'\n",
+    "OUT_DIR = 'output/'\n",
+    "\n",
+    "# create the output directory if it doesn't exist\n",
+    "if not os.path.isdir(OUT_DIR):\n",
+    "    os.makedirs(OUT_DIR)\n",
+    "\n",
+    "filename = 'runningExample_IGotYouMixture.wav'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Load the audio signal"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read signal\n",
+    "fs, x = wav.read(os.path.join(INPUT_DIR, filename))\n",
+    "\n",
+    "# make monaural if necessary\n",
+    "x = make_monaural(x)\n",
+    "\n",
+    "# convert wav from int16 to float32\n",
+    "x = pcm_int16_to_float32np(x)\n",
+    "\n",
+    "# read corresponding transcription files\n",
+    "melody_transcription = np.loadtxt(os.path.join(INPUT_DIR, 'runningExample_IGotYouMelody.txt'))\n",
+    "drums_transcription = np.loadtxt(os.path.join(INPUT_DIR, 'runningExample_IGotYouDrums.txt'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Compute STFT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# spectral parameters\n",
+    "BLOCK_SIZE = 2048\n",
+    "HOP_SIZE = 512\n",
+    "\n",
+    "# STFT computation\n",
+    "X, A, P = forward_stft(x, block_size=BLOCK_SIZE, hop_size=HOP_SIZE, reconst_mirror=True, append_frames=True)\n",
+    "\n",
+    "# get dimensions and time and freq resolutions\n",
+    "num_bins, num_frames = X.shape\n",
+    "time_res = HOP_SIZE / fs\n",
+    "freq_res = fs / BLOCK_SIZE\n",
+    "\n",
+    "# get logarithmically-spaced frequency axis version for visualization purposes\n",
+    "log_freq_log_mag_A, log_freq_axis = log_freq_log_mag(A=A, freq_res=freq_res)\n",
+    "num_log_bins = len(log_freq_axis)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Apply KAM-based Harmonic Percussive Separation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set common parameters\n",
+    "num_iter_kam = 30\n",
+    "kam_A, kern, kern_ord = hpss_kam_fitzgerald(X=A,\n",
+    "                                            num_iter=num_iter_kam, \n",
+    "                                            kern_dim=13)\n",
+    "\n",
+    "# visualize\n",
+    "fh1 = visualize_components_kam(kam_A, time_res=time_res, freq_res=freq_res, font_size=14)\n",
+    "\n",
+    "# save result\n",
+    "fh1.savefig(os.path.join(OUT_DIR, 'demoDrumExtractionKAM_NMF_percThreshold_KAM.png'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "audios = []\n",
+    "\n",
+    "# resynthesize KAM results\n",
+    "for k in range(2):\n",
+    "    Y = kam_A[k] * np.exp(1j * P);\n",
+    "    y, _ = inverse_stft(X=Y, block_size=BLOCK_SIZE, hop_size=HOP_SIZE, reconst_mirror=True,\n",
+    "                        append_frames=True, num_samp=len(x))\n",
+    "    audios.append(y)\n",
+    "    # save result\n",
+    "    out_filepath = os.path.join(OUT_DIR,\n",
+    "                                'demoDrumExtractionKAM_NMF_percThreshold_KAM_component_{}_extracted_from_{}'.format(k, filename))\n",
+    "    \n",
+    "    wav.write(filename=out_filepath, rate=fs, data=y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Input audio mixture"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ipd.Audio(x, rate=fs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Percussive component based on KAM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ipd.Audio(audios[0].T, rate=fs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Harmonic component based on KAM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ipd.Audio(audios[1].T, rate=fs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# concatenate new NMF target\n",
+    "V = np.concatenate([kam_A[0], kam_A[1]])\n",
+    "num_double_bins = V.shape[0]\n",
+    "\n",
+    "# prepare matrix to revert concatenation,\n",
+    "accu_mat = np.concatenate([np.eye(num_bins), np.eye(num_bins)], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4. Apply score-informed NMF to KAM-based target"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set common parameters\n",
+    "num_iter_nmf = 30 #  in the score-informed case, less iterations are necessary\n",
+    "num_template_frames = 1 #  this can also be adjusted upwards\n",
+    "\n",
+    "# generate score-informed templates for the melodic part\n",
+    "pitched_W = init_templates(num_bins=num_bins,\n",
+    "                           num_template_frames=num_template_frames,\n",
+    "                           freq_res=freq_res,\n",
+    "                           pitches=melody_transcription[:, 1], \n",
+    "                           strategy='pitched')\n",
+    "\n",
+    "# generate score-informed activations for the melodic part\n",
+    "pitched_H = init_activations(num_frames=num_frames,\n",
+    "                             time_res=time_res,\n",
+    "                             pitches=melody_transcription[:, 1],\n",
+    "                             onsets=melody_transcription[:, 0],\n",
+    "                             durations=melody_transcription[:, 2],\n",
+    "                             strategy='pitched')\n",
+    "\n",
+    "# generate score-informed activations for the drum part\n",
+    "drums_H = init_activations(num_frames=num_frames,\n",
+    "                           time_res=time_res,\n",
+    "                           onsets=drums_transcription[:, 0],\n",
+    "                           drums=drums_transcription[:, 1],\n",
+    "                           decay=0.75,\n",
+    "                           strategy='drums')\n",
+    "\n",
+    "\n",
+    "num_comp_drum = drums_H.shape[0]\n",
+    "drums_W = init_templates(num_bins=num_bins, strategy='drums')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# join drum and pitched initialization\n",
+    "init_H = np.concatenate([drums_H, pitched_H], axis=0)\n",
+    "\n",
+    "# now get the total number of components\n",
+    "num_comp = init_H.shape[0]\n",
+    "\n",
+    "# fill remaining parts of templates with random values\n",
+    "# create joint templates\n",
+    "init_W = list()\n",
+    "informed_weight = 5\n",
+    "\n",
+    "# fill missing template parts with noise\n",
+    "for k in range(num_comp):\n",
+    "    if k < num_comp_drum:\n",
+    "        init_W.append(np.concatenate([informed_weight * drums_W[k]/drums_W[k].max().reshape(-1, 1), \n",
+    "                                      np.random.rand(num_bins, num_template_frames)], axis=0))\n",
+    "    else:\n",
+    "        init_W.append(np.concatenate([np.random.rand(num_bins, num_template_frames), \n",
+    "                                     informed_weight * pitched_W[k-num_comp_drum]/pitched_W[k-num_comp_drum].max()],\n",
+    "                                     axis=0))\n",
+    "\n",
+    "    # normalize to unit sum\n",
+    "    init_W[k] /= init_W[k].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# NMFD core method\n",
+    "nmfd_W, nmfd_H, _, _, tensor_W = nmfd(V=V, \n",
+    "                                      num_comp=num_comp, \n",
+    "                                      num_frames=num_frames, \n",
+    "                                      num_iter=num_iter_nmf,\n",
+    "                                      num_template_frames=num_template_frames,\n",
+    "                                      init_W=init_W,\n",
+    "                                      init_H=init_H,\n",
+    "                                      num_bins=num_double_bins,\n",
+    "                                      # set soft constraint parameters\n",
+    "                                      func_preprocess=drum_specific_soft_constraints_nmf,\n",
+    "                                      kern=kern,\n",
+    "                                      decay=0.75)\n",
+    "\n",
+    "\n",
+    "# set percussiveness to ground truth information\n",
+    "perc_weight = np.concatenate([np.ones((1, len(drums_W))), \n",
+    "                              np.zeros((1, len(pitched_W)))], \n",
+    "                             axis=1)\n",
+    "\n",
+    "# compute separate models for percussive and harmonic part\n",
+    "Vp = conv_model(W=tensor_W, H=np.diag(perc_weight[0]) @ nmfd_H)\n",
+    "Vh = conv_model(W=tensor_W, H=np.diag(1-perc_weight[0]) @ nmfd_H)\n",
+    "               \n",
+    "# accumulate back to original spectrum, reverting the stacking\n",
+    "# this step is described in the last paragraph of sec. 2.4 in [2]\n",
+    "Ap = accu_mat @ Vp\n",
+    "Ah = accu_mat @ Vh\n",
+    "\n",
+    "# alpha-Wiener filtering\n",
+    "nmfd_A, _ = alpha_wiener_filter(mixture_X=A, source_A=[Ap, Ah], alpha=1.0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# visualize results\n",
+    "# create reduced version of templates for visualization\n",
+    "nmfdW_vis = list()\n",
+    "for nmfdW_curr in nmfd_W:\n",
+    "    nmfdW_curr = accu_mat @ nmfdW_curr\n",
+    "    nmfdW_vis.append(nmfdW_curr)\n",
+    "\n",
+    "fh2, _ = visualize_components_nmf(V=A, \n",
+    "                                  W=nmfdW_vis, \n",
+    "                                  H=nmfd_H, \n",
+    "                                  comp_V=nmfd_A, \n",
+    "                                  freq_res=freq_res, \n",
+    "                                  time_res=time_res, \n",
+    "                                  font_size=14);\n",
+    "\n",
+    "# save result\n",
+    "fh2.savefig(os.path.join(OUT_DIR, 'demoDrumExtractionKAM_NMF_scoreInformed_NMF.png'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "audios = []\n",
+    "\n",
+    "# resynthesize results of NMF with soft constraints and score information\n",
+    "for k in range(2):\n",
+    "    Y = nmfd_A[k] * np.exp(1j * P);\n",
+    "    y, _ = inverse_stft(X=Y,\n",
+    "                        block_size=BLOCK_SIZE,\n",
+    "                        hop_size=HOP_SIZE,\n",
+    "                        reconst_mirror=True,\n",
+    "                        append_frames=True,\n",
+    "                        num_samp=len(x))\n",
+    "\n",
+    "    # save result\n",
+    "    out_filepath = os.path.join(OUT_DIR,\n",
+    "                                'demoDrumExtractionKAM_NMF_scoreInformed_NMF_component_{}_extracted_from_{}'.format(k, filename))\n",
+    "    \n",
+    "    wav.write(filename=out_filepath, rate=fs, data=y)\n",
+    "    audios.append(y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Input audio mixture"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ipd.Audio(x, rate=fs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Percussive component based on KAM + NMF + Score-Informed Initialization "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ipd.Audio(audios[0].T, rate=fs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Harmonic component based on KAM + NMF + Score-Informed Initialization "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ipd.Audio(audios[1].T, rate=fs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Reference: \n",
+    "[1] Christian Dittmar, Meinard Müller\n",
+    "<br>**Reverse Engineering the Amen Break — Score-Informed Separation and Restoration Applied to Drum Recordings**\n",
+    "<br>IEEE/ACM Transactions on Audio, Speech, and Language Processing, 24(9): 1531-1543, 2016.\n",
+    "<br>\n",
+    "[2] Christian Dittmar, Patricio López-Serrano, Meinard Müller\n",
+    "<br>**Unifying Local and Global Methods for Harmonic-Percussive Source Separation**\n",
+    "<br>In Proceedings of the IEEE International Conference on Acoustics,<br>Speech, and Signal Processing (ICASSP), 2018.\n",
+    "\n",
+    "#### If you use the libnmfd (NMF toolbox) please refer to \n",
+    "[3] Patricio López-Serrano, Christian Dittmar, Yiğitcan Özer, and Meinard Müller<br>\n",
+    "**NMF Toolbox: Music Processing Applications of Nonnegative Matrix Factorization**<br>\n",
+    "In Proceedings of the  International Conference on Digital Audio Effects (DAFx), 2019."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}