Added method for initializing nmf drum templates based on provided dr…

…um samples, some more beautifications
groupmm · Aug 9, 2024 · 9b2502f · 9b2502f
1 parent 810076c
commit 9b2502f
Show file tree

Hide file tree

Showing 3 changed files with 125 additions and 109 deletions.
diff --git a/demo_drum_extraction_kam_nmf_score_informed.ipynb b/demo_drum_extraction_kam_nmf_score_informed.ipynb
@@ -33,17 +33,17 @@
    "source": [
     "import os\n",
     "import numpy as np\n",
-    "import scipy.io.wavfile as wav\n",
+    "import soundfile as sf\n",
     "import IPython.display as ipd\n",
     "\n",
-    "from libnmfd.core.nmfconv import conv_model, drum_specific_soft_constraints_nmf, \\\n",
+    "from libnmfd.core.nmfconv import conv_model, \\\n",
     "    init_activations, init_templates, nmfd\n",
     "from libnmfd.dsp.algorithms import hpss_kam_fitzgerald\n",
     "from libnmfd.dsp.filters import alpha_wiener_filter\n",
     "from libnmfd.dsp.transforms import forward_stft, inverse_stft, log_freq_log_mag\n",
     "from libnmfd.utils import make_monaural, pcm_int16_to_float32np\n",
     "from libnmfd.utils.core_utils import percussiveness_estimation, visualize_components_kam, visualize_components_nmf\n",
-    "\n",
+    "from libnmfd.utils.core_utils import drum_specific_soft_constraints_nmf\n",
     "\n",
     "INPUT_DIR = 'data/'\n",
     "OUT_DIR = 'output/'\n",
@@ -69,14 +69,12 @@
    "outputs": [],
    "source": [
     "# read signal\n",
-    "fs, x = wav.read(os.path.join(INPUT_DIR, filename))\n",
+    "\n",
+    "x, fs = sf.read(file=os.path.join(INPUT_DIR, filename),dtype=np.float32)\n",
     "\n",
     "# make monaural if necessary\n",
     "x = make_monaural(x)\n",
     "\n",
-    "# convert wav from int16 to float32\n",
-    "x = pcm_int16_to_float32np(x)\n",
-    "\n",
     "# read corresponding transcription files\n",
     "melody_transcription = np.loadtxt(os.path.join(INPUT_DIR, 'runningExample_IGotYouMelody.txt'))\n",
     "drums_transcription = np.loadtxt(os.path.join(INPUT_DIR, 'runningExample_IGotYouDrums.txt'))"
@@ -155,8 +153,8 @@
     "    # save result\n",
     "    out_filepath = os.path.join(OUT_DIR,\n",
     "                                'demoDrumExtractionKAM_NMF_percThreshold_KAM_component_{}_extracted_from_{}'.format(k, filename))\n",
-    "    \n",
-    "    wav.write(filename=out_filepath, rate=fs, data=y)"
+    "\n",
+    "    sf.write(file=out_filepath, data=y, samplerate=fs)"
    ]
   },
   {
@@ -263,7 +261,7 @@
     "\n",
     "\n",
     "num_comp_drum = drums_H.shape[0]\n",
-    "drums_W = init_templates(num_bins=num_bins, strategy='drums')"
+    "drums_W = init_templates(num_bins=num_bins, strategy='drums', num_template_frames=num_template_frames)"
    ]
   },
   {
@@ -382,8 +380,9 @@
     "    # save result\n",
     "    out_filepath = os.path.join(OUT_DIR,\n",
     "                                'demoDrumExtractionKAM_NMF_scoreInformed_NMF_component_{}_extracted_from_{}'.format(k, filename))\n",
-    "    \n",
-    "    wav.write(filename=out_filepath, rate=fs, data=y)\n",
+    "\n",
+    "    sf.write(file=out_filepath, data=y, samplerate=fs)\n",
+    "\n",
     "    audios.append(y)"
    ]
   },
@@ -457,7 +456,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -471,9 +470,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.11"
+   "version": "3.8.19"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/demo_drum_sound_separation_nmf.ipynb b/demo_drum_sound_separation_nmf.ipynb
@@ -28,7 +28,7 @@
     "from libnmfd.core.nmfconv import init_activations, init_templates, nmfd\n",
     "from libnmfd.dsp.filters import alpha_wiener_filter\n",
     "from libnmfd.dsp.transforms import forward_stft, inverse_stft\n",
-    "from libnmfd.utils import make_monaural, pcm_int16_to_float32np\n",
+    "from libnmfd.utils import make_monaural #, pcm_int16_to_float32np\n",
     "from libnmfd.utils.core_utils import visualize_components_nmf\n",
     "\n",
     "INPUT_DIR = 'data/'\n",
@@ -61,88 +61,6 @@
     "x = make_monaural(x)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "desired_drum_classes = ['kick', 'snare', 'hihat']\n",
-    "\n",
-    "# set common parameters\n",
-    "num_comp = 3\n",
-    "num_iter = 30\n",
-    "num_template_frames = 8\n",
-    "\n",
-    "# spectral parameters\n",
-    "BLOCK_SIZE = 2048\n",
-    "HOP_SIZE = 512\n",
-    "\n",
-    "counter = 0\n",
-    "init_W_drums = list()\n",
-    "\n",
-    "\n",
-    "\n",
-    "for drum_class in desired_drum_classes:\n",
-    "    # parse all audio files\n",
-    "    \n",
-    "    drum_audio_files = os.listdir(os.path.join(INPUT_DIR, drum_class))\n",
-    "    print(drum_audio_files)\n",
-    "\n",
-    "    drum_class_audios = None\n",
-    "    \n",
-    "    for drum_audio_file in drum_audio_files:\n",
-    "        dx, fs = sf.read(file=os.path.join(INPUT_DIR, drum_class, drum_audio_file),dtype=np.float32)\n",
-    "\n",
-    "        # make monaural if necessary\n",
-    "        dx = make_monaural(dx)\n",
-    "\n",
-    "        # normalize amplitude\n",
-    "        dx = 0.99 * dx / np.max(np.abs(dx))\n",
-    "\n",
-    "        # concatenate all audios for one drum class\n",
-    "        if drum_class_audios is None:\n",
-    "            drum_class_audios = dx.copy()\n",
-    "        else:\n",
-    "            drum_class_audios = np.hstack([drum_class_audios, dx])\n",
-    "\n",
-    "\n",
-    "    \n",
-    "    # STFT computation\n",
-    "    _, A, _ = forward_stft(drum_class_audios, block_size=BLOCK_SIZE, hop_size=HOP_SIZE, reconst_mirror=True, append_frames=True)\n",
-    "\n",
-    "    # get dimensions and time and freq resolutions\n",
-    "    num_bins, num_frames = A.shape\n",
-    "    time_res = HOP_SIZE / fs\n",
-    "    freq_res = fs / BLOCK_SIZE    \n",
-    "\n",
-    "    # generate initial guess for templates\n",
-    "    init_W = init_templates(num_comp=1,\n",
-    "                            num_bins=num_bins,\n",
-    "                            strategy='random')\n",
-    "    \n",
-    "    # generate initial activations\n",
-    "    init_H = init_activations(num_comp=1,\n",
-    "                              num_frames=num_frames,\n",
-    "                              strategy='uniform')    \n",
-    "\n",
-    "    # NMFD core method\n",
-    "    nmfd_W, nmfd_H, nmfd_V, divKL, _ = nmfd(V=A, \n",
-    "                                            num_comp=1, \n",
-    "                                            num_frames=num_frames, \n",
-    "                                            num_iter=num_iter,\n",
-    "                                            num_template_frames=num_template_frames,\n",
-    "                                            init_W=init_W,\n",
-    "                                            init_H=init_H)\n",
-    "\n",
-    "\n",
-    "    init_W_drums.append(np.array(nmfd_W).squeeze(0).copy())\n",
-    "    \n",
-    "\n",
-    "        \n",
-    "    "
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -188,8 +106,11 @@
     "num_template_frames = 8\n",
     "\n",
     "# generate initial guess for templates\n",
-    "init_W = init_W_drums\n",
-    "\n",
+    "# generate initial guess for templates\n",
+    "init_W = init_templates(num_comp=num_comp,\n",
+    "                        num_bins=num_bins,\n",
+    "                        num_template_frames=num_template_frames,\n",
+    "                        strategy='drums')\n",
     "\n",
     "# generate initial activations\n",
     "init_H = init_activations(num_comp=num_comp,\n",
@@ -310,6 +231,25 @@
    "source": [
     "ipd.Audio(audios[2].T, rate=fs)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Reference: \n",
+    "[1] Christian Dittmar, Meinard Müller\n",
+    "<br>**Reverse Engineering the Amen Break — Score-Informed Separation and Restoration Applied to Drum Recordings**\n",
+    "<br>IEEE/ACM Transactions on Audio, Speech, and Language Processing, 24(9): 1531-1543, 2016.\n",
+    "<br>\n",
+    "[2] Christian Dittmar, Patricio López-Serrano, Meinard Müller\n",
+    "<br>**Unifying Local and Global Methods for Harmonic-Percussive Source Separation**\n",
+    "<br>In Proceedings of the IEEE International Conference on Acoustics,<br>Speech, and Signal Processing (ICASSP), 2018.\n",
+    "\n",
+    "#### If you use the libnmfd (NMF toolbox) please refer to \n",
+    "[3] Patricio López-Serrano, Christian Dittmar, Yiğitcan Özer, and Meinard Müller<br>\n",
+    "**NMF Toolbox: Music Processing Applications of Nonnegative Matrix Factorization**<br>\n",
+    "In Proceedings of the  International Conference on Digital Audio Effects (DAFx), 2019."
+   ]
   }
  ],
  "metadata": {
@@ -328,7 +268,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.18"
+   "version": "3.8.19"
   }
  },
  "nbformat": 4,

diff --git a/libnmfd/core/nmfconv.py b/libnmfd/core/nmfconv.py
@@ -1,11 +1,13 @@
+import os
 import numpy as np
+import soundfile as sf
 from tqdm import tnrange
 from typing import List, Tuple, Union
 
 from libnmfd.dsp.filters import nema
-from libnmfd.utils import EPS, load_matlab_dict, midi2freq
-from libnmfd.utils.core_utils import drum_specific_soft_constraints_nmf
-
+from libnmfd.dsp.transforms import forward_stft
+from libnmfd.utils import EPS, midi2freq, make_monaural
+#from libnmfd.utils.core_utils import drum_specific_soft_constraints_nmf
 
 def nmf_conv(V:np.ndarray,
              num_comp: int = 3,
@@ -425,6 +427,83 @@ def shift_operator(A: np.ndarray,
 
     return shifted
 
+def initialize_drum_specific_nmfd_templates(desired_drum_classes: List[str] = None,
+                                            num_iter: int = 30,
+                                            num_template_frames: int = 8,
+                                            block_size: int = 2048,
+                                            hop_size: int = 512,
+                                            fs: int = 44100,
+                                            input_dir: str = 'data/') -> List[np.ndarray]:
+    """Implements the extraction of drum specific spectrogram templates. The method assumes, that 
+    folders with the same name as the desired drums sounds are present inside the data directory. 
+    These should contain single samples of the target drum sounds. Per default, we use pre-defined kick, snare and hihat samples.
+    """
+    # set some default classes in case of empty user input
+    if desired_drum_classes == None:
+        desired_drum_classes = ['kick', 'snare', 'hihat']
+
+    # initialize empty list
+    init_W_drums = list()
+
+    for drum_class in desired_drum_classes:
+
+        # check if folder exists
+        if not os.path.isdir(os.path.join(input_dir, drum_class)):
+            raise NotADirectoryError(f"The specified folder {os.path.join(input_dir, drum_class)} does not exist.")
+
+        # parse all audio files
+        drum_audio_files = os.listdir(os.path.join(input_dir, drum_class))
+        print(drum_audio_files)
+
+        drum_class_audios = None
+
+        for drum_audio_file in drum_audio_files:
+            dx, orig_fs = sf.read(file=os.path.join(input_dir, drum_class, drum_audio_file),dtype=np.float32)
+
+            # make monaural if necessary
+            dx = make_monaural(dx)
+
+            # resample if necessary
+            if orig_fs != fs:
+                dx = resample(dx, len(dx)*orig_fs/fs)
+
+            # normalize amplitude
+            dx = 0.99 * dx / np.max(np.abs(dx))
+
+            # concatenate all audios for one drum class
+            if drum_class_audios is None:
+                drum_class_audios = dx.copy()
+            else:
+                drum_class_audios = np.hstack([drum_class_audios, dx])
+
+        # STFT computation
+        _, A, _ = forward_stft(drum_class_audios, block_size=block_size, hop_size=hop_size, reconst_mirror=True, append_frames=True)
+
+        # get dimensions and time and freq resolutions
+        num_bins, num_frames = A.shape
+
+        # generate initial guess for templates
+        init_W = init_templates(num_comp=1,
+                                num_bins=num_bins,
+                                strategy='random')
+
+        # generate initial activations
+        init_H = init_activations(num_comp=1,
+                                  num_frames=num_frames,
+                                  strategy='uniform')    
+
+        # NMFD core method
+        nmfd_W, _, _, _, _ = nmfd(V=A,
+                                  num_comp=1,
+                                  num_frames=num_frames,
+                                  num_iter=num_iter,
+                                  num_template_frames=num_template_frames,
+                                  init_W=init_W,init_H=init_H)
+
+        # adjust the dimensions
+        init_W_drums.append(np.array(nmfd_W).squeeze(0).copy())
+
+    return init_W_drums
 
 def init_templates(num_comp: int = None,
                    num_bins: int = None,
@@ -522,12 +601,10 @@ def init_templates(num_comp: int = None,
                 init_W[k][bin_range, :] = 1/(g+1)
 
     elif strategy == 'drums':
-        dict_W = load_matlab_dict('data/dictW.mat', 'dictW')
-
-        if num_bins == dict_W.shape[0]:
-            for k in range(dict_W.shape[1]):
-                init_W.append(dict_W[:, k].reshape(-1, 1) * np.linspace(1, 0.1, num_template_frames))
 
+        # call sub-routine that extracts the NMFD templates for drums
+        init_W = initialize_drum_specific_nmfd_templates(num_template_frames=num_template_frames)
+
         # needs to be overwritten
         num_comp = len(init_W)