-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preperation.py
105 lines (87 loc) · 3.6 KB
/
data_preperation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import h5_getter
import numpy as np
import os
import pypianoroll
import pandas as pd
import pretty_midi
import matplotlib.pyplot as plt
import librosa
import music21
root_dir = '.'
data_dir = root_dir + '/dataset/lpd_5_cleansed'
music_dataset_lpd_dir = root_dir + '/dataset/lmd_matched_h5'
cleansed_ids = pd.read_csv(os.path.join('dataset', 'cleansed_ids.txt'), delimiter = ' ', header = None)
lpd_to_msd_ids = {a:b for a, b in zip(cleansed_ids[0], cleansed_ids[1])}
msd_to_lpd_ids = {a:b for a, b in zip(cleansed_ids[1], cleansed_ids[0])}
RESULTS_PATH = os.path.join(root_dir, 'dataset')
# Utility functions for retrieving paths
def msd_id_to_dirs(msd_id):
"""Given an MSD ID, generate the path prefix.
E.g. TRABCD12345678 -> A/B/C/TRABCD12345678"""
return os.path.join(msd_id[2], msd_id[3], msd_id[4], msd_id)
def msd_id_to_h5(msd_id):
"""Given an MSD ID, return the path to the corresponding h5"""
return os.path.join(RESULTS_PATH, 'lmd_matched_h5',
msd_id_to_dirs(msd_id) + '.h5')
# Load the midi npz file from the LMD cleansed folder
def get_midi_npz_path(msd_id, midi_md5):
return os.path.join(data_dir,
msd_id_to_dirs(msd_id), midi_md5 + '.npz')
# Load the midi file from the Music Dataset folder
def get_midi_path(msd_id, midi_md5):
return os.path.join(music_dataset_lpd_dir,
msd_id_to_dirs(msd_id), midi_md5 + '.mid')
# Reading the genre annotations
genre_file_dir = os.path.join('dataset', 'msd_tagtraum_cd1.cls')
ids = []
genres = []
with open(genre_file_dir) as f:
line = f.readline()
while line:
if line[0] != '#':
split = line.strip().split("\t")
if len(split) == 2:
ids.append(split[0])
genres.append(split[1])
elif len(split) == 3:
ids.append(split[0])
ids.append(split[0])
genres.append(split[1])
genres.append(split[2])
line = f.readline()
genre_df = pd.DataFrame(data={"TrackID": ids, "Genre": genres})
# get ids of pop songs
pop_ids = genre_df[genre_df['Genre'] == 'Pop_Rock']['TrackID'].tolist()
pop_lpd_ids = [msd_to_lpd_ids[msd_id] for msd_id in pop_ids if msd_id in msd_to_lpd_ids]
notes = []
i = 0
for lpd_file_name in pop_lpd_ids:
msd_file_name = lpd_to_msd_ids[lpd_file_name]
# Get the NPZ path
npz_path = get_midi_npz_path(msd_file_name, lpd_file_name)
multitrack = pypianoroll.load(npz_path)
pm = pypianoroll.to_pretty_midi(multitrack)
new_midi_path = npz_path[:-4] + '.mid'
pypianoroll.write(new_midi_path, multitrack)
# Get the MIDI path (should already be generated)
new_midi_path = npz_path[:-4] + '.mid'
midi = music21.converter.parse(new_midi_path)
s2 = music21.instrument.partitionByInstrument(midi)
piano_part = None
# Filter for only the piano part
instr = music21.instrument.Piano
for part in s2:
if isinstance(part.getInstrument(), instr):
piano_part = part
notes_song = []
if piano_part: # Some songs somehow have no piano parts
for element in piano_part:
if isinstance(element, music21.note.Note):
# Return the pitch of the single note
notes_song.append(str(element.pitch))
elif isinstance(element, music21.chord.Chord):
# Returns the normal order of a Chord represented in a list of integers
notes_song.append('.'.join(str(n) for n in element.normalOrder))
notes.append(notes_song)
i+=1
print(i)