-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
112 lines (99 loc) · 3.04 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import argparse
import os
from datasets import Audio, load_dataset
from pipelines import Downloader, PromptCreator
from pipelines.utils import hash_url
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Process datasets to create prompt and download data"
)
parser.add_argument(
"--huggingface", type=str, help="Huggingface dataset name", required=True
)
parser.add_argument(
"--shuffle",
action="store_true",
help="Shuffle the input dataset",
required=False,
)
parser.add_argument(
"--download", action="store_true", help="Download the dataset", required=False
)
parser.add_argument(
"--use_cache",
action="store_true",
help="Use cache for prompt generation",
)
parser.add_argument(
"--batch_size",
type=int,
default=1,
help="Batch size for prompt generation",
)
parser.add_argument(
"--output_dataset",
type=str,
default=None,
help="Output dataset name",
required=True,
)
parser.add_argument(
"--cache_dir",
type=str,
default=".pipelines",
help="Cache directory",
required=False,
)
parser.add_argument(
"--max_files",
type=int,
default=0,
help="Maximum number of files to download",
required=False,
)
parser.add_argument(
"--audio_duration",
type=int,
default=60 * 10,
help="Duration of the audio file",
required=False,
)
args = parser.parse_args()
dataset = load_dataset(args.huggingface)
if args.shuffle:
for split in dataset:
dataset[split] = dataset[split].shuffle(seed=42)
dataset[split] = dataset[split].flatten_indices() # for performance
if args.download:
Downloader(
dataset,
cache_dir=args.cache_dir,
max_files=args.max_files,
audio_duration=args.audio_duration,
)
# add audio files to the dataset
for split in dataset:
audio_files = []
# get existing audio files
for data in dataset[split]:
file_name = hash_url(data["url"])
if os.path.exists(os.path.join(args.cache_dir, f"{file_name}.mp3")):
audio_files.append(os.path.join(args.cache_dir, f"{file_name}.mp3"))
else:
audio_files.append(None)
dataset[split] = dataset[split].add_column("audio", audio_files)
# delete all rows without audio
for split in dataset:
dataset[split] = dataset[split].filter(lambda x: x["audio"] is not None)
dataset = PromptCreator(
dataset,
use_cache=args.use_cache,
batch_size=args.batch_size,
cache_dir=args.cache_dir,
).create_prompt()
# cast audio column
for split in dataset:
dataset[split] = dataset[split].cast_column(
"audio", Audio(mono=False, sampling_rate=44100)
)
dataset.push_to_hub(args.output_dataset)