Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unable to download voxpopuli in unit 6 [Errno22] #198

Open
Antoine101 opened this issue Nov 27, 2024 · 0 comments
Open

Unable to download voxpopuli in unit 6 [Errno22] #198

Antoine101 opened this issue Nov 27, 2024 · 0 comments

Comments

@Antoine101
Copy link

Hi folks,

When executing the following block cell (from Unit 6 of the Audio Course - subsection "Fine-tune SpeechT5") in my notebook environnement:

from datasets import load_dataset

dataset = load_dataset("facebook/voxpopuli", "nl", split="train")
len(dataset)

I get the following error message. The wav file mentionned at the end seems to change at each execution of the cell.
I didn't have any problem with all the other datasets that I had to download in all previous units of the course.


OSError Traceback (most recent call last)
Cell In[6], line 1
----> 1 dataset = load_dataset("facebook/voxpopuli", "nl")
2 len(dataset)

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\load.py:2154, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
2151 return builder_instance.as_streaming_dataset(split=split)
2153 # Download and prepare data
-> 2154 builder_instance.download_and_prepare(
2155 download_config=download_config,
2156 download_mode=download_mode,
2157 verification_mode=verification_mode,
2158 num_proc=num_proc,
2159 storage_options=storage_options,
2160 )
2162 # Build dataset for splits
2163 keep_in_memory = (
2164 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
2165 )

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\builder.py:924, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, dl_manager, base_path, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
922 if num_proc is not None:
923 prepare_split_kwargs["num_proc"] = num_proc
--> 924 self._download_and_prepare(
925 dl_manager=dl_manager,
926 verification_mode=verification_mode,
927 **prepare_split_kwargs,
928 **download_and_prepare_kwargs,
929 )
930 # Sync info
931 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\builder.py:1648, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs)
1647 def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_kwargs):
-> 1648 super()._download_and_prepare(
1649 dl_manager,
1650 verification_mode,
1651 check_duplicate_keys=verification_mode == VerificationMode.BASIC_CHECKS
1652 or verification_mode == VerificationMode.ALL_CHECKS,
1653 **prepare_splits_kwargs,
1654 )

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\builder.py:978, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
976 split_dict = SplitDict(dataset_name=self.dataset_name)
977 split_generators_kwargs = self._make_split_generators_kwargs(prepare_split_kwargs)
--> 978 split_generators = self._split_generators(dl_manager, **split_generators_kwargs)
980 # Checksums verification
981 if verification_mode == VerificationMode.ALL_CHECKS and dl_manager.record_checksums:

File ~.cache\huggingface\modules\datasets_modules\datasets\facebook--voxpopuli\b5ff837284f0778eefe0f642734e142d8c3f574eba8c9c8a4b13602297f73604\voxpopuli.py:146, in Voxpopuli._split_generators(self, dl_manager)
142 meta_paths = dl_manager.download_and_extract(meta_urls)
143 audio_paths = dl_manager.download(audio_urls)
145 local_extracted_audio_paths = (
--> 146 dl_manager.extract(audio_paths) if not dl_manager.is_streaming else
147 {
148 split: {lang: [None] * len(audio_paths[split][lang]) for lang in self.config.languages} for split in splits
149 }
150 )
151 if self.config.name == "en_accented":
152 return [
153 datasets.SplitGenerator(
154 name=datasets.Split.TEST,
(...)
163 ),
164 ]

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\download\download_manager.py:299, in DownloadManager.extract(self, path_or_paths)
297 download_config.extract_compressed_file = True
298 extract_func = partial(self._download_single, download_config=download_config)
--> 299 extracted_paths = map_nested(
300 extract_func,
301 path_or_paths,
302 num_proc=download_config.num_proc,
303 desc="Extracting data files",
304 )
305 path_or_paths = NestedDataStructure(path_or_paths)
306 extracted_paths = NestedDataStructure(extracted_paths)

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\utils\py_utils.py:512, in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, batched, batch_size, types, disable_tqdm, desc)
509 batch_size = max(len(iterable) // num_proc + int(len(iterable) % num_proc > 0), 1)
510 iterable = list(iter_batched(iterable, batch_size))
511 mapped = [
--> 512 _single_map_nested((function, obj, batched, batch_size, types, None, True, None))
513 for obj in hf_tqdm(iterable, disable=disable_tqdm, desc=desc)
514 ]
515 if batched:
516 mapped = [mapped_item for mapped_batch in mapped for mapped_item in mapped_batch]

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\utils\py_utils.py:396, in _single_map_nested(args)
393 with hf_tqdm(pbar_iterable, disable=disable_tqdm, position=rank, unit="obj", desc=pbar_desc) as pbar:
394 if isinstance(data_struct, dict):
395 return {
--> 396 k: _single_map_nested((function, v, batched, batch_size, types, None, True, None)) for k, v in pbar
397 }
398 else:
399 mapped = [_single_map_nested((function, v, batched, batch_size, types, None, True, None)) for v in pbar]

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\utils\py_utils.py:399, in _single_map_nested(args)
395 return {
396 k: _single_map_nested((function, v, batched, batch_size, types, None, True, None)) for k, v in pbar
397 }
398 else:
--> 399 mapped = [_single_map_nested((function, v, batched, batch_size, types, None, True, None)) for v in pbar]
400 if isinstance(data_struct, list):
401 return mapped

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\utils\py_utils.py:373, in _single_map_nested(args)
371 return function([data_struct])[0]
372 else:
--> 373 return function(data_struct)
374 if (
375 batched
376 and not isinstance(data_struct, dict)
377 and isinstance(data_struct, types)
378 and all(not isinstance(v, (dict, types)) for v in data_struct)
379 ):
380 return [mapped_item for batch in iter_batched(data_struct, batch_size) for mapped_item in function(batch)]

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\download\download_manager.py:229, in DownloadManager._download_single(self, url_or_filename, download_config)
226 if is_relative_path(url_or_filename):
227 # append the relative path to the base_path
228 url_or_filename = url_or_path_join(self._base_path, url_or_filename)
--> 229 out = cached_path(url_or_filename, download_config=download_config)
230 out = tracked_str(out)
231 out.set_origin(url_or_filename)

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\utils\file_utils.py:251, in cached_path(url_or_filename, download_config, **download_kwargs)
248 return output_path
250 # Eager extraction
--> 251 output_path = ExtractManager(cache_dir=download_config.cache_dir).extract(
252 output_path, force_extract=download_config.force_extract
253 )
254 return relative_to_absolute_path(output_path)

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\utils\extract.py:48, in ExtractManager.extract(self, input_path, force_extract)
46 output_path = self._get_output_path(input_path)
47 if self._do_extract(output_path, force_extract):
---> 48 self.extractor.extract(input_path, output_path, extractor_format)
49 return output_path

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\utils\extract.py:332, in Extractor.extract(cls, input_path, output_path, extractor_format)
330 shutil.rmtree(output_path, ignore_errors=True)
331 extractor = cls.extractors[extractor_format]
--> 332 return extractor.extract(input_path, output_path)

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\site-packages\datasets\utils\extract.py:126, in TarExtractor.extract(input_path, output_path)
124 os.makedirs(output_path, exist_ok=True)
125 tar_file = tarfile.open(input_path)
--> 126 tar_file.extractall(output_path, members=TarExtractor.safemembers(tar_file, output_path))
127 tar_file.close()

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\tarfile.py:2302, in TarFile.extractall(self, path, members, numeric_owner, filter)
2297 if tarinfo.isdir():
2298 # For directories, delay setting attributes until later,
2299 # since permissions can interfere with extraction and
2300 # extracting contents can reset mtime.
2301 directories.append(tarinfo)
-> 2302 self._extract_one(tarinfo, path, set_attrs=not tarinfo.isdir(),
2303 numeric_owner=numeric_owner)
2305 # Reverse sort directories.
2306 directories.sort(key=lambda a: a.name, reverse=True)

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\tarfile.py:2369, in TarFile._extract_one(self, tarinfo, path, set_attrs, numeric_owner)
2365 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2366 set_attrs=set_attrs,
2367 numeric_owner=numeric_owner)
2368 except OSError as e:
-> 2369 self._handle_fatal_error(e)
2370 except ExtractError as e:
2371 self._handle_nonfatal_error(e)

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\tarfile.py:2365, in TarFile._extract_one(self, tarinfo, path, set_attrs, numeric_owner)
2362 self._check("r")
2364 try:
-> 2365 self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2366 set_attrs=set_attrs,
2367 numeric_owner=numeric_owner)
2368 except OSError as e:
2369 self._handle_fatal_error(e)

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\tarfile.py:2448, in TarFile._extract_member(self, tarinfo, targetpath, set_attrs, numeric_owner)
2445 self._dbg(1, tarinfo.name)
2447 if tarinfo.isreg():
-> 2448 self.makefile(tarinfo, targetpath)
2449 elif tarinfo.isdir():
2450 self.makedir(tarinfo, targetpath)

File C:\ProgramData\anaconda3\envs\hf_audio_course\Lib\tarfile.py:2494, in TarFile.makefile(self, tarinfo, targetpath)
2492 source.seek(tarinfo.offset_data)
2493 bufsize = self.copybufsize
-> 2494 with bltn_open(targetpath, "wb") as target:
2495 if tarinfo.sparse is not None:
2496 for offset, size in tarinfo.sparse:

OSError: [Errno 22] Invalid argument: 'C:\Users\APU\.cache\huggingface\datasets\downloads\extracted\49afd6ed6a1455c643d97d6bc9dcba821db2c51ee26fe1279355df74a0d10a31\train_part_4\20140116-0900-PLENARY-4-nl_20140116-10:46:51_1.wav'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant