From 1f9ba34fa3d05f587aa76b18f3a474df146bf1b0 Mon Sep 17 00:00:00 2001 From: Andrei Fajardo <92402603+nerdai@users.noreply.github.com> Date: Sat, 2 Dec 2023 15:13:28 -0500 Subject: [PATCH] Move download datasets logic our of download_utils.py (#9253) * add modifications to allow for pdf download * lint * lint * separate content vars * move download datasets into separate sub module * wip * update commandline * move util functions to utils * use renamed module * refactor to use new module name * refactor to use new module name * cr * lint * add missing import * add missing import * add missing import; add back download_llama_datasets * fix command line * get metadata from hub * fix command line * wip --- llama_index/command_line/command_line.py | 21 +- llama_index/download/dataset.py | 228 +++++++++++++++++ .../download/{download_utils.py => module.py} | 237 +++++------------- llama_index/download/utils.py | 88 +++++++ llama_index/llama_dataset/download.py | 48 ++-- llama_index/llama_pack/download.py | 2 +- llama_index/readers/download.py | 2 +- llama_index/tools/download.py | 2 +- 8 files changed, 433 insertions(+), 195 deletions(-) create mode 100644 llama_index/download/dataset.py rename llama_index/download/{download_utils.py => module.py} (50%) create mode 100644 llama_index/download/utils.py diff --git a/llama_index/command_line/command_line.py b/llama_index/command_line/command_line.py index 5d4810d960909..be3e704b6e517 100644 --- a/llama_index/command_line/command_line.py +++ b/llama_index/command_line/command_line.py @@ -2,7 +2,8 @@ from typing import Any, Optional from llama_index.llama_dataset.download import ( - LLAMA_DATASETS_URL, + LLAMA_DATASETS_LFS_URL, + LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL, download_llama_dataset, ) from llama_index.llama_pack.download import LLAMA_HUB_URL, download_llama_pack @@ -29,7 +30,8 @@ def handle_download_llama_dataset( llama_dataset_class: Optional[str] = None, download_dir: Optional[str] = None, llama_hub_url: str = LLAMA_HUB_URL, - llama_datasets_url: str = LLAMA_DATASETS_URL, + llama_datasets_lfs_url: str = LLAMA_DATASETS_LFS_URL, + llama_datasets_source_files_tree_url: str = LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL, **kwargs: Any, ) -> None: assert llama_dataset_class is not None @@ -39,10 +41,11 @@ def handle_download_llama_dataset( llama_dataset_class=llama_dataset_class, download_dir=download_dir, llama_hub_url=llama_hub_url, - llama_datasets_url=llama_datasets_url, + llama_datasets_lfs_url=llama_datasets_lfs_url, + llama_datasets_source_files_tree_url=llama_datasets_source_files_tree_url, ) - print(f"Successfully downloaded {llama_datasets_url} to {download_dir}") + print(f"Successfully downloaded {llama_dataset_class} to {download_dir}") def main() -> None: @@ -106,9 +109,15 @@ def main() -> None: help="URL to llama hub.", ) llamadataset_parser.add_argument( - "--llama-dataset-url", + "--llama-datasets-lfs-url", type=str, - default=LLAMA_DATASETS_URL, + default=LLAMA_DATASETS_LFS_URL, + help="URL to llama datasets.", + ) + llamadataset_parser.add_argument( + "--llama-datasets-lfs-url", + type=str, + default=LLAMA_DATASETS_LFS_URL, help="URL to llama datasets.", ) llamadataset_parser.set_defaults( diff --git a/llama_index/download/dataset.py b/llama_index/download/dataset.py new file mode 100644 index 0000000000000..358618f476e80 --- /dev/null +++ b/llama_index/download/dataset.py @@ -0,0 +1,228 @@ +"""Download.""" + +import json +import os +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import requests +import tqdm + +from llama_index.download.module import LLAMA_HUB_URL +from llama_index.download.utils import ( + get_file_content, + get_file_content_bytes, + initialize_directory, +) + +LLAMA_DATASETS_LFS_URL = ( + f"https://media.githubusercontent.com/media/run-llama/llama_datasets/main" +) + +LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL = ( + "https://github.com/run-llama/llama_datasets/tree/main" +) +LLAMA_RAG_DATASET_FILENAME = "rag_dataset.json" +LLAMA_SOURCE_FILES_PATH = "source_files" + + +PATH_TYPE = Union[str, Path] + + +def _get_source_files_list(source_tree_url: str, path: str) -> List[str]: + """Get the list of source files to download.""" + resp = requests.get(source_tree_url + path + "?recursive=1") + payload = resp.json()["payload"] + return [item["name"] for item in payload["tree"]["items"]] + + +def get_dataset_info( + local_dir_path: PATH_TYPE, + remote_dir_path: PATH_TYPE, + remote_source_dir_path: PATH_TYPE, + dataset_class: str, + refresh_cache: bool = False, + library_path: str = "library.json", + source_files_path: str = "source_files", + disable_library_cache: bool = False, +) -> Dict: + """Get dataset info.""" + if isinstance(local_dir_path, str): + local_dir_path = Path(local_dir_path) + + local_library_path = f"{local_dir_path}/{library_path}" + dataset_id = None + source_files = [] + + # Check cache first + if not refresh_cache and os.path.exists(local_library_path): + with open(local_library_path) as f: + library = json.load(f) + if dataset_class in library: + dataset_id = library[dataset_class]["id"] + source_files = library[dataset_class].get("source_files", []) + + # Fetch up-to-date library from remote repo if dataset_id not found + if dataset_id is None: + library_raw_content, _ = get_file_content( + str(remote_dir_path), f"/{library_path}" + ) + library = json.loads(library_raw_content) + if dataset_class not in library: + raise ValueError("Loader class name not found in library") + + dataset_id = library[dataset_class]["id"] + source_files = _get_source_files_list( + str(remote_source_dir_path), f"/{dataset_id}/{source_files_path}" + ) + + # create cache dir if needed + local_library_dir = os.path.dirname(local_library_path) + if not disable_library_cache: + if not os.path.exists(local_library_dir): + os.makedirs(local_library_dir) + + # Update cache + with open(local_library_path, "w") as f: + f.write(library_raw_content) + + if dataset_id is None: + raise ValueError("Dataset class name not found in library") + + return { + "dataset_id": dataset_id, + "source_files": source_files, + } + + +def download_dataset_and_source_files( + local_dir_path: PATH_TYPE, + remote_lfs_dir_path: PATH_TYPE, + source_files_dir_path: PATH_TYPE, + dataset_id: str, + source_files: List[str], + refresh_cache: bool = False, + base_file_name: str = "rag_dataset.json", + override_path: bool = False, + show_progress: bool = False, +) -> None: + """Download dataset and source files.""" + if isinstance(local_dir_path, str): + local_dir_path = Path(local_dir_path) + + if override_path: + module_path = str(local_dir_path) + else: + module_path = f"{local_dir_path}/{dataset_id}" + + if refresh_cache or not os.path.exists(module_path): + os.makedirs(module_path, exist_ok=True) + os.makedirs(f"{module_path}/{source_files_dir_path}", exist_ok=True) + + rag_dataset_raw_content, _ = get_file_content( + str(remote_lfs_dir_path), f"/{dataset_id}/{base_file_name}" + ) + + with open(f"{module_path}/{base_file_name}", "w") as f: + f.write(rag_dataset_raw_content) + + # Get content of source files + if show_progress: + source_files_iterator = tqdm.tqdm(source_files) + else: + source_files_iterator = source_files + for source_file in source_files_iterator: + if ".pdf" in source_file: + source_file_raw_content_bytes, _ = get_file_content_bytes( + str(remote_lfs_dir_path), + f"/{dataset_id}/{source_files_dir_path}/{source_file}", + ) + with open( + f"{module_path}/{source_files_dir_path}/{source_file}", "wb" + ) as f: + f.write(source_file_raw_content_bytes) + else: + source_file_raw_content, _ = get_file_content( + str(remote_lfs_dir_path), + f"/{dataset_id}/{source_files_dir_path}/{source_file}", + ) + with open( + f"{module_path}/{source_files_dir_path}/{source_file}", "w" + ) as f: + f.write(source_file_raw_content) + + +def download_llama_dataset( + dataset_class: str, + llama_hub_url: str = LLAMA_HUB_URL, + llama_datasets_lfs_url: str = LLAMA_DATASETS_LFS_URL, + llama_datasets_source_files_tree_url: str = LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL, + refresh_cache: bool = False, + custom_dir: Optional[str] = None, + custom_path: Optional[str] = None, + source_files_dirpath: str = LLAMA_SOURCE_FILES_PATH, + library_path: str = "library.json", + base_file_name: str = "rag_dataset.json", + disable_library_cache: bool = False, + override_path: bool = False, + show_progress: bool = False, +) -> Any: + """Download a module from LlamaHub. + + Can be a loader, tool, pack, or more. + + Args: + loader_class: The name of the llama module class you want to download, + such as `GmailOpenAIAgentPack`. + refresh_cache: If true, the local cache will be skipped and the + loader will be fetched directly from the remote repo. + custom_dir: Custom dir name to download loader into (under parent folder). + custom_path: Custom dirpath to download loader into. + library_path: File name of the library file. + use_gpt_index_import: If true, the loader files will use + llama_index as the base dependency. By default (False), + the loader files use llama_index as the base dependency. + NOTE: this is a temporary workaround while we fully migrate all usages + to llama_index. + is_dataset: whether or not downloading a LlamaDataset + + Returns: + A Loader, A Pack, An Agent, or A Dataset + """ + # create directory / get path + dirpath = initialize_directory(custom_path=custom_path, custom_dir=custom_dir) + + # fetch info from library.json file + dataset_info = get_dataset_info( + local_dir_path=dirpath, + remote_dir_path=llama_hub_url, + remote_source_dir_path=llama_datasets_source_files_tree_url, + dataset_class=dataset_class, + refresh_cache=refresh_cache, + library_path=library_path, + disable_library_cache=disable_library_cache, + ) + dataset_id = dataset_info["dataset_id"] + source_files = dataset_info["source_files"] + + download_dataset_and_source_files( + local_dir_path=dirpath, + remote_lfs_dir_path=llama_datasets_lfs_url, + source_files_dir_path=source_files_dirpath, + dataset_id=dataset_id, + source_files=source_files, + refresh_cache=refresh_cache, + base_file_name=base_file_name, + override_path=override_path, + show_progress=show_progress, + ) + + if override_path: + module_path = str(dirpath) + else: + module_path = f"{dirpath}/{dataset_id}" + + return ( + f"{module_path}/{LLAMA_RAG_DATASET_FILENAME}", + f"{module_path}/{LLAMA_SOURCE_FILES_PATH}", + ) diff --git a/llama_index/download/download_utils.py b/llama_index/download/module.py similarity index 50% rename from llama_index/download/download_utils.py rename to llama_index/download/module.py index 3f942196c77d2..817611785b42b 100644 --- a/llama_index/download/download_utils.py +++ b/llama_index/download/module.py @@ -7,24 +7,23 @@ from enum import Enum from importlib import util from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Union import pkg_resources import requests from pkg_resources import DistributionNotFound +from llama_index.download.utils import ( + get_exports, + get_file_content, + initialize_directory, + rewrite_exports, +) + LLAMA_HUB_CONTENTS_URL = f"https://raw.githubusercontent.com/run-llama/llama-hub/main" LLAMA_HUB_PATH = "/llama_hub" LLAMA_HUB_URL = LLAMA_HUB_CONTENTS_URL + LLAMA_HUB_PATH -REPO = "llama_datasets" -BRANCH = "main" -LLAMA_DATASETS_URL = ( - f"https://media.githubusercontent.com/media/run-llama/{REPO}/{BRANCH}" -) -LLAMA_RAG_DATASET_FILENAME = "rag_dataset.json" - - PATH_TYPE = Union[str, Path] logger = logging.getLogger(__name__) @@ -38,87 +37,6 @@ class MODULE_TYPE(str, Enum): DATASETS = "datasets" -def _get_file_content(loader_hub_url: str, path: str) -> Tuple[str, int]: - """Get the content of a file from the GitHub REST API.""" - resp = requests.get(loader_hub_url + path) - return resp.text, resp.status_code - - -def get_exports(raw_content: str) -> List: - """Read content of a Python file and returns a list of exported class names. - - For example: - ```python - from .a import A - from .b import B - - __all__ = ["A", "B"] - ``` - will return `["A", "B"]`. - - Args: - - raw_content: The content of a Python file as a string. - - Returns: - A list of exported class names. - - """ - exports = [] - for line in raw_content.splitlines(): - line = line.strip() - if line.startswith("__all__"): - exports = line.split("=")[1].strip().strip("[").strip("]").split(",") - exports = [export.strip().strip("'").strip('"') for export in exports] - return exports - - -def rewrite_exports(exports: List[str], dirpath: str) -> None: - """Write the `__all__` variable to the `__init__.py` file in the modules dir. - - Removes the line that contains `__all__` and appends a new line with the updated - `__all__` variable. - - Args: - - exports: A list of exported class names. - - """ - init_path = f"{dirpath}/__init__.py" - with open(init_path) as f: - lines = f.readlines() - with open(init_path, "w") as f: - for line in lines: - line = line.strip() - if line.startswith("__all__"): - continue - f.write(line + os.linesep) - f.write(f"__all__ = {list(set(exports))}" + os.linesep) - - -def initialize_directory( - custom_path: Optional[str] = None, custom_dir: Optional[str] = None -) -> Path: - """Initialize directory.""" - if custom_path is not None and custom_dir is not None: - raise ValueError( - "You cannot specify both `custom_path` and `custom_dir` at the same time." - ) - - custom_dir = custom_dir or "llamahub_modules" - if custom_path is not None: - dirpath = Path(custom_path) - else: - dirpath = Path(__file__).parent / custom_dir - if not os.path.exists(dirpath): - # Create a new directory because it does not exist - os.makedirs(dirpath) - if not os.path.exists(f"{dirpath}/__init__.py"): - # Create an empty __init__.py file if it does not exist yet - with open(f"{dirpath}/__init__.py", "w") as f: - pass - - return dirpath - - def get_module_info( local_dir_path: PATH_TYPE, remote_dir_path: PATH_TYPE, @@ -145,7 +63,7 @@ def get_module_info( # Fetch up-to-date library from remote repo if module_id not found if module_id is None: - library_raw_content, _ = _get_file_content( + library_raw_content, _ = get_file_content( str(remote_dir_path), f"/{library_path}" ) library = json.loads(library_raw_content) @@ -183,7 +101,6 @@ def download_module_and_reqs( use_gpt_index_import: bool = False, base_file_name: str = "base.py", override_path: bool = False, - is_dataset: bool = False, ) -> None: """Load module.""" if isinstance(local_dir_path, str): @@ -197,7 +114,7 @@ def download_module_and_reqs( if refresh_cache or not os.path.exists(module_path): os.makedirs(module_path, exist_ok=True) - basepy_raw_content, _ = _get_file_content( + basepy_raw_content, _ = get_file_content( str(remote_dir_path), f"/{module_id}/{base_file_name}" ) if use_gpt_index_import: @@ -211,48 +128,48 @@ def download_module_and_reqs( with open(f"{module_path}/{base_file_name}", "w") as f: f.write(basepy_raw_content) - # Get content of extra files if there are any - # and write them under the loader directory - for extra_file in extra_files: - extra_file_raw_content, _ = _get_file_content( - str(remote_dir_path), f"/{module_id}/{extra_file}" + # Get content of extra files if there are any + # and write them under the loader directory + for extra_file in extra_files: + extra_file_raw_content, _ = get_file_content( + str(remote_dir_path), f"/{module_id}/{extra_file}" + ) + # If the extra file is an __init__.py file, we need to + # add the exports to the __init__.py file in the modules directory + if extra_file == "__init__.py": + loader_exports = get_exports(extra_file_raw_content) + existing_exports = [] + with open(local_dir_path / "__init__.py", "r+") as f: + f.write(f"from .{module_id} import {', '.join(loader_exports)}") + existing_exports = get_exports(f.read()) + rewrite_exports(existing_exports + loader_exports, str(local_dir_path)) + + with open(f"{module_path}/{extra_file}", "w") as f: + f.write(extra_file_raw_content) + + # install requirements + requirements_path = f"{local_dir_path}/requirements.txt" + + if not os.path.exists(requirements_path): + # NOTE: need to check the status code + response_txt, status_code = get_file_content( + str(remote_dir_path), f"/{module_id}/requirements.txt" + ) + if status_code == 200: + with open(requirements_path, "w") as f: + f.write(response_txt) + + # Install dependencies if there are any and not already installed + if os.path.exists(requirements_path): + try: + requirements = pkg_resources.parse_requirements( + Path(requirements_path).open() ) - # If the extra file is an __init__.py file, we need to - # add the exports to the __init__.py file in the modules directory - if extra_file == "__init__.py": - loader_exports = get_exports(extra_file_raw_content) - existing_exports = [] - with open(local_dir_path / "__init__.py", "r+") as f: - f.write(f"from .{module_id} import {', '.join(loader_exports)}") - existing_exports = get_exports(f.read()) - rewrite_exports(existing_exports + loader_exports, str(local_dir_path)) - with open(f"{module_path}/{extra_file}", "w") as f: - f.write(extra_file_raw_content) - - if not is_dataset: - # install requirements - requirements_path = f"{local_dir_path}/requirements.txt" - - if not os.path.exists(requirements_path): - # NOTE: need to check the status code - response_txt, status_code = _get_file_content( - str(remote_dir_path), f"/{module_id}/requirements.txt" + pkg_resources.require([str(r) for r in requirements]) + except DistributionNotFound: + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "-r", requirements_path] ) - if status_code == 200: - with open(requirements_path, "w") as f: - f.write(response_txt) - - # Install dependencies if there are any and not already installed - if os.path.exists(requirements_path): - try: - requirements = pkg_resources.parse_requirements( - Path(requirements_path).open() - ) - pkg_resources.require([str(r) for r in requirements]) - except DistributionNotFound: - subprocess.check_call( - [sys.executable, "-m", "pip", "install", "-r", requirements_path] - ) def download_llama_module( @@ -266,8 +183,6 @@ def download_llama_module( use_gpt_index_import: bool = False, disable_library_cache: bool = False, override_path: bool = False, - llama_datasets_url: str = LLAMA_DATASETS_URL, - is_dataset: bool = False, ) -> Any: """Download a module from LlamaHub. @@ -307,55 +222,37 @@ def download_llama_module( extra_files = module_info["extra_files"] # download the module, install requirements - if is_dataset: - download_remote_dir_path = llama_datasets_url - base_file_name = "rag_dataset.json" - else: - download_remote_dir_path = llama_hub_url - download_module_and_reqs( local_dir_path=dirpath, - remote_dir_path=download_remote_dir_path, + remote_dir_path=llama_hub_url, module_id=module_id, extra_files=extra_files, refresh_cache=refresh_cache, use_gpt_index_import=use_gpt_index_import, base_file_name=base_file_name, override_path=override_path, - is_dataset=is_dataset, ) - if is_dataset: - # no module to install, instead just store data files in specified path - if override_path: - module_path = str(dirpath) - else: - module_path = f"{dirpath}/{module_id}" - - return f"{module_path}/{LLAMA_RAG_DATASET_FILENAME}", [ - f"{module_path}/{el}" for el in extra_files - ] + # loads the module into memory + if override_path: + spec = util.spec_from_file_location( + "custom_module", location=f"{dirpath}/{base_file_name}" + ) + if spec is None: + raise ValueError(f"Could not find file: {dirpath}/{base_file_name}.") else: - # loads the module into memory - if override_path: - spec = util.spec_from_file_location( - "custom_module", location=f"{dirpath}/{base_file_name}" - ) - if spec is None: - raise ValueError(f"Could not find file: {dirpath}/{base_file_name}.") - else: - spec = util.spec_from_file_location( - "custom_module", location=f"{dirpath}/{module_id}/{base_file_name}" + spec = util.spec_from_file_location( + "custom_module", location=f"{dirpath}/{module_id}/{base_file_name}" + ) + if spec is None: + raise ValueError( + f"Could not find file: {dirpath}/{module_id}/{base_file_name}." ) - if spec is None: - raise ValueError( - f"Could not find file: {dirpath}/{module_id}/{base_file_name}." - ) - module = util.module_from_spec(spec) - spec.loader.exec_module(module) # type: ignore + module = util.module_from_spec(spec) + spec.loader.exec_module(module) # type: ignore - return getattr(module, module_class) + return getattr(module, module_class) def track_download(module_class: str, module_type: str) -> None: diff --git a/llama_index/download/utils.py b/llama_index/download/utils.py new file mode 100644 index 0000000000000..3fc03a59a2461 --- /dev/null +++ b/llama_index/download/utils.py @@ -0,0 +1,88 @@ +import os +from pathlib import Path +from typing import List, Optional, Tuple + +import requests + + +def get_file_content(url: str, path: str) -> Tuple[str, int]: + """Get the content of a file from the GitHub REST API.""" + resp = requests.get(url + path) + return resp.text, resp.status_code + + +def get_file_content_bytes(url: str, path: str) -> Tuple[bytes, int]: + """Get the content of a file from the GitHub REST API.""" + resp = requests.get(url + path) + return resp.content, resp.status_code + + +def get_exports(raw_content: str) -> List: + """Read content of a Python file and returns a list of exported class names. + + For example: + ```python + from .a import A + from .b import B + + __all__ = ["A", "B"] + ``` + will return `["A", "B"]`. + + Args: + - raw_content: The content of a Python file as a string. + + Returns: + A list of exported class names. + + """ + exports = [] + for line in raw_content.splitlines(): + line = line.strip() + if line.startswith("__all__"): + exports = line.split("=")[1].strip().strip("[").strip("]").split(",") + exports = [export.strip().strip("'").strip('"') for export in exports] + return exports + + +def rewrite_exports(exports: List[str], dirpath: str) -> None: + """Write the `__all__` variable to the `__init__.py` file in the modules dir. + + Removes the line that contains `__all__` and appends a new line with the updated + `__all__` variable. + + Args: + - exports: A list of exported class names. + + """ + init_path = f"{dirpath}/__init__.py" + with open(init_path) as f: + lines = f.readlines() + with open(init_path, "w") as f: + for line in lines: + line = line.strip() + if line.startswith("__all__"): + continue + f.write(line + os.linesep) + f.write(f"__all__ = {list(set(exports))}" + os.linesep) + + +def initialize_directory( + custom_path: Optional[str] = None, custom_dir: Optional[str] = None +) -> Path: + """Initialize directory.""" + if custom_path is not None and custom_dir is not None: + raise ValueError( + "You cannot specify both `custom_path` and `custom_dir` at the same time." + ) + + custom_dir = custom_dir or "llamadatasets" + if custom_path is not None: + dirpath = Path(custom_path) + else: + dirpath = Path(__file__).parent / custom_dir + if not os.path.exists(dirpath): + # Create a new directory because it does not exist + os.makedirs(dirpath) + + return dirpath diff --git a/llama_index/llama_dataset/download.py b/llama_index/llama_dataset/download.py index 8c4a344760a2a..1a006b040c00b 100644 --- a/llama_index/llama_dataset/download.py +++ b/llama_index/llama_dataset/download.py @@ -1,13 +1,14 @@ from typing import List, Tuple, Type from llama_index import Document -from llama_index.download.download_utils import ( - LLAMA_DATASETS_URL, - LLAMA_HUB_URL, - MODULE_TYPE, - download_llama_module, - track_download, +from llama_index.download.dataset import ( + LLAMA_DATASETS_LFS_URL, + LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL, ) +from llama_index.download.dataset import ( + download_llama_dataset as download, +) +from llama_index.download.module import LLAMA_HUB_URL, MODULE_TYPE, track_download from llama_index.llama_dataset.base import BaseLlamaDataset from llama_index.llama_dataset.rag import LabelledRagDataset from llama_index.readers import SimpleDirectoryReader @@ -17,34 +18,49 @@ def download_llama_dataset( llama_dataset_class: str, download_dir: str, llama_hub_url: str = LLAMA_HUB_URL, - llama_datasets_url: str = LLAMA_DATASETS_URL, + llama_datasets_lfs_url: str = LLAMA_DATASETS_LFS_URL, + llama_datasets_source_files_tree_url: str = LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL, + show_progress: bool = False, ) -> Tuple[Type[BaseLlamaDataset], List[Document]]: - """Download a single LlamaDataset from Llama Hub. + """Download dataset from datasets-LFS and llamahub. Args: - llama_dataset_class: The name of the LlamaPack class you want to download, + dataset_class: The name of the llamadataset class you want to download, such as `PaulGrahamEssayDataset`. + custom_dir: Custom dir name to download loader into (under parent folder). + custom_path: Custom dirpath to download loader into. + llama_datasets_url: Url for getting ordinary files from llama_datasets repo + llama_datasets_lfs_url: Url for lfs-traced files llama_datasets repo + llama_datasets_source_files_tree_url: Url for listing source_files contents refresh_cache: If true, the local cache will be skipped and the loader will be fetched directly from the remote repo. - download_dir: Custom dirpath to download the pack into. + source_files_dirpath: The directory for storing source files + library_path: File name of the library file. + base_file_name: The rag dataset json file + disable_library_cache: Boolean to control library cache + override_path: Boolean to control overriding path + show_progress: Boolean for showing progress on downloading source files Returns: - A Loader. + a `LabelledRagDataset` and a `List[Document]` """ - filenames: Tuple[str, List[str]] = download_llama_module( + filenames: Tuple[str, str] = download( llama_dataset_class, - is_dataset=True, llama_hub_url=llama_hub_url, - llama_datasets_url=llama_datasets_url, + llama_datasets_lfs_url=llama_datasets_lfs_url, + llama_datasets_source_files_tree_url=llama_datasets_source_files_tree_url, refresh_cache=True, custom_path=download_dir, library_path="llama_datasets/library.json", disable_library_cache=True, override_path=True, + show_progress=show_progress, ) - rag_dataset_filename, source_filenames = filenames + rag_dataset_filename, source_files_dir = filenames track_download(llama_dataset_class, MODULE_TYPE.DATASETS) return ( LabelledRagDataset.from_json(rag_dataset_filename), - SimpleDirectoryReader(input_files=source_filenames).load_data(), + SimpleDirectoryReader(input_dir=source_files_dir).load_data( + show_progress=show_progress + ), ) diff --git a/llama_index/llama_pack/download.py b/llama_index/llama_pack/download.py index d26d7c23d5541..0be93bc159216 100644 --- a/llama_index/llama_pack/download.py +++ b/llama_index/llama_pack/download.py @@ -1,6 +1,6 @@ from typing import Type -from llama_index.download.download_utils import ( +from llama_index.download.module import ( LLAMA_HUB_URL, MODULE_TYPE, download_llama_module, diff --git a/llama_index/readers/download.py b/llama_index/readers/download.py index 2f87aea939b9f..3155d237de6af 100644 --- a/llama_index/readers/download.py +++ b/llama_index/readers/download.py @@ -7,7 +7,7 @@ from typing import Optional, Type -from llama_index.download.download_utils import ( +from llama_index.download.module import ( LLAMA_HUB_URL, MODULE_TYPE, download_llama_module, diff --git a/llama_index/tools/download.py b/llama_index/tools/download.py index c30ce3a21e804..dddeb87efad02 100644 --- a/llama_index/tools/download.py +++ b/llama_index/tools/download.py @@ -2,7 +2,7 @@ from typing import Optional, Type -from llama_index.download.download_utils import ( +from llama_index.download.module import ( LLAMA_HUB_URL, MODULE_TYPE, download_llama_module,