From 1f9ba34fa3d05f587aa76b18f3a474df146bf1b0 Mon Sep 17 00:00:00 2001
From: Andrei Fajardo <92402603+nerdai@users.noreply.github.com>
Date: Sat, 2 Dec 2023 15:13:28 -0500
Subject: [PATCH] Move download datasets logic our of download_utils.py (#9253)

* add modifications to allow for pdf download

* lint

* lint

* separate content vars

* move download datasets into separate sub module

* wip

* update commandline

* move util functions to utils

* use renamed module

* refactor to use new module name

* refactor to use new module name

* cr

* lint

* add missing import

* add missing import

* add missing import; add back download_llama_datasets

* fix command line

* get metadata from hub

* fix command line

* wip
---
 llama_index/command_line/command_line.py      |  21 +-
 llama_index/download/dataset.py               | 228 +++++++++++++++++
 .../download/{download_utils.py => module.py} | 237 +++++-------------
 llama_index/download/utils.py                 |  88 +++++++
 llama_index/llama_dataset/download.py         |  48 ++--
 llama_index/llama_pack/download.py            |   2 +-
 llama_index/readers/download.py               |   2 +-
 llama_index/tools/download.py                 |   2 +-
 8 files changed, 433 insertions(+), 195 deletions(-)
 create mode 100644 llama_index/download/dataset.py
 rename llama_index/download/{download_utils.py => module.py} (50%)
 create mode 100644 llama_index/download/utils.py

diff --git a/llama_index/command_line/command_line.py b/llama_index/command_line/command_line.py
index 5d4810d960909..be3e704b6e517 100644
--- a/llama_index/command_line/command_line.py
+++ b/llama_index/command_line/command_line.py
@@ -2,7 +2,8 @@
 from typing import Any, Optional
 
 from llama_index.llama_dataset.download import (
-    LLAMA_DATASETS_URL,
+    LLAMA_DATASETS_LFS_URL,
+    LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL,
     download_llama_dataset,
 )
 from llama_index.llama_pack.download import LLAMA_HUB_URL, download_llama_pack
@@ -29,7 +30,8 @@ def handle_download_llama_dataset(
     llama_dataset_class: Optional[str] = None,
     download_dir: Optional[str] = None,
     llama_hub_url: str = LLAMA_HUB_URL,
-    llama_datasets_url: str = LLAMA_DATASETS_URL,
+    llama_datasets_lfs_url: str = LLAMA_DATASETS_LFS_URL,
+    llama_datasets_source_files_tree_url: str = LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL,
     **kwargs: Any,
 ) -> None:
     assert llama_dataset_class is not None
@@ -39,10 +41,11 @@ def handle_download_llama_dataset(
         llama_dataset_class=llama_dataset_class,
         download_dir=download_dir,
         llama_hub_url=llama_hub_url,
-        llama_datasets_url=llama_datasets_url,
+        llama_datasets_lfs_url=llama_datasets_lfs_url,
+        llama_datasets_source_files_tree_url=llama_datasets_source_files_tree_url,
     )
 
-    print(f"Successfully downloaded {llama_datasets_url} to {download_dir}")
+    print(f"Successfully downloaded {llama_dataset_class} to {download_dir}")
 
 
 def main() -> None:
@@ -106,9 +109,15 @@ def main() -> None:
         help="URL to llama hub.",
     )
     llamadataset_parser.add_argument(
-        "--llama-dataset-url",
+        "--llama-datasets-lfs-url",
         type=str,
-        default=LLAMA_DATASETS_URL,
+        default=LLAMA_DATASETS_LFS_URL,
+        help="URL to llama datasets.",
+    )
+    llamadataset_parser.add_argument(
+        "--llama-datasets-lfs-url",
+        type=str,
+        default=LLAMA_DATASETS_LFS_URL,
         help="URL to llama datasets.",
     )
     llamadataset_parser.set_defaults(
diff --git a/llama_index/download/dataset.py b/llama_index/download/dataset.py
new file mode 100644
index 0000000000000..358618f476e80
--- /dev/null
+++ b/llama_index/download/dataset.py
@@ -0,0 +1,228 @@
+"""Download."""
+
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+import requests
+import tqdm
+
+from llama_index.download.module import LLAMA_HUB_URL
+from llama_index.download.utils import (
+    get_file_content,
+    get_file_content_bytes,
+    initialize_directory,
+)
+
+LLAMA_DATASETS_LFS_URL = (
+    f"https://media.githubusercontent.com/media/run-llama/llama_datasets/main"
+)
+
+LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL = (
+    "https://github.com/run-llama/llama_datasets/tree/main"
+)
+LLAMA_RAG_DATASET_FILENAME = "rag_dataset.json"
+LLAMA_SOURCE_FILES_PATH = "source_files"
+
+
+PATH_TYPE = Union[str, Path]
+
+
+def _get_source_files_list(source_tree_url: str, path: str) -> List[str]:
+    """Get the list of source files to download."""
+    resp = requests.get(source_tree_url + path + "?recursive=1")
+    payload = resp.json()["payload"]
+    return [item["name"] for item in payload["tree"]["items"]]
+
+
+def get_dataset_info(
+    local_dir_path: PATH_TYPE,
+    remote_dir_path: PATH_TYPE,
+    remote_source_dir_path: PATH_TYPE,
+    dataset_class: str,
+    refresh_cache: bool = False,
+    library_path: str = "library.json",
+    source_files_path: str = "source_files",
+    disable_library_cache: bool = False,
+) -> Dict:
+    """Get dataset info."""
+    if isinstance(local_dir_path, str):
+        local_dir_path = Path(local_dir_path)
+
+    local_library_path = f"{local_dir_path}/{library_path}"
+    dataset_id = None
+    source_files = []
+
+    # Check cache first
+    if not refresh_cache and os.path.exists(local_library_path):
+        with open(local_library_path) as f:
+            library = json.load(f)
+        if dataset_class in library:
+            dataset_id = library[dataset_class]["id"]
+            source_files = library[dataset_class].get("source_files", [])
+
+    # Fetch up-to-date library from remote repo if dataset_id not found
+    if dataset_id is None:
+        library_raw_content, _ = get_file_content(
+            str(remote_dir_path), f"/{library_path}"
+        )
+        library = json.loads(library_raw_content)
+        if dataset_class not in library:
+            raise ValueError("Loader class name not found in library")
+
+        dataset_id = library[dataset_class]["id"]
+        source_files = _get_source_files_list(
+            str(remote_source_dir_path), f"/{dataset_id}/{source_files_path}"
+        )
+
+        # create cache dir if needed
+        local_library_dir = os.path.dirname(local_library_path)
+        if not disable_library_cache:
+            if not os.path.exists(local_library_dir):
+                os.makedirs(local_library_dir)
+
+            # Update cache
+            with open(local_library_path, "w") as f:
+                f.write(library_raw_content)
+
+    if dataset_id is None:
+        raise ValueError("Dataset class name not found in library")
+
+    return {
+        "dataset_id": dataset_id,
+        "source_files": source_files,
+    }
+
+
+def download_dataset_and_source_files(
+    local_dir_path: PATH_TYPE,
+    remote_lfs_dir_path: PATH_TYPE,
+    source_files_dir_path: PATH_TYPE,
+    dataset_id: str,
+    source_files: List[str],
+    refresh_cache: bool = False,
+    base_file_name: str = "rag_dataset.json",
+    override_path: bool = False,
+    show_progress: bool = False,
+) -> None:
+    """Download dataset and source files."""
+    if isinstance(local_dir_path, str):
+        local_dir_path = Path(local_dir_path)
+
+    if override_path:
+        module_path = str(local_dir_path)
+    else:
+        module_path = f"{local_dir_path}/{dataset_id}"
+
+    if refresh_cache or not os.path.exists(module_path):
+        os.makedirs(module_path, exist_ok=True)
+        os.makedirs(f"{module_path}/{source_files_dir_path}", exist_ok=True)
+
+        rag_dataset_raw_content, _ = get_file_content(
+            str(remote_lfs_dir_path), f"/{dataset_id}/{base_file_name}"
+        )
+
+        with open(f"{module_path}/{base_file_name}", "w") as f:
+            f.write(rag_dataset_raw_content)
+
+        # Get content of source files
+        if show_progress:
+            source_files_iterator = tqdm.tqdm(source_files)
+        else:
+            source_files_iterator = source_files
+        for source_file in source_files_iterator:
+            if ".pdf" in source_file:
+                source_file_raw_content_bytes, _ = get_file_content_bytes(
+                    str(remote_lfs_dir_path),
+                    f"/{dataset_id}/{source_files_dir_path}/{source_file}",
+                )
+                with open(
+                    f"{module_path}/{source_files_dir_path}/{source_file}", "wb"
+                ) as f:
+                    f.write(source_file_raw_content_bytes)
+            else:
+                source_file_raw_content, _ = get_file_content(
+                    str(remote_lfs_dir_path),
+                    f"/{dataset_id}/{source_files_dir_path}/{source_file}",
+                )
+                with open(
+                    f"{module_path}/{source_files_dir_path}/{source_file}", "w"
+                ) as f:
+                    f.write(source_file_raw_content)
+
+
+def download_llama_dataset(
+    dataset_class: str,
+    llama_hub_url: str = LLAMA_HUB_URL,
+    llama_datasets_lfs_url: str = LLAMA_DATASETS_LFS_URL,
+    llama_datasets_source_files_tree_url: str = LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL,
+    refresh_cache: bool = False,
+    custom_dir: Optional[str] = None,
+    custom_path: Optional[str] = None,
+    source_files_dirpath: str = LLAMA_SOURCE_FILES_PATH,
+    library_path: str = "library.json",
+    base_file_name: str = "rag_dataset.json",
+    disable_library_cache: bool = False,
+    override_path: bool = False,
+    show_progress: bool = False,
+) -> Any:
+    """Download a module from LlamaHub.
+
+    Can be a loader, tool, pack, or more.
+
+    Args:
+        loader_class: The name of the llama module class you want to download,
+            such as `GmailOpenAIAgentPack`.
+        refresh_cache: If true, the local cache will be skipped and the
+            loader will be fetched directly from the remote repo.
+        custom_dir: Custom dir name to download loader into (under parent folder).
+        custom_path: Custom dirpath to download loader into.
+        library_path: File name of the library file.
+        use_gpt_index_import: If true, the loader files will use
+            llama_index as the base dependency. By default (False),
+            the loader files use llama_index as the base dependency.
+            NOTE: this is a temporary workaround while we fully migrate all usages
+            to llama_index.
+        is_dataset: whether or not downloading a LlamaDataset
+
+    Returns:
+        A Loader, A Pack, An Agent, or A Dataset
+    """
+    # create directory / get path
+    dirpath = initialize_directory(custom_path=custom_path, custom_dir=custom_dir)
+
+    # fetch info from library.json file
+    dataset_info = get_dataset_info(
+        local_dir_path=dirpath,
+        remote_dir_path=llama_hub_url,
+        remote_source_dir_path=llama_datasets_source_files_tree_url,
+        dataset_class=dataset_class,
+        refresh_cache=refresh_cache,
+        library_path=library_path,
+        disable_library_cache=disable_library_cache,
+    )
+    dataset_id = dataset_info["dataset_id"]
+    source_files = dataset_info["source_files"]
+
+    download_dataset_and_source_files(
+        local_dir_path=dirpath,
+        remote_lfs_dir_path=llama_datasets_lfs_url,
+        source_files_dir_path=source_files_dirpath,
+        dataset_id=dataset_id,
+        source_files=source_files,
+        refresh_cache=refresh_cache,
+        base_file_name=base_file_name,
+        override_path=override_path,
+        show_progress=show_progress,
+    )
+
+    if override_path:
+        module_path = str(dirpath)
+    else:
+        module_path = f"{dirpath}/{dataset_id}"
+
+    return (
+        f"{module_path}/{LLAMA_RAG_DATASET_FILENAME}",
+        f"{module_path}/{LLAMA_SOURCE_FILES_PATH}",
+    )
diff --git a/llama_index/download/download_utils.py b/llama_index/download/module.py
similarity index 50%
rename from llama_index/download/download_utils.py
rename to llama_index/download/module.py
index 3f942196c77d2..817611785b42b 100644
--- a/llama_index/download/download_utils.py
+++ b/llama_index/download/module.py
@@ -7,24 +7,23 @@
 from enum import Enum
 from importlib import util
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
 
 import pkg_resources
 import requests
 from pkg_resources import DistributionNotFound
 
+from llama_index.download.utils import (
+    get_exports,
+    get_file_content,
+    initialize_directory,
+    rewrite_exports,
+)
+
 LLAMA_HUB_CONTENTS_URL = f"https://raw.githubusercontent.com/run-llama/llama-hub/main"
 LLAMA_HUB_PATH = "/llama_hub"
 LLAMA_HUB_URL = LLAMA_HUB_CONTENTS_URL + LLAMA_HUB_PATH
 
-REPO = "llama_datasets"
-BRANCH = "main"
-LLAMA_DATASETS_URL = (
-    f"https://media.githubusercontent.com/media/run-llama/{REPO}/{BRANCH}"
-)
-LLAMA_RAG_DATASET_FILENAME = "rag_dataset.json"
-
-
 PATH_TYPE = Union[str, Path]
 
 logger = logging.getLogger(__name__)
@@ -38,87 +37,6 @@ class MODULE_TYPE(str, Enum):
     DATASETS = "datasets"
 
 
-def _get_file_content(loader_hub_url: str, path: str) -> Tuple[str, int]:
-    """Get the content of a file from the GitHub REST API."""
-    resp = requests.get(loader_hub_url + path)
-    return resp.text, resp.status_code
-
-
-def get_exports(raw_content: str) -> List:
-    """Read content of a Python file and returns a list of exported class names.
-
-    For example:
-    ```python
-    from .a import A
-    from .b import B
-
-    __all__ = ["A", "B"]
-    ```
-    will return `["A", "B"]`.
-
-    Args:
-        - raw_content: The content of a Python file as a string.
-
-    Returns:
-        A list of exported class names.
-
-    """
-    exports = []
-    for line in raw_content.splitlines():
-        line = line.strip()
-        if line.startswith("__all__"):
-            exports = line.split("=")[1].strip().strip("[").strip("]").split(",")
-            exports = [export.strip().strip("'").strip('"') for export in exports]
-    return exports
-
-
-def rewrite_exports(exports: List[str], dirpath: str) -> None:
-    """Write the `__all__` variable to the `__init__.py` file in the modules dir.
-
-    Removes the line that contains `__all__` and appends a new line with the updated
-    `__all__` variable.
-
-    Args:
-        - exports: A list of exported class names.
-
-    """
-    init_path = f"{dirpath}/__init__.py"
-    with open(init_path) as f:
-        lines = f.readlines()
-    with open(init_path, "w") as f:
-        for line in lines:
-            line = line.strip()
-            if line.startswith("__all__"):
-                continue
-            f.write(line + os.linesep)
-        f.write(f"__all__ = {list(set(exports))}" + os.linesep)
-
-
-def initialize_directory(
-    custom_path: Optional[str] = None, custom_dir: Optional[str] = None
-) -> Path:
-    """Initialize directory."""
-    if custom_path is not None and custom_dir is not None:
-        raise ValueError(
-            "You cannot specify both `custom_path` and `custom_dir` at the same time."
-        )
-
-    custom_dir = custom_dir or "llamahub_modules"
-    if custom_path is not None:
-        dirpath = Path(custom_path)
-    else:
-        dirpath = Path(__file__).parent / custom_dir
-    if not os.path.exists(dirpath):
-        # Create a new directory because it does not exist
-        os.makedirs(dirpath)
-    if not os.path.exists(f"{dirpath}/__init__.py"):
-        # Create an empty __init__.py file if it does not exist yet
-        with open(f"{dirpath}/__init__.py", "w") as f:
-            pass
-
-    return dirpath
-
-
 def get_module_info(
     local_dir_path: PATH_TYPE,
     remote_dir_path: PATH_TYPE,
@@ -145,7 +63,7 @@ def get_module_info(
 
     # Fetch up-to-date library from remote repo if module_id not found
     if module_id is None:
-        library_raw_content, _ = _get_file_content(
+        library_raw_content, _ = get_file_content(
             str(remote_dir_path), f"/{library_path}"
         )
         library = json.loads(library_raw_content)
@@ -183,7 +101,6 @@ def download_module_and_reqs(
     use_gpt_index_import: bool = False,
     base_file_name: str = "base.py",
     override_path: bool = False,
-    is_dataset: bool = False,
 ) -> None:
     """Load module."""
     if isinstance(local_dir_path, str):
@@ -197,7 +114,7 @@ def download_module_and_reqs(
     if refresh_cache or not os.path.exists(module_path):
         os.makedirs(module_path, exist_ok=True)
 
-        basepy_raw_content, _ = _get_file_content(
+        basepy_raw_content, _ = get_file_content(
             str(remote_dir_path), f"/{module_id}/{base_file_name}"
         )
         if use_gpt_index_import:
@@ -211,48 +128,48 @@ def download_module_and_reqs(
         with open(f"{module_path}/{base_file_name}", "w") as f:
             f.write(basepy_raw_content)
 
-        # Get content of extra files if there are any
-        # and write them under the loader directory
-        for extra_file in extra_files:
-            extra_file_raw_content, _ = _get_file_content(
-                str(remote_dir_path), f"/{module_id}/{extra_file}"
+    # Get content of extra files if there are any
+    # and write them under the loader directory
+    for extra_file in extra_files:
+        extra_file_raw_content, _ = get_file_content(
+            str(remote_dir_path), f"/{module_id}/{extra_file}"
+        )
+        # If the extra file is an __init__.py file, we need to
+        # add the exports to the __init__.py file in the modules directory
+        if extra_file == "__init__.py":
+            loader_exports = get_exports(extra_file_raw_content)
+            existing_exports = []
+            with open(local_dir_path / "__init__.py", "r+") as f:
+                f.write(f"from .{module_id} import {', '.join(loader_exports)}")
+                existing_exports = get_exports(f.read())
+            rewrite_exports(existing_exports + loader_exports, str(local_dir_path))
+
+        with open(f"{module_path}/{extra_file}", "w") as f:
+            f.write(extra_file_raw_content)
+
+    # install requirements
+    requirements_path = f"{local_dir_path}/requirements.txt"
+
+    if not os.path.exists(requirements_path):
+        # NOTE: need to check the status code
+        response_txt, status_code = get_file_content(
+            str(remote_dir_path), f"/{module_id}/requirements.txt"
+        )
+        if status_code == 200:
+            with open(requirements_path, "w") as f:
+                f.write(response_txt)
+
+    # Install dependencies if there are any and not already installed
+    if os.path.exists(requirements_path):
+        try:
+            requirements = pkg_resources.parse_requirements(
+                Path(requirements_path).open()
             )
-            # If the extra file is an __init__.py file, we need to
-            # add the exports to the __init__.py file in the modules directory
-            if extra_file == "__init__.py":
-                loader_exports = get_exports(extra_file_raw_content)
-                existing_exports = []
-                with open(local_dir_path / "__init__.py", "r+") as f:
-                    f.write(f"from .{module_id} import {', '.join(loader_exports)}")
-                    existing_exports = get_exports(f.read())
-                rewrite_exports(existing_exports + loader_exports, str(local_dir_path))
-            with open(f"{module_path}/{extra_file}", "w") as f:
-                f.write(extra_file_raw_content)
-
-    if not is_dataset:
-        # install requirements
-        requirements_path = f"{local_dir_path}/requirements.txt"
-
-        if not os.path.exists(requirements_path):
-            # NOTE: need to check the status code
-            response_txt, status_code = _get_file_content(
-                str(remote_dir_path), f"/{module_id}/requirements.txt"
+            pkg_resources.require([str(r) for r in requirements])
+        except DistributionNotFound:
+            subprocess.check_call(
+                [sys.executable, "-m", "pip", "install", "-r", requirements_path]
             )
-            if status_code == 200:
-                with open(requirements_path, "w") as f:
-                    f.write(response_txt)
-
-        # Install dependencies if there are any and not already installed
-        if os.path.exists(requirements_path):
-            try:
-                requirements = pkg_resources.parse_requirements(
-                    Path(requirements_path).open()
-                )
-                pkg_resources.require([str(r) for r in requirements])
-            except DistributionNotFound:
-                subprocess.check_call(
-                    [sys.executable, "-m", "pip", "install", "-r", requirements_path]
-                )
 
 
 def download_llama_module(
@@ -266,8 +183,6 @@ def download_llama_module(
     use_gpt_index_import: bool = False,
     disable_library_cache: bool = False,
     override_path: bool = False,
-    llama_datasets_url: str = LLAMA_DATASETS_URL,
-    is_dataset: bool = False,
 ) -> Any:
     """Download a module from LlamaHub.
 
@@ -307,55 +222,37 @@ def download_llama_module(
     extra_files = module_info["extra_files"]
 
     # download the module, install requirements
-    if is_dataset:
-        download_remote_dir_path = llama_datasets_url
-        base_file_name = "rag_dataset.json"
-    else:
-        download_remote_dir_path = llama_hub_url
-
     download_module_and_reqs(
         local_dir_path=dirpath,
-        remote_dir_path=download_remote_dir_path,
+        remote_dir_path=llama_hub_url,
         module_id=module_id,
         extra_files=extra_files,
         refresh_cache=refresh_cache,
         use_gpt_index_import=use_gpt_index_import,
         base_file_name=base_file_name,
         override_path=override_path,
-        is_dataset=is_dataset,
     )
 
-    if is_dataset:
-        # no module to install, instead just store data files in specified path
-        if override_path:
-            module_path = str(dirpath)
-        else:
-            module_path = f"{dirpath}/{module_id}"
-
-        return f"{module_path}/{LLAMA_RAG_DATASET_FILENAME}", [
-            f"{module_path}/{el}" for el in extra_files
-        ]
+    # loads the module into memory
+    if override_path:
+        spec = util.spec_from_file_location(
+            "custom_module", location=f"{dirpath}/{base_file_name}"
+        )
+        if spec is None:
+            raise ValueError(f"Could not find file: {dirpath}/{base_file_name}.")
     else:
-        # loads the module into memory
-        if override_path:
-            spec = util.spec_from_file_location(
-                "custom_module", location=f"{dirpath}/{base_file_name}"
-            )
-            if spec is None:
-                raise ValueError(f"Could not find file: {dirpath}/{base_file_name}.")
-        else:
-            spec = util.spec_from_file_location(
-                "custom_module", location=f"{dirpath}/{module_id}/{base_file_name}"
+        spec = util.spec_from_file_location(
+            "custom_module", location=f"{dirpath}/{module_id}/{base_file_name}"
+        )
+        if spec is None:
+            raise ValueError(
+                f"Could not find file: {dirpath}/{module_id}/{base_file_name}."
             )
-            if spec is None:
-                raise ValueError(
-                    f"Could not find file: {dirpath}/{module_id}/{base_file_name}."
-                )
 
-        module = util.module_from_spec(spec)
-        spec.loader.exec_module(module)  # type: ignore
+    module = util.module_from_spec(spec)
+    spec.loader.exec_module(module)  # type: ignore
 
-        return getattr(module, module_class)
+    return getattr(module, module_class)
 
 
 def track_download(module_class: str, module_type: str) -> None:
diff --git a/llama_index/download/utils.py b/llama_index/download/utils.py
new file mode 100644
index 0000000000000..3fc03a59a2461
--- /dev/null
+++ b/llama_index/download/utils.py
@@ -0,0 +1,88 @@
+import os
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+import requests
+
+
+def get_file_content(url: str, path: str) -> Tuple[str, int]:
+    """Get the content of a file from the GitHub REST API."""
+    resp = requests.get(url + path)
+    return resp.text, resp.status_code
+
+
+def get_file_content_bytes(url: str, path: str) -> Tuple[bytes, int]:
+    """Get the content of a file from the GitHub REST API."""
+    resp = requests.get(url + path)
+    return resp.content, resp.status_code
+
+
+def get_exports(raw_content: str) -> List:
+    """Read content of a Python file and returns a list of exported class names.
+
+    For example:
+    ```python
+    from .a import A
+    from .b import B
+
+    __all__ = ["A", "B"]
+    ```
+    will return `["A", "B"]`.
+
+    Args:
+        - raw_content: The content of a Python file as a string.
+
+    Returns:
+        A list of exported class names.
+
+    """
+    exports = []
+    for line in raw_content.splitlines():
+        line = line.strip()
+        if line.startswith("__all__"):
+            exports = line.split("=")[1].strip().strip("[").strip("]").split(",")
+            exports = [export.strip().strip("'").strip('"') for export in exports]
+    return exports
+
+
+def rewrite_exports(exports: List[str], dirpath: str) -> None:
+    """Write the `__all__` variable to the `__init__.py` file in the modules dir.
+
+    Removes the line that contains `__all__` and appends a new line with the updated
+    `__all__` variable.
+
+    Args:
+        - exports: A list of exported class names.
+
+    """
+    init_path = f"{dirpath}/__init__.py"
+    with open(init_path) as f:
+        lines = f.readlines()
+    with open(init_path, "w") as f:
+        for line in lines:
+            line = line.strip()
+            if line.startswith("__all__"):
+                continue
+            f.write(line + os.linesep)
+        f.write(f"__all__ = {list(set(exports))}" + os.linesep)
+
+
+def initialize_directory(
+    custom_path: Optional[str] = None, custom_dir: Optional[str] = None
+) -> Path:
+    """Initialize directory."""
+    if custom_path is not None and custom_dir is not None:
+        raise ValueError(
+            "You cannot specify both `custom_path` and `custom_dir` at the same time."
+        )
+
+    custom_dir = custom_dir or "llamadatasets"
+    if custom_path is not None:
+        dirpath = Path(custom_path)
+    else:
+        dirpath = Path(__file__).parent / custom_dir
+    if not os.path.exists(dirpath):
+        # Create a new directory because it does not exist
+        os.makedirs(dirpath)
+
+    return dirpath
diff --git a/llama_index/llama_dataset/download.py b/llama_index/llama_dataset/download.py
index 8c4a344760a2a..1a006b040c00b 100644
--- a/llama_index/llama_dataset/download.py
+++ b/llama_index/llama_dataset/download.py
@@ -1,13 +1,14 @@
 from typing import List, Tuple, Type
 
 from llama_index import Document
-from llama_index.download.download_utils import (
-    LLAMA_DATASETS_URL,
-    LLAMA_HUB_URL,
-    MODULE_TYPE,
-    download_llama_module,
-    track_download,
+from llama_index.download.dataset import (
+    LLAMA_DATASETS_LFS_URL,
+    LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL,
 )
+from llama_index.download.dataset import (
+    download_llama_dataset as download,
+)
+from llama_index.download.module import LLAMA_HUB_URL, MODULE_TYPE, track_download
 from llama_index.llama_dataset.base import BaseLlamaDataset
 from llama_index.llama_dataset.rag import LabelledRagDataset
 from llama_index.readers import SimpleDirectoryReader
@@ -17,34 +18,49 @@ def download_llama_dataset(
     llama_dataset_class: str,
     download_dir: str,
     llama_hub_url: str = LLAMA_HUB_URL,
-    llama_datasets_url: str = LLAMA_DATASETS_URL,
+    llama_datasets_lfs_url: str = LLAMA_DATASETS_LFS_URL,
+    llama_datasets_source_files_tree_url: str = LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL,
+    show_progress: bool = False,
 ) -> Tuple[Type[BaseLlamaDataset], List[Document]]:
-    """Download a single LlamaDataset from Llama Hub.
+    """Download dataset from datasets-LFS and llamahub.
 
     Args:
-        llama_dataset_class: The name of the LlamaPack class you want to download,
+        dataset_class: The name of the llamadataset class you want to download,
             such as `PaulGrahamEssayDataset`.
+        custom_dir: Custom dir name to download loader into (under parent folder).
+        custom_path: Custom dirpath to download loader into.
+        llama_datasets_url: Url for getting ordinary files from llama_datasets repo
+        llama_datasets_lfs_url: Url for lfs-traced files llama_datasets repo
+        llama_datasets_source_files_tree_url: Url for listing source_files contents
         refresh_cache: If true, the local cache will be skipped and the
             loader will be fetched directly from the remote repo.
-        download_dir: Custom dirpath to download the pack into.
+        source_files_dirpath: The directory for storing source files
+        library_path: File name of the library file.
+        base_file_name: The rag dataset json file
+        disable_library_cache: Boolean to control library cache
+        override_path: Boolean to control overriding path
+        show_progress: Boolean for showing progress on downloading source files
 
     Returns:
-        A Loader.
+        a `LabelledRagDataset` and a `List[Document]`
     """
-    filenames: Tuple[str, List[str]] = download_llama_module(
+    filenames: Tuple[str, str] = download(
         llama_dataset_class,
-        is_dataset=True,
         llama_hub_url=llama_hub_url,
-        llama_datasets_url=llama_datasets_url,
+        llama_datasets_lfs_url=llama_datasets_lfs_url,
+        llama_datasets_source_files_tree_url=llama_datasets_source_files_tree_url,
         refresh_cache=True,
         custom_path=download_dir,
         library_path="llama_datasets/library.json",
         disable_library_cache=True,
         override_path=True,
+        show_progress=show_progress,
     )
-    rag_dataset_filename, source_filenames = filenames
+    rag_dataset_filename, source_files_dir = filenames
     track_download(llama_dataset_class, MODULE_TYPE.DATASETS)
     return (
         LabelledRagDataset.from_json(rag_dataset_filename),
-        SimpleDirectoryReader(input_files=source_filenames).load_data(),
+        SimpleDirectoryReader(input_dir=source_files_dir).load_data(
+            show_progress=show_progress
+        ),
     )
diff --git a/llama_index/llama_pack/download.py b/llama_index/llama_pack/download.py
index d26d7c23d5541..0be93bc159216 100644
--- a/llama_index/llama_pack/download.py
+++ b/llama_index/llama_pack/download.py
@@ -1,6 +1,6 @@
 from typing import Type
 
-from llama_index.download.download_utils import (
+from llama_index.download.module import (
     LLAMA_HUB_URL,
     MODULE_TYPE,
     download_llama_module,
diff --git a/llama_index/readers/download.py b/llama_index/readers/download.py
index 2f87aea939b9f..3155d237de6af 100644
--- a/llama_index/readers/download.py
+++ b/llama_index/readers/download.py
@@ -7,7 +7,7 @@
 
 from typing import Optional, Type
 
-from llama_index.download.download_utils import (
+from llama_index.download.module import (
     LLAMA_HUB_URL,
     MODULE_TYPE,
     download_llama_module,
diff --git a/llama_index/tools/download.py b/llama_index/tools/download.py
index c30ce3a21e804..dddeb87efad02 100644
--- a/llama_index/tools/download.py
+++ b/llama_index/tools/download.py
@@ -2,7 +2,7 @@
 
 from typing import Optional, Type
 
-from llama_index.download.download_utils import (
+from llama_index.download.module import (
     LLAMA_HUB_URL,
     MODULE_TYPE,
     download_llama_module,