Skip to content

Commit

Permalink
Use StorageLib to download dependencies (#1383)
Browse files Browse the repository at this point in the history
* Use StorageLib to download dependencies

Signed-off-by: Ahmed Hussein <[email protected]>

Fixes #1364, Contributes to #1359

- Use the CspPath and CspFs to manage dependencies
- This allows more flexibility in specifying custom dependencies
  including local disk storage.
- Remove Pricing catalog from python package

* use requests library as a default download utility

Signed-off-by: Ahmed Hussein <[email protected]>

* address code reviews

Signed-off-by: Ahmed Hussein <[email protected]>

---------

Signed-off-by: Ahmed Hussein <[email protected]>
  • Loading branch information
amahussein authored Oct 21, 2024
1 parent 98a862f commit b778edb
Show file tree
Hide file tree
Showing 16 changed files with 930 additions and 309 deletions.
12 changes: 8 additions & 4 deletions user_tools/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ dependencies = [
"numpy<=1.24.4",
"chevron==0.14.0",
"fastprogress==1.0.3",
"fastcore==1.5.29",
"fastcore==1.7.10",
"fire>=0.5.0",
"pandas==1.4.3",
"pyYAML>=6.0",
Expand All @@ -37,8 +37,8 @@ dependencies = [
"urllib3==1.26.19",
"beautifulsoup4==4.11.2",
"pygments==2.15.0",
# used to apply validator on objects and models
"pydantic==2.1.1",
# used to apply validator on objects and models. "2.9.2" contains from_json method.
"pydantic==2.9.2",
# used to help pylint understand pydantic
"pylint-pydantic==0.3.0",
# used for common API to access remote filesystems like local/s3/gcs/hdfs
Expand Down Expand Up @@ -76,7 +76,11 @@ version = {attr = "spark_rapids_pytools.__version__"}
repository = "https://github.com/NVIDIA/spark-rapids-tools/tree/main"
[project.optional-dependencies]
test = [
"tox", 'pytest', 'cli_test_helpers', 'behave'
"tox", 'pytest', 'cli_test_helpers', 'behave',
# use flak-8 plugin for pydantic
'flake8-pydantic',
# use pylint specific version
'pylint==3.2.7',
]
qualx = [
"holoviews",
Expand Down
2 changes: 1 addition & 1 deletion user_tools/src/spark_rapids_pytools/rapids/rapids_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def _get_hadoop_classpath(self) -> Optional[str]:
conf_dir_path = LocalPath(conf_dir)
if conf_dir_path.is_dir() and conf_dir_path.exists():
# return the first valid directory found without the URI prefix
return conf_dir_path.no_prefix
return conf_dir_path.no_scheme
except Exception as e: # pylint: disable=broad-except
self.logger.debug(
'Could not build hadoop classpath from %s. Reason: %s', dir_key, e)
Expand Down
73 changes: 36 additions & 37 deletions user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import os
import re
import sys
import tarfile
import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
Expand All @@ -34,13 +33,16 @@
from spark_rapids_pytools.cloud_api.sp_types import get_platform, \
ClusterBase, DeployMode, NodeHWInfo
from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer, AbstractPropertiesContainer
from spark_rapids_pytools.common.sys_storage import FSUtil, FileVerifier
from spark_rapids_pytools.common.sys_storage import FSUtil
from spark_rapids_pytools.common.utilities import ToolLogging, Utils, ToolsSpinner
from spark_rapids_pytools.rapids.rapids_job import RapidsJobPropContainer
from spark_rapids_pytools.rapids.tool_ctxt import ToolContext
from spark_rapids_tools import CspEnv
from spark_rapids_tools.storagelib import LocalPath
from spark_rapids_tools.enums import HashAlgorithm
from spark_rapids_tools.storagelib import LocalPath, CspFs
from spark_rapids_tools.storagelib.tools.fs_utils import untar_file, FileHashAlgorithm
from spark_rapids_tools.utils import Utilities
from spark_rapids_tools.utils.net_utils import DownloadTask


@dataclass
Expand Down Expand Up @@ -139,7 +141,7 @@ def _process_output_args(self):
self.output_folder = Utils.get_rapids_tools_env('OUTPUT_DIRECTORY', os.getcwd())
try:
output_folder_path = LocalPath(self.output_folder)
self.output_folder = output_folder_path.no_prefix
self.output_folder = output_folder_path.no_scheme
except Exception as ex: # pylint: disable=broad-except
self.logger.error('Failed in processing output arguments. Output_folder must be a local directory')
raise ex
Expand Down Expand Up @@ -411,6 +413,7 @@ class RapidsJarTool(RapidsTool):
"""

def _process_jar_arg(self):
# TODO: use the StorageLib to download the jar file
jar_path = ''
tools_jar_url = self.wrapper_options.get('toolsJar')
try:
Expand Down Expand Up @@ -533,43 +536,40 @@ def cache_single_dependency(dep: dict) -> str:
"""
Downloads the specified URL and saves it to disk
"""
start_time = time.monotonic()
self.logger.info('Checking dependency %s', dep['name'])
dest_folder = self.ctxt.get_cache_folder()
resource_file_name = FSUtil.get_resource_name(dep['uri'])
resource_file = FSUtil.build_path(dest_folder, resource_file_name)
file_check_dict = {'size': dep['size']}
signature_file = FileVerifier.get_signature_file(dep['uri'], dest_folder)
if signature_file is not None:
file_check_dict['signatureFile'] = signature_file
algorithm = FileVerifier.get_integrity_algorithm(dep)
if algorithm is not None:
file_check_dict['hashlib'] = {
'algorithm': algorithm,
'hash': dep[algorithm]
}
is_created = FSUtil.cache_from_url(dep['uri'], resource_file, file_checks=file_check_dict)
if is_created:
self.logger.info('The dependency %s has been downloaded into %s', dep['uri'],
resource_file)
# check if we need to decompress files
verify_opts = {}
dep_verification = dep.get('verification')
if dep_verification is not None:
if 'size' in dep_verification:
verify_opts['size'] = dep_verification['size']
hash_lib_alg = dep_verification.get('hashLib')
if hash_lib_alg:
verify_opts['file_hash'] = FileHashAlgorithm(HashAlgorithm(hash_lib_alg['type']),
hash_lib_alg['value'])
download_task = DownloadTask(src_url=dep['uri'], # pylint: disable=no-value-for-parameter)
dest_folder=dest_folder,
verification=verify_opts)
download_result = download_task.run_task()
self.logger.info('Completed downloading of dependency [%s] => %s',
dep['name'],
f'{download_result.pretty_print()}')
if not download_result.success:
msg = f'Failed to download dependency {dep["name"]}, reason: {download_result.download_error}'
raise RuntimeError(f'Could not download all dependencies. Aborting Executions.\n\t{msg}')
destination_path = self.ctxt.get_local_work_dir()
destination_cspath = LocalPath(destination_path)
if dep['type'] == 'archive':
destination_path = self.ctxt.get_local_work_dir()
with tarfile.open(resource_file, mode='r:*') as tar:
tar.extractall(destination_path)
tar.close()
dep_item = FSUtil.remove_ext(resource_file_name)
if dep.get('relativePath') is not None:
dep_item = FSUtil.build_path(dep_item, dep.get('relativePath'))
dep_item = FSUtil.build_path(destination_path, dep_item)
uncompressed_cspath = untar_file(download_result.resource, destination_cspath)
dep_item = uncompressed_cspath.no_scheme
relative_path = dep.get('relativePath')
if relative_path is not None:
dep_item = f'{dep_item}/{relative_path}'
else:
# copy the jar into dependency folder
dep_item = self.ctxt.platform.storage.download_resource(resource_file,
self.ctxt.get_local_work_dir())
end_time = time.monotonic()
self.logger.info('Completed downloading of dependency [%s] => %s seconds',
dep['name'],
f'{(end_time-start_time):,.3f}')
CspFs.copy_resources(download_result.resource, destination_cspath)
final_dep_csp = destination_cspath.create_sub_path(download_result.resource.base_name())
dep_item = final_dep_csp.no_scheme
return dep_item

def cache_all_dependencies(dep_arr: List[dict]):
Expand All @@ -593,7 +593,6 @@ def cache_all_dependencies(dep_arr: List[dict]):
raise ex
return results

# TODO: Verify the downloaded file by checking their MD5
deploy_mode = DeployMode.tostring(self.ctxt.get_deploy_mode())
depend_arr = self.get_rapids_tools_dependencies(deploy_mode, self.ctxt.platform.configs)
if depend_arr:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,52 +7,78 @@
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"verification": {
"hashLib": {
"type": "sha512",
"value": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319"
},
"size": 400395283
},
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
"relativePath": "jars/*"
},
{
"name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
"sha1": "a65839fbf1869f81a1632e09f415e586922e4f80",
"size": 962685
"verification": {
"hashLib": {
"type": "sha1",
"value": "a65839fbf1869f81a1632e09f415e586922e4f80"
},
"size": 962685
},
"type": "jar"
},
{
"name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
"sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea",
"size": 280645251
"verification": {
"hashLib": {
"type": "sha1",
"value": "02deec3a0ad83d13d032b1812421b23d7a961eea"
},
"size": 280645251
},
"type": "jar"
}
],
"333": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"verification": {
"hashLib": {
"type": "sha512",
"value": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9"
},
"size": 299426263
},
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
"relativePath": "jars/*"
},
{
"name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
"sha1": "a65839fbf1869f81a1632e09f415e586922e4f80",
"size": 962685
"verification": {
"hashLib": {
"type": "sha1",
"value": "a65839fbf1869f81a1632e09f415e586922e4f80"
},
"size": 962685
},
"type": "jar"
},
{
"name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
"sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea",
"size": 280645251
"verification": {
"hashLib": {
"type": "sha1",
"value": "02deec3a0ad83d13d032b1812421b23d7a961eea"
},
"size": 280645251
},
"type": "jar"
}
]
}
Expand Down Expand Up @@ -205,34 +231,6 @@
}
}
},
"pricing": {
"catalog": {
"onlineResources": [
{
"resourceKey": "databricks-aws-catalog",
"onlineURL": "https://www.databricks.com/en-website-assets/data/pricing/AWS.json",
"//localFile": "the name of the file after downloading",
"localFile": "databricks-aws-catalog.json",
"backupArchive": {
"//description-1": "In case the file is stuck, we use this archive as a backup.",
"//description-2": "It is stored in the resources",
"archiveName": "databricks-aws-catalog.json"
}
},
{
"resourceKey": "ec2-catalog",
"onlineURL": "https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/us-west-2/index.json",
"//localFile": "the name of the file after downloading",
"localFile": "aws_ec2_catalog_ec2_us-west-2.json",
"backupArchive": {
"//description-1": "In case the file is stuck, we use this archive as a backup.",
"//description-2": "It is stored in the resources",
"archiveName": "aws_ec2_catalog_ec2_us-west-2.json"
}
}
]
}
},
"gpuConfigs": {
"user-tools": {
"supportedGpuInstances": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,36 +7,54 @@
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"verification": {
"hashLib": {
"type": "sha512",
"value": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319"
},
"size": 400395283
},
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
"relativePath": "jars/*"
},
{
"name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
"sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e",
"size": 574116
"verification": {
"hashLib": {
"type": "sha1",
"value": "a23f621bca9b2100554150f6b0b521f94b8b419e"
},
"size": 574116
},
"type": "jar"
}
],
"333": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"verification": {
"hashLib": {
"type": "sha512",
"value": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9"
},
"size": 299426263
},
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
"relativePath": "jars/*"
},
{
"name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
"sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e",
"size": 574116
"verification": {
"hashLib": {
"type": "sha1",
"value": "a23f621bca9b2100554150f6b0b521f94b8b419e"
},
"size": 574116
},
"type": "jar"
}
]
}
Expand Down Expand Up @@ -154,22 +172,6 @@
}
}
},
"pricing": {
"catalog": {
"onlineResources": [
{
"resourceKey": "premium-databricks-azure-catalog",
"onlineURL": "https://azure.microsoft.com/en-us/pricing/details/databricks/",
"//localFile": "the name of the local file",
"localFile": "premium-databricks-azure-catalog.json",
"backupArchive": {
"//description": "We use this archive as a backup. It is stored in the resources",
"archiveName": "premium-databricks-azure-catalog.json"
}
}
]
}
},
"gpuConfigs": {
"user-tools": {
"supportedGpuInstances": {
Expand Down
Loading

0 comments on commit b778edb

Please sign in to comment.