diff --git a/metaflow/metaflow_environment.py b/metaflow/metaflow_environment.py index 586ea7a740..f4093eb682 100644 --- a/metaflow/metaflow_environment.py +++ b/metaflow/metaflow_environment.py @@ -126,6 +126,7 @@ def get_package_commands( "after 6 tries. Exiting...' && exit 1; " "fi" % code_package_url, "TAR_OPTIONS='--warning=no-timestamp' tar xf job.tar", + f"mflog {code_package_url=}", "mflog 'Task is starting.'", ] return cmds diff --git a/metaflow/plugins/kfp/kfp.py b/metaflow/plugins/kfp/kfp.py index 460ab3f0dd..54a37482fa 100644 --- a/metaflow/plugins/kfp/kfp.py +++ b/metaflow/plugins/kfp/kfp.py @@ -18,6 +18,7 @@ V1EnvVar, V1EnvVarSource, V1EmptyDirVolumeSource, + V1NFSVolumeSource, V1NodeAffinity, V1NodeSelector, V1NodeSelectorRequirement, @@ -297,11 +298,11 @@ def to_k8s_resource_format(resource: str, value: Union[int, float, str]) -> str: resource_requirements = {} for deco in node.decorators: if isinstance(deco, ResourcesDecorator): - if deco.attributes.get("local_storage") is not None: - raise ValueError( # Not using DeprecationWarning to hard block the run before triggering. - "`local_storage` option is deprecated over cluster stability concerns. " - "Please use `volume` for storage request." - ) + # if deco.attributes.get("local_storage") is not None: + # raise ValueError( # Not using DeprecationWarning to hard block the run before triggering. + # "`local_storage` option is deprecated over cluster stability concerns. " + # "Please use `volume` for storage request." + # ) for attr_key, attr_value in deco.attributes.items(): if attr_value is not None: @@ -538,6 +539,14 @@ def _set_container_resources( resource_requirements["gpu"], vendor=gpu_vendor if gpu_vendor else "nvidia", ) + if "local_storage" in resource_requirements: + # TODO(talebz): remove, as this is for RMX EFS persisted GPU training + container_op.container.set_ephemeral_storage_request( + resource_requirements["local_storage"] + ) + container_op.container.set_ephemeral_storage_limit( + resource_requirements["local_storage"] + ) if "shared_memory" in resource_requirements: memory_volume = PipelineVolume( @@ -554,6 +563,22 @@ def _set_container_resources( ) container_op.add_pvolumes({"dev/shm": memory_volume}) + if "nfs_server" in resource_requirements: + nfs_volume = PipelineVolume( + volume=V1Volume( + # k8s volume name must consist of lower case alphanumeric characters or '-', + # and must start and end with an alphanumeric character, + # but step name is python function name that tends to be alphanumeric chars with '_' + name=f"{kfp_component.step_name.lower().replace('_', '-')}-nfs", + nfs=V1NFSVolumeSource( + read_only=True, + path=resource_requirements["nfs_path"], + server=resource_requirements["nfs_server"], + ), + ) + ) + container_op.add_pvolumes({"/mnt/nfs": nfs_volume}) + affinity_match_expressions: List[V1NodeSelectorRequirement] = [] if kfp_component.accelerator_decorator: @@ -714,6 +739,9 @@ def _set_container_labels(self, container_op: ContainerOp, metaflow_run_id: str) ) container_op.add_pod_annotation(annotation_name, annotation_value) + # TODO(talebz): remove, as this is for RMX EFS persisted GPU training + container_op.add_pod_label("aip.zillowgroup.net/performance", "efs_test") + # tags.ledger.zgtools.net/* pod labels required for the ZGCP Costs Ledger container_op.add_pod_label("tags.ledger.zgtools.net/ai-flow-name", self.name) container_op.add_pod_label( diff --git a/metaflow/plugins/resources_decorator.py b/metaflow/plugins/resources_decorator.py index cfe23f4196..6c4a1ae238 100644 --- a/metaflow/plugins/resources_decorator.py +++ b/metaflow/plugins/resources_decorator.py @@ -95,4 +95,6 @@ def myStep(self): "volume_type": None, # Deprecated - kept only to show a meaningful error message "local_storage": None, + "nfs_path": None, + "nfs_server": None, }