diff --git a/clarifai/runners/dockerfile_template/Dockerfile.nim.template b/clarifai/runners/dockerfile_template/Dockerfile.nim.template new file mode 100644 index 00000000..7fe37a48 --- /dev/null +++ b/clarifai/runners/dockerfile_template/Dockerfile.nim.template @@ -0,0 +1,122 @@ +# Use an intermediate image to install pip and other dependencies +FROM --platform=$TARGETPLATFORM public.ecr.aws/docker/library/python:${PYTHON_VERSION}-slim-bookworm as deps +ENV DEBIAN_FRONTEND=noninteractive + + +RUN python${PYTHON_VERSION} -m venv /venv && \ + /venv/bin/pip install --disable-pip-version-check --upgrade pip setuptools wheel && \ + ln -sf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 && \ + apt-get clean && rm -rf /var/lib/apt/lists/*; + +# Install the NIM base image +ENV NGC_API_KEY=${NGC_API_KEY} + +# Use the NIM base image as another build stage +FROM --platform=$TARGETPLATFORM ${BASE_IMAGE} as build + +# Final image based on distroless +FROM gcr.io/distroless/python3-debian12:debug + +# virtual env +COPY --from=deps /venv /venv +# we have to overwrite the python3 binary that the distroless image uses +COPY --from=deps /usr/local/bin/python${PYTHON_VERSION} /usr/bin/python3 +COPY --from=deps /usr/local/bin/python${PYTHON_VERSION} /usr/local/bin/python${PYTHON_VERSION} + +# Copy NIM files +COPY --from=build /opt /opt +COPY --from=build /etc/nim /etc/nim + +# Copy necessary binaries and libraries from the NIM base image +COPY --from=build /bin/bash /bin/bash +COPY --from=build /bin/ssh /bin/ssh +COPY --from=build /usr/bin/ln /usr/bin/ln + +# also copy in all the lib files for it. +COPY --from=build /lib /lib +COPY --from=build /lib64 /lib64 +COPY --from=build /usr/lib/ /usr/lib/ +COPY --from=build /usr/local/lib/ /usr/local/lib/ +# ldconfig is needed to update the shared library cache so system libraries (like CUDA) can be found +COPY --from=build /usr/sbin/ldconfig /sbin/ldconfig +COPY --from=build /usr/sbin/ldconfig.real /sbin/ldconfig.real +COPY --from=build /etc/ld.so.conf /etc/ld.so.conf +COPY --from=build /etc/ld.so.cache /etc/ld.so.cache +COPY --from=build /etc/ld.so.conf.d/ /etc/ld.so.conf.d/ + + +# Set environment variables +ENV PYTHONPATH=/venv/lib/python3.10/site-packages:/opt/nim/llm/.venv/lib/python3.10/site-packages:/opt/nim/llm +ENV PATH="/usr/local/bin:/venv/bin:/opt/nim/llm/.venv/bin:/opt/hpcx/ucc/bin:/opt/hpcx/ucx/bin:/opt/hpcx/ompi/bin:$PATH" + +ENV LD_LIBRARY_PATH="/opt/hpcx/ucc/lib/ucc:/opt/hpcx/ucc/lib:/opt/hpcx/ucx/lib/ucx:/opt/hpcx/ucx/lib:/opt/hpcx/ompi/lib:/opt/hpcx/ompi/lib/openmpi:/opt/nim/llm/.venv/lib/python3.10/site-packages/tensorrt_llm/libs:/opt/nim/llm/.venv/lib/python3.10/site-packages/nvidia/cublas/lib:/opt/nim/llm/.venv/lib/python3.10/site-packages/tensorrt_libs:/opt/nim/llm/.venv/lib/python3.10/site-packages/nvidia/nccl/lib:$LD_LIBRARY_PATH" + +ENV LIBRARY_PATH=/opt/hpcx/ucc/lib:/opt/hpcx/ucx/lib:/opt/hpcx/ompi/lib:$LIBRARY_PATH + +ENV CPATH=/opt/hpcx/ompi/include:/opt/hpcx/ucc/include:/opt/hpcx/ucx/include:$CPATH +ENV LLM_PROJECT_DIR=/opt/nim/llm + +# Set environment variables for MPI +ENV OMPI_HOME=/opt/hpcx/ompi +ENV HPCX_MPI_DIR=/opt/hpcx/ompi +ENV MPIf_HOME=/opt/hpcx/ompi +ENV OPAL_PREFIX=/opt/hpcx/ompi + +# Set environment variables for UCC +ENV UCC_DIR=/opt/hpcx/ucc/lib/cmake/ucc +ENV UCC_HOME=/opt/hpcx/ucc +ENV HPCX_UCC_DIR=/opt/hpcx/ucc +ENV USE_UCC=1 +ENV USE_SYSTEM_UCC=1 + +# Set environment variables for HPC-X +ENV HPCX_DIR=/opt/hpcx +ENV HPCX_UCX_DIR=/opt/hpcx/ucx +ENV HPCX_MPI_DIR=/opt/hpcx/ompi + +# Set environment variables for UCX +ENV UCX_DIR=/opt/hpcx/ucx/lib/cmake/ucx +ENV UCX_HOME=/opt/hpcx/ucx + +ENV HOME=/opt/nim/llm + +SHELL ["/bin/bash", "-c"] + +# These will be set by the templaing system. +ENV CLARIFAI_PAT=${CLARIFAI_PAT} +ENV CLARIFAI_USER_ID=${CLARIFAI_USER_ID} +ENV CLARIFAI_RUNNER_ID=${CLARIFAI_RUNNER_ID} +ENV CLARIFAI_NODEPOOL_ID=${CLARIFAI_NODEPOOL_ID} +ENV CLARIFAI_COMPUTE_CLUSTER_ID=${CLARIFAI_COMPUTE_CLUSTER_ID} +ENV CLARIFAI_API_BASE=${CLARIFAI_API_BASE} + +############################# +# User specific requirements +############################# +COPY requirements.txt . + +# Install requirements and clarifai package and cleanup before leaving this line. +# Note(zeiler): this could be in a future template as {{model_python_deps}} +RUN pip install --no-cache-dir -r requirements.txt && \ + pip install --no-cache-dir clarifai + +# Set the NUMBA cache dir to /tmp +ENV NUMBA_CACHE_DIR=/tmp/numba_cache +ENV LOCAL_NIM_CACHE=/tmp/nim_cache + + +# Set the working directory to /app +WORKDIR /app + +# Copy the current folder into /app/model_dir that the SDK will expect. +# Note(zeiler): would be nice to exclude checkpoints in case they were pre-downloaded. +COPY . /app/model_dir/${name} + +# Add the model directory to the python path. +ENV PYTHONPATH=${PYTHONPATH}:/app/model_dir/${name} + +ENTRYPOINT ["python", "-m", "clarifai.runners.server"] + +# Finally run the clarifai entrypoint to start the runner loop and local dev server. +# Note(zeiler): we may want to make this a clarifai CLI call. +CMD ["--model_path", "/app/model_dir/main"] diff --git a/clarifai/runners/models/model_upload.py b/clarifai/runners/models/model_upload.py index 014f8a20..f0228300 100644 --- a/clarifai/runners/models/model_upload.py +++ b/clarifai/runners/models/model_upload.py @@ -101,9 +101,10 @@ def _validate_config_checkpoints(self): assert "type" in self.config.get("checkpoints"), "No loader type specified in the config file" loader_type = self.config.get("checkpoints").get("type") + loader_type = loader_type.lower() if not loader_type: logger.info("No loader type specified in the config file for checkpoints") - assert loader_type == "huggingface", "Only huggingface loader supported for now" + if loader_type == "huggingface": assert "repo_id" in self.config.get("checkpoints"), "No repo_id specified in the config file" repo_id = self.config.get("checkpoints").get("repo_id") @@ -114,6 +115,15 @@ def _validate_config_checkpoints(self): else: hf_token = self.config.get("checkpoints").get("hf_token", None) return repo_id, hf_token + elif loader_type == "nvidia-nim" or loader_type == "nim": + assert "nim_image" in self.config.get( + "checkpoints"), "No nim_image specified in the config file" + assert "ngc_api_key" in self.config.get( + "checkpoints"), "No ngc_api_key specified in the config file" + + nim_image = self.config.get("checkpoints").get("nim_image") + ngc_api_key = self.config.get("checkpoints").get("ngc_api_key") + return nim_image, ngc_api_key @property def client(self): @@ -218,11 +228,21 @@ def _parse_requirements(self): return deendencies_version def create_dockerfile(self): - dockerfile_template = os.path.join( - os.path.dirname(os.path.dirname(__file__)), - 'dockerfile_template', - 'Dockerfile.template', - ) + loader_type = None + if self.config.get("checkpoints"): + loader_type = self.config.get("checkpoints").get("type") + if loader_type == "nvidia-nim" or loader_type == "nim": + dockerfile_template = os.path.join( + os.path.dirname(os.path.dirname(__file__)), + 'dockerfile_template', + 'Dockerfile.nim.template', + ) + else: + dockerfile_template = os.path.join( + os.path.dirname(os.path.dirname(__file__)), + 'dockerfile_template', + 'Dockerfile.template', + ) with open(dockerfile_template, 'r') as template_file: dockerfile_template = template_file.read() @@ -248,24 +268,32 @@ def create_dockerfile(self): base_image = self.PYTHON_BASE_IMAGE.format(python_version=python_version) - # Parse the requirements.txt file to determine the base image - dependencies = self._parse_requirements() - if 'torch' in dependencies and dependencies['torch']: - torch_version = dependencies['torch'] - - for image in self.AVAILABLE_TORCH_IMAGES: - if torch_version in image and f'py{python_version}' in image: - base_image = self.TORCH_BASE_IMAGE.format( - torch_version=torch_version, - python_version=python_version, - cuda_version=self.DEFAULT_CUDA_VERSION) - logger.info(f"Using Torch version {torch_version} base image to build the Docker image") - break + if loader_type == "nvidia-nim" or loader_type == "nim": + + base_image, ngc_api_key = self._validate_config_checkpoints() + else: + + # Parse the requirements.txt file to determine the base image + dependencies = self._parse_requirements() + if 'torch' in dependencies and dependencies['torch']: + torch_version = dependencies['torch'] + + for image in self.AVAILABLE_TORCH_IMAGES: + if torch_version in image and f'py{python_version}' in image: + base_image = self.TORCH_BASE_IMAGE.format( + torch_version=torch_version, + python_version=python_version, + cuda_version=self.DEFAULT_CUDA_VERSION) + logger.info( + f"Using Torch version {torch_version} base image to build the Docker image") + break # Replace placeholders with actual values dockerfile_content = dockerfile_template.safe_substitute( name='main', BASE_IMAGE=base_image, + NGC_API_KEY=ngc_api_key if loader_type == "nvidia-nim" or loader_type == "nim" else None, + PYTHON_VERSION=python_version, ) # Write Dockerfile @@ -363,7 +391,8 @@ def upload_model_version(self, download_checkpoints): logger.info( f"Model type {model_type_id} requires concepts to be specified in the config.yaml file.." ) - if self.config.get("checkpoints"): + if self.config.get("checkpoints") and self.config.get("checkpoints").get( + "type") == "huggingface": logger.info( "Checkpoints specified in the config.yaml file, will download the HF model's config.json file to infer the concepts." )