From 1b8f4dec13b59136d0a79d3ad1136aed9183af09 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 21 Mar 2024 13:16:49 -0400 Subject: [PATCH] apply code review suggestions --- .github/workflows/_e2e_tests.yml | 1 + chart/templates/_env/_envWorker.tpl | 2 ++ chart/templates/services/search/_container.tpl | 2 ++ e2e/Makefile | 2 ++ libs/libcommon/src/libcommon/utils.py | 7 +++---- services/search/Makefile | 1 + services/worker/Makefile | 2 +- tools/docker-compose-datasets-server.yml | 2 ++ tools/docker-compose-dev-datasets-server.yml | 2 ++ 9 files changed, 16 insertions(+), 5 deletions(-) diff --git a/.github/workflows/_e2e_tests.yml b/.github/workflows/_e2e_tests.yml index 40bb9c166a..5559972579 100644 --- a/.github/workflows/_e2e_tests.yml +++ b/.github/workflows/_e2e_tests.yml @@ -66,6 +66,7 @@ jobs: S3_REGION_NAME: "us-east-1" CLOUDFRONT_KEY_PAIR_ID: "K3814DK2QUJ71H" CLOUDFRONT_PRIVATE_KEY: ${{ secrets.CLOUDFRONT_PRIVATE_KEY }} + HF_HUB_ENABLE_HF_TRANSFER: "1" run: docker compose -f docker-compose-datasets-server.yml up -d --wait --wait-timeout 20 working-directory: ./tools - name: Install poetry diff --git a/chart/templates/_env/_envWorker.tpl b/chart/templates/_env/_envWorker.tpl index 4648ed678a..af9cbf6580 100644 --- a/chart/templates/_env/_envWorker.tpl +++ b/chart/templates/_env/_envWorker.tpl @@ -112,4 +112,6 @@ value: {{ .Values.descriptiveStatistics.maxSplitSizeBytes | quote }} - name: DESCRIPTIVE_STATISTICS_CACHE_DIRECTORY value: {{ .Values.descriptiveStatistics.cacheDirectory | quote }} +- name: HF_HUB_ENABLE_HF_TRANSFER + value: 1 {{- end -}} diff --git a/chart/templates/services/search/_container.tpl b/chart/templates/services/search/_container.tpl index 0e66020f3f..60a997d95f 100644 --- a/chart/templates/services/search/_container.tpl +++ b/chart/templates/services/search/_container.tpl @@ -37,6 +37,8 @@ value: {{ .Values.duckDBIndex.cacheDirectory | quote }} - name: DUCKDB_INDEX_EXTENSIONS_DIRECTORY value: "/tmp/duckdb-extensions" + - name: HF_HUB_ENABLE_HF_TRANSFER + value: 1 volumeMounts: {{ include "volumeMountDuckDBIndexRW" . | nindent 2 }} securityContext: diff --git a/e2e/Makefile b/e2e/Makefile index 06245e09da..fbc720ac8e 100644 --- a/e2e/Makefile +++ b/e2e/Makefile @@ -46,6 +46,8 @@ export S3_REGION_NAME := us-east-1 # S3_ACCESS_KEY_ID <- secret # S3_SECRET_ACCESS_KEY <- secret +export HF_HUB_ENABLE_HF_TRANSFER := 1 + # makefile variables DOCKER_COMPOSE := ../tools/docker-compose-datasets-server.yml diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index a4d17f44ce..266183b233 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -11,7 +11,7 @@ from fnmatch import fnmatch from pathlib import Path from typing import Any, Optional, TypeVar, Union, cast - +from huggingface_hub import constants import orjson import pandas as pd from huggingface_hub import constants, hf_hub_download @@ -204,10 +204,9 @@ def download_file_from_hub( force_download: bool = False, resume_download: bool = False, ) -> None: - # Force hf_transfer usage - constants.HF_HUB_ENABLE_HF_TRANSFER = True logging.debug(f"Using {constants.HF_HUB_ENABLE_HF_TRANSFER} for hf_transfer") - retry_download_hub_file = retry(on=[ReadTimeout], sleeps=HF_HUB_HTTP_ERROR_RETRY_SLEEPS)(hf_hub_download) + retry_on = [RuntimeError] if constants.HF_HUB_ENABLE_HF_TRANSFER else [ReadTimeout] + retry_download_hub_file = retry(on=retry_on, sleeps=HF_HUB_HTTP_ERROR_RETRY_SLEEPS)(hf_hub_download) retry_download_hub_file( repo_type=repo_type, revision=revision, diff --git a/services/search/Makefile b/services/search/Makefile index ef267eefe6..fd795b8def 100644 --- a/services/search/Makefile +++ b/services/search/Makefile @@ -3,6 +3,7 @@ export COMPOSE_PROJECT_NAME := search export MONGO_PORT := 27033 export CACHE_MONGO_URL := mongodb://localhost:${MONGO_PORT} export QUEUE_MONGO_URL := mongodb://localhost:${MONGO_PORT} +export HF_HUB_ENABLE_HF_TRANSFER := 1 # makefile variables DOCKER_COMPOSE := ../../tools/docker-compose-mongo.yml TEST_PATH ?= tests diff --git a/services/worker/Makefile b/services/worker/Makefile index 6472cc0dc7..ee91b14307 100644 --- a/services/worker/Makefile +++ b/services/worker/Makefile @@ -3,7 +3,7 @@ export COMPOSE_PROJECT_NAME := worker export MONGO_PORT := 27040 export CACHE_MONGO_URL := mongodb://localhost:${MONGO_PORT} export QUEUE_MONGO_URL := mongodb://localhost:${MONGO_PORT} - +export HF_HUB_ENABLE_HF_TRANSFER := 1 # makefile variables DOCKER_COMPOSE := ../../tools/docker-compose-mongo.yml diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml index 0a730b9cc5..146594695d 100644 --- a/tools/docker-compose-datasets-server.yml +++ b/tools/docker-compose-datasets-server.yml @@ -126,6 +126,7 @@ services: API_UVICORN_HOSTNAME: 0.0.0.0 # required for docker compose API_UVICORN_NUM_WORKERS: ${SEARCH_UVICORN_NUM_WORKERS-2} API_UVICORN_PORT: ${SEARCH_UVICORN_PORT-8083} + HF_HUB_ENABLE_HF_TRANSFER: 1 ports: # for debug - ${SEARCH_UVICORN_PORT-8083}:${SEARCH_UVICORN_PORT-8083} @@ -181,6 +182,7 @@ services: FIRST_ROWS_MIN_CELL_BYTES: ${FIRST_ROWS_MIN_CELL_BYTES-100} FIRST_ROWS_MIN_NUMBER: ${FIRST_ROWS_MIN_NUMBER-10} FIRST_ROWS_COLUMNS_MAX_NUMBER: ${FIRST_ROWS_COLUMNS_MAX_NUMBER-1_000} + HF_HUB_ENABLE_HF_TRANSFER: 1 OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10} OPT_IN_OUT_URLS_SCAN_MAX_CONCURRENT_REQUESTS_NUMBER: ${OPT_IN_OUT_URLS_SCAN_MAX_CONCURRENT_REQUESTS_NUMBER-100} OPT_IN_OUT_URLS_SCAN_MAX_REQUESTS_PER_SECOND: ${OPT_IN_OUT_URLS_SCAN_MAX_REQUESTS_PER_SECOND-50} diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml index 63ff9084a1..18e4de981d 100644 --- a/tools/docker-compose-dev-datasets-server.yml +++ b/tools/docker-compose-dev-datasets-server.yml @@ -134,6 +134,7 @@ services: API_UVICORN_HOSTNAME: 0.0.0.0 # required for docker compose API_UVICORN_NUM_WORKERS: ${SEARCH_UVICORN_NUM_WORKERS-2} API_UVICORN_PORT: ${SEARCH_UVICORN_PORT-8083} + HF_HUB_ENABLE_HF_TRANSFER: 1 ports: # for debug - ${SEARCH_UVICORN_PORT-8083}:${SEARCH_UVICORN_PORT-8083} @@ -196,6 +197,7 @@ services: FIRST_ROWS_MIN_CELL_BYTES: ${FIRST_ROWS_MIN_CELL_BYTES-100} FIRST_ROWS_MIN_NUMBER: ${FIRST_ROWS_MIN_NUMBER-10} FIRST_ROWS_COLUMNS_MAX_NUMBER: ${FIRST_ROWS_COLUMNS_MAX_NUMBER-1_000} + HF_HUB_ENABLE_HF_TRANSFER: 1 OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER: ${OPT_IN_OUT_URLS_SCAN_COLUMNS_MAX_NUMBER-10} OPT_IN_OUT_URLS_SCAN_MAX_CONCURRENT_REQUESTS_NUMBER: ${OPT_IN_OUT_URLS_SCAN_MAX_CONCURRENT_REQUESTS_NUMBER-100} OPT_IN_OUT_URLS_SCAN_MAX_REQUESTS_PER_SECOND: ${OPT_IN_OUT_URLS_SCAN_MAX_REQUESTS_PER_SECOND-50}