From 2547434b2d1e09e654fc8430c1bdd9f3a9d25de3 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 5 Oct 2023 15:47:08 -0400 Subject: [PATCH 1/2] force download when previous file is obsolete --- libs/libapi/src/libapi/duckdb.py | 1 + .../src/worker/job_runners/split/descriptive_statistics.py | 1 + services/worker/src/worker/job_runners/split/duckdb_index.py | 1 + 3 files changed, 3 insertions(+) diff --git a/libs/libapi/src/libapi/duckdb.py b/libs/libapi/src/libapi/duckdb.py index aea824bfdf..148cd48ed8 100644 --- a/libs/libapi/src/libapi/duckdb.py +++ b/libs/libapi/src/libapi/duckdb.py @@ -83,6 +83,7 @@ def download_index_file( local_dir_use_symlinks=False, token=hf_token, cache_dir=cache_folder, + force_download=True, ) diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py index baf8fb4aba..a8b7714062 100644 --- a/services/worker/src/worker/job_runners/split/descriptive_statistics.py +++ b/services/worker/src/worker/job_runners/split/descriptive_statistics.py @@ -443,6 +443,7 @@ def compute_descriptive_statistics_response( local_dir_use_symlinks=False, token=hf_token, cache_dir=local_parquet_directory, + force_download=True, ) local_parquet_glob_path = Path(local_parquet_directory) / config / f"{split}/*.parquet" diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index be45646670..51326695a1 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -171,6 +171,7 @@ def compute_index_rows( local_dir_use_symlinks=False, token=hf_token, cache_dir=duckdb_index_file_directory, + force_download=True, ) all_split_parquets = f"{duckdb_index_file_directory}/{config}/{split_directory}/*.parquet" From 1784dc15fc6648955eefdeeb438ba12fd0f2cc68 Mon Sep 17 00:00:00 2001 From: Andrea Soria Date: Thu, 5 Oct 2023 15:53:09 -0400 Subject: [PATCH 2/2] resume download false --- libs/libapi/src/libapi/duckdb.py | 1 - .../src/worker/job_runners/split/descriptive_statistics.py | 1 + services/worker/src/worker/job_runners/split/duckdb_index.py | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/libapi/src/libapi/duckdb.py b/libs/libapi/src/libapi/duckdb.py index 148cd48ed8..aea824bfdf 100644 --- a/libs/libapi/src/libapi/duckdb.py +++ b/libs/libapi/src/libapi/duckdb.py @@ -83,7 +83,6 @@ def download_index_file( local_dir_use_symlinks=False, token=hf_token, cache_dir=cache_folder, - force_download=True, ) diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py index a8b7714062..5051a9b6f5 100644 --- a/services/worker/src/worker/job_runners/split/descriptive_statistics.py +++ b/services/worker/src/worker/job_runners/split/descriptive_statistics.py @@ -444,6 +444,7 @@ def compute_descriptive_statistics_response( token=hf_token, cache_dir=local_parquet_directory, force_download=True, + resume_download=False, ) local_parquet_glob_path = Path(local_parquet_directory) / config / f"{split}/*.parquet" diff --git a/services/worker/src/worker/job_runners/split/duckdb_index.py b/services/worker/src/worker/job_runners/split/duckdb_index.py index 51326695a1..47fc34c03a 100644 --- a/services/worker/src/worker/job_runners/split/duckdb_index.py +++ b/services/worker/src/worker/job_runners/split/duckdb_index.py @@ -172,6 +172,7 @@ def compute_index_rows( token=hf_token, cache_dir=duckdb_index_file_directory, force_download=True, + resume_download=False, ) all_split_parquets = f"{duckdb_index_file_directory}/{config}/{split_directory}/*.parquet"