From 4e2532902bfa74bba9759ac79daa4e851ec95b90 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Wed, 22 Apr 2020 14:11:12 +0300 Subject: [PATCH 01/30] handle http 408 in blob file upload by retrying in small chunk size --- storage/google_cloud/google_cloud_utils.py | 27 ++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 6b15f43..5adb1e2 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -2,6 +2,7 @@ from google.cloud import storage from core_data_modules.logging import Logger +from googleapiclient.errors import HttpError log = Logger(__name__) @@ -85,8 +86,30 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f): :param f: File to upload, opened in binary mode. :type f: file-like """ + log.info(f"Uploading file to blob '{target_blob_url}'...") storage_client = storage.Client.from_service_account_json(bucket_credentials_file_path) blob = _blob_at_url(storage_client, target_blob_url) - blob.upload_from_file(f) - log.info(f"Uploaded file to blob") + + try: + blob.upload_from_file(f) + log.info(f"Uploaded file to blob") + + except HttpError as ex: + if ex.resp.status != 408: + raise ex + + num_retries = 0 + if num_retries > 7: # retry up-to the default of deprecated num_tries=6 + + log.info(f"Retrying no. {num_retries} to upload file to blob '{target_blob_url}") + + # lower the default chunk size and retry uploading + blob.chunk_size = 50 * 1024 * 1024 # Set the chunks size to half the default size (100Mb) + blob.upload_from_file(f,) + num_retries +=1 + log.info(f"Uploaded file to blob") + + else: + log.error(f"Retried the {num_retries} of times") + raise ex From 2a54cee3262f783d8db851759fb92fe395d98a59 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Wed, 22 Apr 2020 16:09:54 +0300 Subject: [PATCH 02/30] retry to upload by half the previous run chunk_size --- storage/google_cloud/google_cloud_utils.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 5adb1e2..659c50f 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -86,7 +86,6 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f): :param f: File to upload, opened in binary mode. :type f: file-like """ - log.info(f"Uploading file to blob '{target_blob_url}'...") storage_client = storage.Client.from_service_account_json(bucket_credentials_file_path) blob = _blob_at_url(storage_client, target_blob_url) @@ -100,16 +99,15 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f): raise ex num_retries = 0 - if num_retries > 7: # retry up-to the default of deprecated num_tries=6 - - log.info(f"Retrying no. {num_retries} to upload file to blob '{target_blob_url}") - - # lower the default chunk size and retry uploading - blob.chunk_size = 50 * 1024 * 1024 # Set the chunks size to half the default size (100Mb) - blob.upload_from_file(f,) - num_retries +=1 - log.info(f"Uploaded file to blob") - + chunk_sizes= [50, 25, 12.5, 6.25] + if num_retries != 5: + for chunk_size in chunk_sizes: + log.info(f"Retrying to upload file to blob '{target_blob_url}") + # lower the chunk size and retry uploading + blob.chunk_size = chunk_size * 1024 * 1024 + blob.upload_from_file(f,) + num_retries +=1 + log.info(f"Uploaded file to blob") else: - log.error(f"Retried the {num_retries} of times") + log.error(f"Retried {num_retries} of times") raise ex From 1cd1ce7d18fb983ed70340138edf5adb044f3c90 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Wed, 22 Apr 2020 18:40:40 +0300 Subject: [PATCH 03/30] retry to upload 5 times only --- storage/google_cloud/google_cloud_utils.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 659c50f..aa0c25d 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -98,16 +98,18 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f): if ex.resp.status != 408: raise ex - num_retries = 0 - chunk_sizes= [50, 25, 12.5, 6.25] - if num_retries != 5: + chunk_sizes = [50, 25, 12.5, 6.25] + num_tries = 0 + if num_tries != 5: for chunk_size in chunk_sizes: log.info(f"Retrying to upload file to blob '{target_blob_url}") # lower the chunk size and retry uploading blob.chunk_size = chunk_size * 1024 * 1024 - blob.upload_from_file(f,) - num_retries +=1 - log.info(f"Uploaded file to blob") - else: - log.error(f"Retried {num_retries} of times") + blob.upload_from_file(f) + if ex.resp.status == 200: + log.info(f"Uploaded file to blob") + break + num_tries += 1 + else : + log.error(f"Retried 5 times") raise ex From e002ee19c9f5af6bff705fa7872b4847f4beca26 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Thu, 23 Apr 2020 11:09:11 +0300 Subject: [PATCH 04/30] catch socket timeout error --- storage/google_cloud/google_cloud_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index aa0c25d..1b60b0e 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -3,6 +3,7 @@ from google.cloud import storage from core_data_modules.logging import Logger from googleapiclient.errors import HttpError +import socket log = Logger(__name__) @@ -94,10 +95,11 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f): blob.upload_from_file(f) log.info(f"Uploaded file to blob") - except HttpError as ex: - if ex.resp.status != 408: + except HttpError or socket.timeout as ex: + if ex.resp.status not in [408, 504]: raise ex + log.warning(f"Failed to upload due to {ex.resp.status}") chunk_sizes = [50, 25, 12.5, 6.25] num_tries = 0 if num_tries != 5: From 7b31b1fcf1ab3d26c4120e16be1acf24ad1df0e2 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Thu, 23 Apr 2020 15:42:34 +0300 Subject: [PATCH 05/30] handle request related error --- storage/google_cloud/google_cloud_utils.py | 38 ++++++++-------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 1b60b0e..4bfcb1b 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -2,7 +2,7 @@ from google.cloud import storage from core_data_modules.logging import Logger -from googleapiclient.errors import HttpError +from requests import ConnectionError, Timeout import socket log = Logger(__name__) @@ -76,7 +76,7 @@ def download_blob_to_file(bucket_credentials_file_path, blob_url, f): log.info(f"Downloaded blob to file") -def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f): +def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=5, blob_chunk_size=100): """ Uploads a file to a Google Cloud Storage blob. @@ -87,31 +87,21 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f): :param f: File to upload, opened in binary mode. :type f: file-like """ - log.info(f"Uploading file to blob '{target_blob_url}'...") - storage_client = storage.Client.from_service_account_json(bucket_credentials_file_path) - blob = _blob_at_url(storage_client, target_blob_url) - try: + log.info(f"Uploading file to blob '{target_blob_url}'...") + storage_client = storage.Client.from_service_account_json(bucket_credentials_file_path) + blob = _blob_at_url(storage_client, target_blob_url) + blob.chunk_size = blob_chunk_size * 1024 * 1024 blob.upload_from_file(f) log.info(f"Uploaded file to blob") - except HttpError or socket.timeout as ex: - if ex.resp.status not in [408, 504]: - raise ex - - log.warning(f"Failed to upload due to {ex.resp.status}") - chunk_sizes = [50, 25, 12.5, 6.25] - num_tries = 0 - if num_tries != 5: - for chunk_size in chunk_sizes: - log.info(f"Retrying to upload file to blob '{target_blob_url}") - # lower the chunk size and retry uploading - blob.chunk_size = chunk_size * 1024 * 1024 - blob.upload_from_file(f) - if ex.resp.status == 200: - log.info(f"Uploaded file to blob") - break - num_tries += 1 - else : + except ConnectionError or socket.timeout or Timeout as ex: + log.warning(f"Failed to upload due to {ex.resp.status} (connection error)") + if max_retries > 0: + log.info(f"Retrying {max_retries} more times with a reduced chunk_size of 10MiB") + # lower the chunk size and retry uploading + upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, + max_retries - 1, blob_chunk_size=10) + else: log.error(f"Retried 5 times") raise ex From bcac7f216c1a5ea6679f1a8b306d0da16ad7fe81 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Thu, 23 Apr 2020 16:50:24 +0300 Subject: [PATCH 06/30] fix attribute error --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 4bfcb1b..96b3e99 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -96,7 +96,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re log.info(f"Uploaded file to blob") except ConnectionError or socket.timeout or Timeout as ex: - log.warning(f"Failed to upload due to {ex.resp.status} (connection error)") + log.warning("Failed to upload due to connection error") if max_retries > 0: log.info(f"Retrying {max_retries} more times with a reduced chunk_size of 10MiB") # lower the chunk size and retry uploading From 743786045dfa8a6eea3ce5685346e38aee355f4b Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Thu, 23 Apr 2020 18:02:14 +0300 Subject: [PATCH 07/30] resume uploading from beginning while retrying --- storage/google_cloud/google_cloud_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 96b3e99..c07183b 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -99,8 +99,8 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re log.warning("Failed to upload due to connection error") if max_retries > 0: log.info(f"Retrying {max_retries} more times with a reduced chunk_size of 10MiB") - # lower the chunk size and retry uploading - upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, + # lower the chunk size and start uploading from beginning resumable_media requires so + upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f.seek(0), max_retries - 1, blob_chunk_size=10) else: log.error(f"Retried 5 times") From 573296ca85b65c2bcbf9717562cb9cefb9393dfd Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Thu, 23 Apr 2020 18:56:00 +0300 Subject: [PATCH 08/30] offset file position before retrying --- storage/google_cloud/google_cloud_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index c07183b..d83a37e 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -99,8 +99,9 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re log.warning("Failed to upload due to connection error") if max_retries > 0: log.info(f"Retrying {max_retries} more times with a reduced chunk_size of 10MiB") - # lower the chunk size and start uploading from beginning resumable_media requires so - upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f.seek(0), + # lower the chunk size and start uploading from beginning because resumable_media requires so + f.seek(0) + upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries - 1, blob_chunk_size=10) else: log.error(f"Retried 5 times") From 80af07f802db34303edd3ab84d3e3a2d0d926a7a Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Fri, 24 Apr 2020 10:51:28 +0300 Subject: [PATCH 09/30] retry 3 times at half the blob size each time --- storage/google_cloud/google_cloud_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index d83a37e..6f40ab9 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -76,7 +76,7 @@ def download_blob_to_file(bucket_credentials_file_path, blob_url, f): log.info(f"Downloaded blob to file") -def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=5, blob_chunk_size=100): +def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=3, blob_chunk_size=100.0): """ Uploads a file to a Google Cloud Storage blob. @@ -86,6 +86,10 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re :type target_blob_url: str :param f: File to upload, opened in binary mode. :type f: file-like + :param max_retries: maximum number of times to retry uploading the file. + :type max_retries: int + :param blob_chunk_size: the size of a chunk of data whenever iterating (in MiB). + :type blob_chunk_size: float """ try: log.info(f"Uploading file to blob '{target_blob_url}'...") @@ -98,11 +102,11 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re except ConnectionError or socket.timeout or Timeout as ex: log.warning("Failed to upload due to connection error") if max_retries > 0: - log.info(f"Retrying {max_retries} more times with a reduced chunk_size of 10MiB") + log.info(f"Retrying {max_retries} more times with a reduced chunk_size of {blob_chunk_size}MiB") # lower the chunk size and start uploading from beginning because resumable_media requires so f.seek(0) upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, - max_retries - 1, blob_chunk_size=10) + max_retries - 1, blob_chunk_size/2) else: - log.error(f"Retried 5 times") + log.error(f"Retried 3 times") raise ex From 69c17340bbb61adcc88c70508b1168c2ae613a66 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Fri, 24 Apr 2020 12:04:02 +0300 Subject: [PATCH 10/30] change blob_chunk_size to int - resumable media expects an int --- storage/google_cloud/google_cloud_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 6f40ab9..a98021a 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -76,7 +76,7 @@ def download_blob_to_file(bucket_credentials_file_path, blob_url, f): log.info(f"Downloaded blob to file") -def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=3, blob_chunk_size=100.0): +def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=3, blob_chunk_size=100): """ Uploads a file to a Google Cloud Storage blob. @@ -106,7 +106,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re # lower the chunk size and start uploading from beginning because resumable_media requires so f.seek(0) upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, - max_retries - 1, blob_chunk_size/2) + max_retries - 1, blob_chunk_size - 30) else: log.error(f"Retried 3 times") raise ex From 9763dd54268912f5b430129aace81708cf053837 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Fri, 24 Apr 2020 13:13:11 +0300 Subject: [PATCH 11/30] fix log variable --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index a98021a..1620fc9 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -102,7 +102,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re except ConnectionError or socket.timeout or Timeout as ex: log.warning("Failed to upload due to connection error") if max_retries > 0: - log.info(f"Retrying {max_retries} more times with a reduced chunk_size of {blob_chunk_size}MiB") + log.info(f"Retrying {max_retries} more times with a reduced chunk_size of {blob_chunk_size - 30}MiB") # lower the chunk size and start uploading from beginning because resumable_media requires so f.seek(0) upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, From bfdfa6b4785816c7993cee08c515f75c561fca2c Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Fri, 24 Apr 2020 14:30:47 +0300 Subject: [PATCH 12/30] append FAILED to upload failed files --- storage/google_cloud/google_cloud_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 1620fc9..e74c91b 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -4,6 +4,7 @@ from core_data_modules.logging import Logger from requests import ConnectionError, Timeout import socket +import os log = Logger(__name__) @@ -109,4 +110,5 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re max_retries - 1, blob_chunk_size - 30) else: log.error(f"Retried 3 times") + os.rename(f, f'FAILED_{f}') raise ex From a8a6337c2048b4a5e305464c490abd576d50287e Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Fri, 24 Apr 2020 15:52:06 +0300 Subject: [PATCH 13/30] fetch the correct file name before renaming --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index e74c91b..4e87c67 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -110,5 +110,5 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re max_retries - 1, blob_chunk_size - 30) else: log.error(f"Retried 3 times") - os.rename(f, f'FAILED_{f}') + os.rename(f.name, f'FAILED_{f}') raise ex From bf6cfb4e82db82339fc48a028438d7c79737abe6 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Fri, 24 Apr 2020 18:00:53 +0300 Subject: [PATCH 14/30] reduce max_retries to 2 - (testing purposes only) --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 4e87c67..dc95e95 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -77,7 +77,7 @@ def download_blob_to_file(bucket_credentials_file_path, blob_url, f): log.info(f"Downloaded blob to file") -def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=3, blob_chunk_size=100): +def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=2, blob_chunk_size=100): """ Uploads a file to a Google Cloud Storage blob. From 4db7dd4a816c8ad45a2e1a4e55d6040fb2c763ad Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Fri, 24 Apr 2020 23:07:08 +0300 Subject: [PATCH 15/30] fix file name attribute --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index dc95e95..4b65967 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -110,5 +110,5 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re max_retries - 1, blob_chunk_size - 30) else: log.error(f"Retried 3 times") - os.rename(f.name, f'FAILED_{f}') + os.rename(f.name, f'FAILED_{f.name}') raise ex From 4b6b319f7a0456a8847dbebcd7dd77f75a684a1f Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Sat, 25 Apr 2020 18:28:54 +0300 Subject: [PATCH 16/30] return a custom upload status to aid in post upload events --- storage/google_cloud/google_cloud_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 4b65967..e5bf242 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -99,6 +99,9 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re blob.chunk_size = blob_chunk_size * 1024 * 1024 blob.upload_from_file(f) log.info(f"Uploaded file to blob") + upload_status = "success" + + return upload_status except ConnectionError or socket.timeout or Timeout as ex: log.warning("Failed to upload due to connection error") @@ -110,5 +113,6 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re max_retries - 1, blob_chunk_size - 30) else: log.error(f"Retried 3 times") - os.rename(f.name, f'FAILED_{f.name}') - raise ex + upload_status = "failed" + + return upload_status From 64782ff5e7cc4c018ab5ad69671b51f914ebad0e Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Sat, 25 Apr 2020 18:39:45 +0300 Subject: [PATCH 17/30] update error log texts -> "Failed to upload after retrying 3 times! " --- storage/google_cloud/google_cloud_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index e5bf242..d753d60 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -104,7 +104,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re return upload_status except ConnectionError or socket.timeout or Timeout as ex: - log.warning("Failed to upload due to connection error") + log.warning("Failed to upload due to connection error!") if max_retries > 0: log.info(f"Retrying {max_retries} more times with a reduced chunk_size of {blob_chunk_size - 30}MiB") # lower the chunk size and start uploading from beginning because resumable_media requires so @@ -112,7 +112,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries - 1, blob_chunk_size - 30) else: - log.error(f"Retried 3 times") + log.error(f"Failed to upload after retrying 3 times!") upload_status = "failed" return upload_status From f6de3080e001eaddaca15acabb0df34c4cba145b Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Sat, 25 Apr 2020 19:26:09 +0300 Subject: [PATCH 18/30] fix return statement --- storage/google_cloud/google_cloud_utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index d753d60..e61ed18 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -92,6 +92,9 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re :param blob_chunk_size: the size of a chunk of data whenever iterating (in MiB). :type blob_chunk_size: float """ + + upload_status = None + try: log.info(f"Uploading file to blob '{target_blob_url}'...") storage_client = storage.Client.from_service_account_json(bucket_credentials_file_path) @@ -101,8 +104,6 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re log.info(f"Uploaded file to blob") upload_status = "success" - return upload_status - except ConnectionError or socket.timeout or Timeout as ex: log.warning("Failed to upload due to connection error!") if max_retries > 0: @@ -115,4 +116,4 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re log.error(f"Failed to upload after retrying 3 times!") upload_status = "failed" - return upload_status + return upload_status From 233790a4f5e3f24c4254eb8288d36075c1493fef Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Sat, 25 Apr 2020 21:12:57 +0300 Subject: [PATCH 19/30] use a tuple of exception classes --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index e61ed18..50548aa 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -104,7 +104,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re log.info(f"Uploaded file to blob") upload_status = "success" - except ConnectionError or socket.timeout or Timeout as ex: + except (ConnectionError, socket.timeout, Timeout) as ex: log.warning("Failed to upload due to connection error!") if max_retries > 0: log.info(f"Retrying {max_retries} more times with a reduced chunk_size of {blob_chunk_size - 30}MiB") From 857819e92e04564a4d7d16fc2ac56903d83b4c55 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Sat, 25 Apr 2020 21:13:33 +0300 Subject: [PATCH 20/30] remove unused variable --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 50548aa..53871cb 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -104,7 +104,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re log.info(f"Uploaded file to blob") upload_status = "success" - except (ConnectionError, socket.timeout, Timeout) as ex: + except (ConnectionError, socket.timeout, Timeout): log.warning("Failed to upload due to connection error!") if max_retries > 0: log.info(f"Retrying {max_retries} more times with a reduced chunk_size of {blob_chunk_size - 30}MiB") From 838f9059e78f9269a2d5d9d81ffec3d7d57a6ae4 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Sat, 25 Apr 2020 21:26:03 +0300 Subject: [PATCH 21/30] remove unused imports & variables --- storage/google_cloud/google_cloud_utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 53871cb..052f390 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -4,7 +4,6 @@ from core_data_modules.logging import Logger from requests import ConnectionError, Timeout import socket -import os log = Logger(__name__) @@ -92,9 +91,6 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re :param blob_chunk_size: the size of a chunk of data whenever iterating (in MiB). :type blob_chunk_size: float """ - - upload_status = None - try: log.info(f"Uploading file to blob '{target_blob_url}'...") storage_client = storage.Client.from_service_account_json(bucket_credentials_file_path) @@ -114,6 +110,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re max_retries - 1, blob_chunk_size - 30) else: log.error(f"Failed to upload after retrying 3 times!") - upload_status = "failed" + + upload_status = "failed" return upload_status From 28c1e9c3ec208a90611da25cbe5129ae674f2570 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Tue, 28 Apr 2020 15:14:30 +0300 Subject: [PATCH 22/30] update maximum - > maximum in docstring Co-Authored-By: Alexander Simpson --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 052f390..5341c0b 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -86,7 +86,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re :type target_blob_url: str :param f: File to upload, opened in binary mode. :type f: file-like - :param max_retries: maximum number of times to retry uploading the file. + :param max_retries: Maximum number of times to retry uploading the file. :type max_retries: int :param blob_chunk_size: the size of a chunk of data whenever iterating (in MiB). :type blob_chunk_size: float From 44cf55e85e980d760d5e60153165e365bc287ea3 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Tue, 28 Apr 2020 15:43:11 +0300 Subject: [PATCH 23/30] update docstring for param blob_chunk_size Co-Authored-By: Alexander Simpson --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 5341c0b..e371cb5 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -88,7 +88,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re :type f: file-like :param max_retries: Maximum number of times to retry uploading the file. :type max_retries: int - :param blob_chunk_size: the size of a chunk of data whenever iterating (in MiB). + :param blob_chunk_size: The chunk size to use for resumable uploads, in MiB :type blob_chunk_size: float """ try: From 5d514644e13c1b706416d9c124220af631cda2c2 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Tue, 28 Apr 2020 16:03:17 +0300 Subject: [PATCH 24/30] raise an exception if it fails --- storage/google_cloud/google_cloud_utils.py | 28 ++++++++++++---------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 052f390..92b696a 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -88,29 +88,31 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re :type f: file-like :param max_retries: maximum number of times to retry uploading the file. :type max_retries: int - :param blob_chunk_size: the size of a chunk of data whenever iterating (in MiB). - :type blob_chunk_size: float + :param blob_chunk_size: the size of a chunk of data whenever iterating (in MiB). Default is 100 MiB. + :type blob_chunk_size: int """ try: log.info(f"Uploading file to blob '{target_blob_url}'...") storage_client = storage.Client.from_service_account_json(bucket_credentials_file_path) blob = _blob_at_url(storage_client, target_blob_url) - blob.chunk_size = blob_chunk_size * 1024 * 1024 + + # Check if blob_chunk_size is below the minimum threshold + if blob_chunk_size > 0.256: + blob.chunk_size = blob_chunk_size * 1024 * 1024 + else: + blob.chunk_size = 0.256 * 1024 * 1024 + blob.upload_from_file(f) log.info(f"Uploaded file to blob") - upload_status = "success" - except (ConnectionError, socket.timeout, Timeout): - log.warning("Failed to upload due to connection error!") + except (ConnectionError, socket.timeout, Timeout) as ex: + log.warning("Failed to upload due to connection/timeout error") if max_retries > 0: - log.info(f"Retrying {max_retries} more times with a reduced chunk_size of {blob_chunk_size - 30}MiB") + log.info(f"Retrying {max_retries} more times with a reduced chunk_size of {blob_chunk_size/2}MiB") # lower the chunk size and start uploading from beginning because resumable_media requires so f.seek(0) upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, - max_retries - 1, blob_chunk_size - 30) + max_retries - 1, int(round(blob_chunk_size/2))) else: - log.error(f"Failed to upload after retrying 3 times!") - - upload_status = "failed" - - return upload_status + log.error(f"Failed to upload after retrying 3 times") + raise ex From 1ce05378799d0d3464ec9ce11b8757c845b8f6a2 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Tue, 28 Apr 2020 16:06:04 +0300 Subject: [PATCH 25/30] minor log update --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 7019e90..ba3641e 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -108,7 +108,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re except (ConnectionError, socket.timeout, Timeout) as ex: log.warning("Failed to upload due to connection/timeout error") if max_retries > 0: - log.info(f"Retrying {max_retries} more times with a reduced chunk_size of {blob_chunk_size/2}MiB") + log.info(f"Retrying up to{max_retries} more times with a reduced chunk_size of {int(round(blob_chunk_size/2))}MiB") # lower the chunk size and start uploading from beginning because resumable_media requires so f.seek(0) upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, From 2a3c81682d4dd092b08ce3da6d2f66da7f52932f Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Tue, 28 Apr 2020 17:53:14 +0300 Subject: [PATCH 26/30] set the min blob_chunk_size = 256KB --- storage/google_cloud/google_cloud_utils.py | 23 ++++++++++------------ 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index ba3641e..2d1c65e 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -76,7 +76,7 @@ def download_blob_to_file(bucket_credentials_file_path, blob_url, f): log.info(f"Downloaded blob to file") -def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=2, blob_chunk_size=100): +def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=2, blob_chunk_size=102400): """ Uploads a file to a Google Cloud Storage blob. @@ -88,31 +88,28 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re :type f: file-like :param max_retries: Maximum number of times to retry uploading the file. :type max_retries: int - :param blob_chunk_size: The chunk size to use for resumable uploads, in MiB + :param blob_chunk_size: The chunk size to use for resumable uploads, in KiB. :type blob_chunk_size: float """ try: log.info(f"Uploading file to blob '{target_blob_url}'...") storage_client = storage.Client.from_service_account_json(bucket_credentials_file_path) blob = _blob_at_url(storage_client, target_blob_url) - - # Check if blob_chunk_size is below the minimum threshold - if blob_chunk_size > 0.256: - blob.chunk_size = blob_chunk_size * 1024 * 1024 - else: - blob.chunk_size = 0.256 * 1024 * 1024 - + blob.chunk_size = blob_chunk_size * 1024 blob.upload_from_file(f) log.info(f"Uploaded file to blob") except (ConnectionError, socket.timeout, Timeout) as ex: log.warning("Failed to upload due to connection/timeout error") - if max_retries > 0: - log.info(f"Retrying up to{max_retries} more times with a reduced chunk_size of {int(round(blob_chunk_size/2))}MiB") + if max_retries > 0 and blob_chunk_size > 256: + log.info(f"Retrying up to{max_retries} more times with a reduced chunk_size of {blob_chunk_size/2}KB") # lower the chunk size and start uploading from beginning because resumable_media requires so f.seek(0) upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, - max_retries - 1, int(round(blob_chunk_size/2))) + max_retries - 1, blob_chunk_size/2) + elif max_retries > 0 and blob_chunk_size < 256: + log.error(f"Not retrying because the blob_chunk_size {blob_chunk_size} is below the minimum allowed (256KB") + raise ex else: - log.error(f"Failed to upload after retrying 3 times") + log.error(f"Failed to upload file to blob") raise ex From 296bcd9126b27ea284c45d456bb66ba40083ed02 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Wed, 29 Apr 2020 09:04:43 +0300 Subject: [PATCH 27/30] minor except refactor and log update --- storage/google_cloud/google_cloud_utils.py | 24 ++++++++++++---------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 2d1c65e..109c4a6 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -76,7 +76,7 @@ def download_blob_to_file(bucket_credentials_file_path, blob_url, f): log.info(f"Downloaded blob to file") -def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=2, blob_chunk_size=102400): +def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=2, blob_chunk_size=100 * 1024): """ Uploads a file to a Google Cloud Storage blob. @@ -101,15 +101,17 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re except (ConnectionError, socket.timeout, Timeout) as ex: log.warning("Failed to upload due to connection/timeout error") - if max_retries > 0 and blob_chunk_size > 256: - log.info(f"Retrying up to{max_retries} more times with a reduced chunk_size of {blob_chunk_size/2}KB") - # lower the chunk size and start uploading from beginning because resumable_media requires so - f.seek(0) - upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, - max_retries - 1, blob_chunk_size/2) - elif max_retries > 0 and blob_chunk_size < 256: - log.error(f"Not retrying because the blob_chunk_size {blob_chunk_size} is below the minimum allowed (256KB") - raise ex - else: + + if max_retries <= 0: log.error(f"Failed to upload file to blob") raise ex + + if blob_chunk_size < 256: + log.error(f"Not retrying because the blob_chunk_size {blob_chunk_size} is below the minimum allowed (256KB)") + raise ex + + log.info(f"Retrying up to{max_retries} more times with a reduced chunk_size of {blob_chunk_size / 2}KB") + # lower the chunk size and start uploading from beginning because resumable_media requires so + f.seek(0) + upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, + max_retries - 1, blob_chunk_size / 2) From 1620d5e4a4bbd809e324f78242784e810587cf6a Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Thu, 30 Apr 2020 16:34:28 +0300 Subject: [PATCH 28/30] update default max_retries to 4 --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index 109c4a6..a8d2e83 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -76,7 +76,7 @@ def download_blob_to_file(bucket_credentials_file_path, blob_url, f): log.info(f"Downloaded blob to file") -def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=2, blob_chunk_size=100 * 1024): +def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_retries=4, blob_chunk_size=100 * 1024): """ Uploads a file to a Google Cloud Storage blob. From 380dbea8e84329b9fed5cb07c056f0ebb2665e9f Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Thu, 30 Apr 2020 17:26:23 +0300 Subject: [PATCH 29/30] convert blob_chunk_size to an integer --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index a8d2e83..f47e86e 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -95,7 +95,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re log.info(f"Uploading file to blob '{target_blob_url}'...") storage_client = storage.Client.from_service_account_json(bucket_credentials_file_path) blob = _blob_at_url(storage_client, target_blob_url) - blob.chunk_size = blob_chunk_size * 1024 + blob.chunk_size = int(blob_chunk_size * 1024) # resumable expects an integer blob.upload_from_file(f) log.info(f"Uploaded file to blob") From 7464f11318a5f25b1aa9817be9af0b7d0a248803 Mon Sep 17 00:00:00 2001 From: Isaack Mwenda Date: Mon, 4 May 2020 16:27:27 +0300 Subject: [PATCH 30/30] minor log statement fix --- storage/google_cloud/google_cloud_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage/google_cloud/google_cloud_utils.py b/storage/google_cloud/google_cloud_utils.py index f47e86e..1adc95d 100644 --- a/storage/google_cloud/google_cloud_utils.py +++ b/storage/google_cloud/google_cloud_utils.py @@ -110,7 +110,7 @@ def upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f, max_re log.error(f"Not retrying because the blob_chunk_size {blob_chunk_size} is below the minimum allowed (256KB)") raise ex - log.info(f"Retrying up to{max_retries} more times with a reduced chunk_size of {blob_chunk_size / 2}KB") + log.info(f"Retrying up to {max_retries} more times with a reduced chunk_size of {blob_chunk_size / 2}KB") # lower the chunk size and start uploading from beginning because resumable_media requires so f.seek(0) upload_file_to_blob(bucket_credentials_file_path, target_blob_url, f,