From 2cf7d0e6ceb8eac2d7fa101363a0f79c895a724d Mon Sep 17 00:00:00 2001 From: Ludovic DEHON Date: Fri, 25 Feb 2022 18:49:50 +0100 Subject: [PATCH 1/4] feat: get dataset without create in order to avoid to add a role create --- target_bigquery/utils.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/target_bigquery/utils.py b/target_bigquery/utils.py index 577d2cb..cf330c3 100644 --- a/target_bigquery/utils.py +++ b/target_bigquery/utils.py @@ -6,6 +6,7 @@ from google.api_core import exceptions from google.cloud import bigquery from google.cloud.bigquery import Dataset +from google.cloud.exceptions import NotFound logger = singer.get_logger() @@ -30,7 +31,7 @@ def emit_state(state): def ensure_dataset(project_id, dataset_id, location): """ - Given a project id, dataset id and location, creates BigQuery dataset + Given a project id, dataset id and location, creates BigQuery dataset if not exists https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html @@ -43,13 +44,18 @@ def ensure_dataset(project_id, dataset_id, location): client = bigquery.Client(project=project_id, location=location) dataset_ref = DatasetReference(project_id, dataset_id) + try: - client.create_dataset(dataset_ref) - except exceptions.GoogleAPICallError as e: - if e.response.status_code == 409: # dataset exists - pass - else: - logger.critical(f"unable to create dataset {dataset_id} in project {project_id}; Exception {e}") - return 2 # sys.exit(2) - - return client, Dataset(dataset_ref) + dataset = client.get_dataset(dataset_ref) + return client, dataset + except NotFound: + try: + client.create_dataset(dataset_ref) + except exceptions.GoogleAPICallError as e: + if e.response.status_code == 409: # dataset exists + pass + else: + logger.critical(f"unable to create dataset {dataset_id} in project {project_id}; Exception {e}") + return 2 # sys.exit(2) + + return client, Dataset(dataset_ref) From ab9904e58571813721f205c10ece1b4cbf8824db Mon Sep 17 00:00:00 2001 From: RuslanBergenov Date: Wed, 18 May 2022 19:31:21 -0600 Subject: [PATCH 2/4] test: ensure_dataset unit test --- .github/workflows/python-package.yml | 2 +- tests/test_target_bigquery_utils.py | 71 ++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 tests/test_target_bigquery_utils.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 6149243..5afbbda 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -25,7 +25,7 @@ env: on: push: - branches: [development, master] + branches: [development, master, feature/get-dataset-without-create] pull_request: branches: [ master ] diff --git a/tests/test_target_bigquery_utils.py b/tests/test_target_bigquery_utils.py new file mode 100644 index 0000000..6ad785f --- /dev/null +++ b/tests/test_target_bigquery_utils.py @@ -0,0 +1,71 @@ +import os +import json +from google.cloud.bigquery import Dataset + +from tests import unittestcore + +from target_bigquery.utils import ensure_dataset + +from google.cloud import bigquery + +class TestTargetBigQueryUtils(unittestcore.BaseUnitTest): + + def setUp(self): + + + self.config_file = os.path.join(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'sandbox'), + 'target-config.json') + + config = json.load(open(self.config_file)) + + self.project_id = config["project_id"] + + self.dataset_id = "target_bigquery_unit_test_ensure_dataset_function" # config["dataset_id"] + + self.location = config.get("location", "US") + + self.client = bigquery.Client(project=self.project_id) + + def test_ensure_dataset(self): + """ + the purpose of this test is to show that the dataset is obtained, if it already exists + if it doesn't exist then it's created and then it is obtained + """ + + + # PART 1 + # create dataset and get dataset + client_1, dataset_newly_created = ensure_dataset(project_id=self.project_id, + dataset_id=self.dataset_id, + location=self.location) + + + # PART 2 (identical code to part 1, but now the dataset already exists) + # get dataset if dataset already exists + client_2, dataset_already_exists = ensure_dataset(project_id=self.project_id, + dataset_id=self.dataset_id, + location=self.location) + # PART 3: checks + dataset_list = [dataset_newly_created, dataset_already_exists] + + for next_dataset in dataset_list: + dataset_dict = next_dataset.__dict__ + + assert type(next_dataset) == Dataset + assert dataset_dict["_properties"]["datasetReference"]["projectId"] == self.project_id + assert dataset_dict["_properties"]["datasetReference"]["datasetId"] == self.dataset_id + + + def tearDown(self): + self.delete_dataset() + + def delete_dataset(self): + try: + self.client.delete_dataset( + dataset=self.dataset_id, + delete_contents=True + ) + except Exception as e: + print(e) + pass + From 6f29066c3b9c4c58726ac24c052876ef6ad2ca48 Mon Sep 17 00:00:00 2001 From: RuslanBergenov Date: Wed, 18 May 2022 19:37:21 -0600 Subject: [PATCH 3/4] refactor: ensure_dataset --- target_bigquery/utils.py | 11 +++++------ tests/test_target_bigquery_utils.py | 23 +++++++++-------------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/target_bigquery/utils.py b/target_bigquery/utils.py index cf330c3..98c9fb9 100644 --- a/target_bigquery/utils.py +++ b/target_bigquery/utils.py @@ -7,6 +7,7 @@ from google.cloud import bigquery from google.cloud.bigquery import Dataset from google.cloud.exceptions import NotFound +from google.cloud.bigquery import DatasetReference logger = singer.get_logger() @@ -40,7 +41,7 @@ def ensure_dataset(project_id, dataset_id, location): :param location, str: location for the dataset (US). Passed to bigquery.Client(). :return: client (BigQuery Client Object) and Dataset (BigQuery dataset) """ - from google.cloud.bigquery import DatasetReference + client = bigquery.Client(project=project_id, location=location) dataset_ref = DatasetReference(project_id, dataset_id) @@ -48,14 +49,12 @@ def ensure_dataset(project_id, dataset_id, location): try: dataset = client.get_dataset(dataset_ref) return client, dataset + except NotFound: try: client.create_dataset(dataset_ref) except exceptions.GoogleAPICallError as e: - if e.response.status_code == 409: # dataset exists - pass - else: - logger.critical(f"unable to create dataset {dataset_id} in project {project_id}; Exception {e}") - return 2 # sys.exit(2) + logger.critical(f"unable to create dataset {dataset_id} in project {project_id}; Exception {e}") + return 2 # sys.exit(2) return client, Dataset(dataset_ref) diff --git a/tests/test_target_bigquery_utils.py b/tests/test_target_bigquery_utils.py index 6ad785f..1938fed 100644 --- a/tests/test_target_bigquery_utils.py +++ b/tests/test_target_bigquery_utils.py @@ -1,26 +1,25 @@ import os import json from google.cloud.bigquery import Dataset +from google.cloud import bigquery from tests import unittestcore - from target_bigquery.utils import ensure_dataset -from google.cloud import bigquery class TestTargetBigQueryUtils(unittestcore.BaseUnitTest): def setUp(self): - - self.config_file = os.path.join(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'sandbox'), - 'target-config.json') + self.config_file = os.path.join( + os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'sandbox'), + 'target-config.json') config = json.load(open(self.config_file)) self.project_id = config["project_id"] - self.dataset_id = "target_bigquery_unit_test_ensure_dataset_function" # config["dataset_id"] + self.dataset_id = "target_bigquery_unit_test_ensure_dataset_function" # config["dataset_id"] self.location = config.get("location", "US") @@ -32,19 +31,17 @@ def test_ensure_dataset(self): if it doesn't exist then it's created and then it is obtained """ - # PART 1 # create dataset and get dataset client_1, dataset_newly_created = ensure_dataset(project_id=self.project_id, - dataset_id=self.dataset_id, - location=self.location) - + dataset_id=self.dataset_id, + location=self.location) # PART 2 (identical code to part 1, but now the dataset already exists) # get dataset if dataset already exists client_2, dataset_already_exists = ensure_dataset(project_id=self.project_id, - dataset_id=self.dataset_id, - location=self.location) + dataset_id=self.dataset_id, + location=self.location) # PART 3: checks dataset_list = [dataset_newly_created, dataset_already_exists] @@ -55,7 +52,6 @@ def test_ensure_dataset(self): assert dataset_dict["_properties"]["datasetReference"]["projectId"] == self.project_id assert dataset_dict["_properties"]["datasetReference"]["datasetId"] == self.dataset_id - def tearDown(self): self.delete_dataset() @@ -68,4 +64,3 @@ def delete_dataset(self): except Exception as e: print(e) pass - From a7b90b40cdf7321ed4e63b1e618eae16c666b757 Mon Sep 17 00:00:00 2001 From: RuslanBergenov Date: Thu, 19 May 2022 17:46:36 -0600 Subject: [PATCH 4/4] test: ensure_dataset --- tests/test_target_bigquery_utils.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_target_bigquery_utils.py b/tests/test_target_bigquery_utils.py index 1938fed..4c7c1c4 100644 --- a/tests/test_target_bigquery_utils.py +++ b/tests/test_target_bigquery_utils.py @@ -2,10 +2,16 @@ import json from google.cloud.bigquery import Dataset from google.cloud import bigquery +from google.cloud.exceptions import NotFound +import pytest +import logging from tests import unittestcore from target_bigquery.utils import ensure_dataset +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + class TestTargetBigQueryUtils(unittestcore.BaseUnitTest): @@ -31,6 +37,14 @@ def test_ensure_dataset(self): if it doesn't exist then it's created and then it is obtained """ + # make sure dataset doesn't exist yet + logger.info("Dataset doesn't exist yet") + self.delete_dataset() + + # assert that dataset doesn't exist yet + with pytest.raises(NotFound): + self.client.get_dataset(self.dataset_id) # Make an API request. + # PART 1 # create dataset and get dataset client_1, dataset_newly_created = ensure_dataset(project_id=self.project_id,