From 178247e2a7526cdccf66accd6267ec377fa7b901 Mon Sep 17 00:00:00 2001 From: RuslanBergenov Date: Wed, 18 Aug 2021 16:35:51 -0600 Subject: [PATCH] refactor: simplified create_valid_bigquery_field_name() function --- .github/workflows/python-package.yml | 2 +- target_bigquery/schema.py | 61 ++++++++----------------- target_bigquery/validate_json_schema.py | 6 +-- 3 files changed, 23 insertions(+), 46 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 29ec00c..83bf842 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -24,7 +24,7 @@ env: on: push: - branches: [ development-ci-cd-workflow, development, master ] + branches: [ development-ci-cd-workflow, development, master, development-refactor-bigquery-field-name-function ] pull_request: branches: [ master ] diff --git a/target_bigquery/schema.py b/target_bigquery/schema.py index ccce3be..01124cb 100755 --- a/target_bigquery/schema.py +++ b/target_bigquery/schema.py @@ -31,7 +31,7 @@ def cleanup_record(schema, record): elif isinstance(record, dict): nr = {} for key, value in record.items(): - nkey = bigquery_transformed_key(key) + nkey = create_valid_bigquery_field_name(key) nr[nkey] = cleanup_record(schema, value) return nr @@ -39,7 +39,7 @@ def cleanup_record(schema, record): raise Exception(f"unhandled instance of record: {record}") -def bigquery_transformed_key(key): +def create_valid_bigquery_field_name(field_name): """ Clean up / prettify field names, make sure they match BigQuery naming conventions. @@ -56,45 +56,22 @@ def bigquery_transformed_key(key): :param key: JSON field name :return: cleaned up JSON field name """ - remove_list = [" ", - "!", - "\"", - "#", - "$", - "%", - "&", - "'", - "(", - ")", - "*", - "+", - ",", - "-", - ".", - "/", - ":", - ";", - "<", - "=", - ">", - "?", - "@", - "\\", - "]", - "^", - "`", - "|", - "}", - "~"] - - for c in remove_list: - key = key.replace(c, "_") - - if re.match(r"^\d", key): - key = "_" + key - - return key + cleaned_up_field_name = "" + + # if char is alphanumeric (either letters or numbers), append char to our string + for char in field_name: + if char.isalnum(): + cleaned_up_field_name += char + else: + # otherwise, replace it with underscore + cleaned_up_field_name += "_" + + # if field starts with digit, prepend it with underscore + if cleaned_up_field_name[0].isdigit(): + cleaned_up_field_name = "_%s" % cleaned_up_field_name + + return cleaned_up_field_name[:300] # trim the string to the first x chars def prioritize_one_data_type_from_multiple_ones_in_any_of(field_property): """ @@ -248,7 +225,7 @@ def build_field(field_name, field_property): if not ("items" in field_property and "properties" in field_property["items"]) and not ( "properties" in field_property): - return (SchemaField(name=bigquery_transformed_key(field_name), + return (SchemaField(name=create_valid_bigquery_field_name(field_name), field_type=convert_field_type(field_property), mode=determine_field_mode(field_name, field_property), description=None, @@ -266,7 +243,7 @@ def build_field(field_name, field_property): ).items(): processed_subfields.append(build_field(subfield_name, subfield_property)) - return (SchemaField(name=bigquery_transformed_key(field_name), + return (SchemaField(name=create_valid_bigquery_field_name(field_name), field_type=convert_field_type(field_property), mode=determine_field_mode(field_name, field_property), description=None, diff --git a/target_bigquery/validate_json_schema.py b/target_bigquery/validate_json_schema.py index 89316b5..0146c0a 100644 --- a/target_bigquery/validate_json_schema.py +++ b/target_bigquery/validate_json_schema.py @@ -1,6 +1,6 @@ import re import singer -from target_bigquery.schema import bigquery_transformed_key +from target_bigquery.schema import create_valid_bigquery_field_name LOGGER = singer.get_logger() @@ -107,7 +107,7 @@ def build_field_list(schema): for field_name, field_property in schema.get("properties", schema.get("items", {}).get("properties", {})).items(): if not ("items" in field_property and "properties" in field_property["items"]) \ and not ("properties" in field_property): - key = bigquery_transformed_key(field_name.upper()) + key = create_valid_bigquery_field_name(field_name.upper()) if not f_dict.get(key): f_dict[key] = [field_name] else: @@ -116,7 +116,7 @@ def build_field_list(schema): elif ("items" in field_property and "properties" in field_property["items"]) \ or ("properties" in field_property): nd = build_field_list(field_property) - key = bigquery_transformed_key(field_name.upper()) + key = create_valid_bigquery_field_name(field_name.upper()) for k, v in nd.items(): if not f_dict.get(f"{key}.{k}"): f_dict[f"{key}.{k}"] = [f"{field_name}.{i}" for i in v]