Skip to content

Commit

Permalink
docs: docstring in functions check_schema_for_dupes_in_field_names & …
Browse files Browse the repository at this point in the history
…build_field_list
  • Loading branch information
RuslanBergenov committed Aug 5, 2021
1 parent 705311f commit ef621c6
Showing 1 changed file with 26 additions and 2 deletions.
28 changes: 26 additions & 2 deletions target_bigquery/validate_json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,39 @@ def check_schema_for_dupes_in_field_names(stream_name, schema):
Alerts user if there are duplicate field names in JSON schema.
For example, if JSON schema contains:
"Name" and "name"
"Name" and "name" (this will be considered a dupe field in BigQuery and it'll throw an error)
or
"first name" and "first_name" (this example is also a dupe because "first name" will be converted to "first_name" by schema.py)
:param stream_name: name of stream
:param schema: JSON schema of the stream
:return:
"""
def build_field_list(schema):
"""
:param schema:
:return: a dictionary, where:
every key is uppercase of BigQuery transformed key (uppercase of field name cleaned up to load into BigQuery)
every value is original field names from JSON schema
This dictionary is flat, not nested.
JSON nested fields are represented in dictionary with a . dot.
This dict makes it easy to detect dupes and tell the user exactly where the dupe is located
(what its parent field is).
Sample output:
f_dict / fields = {'OBJECT': ['object'], 'ID': ['id'],
'PERSON._SOURCE': ['person.$source'],
'PERSON.NAME': ['person.name', 'person.Name']}
dupes = {'PERSON.NAME': ['person.name', 'person.Name']}
"""
f_dict = {}
for field_name, field_property in schema.get("properties", schema.get("items", {}).get("properties", {})).items():
if not ("items" in field_property and "properties" in field_property["items"]) \
Expand All @@ -98,7 +123,6 @@ def build_field_list(schema):
else:
f_dict[f"{key}.{k}"].extend([f"{field_name}.{i}" for i in v])

# sample f_dict: {"BQ_FIELD_NAME.BQ_NESTED_FIELD": ["json_schema_field_name.$nested_name", "json_schema_field_name.nested name"]}
return f_dict

fields = build_field_list(schema)
Expand Down

0 comments on commit ef621c6

Please sign in to comment.