From 243e4a7e7bc376bf3cdf61ca03bad014415c84dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anz=CC=8Ce=20Kravanja?= Date: Thu, 7 Apr 2022 12:50:25 -0700 Subject: [PATCH] fix: allow date, date-time and array be standalone types in JSON schema for simplicity --- target_bigquery/simplify_json_schema.py | 32 +++++++++++---- tests/test_schema_conversion.py | 53 +++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 8 deletions(-) diff --git a/target_bigquery/simplify_json_schema.py b/target_bigquery/simplify_json_schema.py index 654c9fc..265e292 100644 --- a/target_bigquery/simplify_json_schema.py +++ b/target_bigquery/simplify_json_schema.py @@ -189,8 +189,8 @@ def is_iterable(schema): """ return not _is_ref(schema) \ - and ARRAY in get_type(schema) \ - and 'items' in schema + and ARRAY in get_type(schema) + # and 'items' in schema # commented out to allow "members": {"type": "array"} def is_nullable(schema): @@ -217,21 +217,31 @@ def is_literal(schema): def is_datetime(schema): """ Given a JSON Schema compatible dict, returns True when schema's type allows being a date-time + Two cases make a datetime type: + a) string in type and format date-time (this is per JSON schema standards) + b) date-time is in type (this is for simplicity) :param schema: dict, JSON Schema :return: Boolean """ - return STRING in get_type(schema) and schema.get('format') == DATE_TIME_FORMAT + return \ + (STRING in get_type(schema) and schema.get('format') == DATE_TIME_FORMAT) \ + or (DATE_TIME_FORMAT in get_type(schema) and schema.get('format') is None) def is_date(schema): """ - Given a JSON Schema compatible dict, returns True when schema's type allows being a date-time + Given a JSON Schema compatible dict, returns True when schema's type allows being a date + Two cases make a date type: + a) string in type and format date (this is per JSON schema standards) + b) date is in type (this is for simplicity) :param schema: dict, JSON Schema :return: Boolean """ - return STRING in get_type(schema) and schema.get('format') == DATE_FORMAT + return \ + (STRING in get_type(schema) and schema.get('format') == DATE_FORMAT) \ + or (DATE_FORMAT in get_type(schema) and schema.get('format') is None) def is_bq_geography(schema): @@ -417,7 +427,10 @@ def _simplify__implicit_anyof(root_schema, schema): 'format': DATE_TIME_FORMAT })) - types.remove(STRING) + if DATE_TIME_FORMAT in types: + types.remove(DATE_TIME_FORMAT) + else: + types.remove(STRING) if is_date(schema): schemas.append(Cachable({ @@ -425,7 +438,10 @@ def _simplify__implicit_anyof(root_schema, schema): 'format': DATE_FORMAT })) - types.remove(STRING) + if DATE_FORMAT in types: + types.remove(DATE_FORMAT) + else: + types.remove(STRING) if is_bq_geography(schema): schemas.append(Cachable({ @@ -485,7 +501,7 @@ def _simplify__implicit_anyof(root_schema, schema): if is_iterable(schema): schemas.append({ 'type': [ARRAY], - 'items': _helper_simplify(root_schema, schema.get('items', {})) + 'items': _helper_simplify(root_schema, schema.get('items', {"type": STRING})) }) types.remove(ARRAY) diff --git a/tests/test_schema_conversion.py b/tests/test_schema_conversion.py index b45f336..ba42e4c 100644 --- a/tests/test_schema_conversion.py +++ b/tests/test_schema_conversion.py @@ -235,6 +235,59 @@ class TestSchemaConversion(unittestcore.BaseUnitTest): def setUp(self): super(TestSchemaConversion, self).setUp() + def test_flat_simplify_and_build(self): + schema = { + "properties": { + "new_status": { + "type": ["string", "null"] + }, + "previous_status": { + "type": ["number", "null"] + }, + "new_assignee": { + "type": ["integer", "null"] + }, + "previous_assignee": { + "type": ["boolean", "null"] + }, + "new_due_date": { + "type": ["date", "null"] + }, + "previous_due_date": { + "type": ["date-time", "null"] + }, + "members": { + "type": "array" # shorted array definition, by default we treat this as array of strings + } + } + } + + schema_simplified = simplify(schema) + schema_bq = build_schema(schema_simplified, key_properties={}, add_metadata=False) + + for f in schema_bq: + if f.name == "new_status": + self.assertEqual(f.field_type.upper(), "STRING") + + elif f.name == "previous_status": + self.assertEqual(f.field_type.upper(), "FLOAT") + + elif f.name == "new_assignee": + self.assertEqual(f.field_type.upper(), "INTEGER") + + elif f.name == "previous_assignee": + self.assertEqual(f.field_type.upper(), "BOOLEAN") + + elif f.name == "new_due_date": + self.assertEqual(f.field_type.upper(), "DATE") + + elif f.name == "previous_due_date": + self.assertEqual(f.field_type.upper(), "TIMESTAMP") + + elif f.name == "members": + self.assertEqual(f.field_type.upper(), "STRING") + self.assertEqual(f.mode, "REPEATED") + def test_flat_schema(self): schema_0_input = schema_simple_1