diff --git a/sample_config.json b/sample_config.json index 89acf67..8d8abab 100644 --- a/sample_config.json +++ b/sample_config.json @@ -14,7 +14,7 @@ "prefer_number_vs_integer": true, "schema_overrides": { "column_name": { - "type": "integer", + "type": "integer" } } }, @@ -27,7 +27,7 @@ "format": "excel", "schema_overrides": { "id": { - "type": "integer", + "type": "integer" } } } diff --git a/tap_spreadsheets_anywhere/__init__.py b/tap_spreadsheets_anywhere/__init__.py index 00d29bb..32be6a6 100644 --- a/tap_spreadsheets_anywhere/__init__.py +++ b/tap_spreadsheets_anywhere/__init__.py @@ -47,7 +47,8 @@ def generate_schema(table_spec, samples): '_smart_source_lineno': {'type': 'integer'}, } prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer', False) - data_schema = conversion.generate_schema(samples, prefer_number_vs_integer=prefer_number_vs_integer) + prefer_schema_as_string = table_spec.get('prefer_schema_as_string', False) + data_schema = conversion.generate_schema(samples, prefer_number_vs_integer=prefer_number_vs_integer, prefer_schema_as_string=prefer_schema_as_string) inferred_schema = { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) diff --git a/tap_spreadsheets_anywhere/configuration.py b/tap_spreadsheets_anywhere/configuration.py index 86fd984..fdb187b 100644 --- a/tap_spreadsheets_anywhere/configuration.py +++ b/tap_spreadsheets_anywhere/configuration.py @@ -27,6 +27,7 @@ Optional('max_records_per_run'): int, Optional('max_sampled_files'): int, Optional('prefer_number_vs_integer'): bool, + Optional('prefer_schema_as_string'): bool, Optional('schema_overrides'): { str: { Required('type'): Any(Any('null','string','integer','number','date-time'), diff --git a/tap_spreadsheets_anywhere/conversion.py b/tap_spreadsheets_anywhere/conversion.py index b2294a6..e333bec 100644 --- a/tap_spreadsheets_anywhere/conversion.py +++ b/tap_spreadsheets_anywhere/conversion.py @@ -130,23 +130,28 @@ def pick_datatype(counts,prefer_number_vs_integer=False): return to_return -def generate_schema(samples,prefer_number_vs_integer=False): +def generate_schema(samples,prefer_number_vs_integer=False, prefer_schema_as_string=False): to_return = {} counts = count_samples(samples) for key, value in counts.items(): - datatype = pick_datatype(value,prefer_number_vs_integer) - # if "survey_responses_count" == key: - # LOGGER.error(f"Key '{key}' held {value} and was typed as {datatype} with prefer_number_vs_integer={prefer_number_vs_integer}") - - if datatype == 'date-time': + if(prefer_schema_as_string): to_return[key] = { - 'type': ['null', 'string'], - 'format': 'date-time', + 'type':['null','string'] } else: - to_return[key] = { - 'type': ['null', datatype], - } + datatype = pick_datatype(value,prefer_number_vs_integer) + # if "survey_responses_count" == key: + # LOGGER.error(f"Key '{key}' held {value} and was typed as {datatype} with prefer_number_vs_integer={prefer_number_vs_integer}") + + if datatype == 'date-time': + to_return[key] = { + 'type': ['null', 'string'], + 'format': 'date-time', + } + else: + to_return[key] = { + 'type': ['null', datatype], + } return to_return diff --git a/tap_spreadsheets_anywhere/file_utils.py b/tap_spreadsheets_anywhere/file_utils.py index e626888..b82067e 100644 --- a/tap_spreadsheets_anywhere/file_utils.py +++ b/tap_spreadsheets_anywhere/file_utils.py @@ -320,6 +320,7 @@ def config_by_crawl(crawl_config): "max_sampling_read": source.get('max_sampling_read', 1000), "universal_newlines": source.get('universal_newlines', True), "prefer_number_vs_integer": source.get('prefer_number_vs_integer', False), + "prefer_schema_as_string": source.get('prefer_schema_as_string', False), "start_date": modified_since.isoformat() } elif abs_pattern != entries[table]["pattern"]: @@ -340,6 +341,7 @@ def config_by_crawl(crawl_config): "max_sampling_read": source.get('max_sampling_read', 1000), "universal_newlines": source.get('universal_newlines', True), "prefer_number_vs_integer": source.get('prefer_number_vs_integer', False), + "prefer_schema_as_string": source.get('prefer_schema_as_string', False), "start_date": modified_since.isoformat() } diff --git a/tap_spreadsheets_anywhere/test/test_format.py b/tap_spreadsheets_anywhere/test/test_format.py index 54fd656..7a0f6d8 100644 --- a/tap_spreadsheets_anywhere/test/test_format.py +++ b/tap_spreadsheets_anywhere/test/test_format.py @@ -24,6 +24,7 @@ "key_properties": ["id"], "format": "csv", "prefer_number_vs_integer": True, + "prefer_schema_as_string": True, "universal_newlines": False, "sample_rate": 5, "max_sampling_read": 2000,