diff --git a/README.md b/README.md index e544859..3f700ed 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ The configuration is also captured in [tables_config_util.py](tap_spreadsheets_a "max_sampling_read": 2000, "max_sampled_files": 3, "prefer_number_vs_integer": true, + "prefer_schema_as_string": true, "selected": true, // for any field in the table, you can hardcode the json schema datatype to override @@ -106,6 +107,7 @@ Each object in the 'tables' array describes one or more CSV or Excel spreadsheet - **max_sampled_files**: (optional) The maximum number of files in the targeted set that will be sampled. The default is 5. - **max_records_per_run**: (optional) The maximum number of records that should be written to this stream in a single sync run. The default is unlimited. - **prefer_number_vs_integer**: (optional) If the discovery mode sampling process sees only integer values for a field, should `number` be used anyway so that floats are not considered errors? The default is false but true can help in situations where floats only appear rarely in sources and may not be detected through discovery sampling. +- **prefer_schema_as_string**: (optional) Bool value either as true or false (default). Should the schema be all read as string by default. - **selected**: (optional) Should this table be synced. Defaults to true. Setting to false will skip this table on a sync run. - **worksheet_name**: (optional) the worksheet name to pull from in the targeted xls file(s). Only required when format is excel - **delimiter**: (optional) the delimiter to use when format is 'csv'. Defaults to a comma ',' but you can set delimiter to 'detect' to leverage the csv "Sniffer" for auto-detecting delimiter. diff --git a/sample_config.json b/sample_config.json index 89acf67..8d8abab 100644 --- a/sample_config.json +++ b/sample_config.json @@ -14,7 +14,7 @@ "prefer_number_vs_integer": true, "schema_overrides": { "column_name": { - "type": "integer", + "type": "integer" } } }, @@ -27,7 +27,7 @@ "format": "excel", "schema_overrides": { "id": { - "type": "integer", + "type": "integer" } } } diff --git a/tap_spreadsheets_anywhere/__init__.py b/tap_spreadsheets_anywhere/__init__.py index 00d29bb..32be6a6 100644 --- a/tap_spreadsheets_anywhere/__init__.py +++ b/tap_spreadsheets_anywhere/__init__.py @@ -47,7 +47,8 @@ def generate_schema(table_spec, samples): '_smart_source_lineno': {'type': 'integer'}, } prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer', False) - data_schema = conversion.generate_schema(samples, prefer_number_vs_integer=prefer_number_vs_integer) + prefer_schema_as_string = table_spec.get('prefer_schema_as_string', False) + data_schema = conversion.generate_schema(samples, prefer_number_vs_integer=prefer_number_vs_integer, prefer_schema_as_string=prefer_schema_as_string) inferred_schema = { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) diff --git a/tap_spreadsheets_anywhere/configuration.py b/tap_spreadsheets_anywhere/configuration.py index 86fd984..fdb187b 100644 --- a/tap_spreadsheets_anywhere/configuration.py +++ b/tap_spreadsheets_anywhere/configuration.py @@ -27,6 +27,7 @@ Optional('max_records_per_run'): int, Optional('max_sampled_files'): int, Optional('prefer_number_vs_integer'): bool, + Optional('prefer_schema_as_string'): bool, Optional('schema_overrides'): { str: { Required('type'): Any(Any('null','string','integer','number','date-time'), diff --git a/tap_spreadsheets_anywhere/conversion.py b/tap_spreadsheets_anywhere/conversion.py index b2294a6..e333bec 100644 --- a/tap_spreadsheets_anywhere/conversion.py +++ b/tap_spreadsheets_anywhere/conversion.py @@ -130,23 +130,28 @@ def pick_datatype(counts,prefer_number_vs_integer=False): return to_return -def generate_schema(samples,prefer_number_vs_integer=False): +def generate_schema(samples,prefer_number_vs_integer=False, prefer_schema_as_string=False): to_return = {} counts = count_samples(samples) for key, value in counts.items(): - datatype = pick_datatype(value,prefer_number_vs_integer) - # if "survey_responses_count" == key: - # LOGGER.error(f"Key '{key}' held {value} and was typed as {datatype} with prefer_number_vs_integer={prefer_number_vs_integer}") - - if datatype == 'date-time': + if(prefer_schema_as_string): to_return[key] = { - 'type': ['null', 'string'], - 'format': 'date-time', + 'type':['null','string'] } else: - to_return[key] = { - 'type': ['null', datatype], - } + datatype = pick_datatype(value,prefer_number_vs_integer) + # if "survey_responses_count" == key: + # LOGGER.error(f"Key '{key}' held {value} and was typed as {datatype} with prefer_number_vs_integer={prefer_number_vs_integer}") + + if datatype == 'date-time': + to_return[key] = { + 'type': ['null', 'string'], + 'format': 'date-time', + } + else: + to_return[key] = { + 'type': ['null', datatype], + } return to_return diff --git a/tap_spreadsheets_anywhere/file_utils.py b/tap_spreadsheets_anywhere/file_utils.py index e626888..b82067e 100644 --- a/tap_spreadsheets_anywhere/file_utils.py +++ b/tap_spreadsheets_anywhere/file_utils.py @@ -320,6 +320,7 @@ def config_by_crawl(crawl_config): "max_sampling_read": source.get('max_sampling_read', 1000), "universal_newlines": source.get('universal_newlines', True), "prefer_number_vs_integer": source.get('prefer_number_vs_integer', False), + "prefer_schema_as_string": source.get('prefer_schema_as_string', False), "start_date": modified_since.isoformat() } elif abs_pattern != entries[table]["pattern"]: @@ -340,6 +341,7 @@ def config_by_crawl(crawl_config): "max_sampling_read": source.get('max_sampling_read', 1000), "universal_newlines": source.get('universal_newlines', True), "prefer_number_vs_integer": source.get('prefer_number_vs_integer', False), + "prefer_schema_as_string": source.get('prefer_schema_as_string', False), "start_date": modified_since.isoformat() } diff --git a/tap_spreadsheets_anywhere/test/test_format.py b/tap_spreadsheets_anywhere/test/test_format.py index 54fd656..7a0f6d8 100644 --- a/tap_spreadsheets_anywhere/test/test_format.py +++ b/tap_spreadsheets_anywhere/test/test_format.py @@ -24,6 +24,7 @@ "key_properties": ["id"], "format": "csv", "prefer_number_vs_integer": True, + "prefer_schema_as_string": True, "universal_newlines": False, "sample_rate": 5, "max_sampling_read": 2000,