Skip to content

Commit

Permalink
feat: add a new configuration that allows users to preferably get sch…
Browse files Browse the repository at this point in the history
…ema as all string
  • Loading branch information
dnz-ssathar committed Sep 10, 2021
1 parent 5d91159 commit cd07d88
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 14 deletions.
4 changes: 2 additions & 2 deletions sample_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"prefer_number_vs_integer": true,
"schema_overrides": {
"column_name": {
"type": "integer",
"type": "integer"
}
}
},
Expand All @@ -27,7 +27,7 @@
"format": "excel",
"schema_overrides": {
"id": {
"type": "integer",
"type": "integer"
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion tap_spreadsheets_anywhere/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def generate_schema(table_spec, samples):
'_smart_source_lineno': {'type': 'integer'},
}
prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer', False)
data_schema = conversion.generate_schema(samples, prefer_number_vs_integer=prefer_number_vs_integer)
prefer_schema_as_string = table_spec.get('prefer_schema_as_string', False)
data_schema = conversion.generate_schema(samples, prefer_number_vs_integer=prefer_number_vs_integer, prefer_schema_as_string=prefer_schema_as_string)
inferred_schema = {
'type': 'object',
'properties': merge_dicts(data_schema, metadata_schema)
Expand Down
1 change: 1 addition & 0 deletions tap_spreadsheets_anywhere/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
Optional('max_records_per_run'): int,
Optional('max_sampled_files'): int,
Optional('prefer_number_vs_integer'): bool,
Optional('prefer_schema_as_string'): bool,
Optional('schema_overrides'): {
str: {
Required('type'): Any(Any('null','string','integer','number','date-time'),
Expand Down
27 changes: 16 additions & 11 deletions tap_spreadsheets_anywhere/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,23 +130,28 @@ def pick_datatype(counts,prefer_number_vs_integer=False):
return to_return


def generate_schema(samples,prefer_number_vs_integer=False):
def generate_schema(samples,prefer_number_vs_integer=False, prefer_schema_as_string=False):
to_return = {}
counts = count_samples(samples)

for key, value in counts.items():
datatype = pick_datatype(value,prefer_number_vs_integer)
# if "survey_responses_count" == key:
# LOGGER.error(f"Key '{key}' held {value} and was typed as {datatype} with prefer_number_vs_integer={prefer_number_vs_integer}")

if datatype == 'date-time':
if(prefer_schema_as_string):
to_return[key] = {
'type': ['null', 'string'],
'format': 'date-time',
'type':['null','string']
}
else:
to_return[key] = {
'type': ['null', datatype],
}
datatype = pick_datatype(value,prefer_number_vs_integer)
# if "survey_responses_count" == key:
# LOGGER.error(f"Key '{key}' held {value} and was typed as {datatype} with prefer_number_vs_integer={prefer_number_vs_integer}")

if datatype == 'date-time':
to_return[key] = {
'type': ['null', 'string'],
'format': 'date-time',
}
else:
to_return[key] = {
'type': ['null', datatype],
}

return to_return
2 changes: 2 additions & 0 deletions tap_spreadsheets_anywhere/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ def config_by_crawl(crawl_config):
"max_sampling_read": source.get('max_sampling_read', 1000),
"universal_newlines": source.get('universal_newlines', True),
"prefer_number_vs_integer": source.get('prefer_number_vs_integer', False),
"prefer_schema_as_string": source.get('prefer_schema_as_string', False),
"start_date": modified_since.isoformat()
}
elif abs_pattern != entries[table]["pattern"]:
Expand All @@ -340,6 +341,7 @@ def config_by_crawl(crawl_config):
"max_sampling_read": source.get('max_sampling_read', 1000),
"universal_newlines": source.get('universal_newlines', True),
"prefer_number_vs_integer": source.get('prefer_number_vs_integer', False),
"prefer_schema_as_string": source.get('prefer_schema_as_string', False),
"start_date": modified_since.isoformat()
}

Expand Down
1 change: 1 addition & 0 deletions tap_spreadsheets_anywhere/test/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"key_properties": ["id"],
"format": "csv",
"prefer_number_vs_integer": True,
"prefer_schema_as_string": True,
"universal_newlines": False,
"sample_rate": 5,
"max_sampling_read": 2000,
Expand Down

0 comments on commit cd07d88

Please sign in to comment.