From 32e769fc44a62fdc2cd109df2d677e472f794c1e Mon Sep 17 00:00:00 2001 From: Denis Date: Sat, 25 Mar 2023 20:50:31 +0300 Subject: [PATCH 1/2] Add JSONPath parser support --- README.md | 14 +++++++++++++- setup.py | 3 ++- tap_spreadsheets_anywhere/json_handler.py | 11 ++++++----- tap_spreadsheets_anywhere/test/test_json.py | 18 ++++++++++++------ 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 7b7ed66..42bb21c 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ Each object in the 'tables' array describes one or more CSV or Excel spreadsheet - **worksheet_name**: (optional) the worksheet name to pull from in the targeted xls file(s). Only required when format is excel - **delimiter**: (optional) the delimiter to use when format is 'csv'. Defaults to a comma ',' but you can set delimiter to 'detect' to leverage the csv "Sniffer" for auto-detecting delimiter. - **quotechar**: (optional) the character used to surround values that may contain delimiters - defaults to a double quote '"' -- **json_path**: (optional) the JSON key under which the list of objets to use is located. Defaults to None, corresponding to an array at the top level of the JSON tree. +- **json_path**: (optional) the JSON key under which the list of objects to use is located (corresponding to an array at the top level of the JSON tree) or [JSONPath](https://pypi.org/project/jsonpath-ng/) (should return array of objects, could be tested on (https://jsonpath.com)). Defaults to None. ### Automatic Config Generation @@ -152,6 +152,18 @@ JSON files are expected to parse as a root-level array of objects where each obj ] ``` +JSONPath could be used to parse deep nested array of objects, i.e., `json_path: response.data[*]` could be used to parse the following JSON file: +```json +{ + "response": { + "data": [ + { "name": "row one", "key": 42 }, + { "name": "row two", "key": 43 } + ] + } +} +``` + ### JSONL (JSON Lines) support JSONL files are expected to parse as one object per line, where each row in a file is a set of key-value pairs. diff --git a/setup.py b/setup.py index 3dc9109..ebaff26 100755 --- a/setup.py +++ b/setup.py @@ -19,7 +19,8 @@ 'openpyxl', 'xlrd', 'paramiko', - 'azure-storage-blob>=12.14.0' + 'azure-storage-blob>=12.14.0', + 'jsonpath-ng>=1.5.3' ], entry_points=""" [console_scripts] diff --git a/tap_spreadsheets_anywhere/json_handler.py b/tap_spreadsheets_anywhere/json_handler.py index add6e33..61b2d25 100644 --- a/tap_spreadsheets_anywhere/json_handler.py +++ b/tap_spreadsheets_anywhere/json_handler.py @@ -1,4 +1,5 @@ import json +from jsonpath_ng.ext import parse import re from json import JSONDecodeError import logging @@ -25,8 +26,12 @@ def get_row_iterator(table_spec, reader): try: json_array = json.load(reader) json_path = table_spec.get('json_path', None) + if json_path is not None: - json_array = json_array[json_path] + if json_path in json_array: + json_array = json_array[json_path] + else: + return generator_wrapper(match.value for match in parse(json_path).find(json_array)) # throw a TypeError if the root json object can not be iterated return generator_wrapper(iter(json_array)) @@ -39,7 +44,3 @@ def get_row_iterator(table_spec, reader): return generator_wrapper(json_objects) else: raise jde - - - - diff --git a/tap_spreadsheets_anywhere/test/test_json.py b/tap_spreadsheets_anywhere/test/test_json.py index 37cff1f..9fbc836 100644 --- a/tap_spreadsheets_anywhere/test/test_json.py +++ b/tap_spreadsheets_anywhere/test/test_json.py @@ -9,27 +9,27 @@ { "path": "file://./tap_spreadsheets_anywhere/test", "name": "badnewlines", - "pattern": ".*\\.xlsx", + "pattern": ".*\\.json", "start_date": "2017-05-01T00:00:00Z", "key_properties": [], - "format": "excel", - "worksheet_name": "sample_with_bad_newlines" + "format": "detect" }, { "path": "file://./tap_spreadsheets_anywhere/test", - "name": "badnewlines", + "name": "nestedlist", "pattern": ".*\\.json", "start_date": "2017-05-01T00:00:00Z", "key_properties": [], + "json_path": "someKey", "format": "detect" }, { "path": "file://./tap_spreadsheets_anywhere/test", - "name": "nestedlist", + "name": "deepnestedlist", "pattern": ".*\\.json", "start_date": "2017-05-01T00:00:00Z", "key_properties": [], - "json_path": "someKey", + "json_path": "response.data[*]", "format": "detect" } ] @@ -48,6 +48,12 @@ def test_json_object_lists(self): def test_json_nested_array(self): reader = StringIO('{"someKey": [{"k":"v"},{"k":"v"},{"k":"v"}]}') + iterator = json_handler.get_row_iterator(TEST_TABLE_SPEC['tables'][1], reader) + for row in iterator: + self.assertEqual(row['k'], 'v') + + def test_json_deep_nested_array(self): + reader = StringIO('{"response": {"data": [{"k":"v"},{"k":"v"},{"k":"v"}]}}') iterator = json_handler.get_row_iterator(TEST_TABLE_SPEC['tables'][2], reader) for row in iterator: self.assertEqual(row['k'], 'v') From dd27e058cd0422db03d57578f68c0ea78655c97f Mon Sep 17 00:00:00 2001 From: Denis Date: Thu, 30 Mar 2023 20:10:12 +0300 Subject: [PATCH 2/2] Cleanup json format tests --- tap_spreadsheets_anywhere/test/test_json.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tap_spreadsheets_anywhere/test/test_json.py b/tap_spreadsheets_anywhere/test/test_json.py index 9fbc836..9ba81dd 100644 --- a/tap_spreadsheets_anywhere/test/test_json.py +++ b/tap_spreadsheets_anywhere/test/test_json.py @@ -8,11 +8,11 @@ "tables": [ { "path": "file://./tap_spreadsheets_anywhere/test", - "name": "badnewlines", + "name": "list", "pattern": ".*\\.json", "start_date": "2017-05-01T00:00:00Z", "key_properties": [], - "format": "detect" + "format": "json" }, { "path": "file://./tap_spreadsheets_anywhere/test", @@ -21,7 +21,7 @@ "start_date": "2017-05-01T00:00:00Z", "key_properties": [], "json_path": "someKey", - "format": "detect" + "format": "json" }, { "path": "file://./tap_spreadsheets_anywhere/test", @@ -30,7 +30,7 @@ "start_date": "2017-05-01T00:00:00Z", "key_properties": [], "json_path": "response.data[*]", - "format": "detect" + "format": "json" } ] }