ets · TyShkan · Mar 25, 2023 · Mar 30, 2023 · menzenski · Mar 29, 2023
diff --git a/README.md b/README.md
@@ -114,7 +114,7 @@ Each object in the 'tables' array describes one or more CSV or Excel spreadsheet
 - **worksheet_name**: (optional) the worksheet name to pull from in the targeted xls file(s). Only required when format is excel
 - **delimiter**: (optional) the delimiter to use when format is 'csv'. Defaults to a comma ',' but you can set delimiter to 'detect' to leverage the csv "Sniffer" for auto-detecting delimiter. 
 - **quotechar**: (optional) the character used to surround values that may contain delimiters - defaults to a double quote '"'
-- **json_path**: (optional) the JSON key under which the list of objets to use is located. Defaults to None, corresponding to an array at the top level of the JSON tree.
+- **json_path**: (optional) the JSON key under which the list of objects to use is located (corresponding to an array at the top level of the JSON tree) or [JSONPath](https://pypi.org/project/jsonpath-ng/) (should return array of objects, could be tested on (https://jsonpath.com)). Defaults to None.
 
 ### Automatic Config Generation
 
@@ -152,6 +152,18 @@ JSON files are expected to parse as a root-level array of objects where each obj
 ]
 ``` 
 
+JSONPath could be used to parse deep nested array of objects, i.e., `json_path: response.data[*]` could be used to parse the following JSON file:
+```json
+{
+  "response": {
+    "data": [
+      { "name": "row one", "key": 42 },
+      { "name": "row two", "key": 43 }
+    ]
+  }
+}
+```
+
 ### JSONL (JSON Lines) support
 
 JSONL files are expected to parse as one object per line, where each row in a file is a set of key-value pairs.

diff --git a/setup.py b/setup.py
@@ -19,7 +19,8 @@
         'openpyxl',
         'xlrd',
         'paramiko',
-        'azure-storage-blob>=12.14.0'
+        'azure-storage-blob>=12.14.0',
+        'jsonpath-ng>=1.5.3'
     ],
     entry_points="""
     [console_scripts]

diff --git a/tap_spreadsheets_anywhere/json_handler.py b/tap_spreadsheets_anywhere/json_handler.py
@@ -1,4 +1,5 @@
 import json
+from jsonpath_ng.ext import parse
 import re
 from json import JSONDecodeError
 import logging
@@ -25,8 +26,12 @@ def get_row_iterator(table_spec, reader):
     try:
         json_array = json.load(reader)
         json_path = table_spec.get('json_path', None)
+
         if json_path is not None:
-            json_array = json_array[json_path]
+            if json_path in json_array:
+                json_array = json_array[json_path]
+            else:
+                return generator_wrapper(match.value for match in parse(json_path).find(json_array))
 
         # throw a TypeError if the root json object can not be iterated
         return generator_wrapper(iter(json_array))
@@ -39,7 +44,3 @@ def get_row_iterator(table_spec, reader):
             return generator_wrapper(json_objects)
         else:
             raise jde
-
-
-
-
diff --git a/tap_spreadsheets_anywhere/test/test_json.py b/tap_spreadsheets_anywhere/test/test_json.py
@@ -8,29 +8,29 @@
     "tables": [
         {
             "path": "file://./tap_spreadsheets_anywhere/test",
-            "name": "badnewlines",
-            "pattern": ".*\\.xlsx",
+            "name": "list",
+            "pattern": ".*\\.json",
             "start_date": "2017-05-01T00:00:00Z",
             "key_properties": [],
-            "format": "excel",
-            "worksheet_name": "sample_with_bad_newlines"
+            "format": "json"
         },
         {
             "path": "file://./tap_spreadsheets_anywhere/test",
-            "name": "badnewlines",
+            "name": "nestedlist",
             "pattern": ".*\\.json",
             "start_date": "2017-05-01T00:00:00Z",
             "key_properties": [],
-            "format": "detect"
+            "json_path": "someKey",
+            "format": "json"
         },
         {
             "path": "file://./tap_spreadsheets_anywhere/test",
-            "name": "nestedlist",
+            "name": "deepnestedlist",
             "pattern": ".*\\.json",
             "start_date": "2017-05-01T00:00:00Z",
             "key_properties": [],
-            "json_path": "someKey",
-            "format": "detect"
+            "json_path": "response.data[*]",
+            "format": "json"
         }
     ]
 }
@@ -48,6 +48,12 @@ def test_json_object_lists(self):
 
     def test_json_nested_array(self):
         reader = StringIO('{"someKey": [{"k":"v"},{"k":"v"},{"k":"v"}]}')
+        iterator = json_handler.get_row_iterator(TEST_TABLE_SPEC['tables'][1], reader)
+        for row in iterator:
+            self.assertEqual(row['k'], 'v')
+
+    def test_json_deep_nested_array(self):
+        reader = StringIO('{"response": {"data": [{"k":"v"},{"k":"v"},{"k":"v"}]}}')
         iterator = json_handler.get_row_iterator(TEST_TABLE_SPEC['tables'][2], reader)
         for row in iterator:
             self.assertEqual(row['k'], 'v')