From 32e769fc44a62fdc2cd109df2d677e472f794c1e Mon Sep 17 00:00:00 2001
From: Denis <denis@tyshkan.ru>
Date: Sat, 25 Mar 2023 20:50:31 +0300
Subject: [PATCH 1/2] Add JSONPath parser support

---
 README.md                                   | 14 +++++++++++++-
 setup.py                                    |  3 ++-
 tap_spreadsheets_anywhere/json_handler.py   | 11 ++++++-----
 tap_spreadsheets_anywhere/test/test_json.py | 18 ++++++++++++------
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 7b7ed66..42bb21c 100644
--- a/README.md
+++ b/README.md
@@ -114,7 +114,7 @@ Each object in the 'tables' array describes one or more CSV or Excel spreadsheet
 - **worksheet_name**: (optional) the worksheet name to pull from in the targeted xls file(s). Only required when format is excel
 - **delimiter**: (optional) the delimiter to use when format is 'csv'. Defaults to a comma ',' but you can set delimiter to 'detect' to leverage the csv "Sniffer" for auto-detecting delimiter. 
 - **quotechar**: (optional) the character used to surround values that may contain delimiters - defaults to a double quote '"'
-- **json_path**: (optional) the JSON key under which the list of objets to use is located. Defaults to None, corresponding to an array at the top level of the JSON tree.
+- **json_path**: (optional) the JSON key under which the list of objects to use is located (corresponding to an array at the top level of the JSON tree) or [JSONPath](https://pypi.org/project/jsonpath-ng/) (should return array of objects, could be tested on (https://jsonpath.com)). Defaults to None.
 
 ### Automatic Config Generation
 
@@ -152,6 +152,18 @@ JSON files are expected to parse as a root-level array of objects where each obj
 ]
 ``` 
 
+JSONPath could be used to parse deep nested array of objects, i.e., `json_path: response.data[*]` could be used to parse the following JSON file:
+```json
+{
+  "response": {
+    "data": [
+      { "name": "row one", "key": 42 },
+      { "name": "row two", "key": 43 }
+    ]
+  }
+}
+```
+
 ### JSONL (JSON Lines) support
 
 JSONL files are expected to parse as one object per line, where each row in a file is a set of key-value pairs.
diff --git a/setup.py b/setup.py
index 3dc9109..ebaff26 100755
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,8 @@
         'openpyxl',
         'xlrd',
         'paramiko',
-        'azure-storage-blob>=12.14.0'
+        'azure-storage-blob>=12.14.0',
+        'jsonpath-ng>=1.5.3'
     ],
     entry_points="""
     [console_scripts]
diff --git a/tap_spreadsheets_anywhere/json_handler.py b/tap_spreadsheets_anywhere/json_handler.py
index add6e33..61b2d25 100644
--- a/tap_spreadsheets_anywhere/json_handler.py
+++ b/tap_spreadsheets_anywhere/json_handler.py
@@ -1,4 +1,5 @@
 import json
+from jsonpath_ng.ext import parse
 import re
 from json import JSONDecodeError
 import logging
@@ -25,8 +26,12 @@ def get_row_iterator(table_spec, reader):
     try:
         json_array = json.load(reader)
         json_path = table_spec.get('json_path', None)
+
         if json_path is not None:
-            json_array = json_array[json_path]
+            if json_path in json_array:
+                json_array = json_array[json_path]
+            else:
+                return generator_wrapper(match.value for match in parse(json_path).find(json_array))
 
         # throw a TypeError if the root json object can not be iterated
         return generator_wrapper(iter(json_array))
@@ -39,7 +44,3 @@ def get_row_iterator(table_spec, reader):
             return generator_wrapper(json_objects)
         else:
             raise jde
-
-
-
-
diff --git a/tap_spreadsheets_anywhere/test/test_json.py b/tap_spreadsheets_anywhere/test/test_json.py
index 37cff1f..9fbc836 100644
--- a/tap_spreadsheets_anywhere/test/test_json.py
+++ b/tap_spreadsheets_anywhere/test/test_json.py
@@ -9,27 +9,27 @@
         {
             "path": "file://./tap_spreadsheets_anywhere/test",
             "name": "badnewlines",
-            "pattern": ".*\\.xlsx",
+            "pattern": ".*\\.json",
             "start_date": "2017-05-01T00:00:00Z",
             "key_properties": [],
-            "format": "excel",
-            "worksheet_name": "sample_with_bad_newlines"
+            "format": "detect"
         },
         {
             "path": "file://./tap_spreadsheets_anywhere/test",
-            "name": "badnewlines",
+            "name": "nestedlist",
             "pattern": ".*\\.json",
             "start_date": "2017-05-01T00:00:00Z",
             "key_properties": [],
+            "json_path": "someKey",
             "format": "detect"
         },
         {
             "path": "file://./tap_spreadsheets_anywhere/test",
-            "name": "nestedlist",
+            "name": "deepnestedlist",
             "pattern": ".*\\.json",
             "start_date": "2017-05-01T00:00:00Z",
             "key_properties": [],
-            "json_path": "someKey",
+            "json_path": "response.data[*]",
             "format": "detect"
         }
     ]
@@ -48,6 +48,12 @@ def test_json_object_lists(self):
 
     def test_json_nested_array(self):
         reader = StringIO('{"someKey": [{"k":"v"},{"k":"v"},{"k":"v"}]}')
+        iterator = json_handler.get_row_iterator(TEST_TABLE_SPEC['tables'][1], reader)
+        for row in iterator:
+            self.assertEqual(row['k'], 'v')
+    
+    def test_json_deep_nested_array(self):
+        reader = StringIO('{"response": {"data": [{"k":"v"},{"k":"v"},{"k":"v"}]}}')
         iterator = json_handler.get_row_iterator(TEST_TABLE_SPEC['tables'][2], reader)
         for row in iterator:
             self.assertEqual(row['k'], 'v')

From dd27e058cd0422db03d57578f68c0ea78655c97f Mon Sep 17 00:00:00 2001
From: Denis <denis@tyshkan.ru>
Date: Thu, 30 Mar 2023 20:10:12 +0300
Subject: [PATCH 2/2] Cleanup json format tests

---
 tap_spreadsheets_anywhere/test/test_json.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tap_spreadsheets_anywhere/test/test_json.py b/tap_spreadsheets_anywhere/test/test_json.py
index 9fbc836..9ba81dd 100644
--- a/tap_spreadsheets_anywhere/test/test_json.py
+++ b/tap_spreadsheets_anywhere/test/test_json.py
@@ -8,11 +8,11 @@
     "tables": [
         {
             "path": "file://./tap_spreadsheets_anywhere/test",
-            "name": "badnewlines",
+            "name": "list",
             "pattern": ".*\\.json",
             "start_date": "2017-05-01T00:00:00Z",
             "key_properties": [],
-            "format": "detect"
+            "format": "json"
         },
         {
             "path": "file://./tap_spreadsheets_anywhere/test",
@@ -21,7 +21,7 @@
             "start_date": "2017-05-01T00:00:00Z",
             "key_properties": [],
             "json_path": "someKey",
-            "format": "detect"
+            "format": "json"
         },
         {
             "path": "file://./tap_spreadsheets_anywhere/test",
@@ -30,7 +30,7 @@
             "start_date": "2017-05-01T00:00:00Z",
             "key_properties": [],
             "json_path": "response.data[*]",
-            "format": "detect"
+            "format": "json"
         }
     ]
 }