Skip to content

Commit

Permalink
Merge pull request #3 from Matatika/fix/fix-dynamic-sheet-discovery
Browse files Browse the repository at this point in the history
Fix/fix dynamic sheet discovery
  • Loading branch information
DanielPDWalker authored Jul 4, 2022
2 parents 1876058 + 286bbd0 commit 53701f4
Show file tree
Hide file tree
Showing 10 changed files with 249 additions and 58 deletions.
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ Setting | Required | Type | Description |
`oauth_credentials.client_secret` | Required | String | Your google client secret
`oauth_credentials.refresh_token` | Required | String | Your google refresh token
`sheet_id` | Required | String | Your target google sheet id
`stream_name` | Optional | String | Optionailly rename the stream and output file or table from the tap
`child_sheet_name` | Optional | String | Optionally choose a different sheet from your Google Sheet file

### Environment Variable

Expand All @@ -62,22 +64,26 @@ These settings expand into environment variables of:
- `TAP_GOOGLE_SHEETS_OAUTH_CREDENTIALS_CLIENT_SECRET`
- `TAP_GOOGLE_SHEETS_OAUTH_CREDENTIALS_REFRESH_TOKEN`
- `TAP_GOOGLE_SHEETS_SHEET_ID`
- `TAP_GOOGLE_SHEETS_STREAM_NAME`
- `TAP_GOOGLE_SHEETS_CHILD_SHEET_NAME`

---

## FAQ / Things to Note

* You need to provide all the setting for this tap to run the it. These settings are used to generate the stream and schema for the tap to use from your Google Sheet.
* If you do not provide a `child_sheet_name`, the tap will find the first visible sheet in your Google Sheet and try to sync the data from there.

* You need to provide all the required settings for this tap to run the it. These settings are used to generate the stream and schema for the tap to use from your Google Sheet.

* Currently the tap supports sheets that have the column name in the first row. (The tap builds a usable json object up by using these column names).

* The tap will skip all columns without a name. (The tap builds a usable json object up by using these column names).

* If syncing to a database it will not respect duplicated column names. The last column with the same name will be the only one synced along with its data.

* The tap will use your Google Sheet's name as output file or table name. It will lowercase the file name, and replace any spaces with underscores.
* The tap will use your Google Sheet's name as output file or table name unless you set a `stream_name`. It will replace any spaces with underscores.

* The tap will not lower case the column names, but will again replace any spaces with underscores.
* The tap will again replace any spaces in column names with underscores.

### Loaders Tested

Expand All @@ -91,7 +97,6 @@ These settings expand into environment variables of:

## Roadmap

- [ ] Add setting to optionally allow renaming the sheet stream name. (File or table name output by stream).
- [ ] Add setting to optionally allow the selection of a range of data from a sheet. (Add an optional range setting).
- [ ] Add setting to enable primary key, and select primary key(s) column(s).

Expand Down
52 changes: 31 additions & 21 deletions meltano.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,35 @@ send_anonymous_usage_stats: true
project_id: 04da77a3-af12-49a4-b9bf-3c22845918ba
plugins:
extractors:
- name: tap-google-sheets
namespace: tap_google_sheets
pip_url: -e .
capabilities:
- state
- catalog
- discover
settings:
- name: oauth_credentials.client_id
kind: password
- name: oauth_credentials.client_secret
kind: password
- name: oauth_credentials.refresh_token
kind: password
- name: sheet_id
- name: tap-google-sheets
namespace: tap_google_sheets
pip_url: -e .
capabilities:
- state
- catalog
- discover
select:
- spreadsheet.*
settings:
- name: oauth_credentials.client_id
kind: password
- name: oauth_credentials.client_secret
kind: password
- name: oauth_credentials.refresh_token
kind: password
- name: sheet_id
- name: stream_name
- name: child_sheet_name
loaders:
- name: target-jsonl
variant: andyh1203
pip_url: target-jsonl
- name: target-postgres
variant: transferwise
pip_url: pipelinewise-target-postgres
- name: target-jsonl
variant: andyh1203
pip_url: target-jsonl
- name: target-postgres
variant: transferwise
pip_url: pipelinewise-target-postgres
- name: target-csv
variant: hotgluexyz
pip_url: git+https://github.com/hotgluexyz/[email protected]
- name: target-snowflake
variant: meltano
pip_url: git+https://github.com/Matatika/[email protected]
6 changes: 4 additions & 2 deletions tap_google_sheets/streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,16 @@
class GoogleSheetsStream(GoogleSheetsBaseStream):
"""Google sheets stream."""

child_sheet_name = None
primary_key = None

@property
def path(self):
"""Set the path for the stream."""
self.url_base = "https://sheets.googleapis.com/v4/spreadsheets/"
path = self.url_base + self.config.get("sheet_id") + "/"
path = path + "values/" + "Sheet1" # self.config.get("sheet_name")
path = (
self.url_base + self.config["sheet_id"] + "/values/" + self.child_sheet_name
)
return path

def parse_response(self, response: requests.Response) -> Iterable[dict]:
Expand Down
60 changes: 47 additions & 13 deletions tap_google_sheets/tap.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,41 @@ class TapGoogleSheets(Tap):
description="Your google refresh token",
),
th.Property("sheet_id", th.StringType, description="Your google sheet id"),
th.Property(
"output_name",
th.StringType,
description="Optionally rename your output file or table",
required=False,
),
th.Property(
"child_sheet_name",
th.StringType,
description="Optionally sync data from a different sheet in"
+ " your Google Sheet",
required=False,
),
).to_dict()

def discover_streams(self) -> List[Stream]:
"""Return a list of discovered streams."""
streams: List[Stream] = []

stream_name = self.get_sheet_name()
stream_schema = self.get_schema()
stream_name = self.config.get("stream_name") or self.get_sheet_name()
stream_name = stream_name.replace(" ", "_")

google_sheet_data = self.get_sheet_data()

stream_schema = self.get_schema(google_sheet_data)

child_sheet_name = self.config.get(
"child_sheet_name"
) or self.get_first_visible_child_sheet_name(google_sheet_data)

if stream_name:
stream = GoogleSheetsStream(
tap=self, name=stream_name, schema=stream_schema
)
stream.child_sheet_name = child_sheet_name
stream.selected
streams.append(stream)

Expand All @@ -63,28 +85,40 @@ def get_sheet_name(self):

response: requests.Response = config_stream._request(prepared_request, None)

return response.json().get("title").lower().replace(" ", "_")
return response.json().get("title")

def get_schema(self):
def get_schema(self, google_sheet_data: requests.Response):
"""Build the schema from the data returned by the google sheet."""
headings, *data = google_sheet_data.json()["values"]

schema = th.PropertiesList()
for column in headings:
if column:
schema.append(th.Property(column.replace(" ", "_"), th.StringType))

return schema.to_dict()

def get_first_visible_child_sheet_name(self, google_sheet_data: requests.Response):
"""Get the name of the first visible sheet in the google sheet."""
sheet_in_sheet_name = google_sheet_data.json()["range"].rsplit("!", 1)[0]

return sheet_in_sheet_name

def get_sheet_data(self):
"""Get the data from the selected or first visible sheet in the google sheet."""
config_stream = GoogleSheetsBaseStream(
tap=self,
name="config",
schema={"not": "null"},
path="https://sheets.googleapis.com/v4/spreadsheets/"
+ self.config["sheet_id"]
+ "/values/Sheet1!1:1",
+ "/values/"
+ self.config.get("child_sheet_name", "")
+ "!1:1",
)

prepared_request = config_stream.prepare_request(None, None)

response: requests.Response = config_stream._request(prepared_request, None)

headings, *data = response.json()["values"]

schema = th.PropertiesList()
for column in headings:
if column:
schema.append(th.Property(column.replace(" ", "_"), th.StringType))

return schema.to_dict()
return response
78 changes: 78 additions & 0 deletions tap_google_sheets/tests/test_child_sheet_name_setting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Tests tap setting child_sheet_name."""

import unittest

import responses
import singer

import tap_google_sheets.tests.utils as test_utils
from tap_google_sheets.tap import TapGoogleSheets


class TestChildSheetNameSetting(unittest.TestCase):
"""Test class for tap setting child_sheet_name"""

def setUp(self):
self.mock_config = {
"oauth_credentials": {
"client_id": "123",
"client_secret": "123",
"refresh_token": "123",
},
"sheet_id": "12345",
}
self.mock_config["child_sheet_name"] = "Test Sheet"

responses.reset()
del test_utils.SINGER_MESSAGES[:]

singer.write_message = test_utils.accumulate_singer_messages

@responses.activate()
def test_discovered_stream_name(self):
""""""
self.column_response = {"values": [["Column One", "Column Two"], ["1", "1"]]}

responses.add(
responses.POST,
"https://oauth2.googleapis.com/token",
json={"access_token": "new_token"},
status=200,
),
responses.add(
responses.GET,
"https://www.googleapis.com/drive/v2/files/12345",
json={"title": "File Name One"},
status=200,
),
responses.add(
responses.GET,
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/"
+ "Test%20Sheet!1:1",
json={
"range": "Test%20Sheet!1:1",
"values": [["Column One", "Column Two"]],
},
status=200,
),
responses.add(
responses.GET,
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/Test%20Sheet",
json=self.column_response,
status=200,
)

tap = TapGoogleSheets(config=self.mock_config)

tap.sync_all()

self.assertEqual(len(test_utils.SINGER_MESSAGES), 4)
self.assertIsInstance(test_utils.SINGER_MESSAGES[0], singer.SchemaMessage)
self.assertIsInstance(test_utils.SINGER_MESSAGES[1], singer.SchemaMessage)
self.assertIsInstance(test_utils.SINGER_MESSAGES[2], singer.RecordMessage)
self.assertIsInstance(test_utils.SINGER_MESSAGES[3], singer.StateMessage)

# Assert that data is sycned from the mocked response
self.assertEquals(
test_utils.SINGER_MESSAGES[2].record, {"Column_One": "1", "Column_Two": "1"}
)
13 changes: 8 additions & 5 deletions tap_google_sheets/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def test_base_credentials_discovery(self):
),
responses.add(
responses.GET,
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/Sheet1!1:1",
json={"values": [["column_one", "column_two"]]},
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/!1:1",
json={"range": "Sheet1!1:1", "values": [["column_one", "column_two"]]},
status=200,
)

Expand All @@ -63,14 +63,17 @@ def test_standard_tap_tests(self):
),
responses.add(
responses.GET,
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/Sheet1!1:1",
json={"values": [["column_one", "column_two"]]},
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/!1:1",
json={"range": "Sheet1!1:1", "values": [["column_one", "column_two"]]},
status=200,
),
responses.add(
responses.GET,
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/Sheet1",
json={"values": [["column_one", "column_two"], ["1", "2"]]},
json={
"range": "Sheet1",
"values": [["column_one", "column_two"], ["1", "2"]],
},
status=200,
)

Expand Down
10 changes: 5 additions & 5 deletions tap_google_sheets/tests/test_discovered_stream_name.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Tests standard tap features using the built-in SDK tests library."""
"""Tests discovered stream names are returned underscored."""

import unittest

Expand All @@ -10,7 +10,7 @@


class TestDiscoveredStreamName(unittest.TestCase):
"""Test class for core tap tests."""
"""Test class for discovered stream name."""

def setUp(self):
self.mock_config = test_utils.MOCK_CONFIG
Expand Down Expand Up @@ -39,8 +39,8 @@ def test_discovered_stream_name(self):
),
responses.add(
responses.GET,
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/Sheet1!1:1",
json={"values": [["Column One", "Column Two"]]},
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/!1:1",
json={"range": "Sheet1!1:1", "values": [["Column One", "Column Two"]]},
status=200,
),
responses.add(
Expand All @@ -55,4 +55,4 @@ def test_discovered_stream_name(self):
tap.sync_all()

# Assert the returned stream name is lowercase and underscored
self.assertEqual(tap.discover_streams()[0].name, "file_name_one")
self.assertEqual(tap.discover_streams()[0].name, "File_Name_One")
8 changes: 4 additions & 4 deletions tap_google_sheets/tests/test_ignoring_unnamed_columns.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Tests standard tap features using the built-in SDK tests library."""
"""Tests that the tap ignores columns with no name"""

import unittest

Expand All @@ -10,7 +10,7 @@


class TestIgnoringUnnamedColumns(unittest.TestCase):
"""Test class for core tap tests."""
"""Test class for ignoring unnamed columns."""

def setUp(self):
self.mock_config = test_utils.MOCK_CONFIG
Expand Down Expand Up @@ -45,8 +45,8 @@ def test_ignoring_unnamed_columns(self):
),
responses.add(
responses.GET,
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/Sheet1!1:1",
json={"values": [["Column_One", "", "Column_Two"]]},
"https://sheets.googleapis.com/v4/spreadsheets/12345/values/!1:1",
json={"range": "Sheet1!1:1", "values": [["Column_One", "", "Column_Two"]]},
status=200,
),
responses.add(
Expand Down
Loading

0 comments on commit 53701f4

Please sign in to comment.