diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..69fbfa8 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +indent_style = space +indent_size = 4 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.yml] +indent_size = 2 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8fc3004 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Brooklyn Data Company LLC + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..243b27c --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +# Meltano on Github Actions + +This template uses [cookiecutter](https://github.com/cookiecutter/cookiecutter) to generate a [GitHub Actions](https://github.com/features/actions) orchestrated [Meltano](https://meltano.com/) project. + +For a list of pre-made Singer taps and targets, see the [Meltano Hub](https://hub.meltano.com/singer/taps/). + +## Benefits to using GitHub Actions + +- Requires only a GitHub account to deploy a fully automated Meltano project with logging, alerting, and incremental state +- Configuration is version controlled and maintained as code +- Low barrier to entry, a working project can be deployed on a schedule in a couple of hours +- Easy management of secrets +- Low cost. See the free minutes for your plan [here](https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions#included-storage-and-minutes), and the incremental cost outside of your free minutes [here](https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions#per-minute-rates). + +## Limitations of using GitHub Actions + +- There is a [6 hour maximum run time](https://docs.github.com/en/actions/learn-github-actions/usage-limits-billing-and-administration#usage-limits) for any isolated individual job in a workflow. This often means that backfills need to be performed outside of GitHub actions. +- No way of exposing the Meltano UI + +## Usage prerequisites: +- Python >= 3.7 +- pipx +- meltano +- cookiecutter + +Install pipx with: +```bash +pip install pipx +pipx ensurepath +``` + +Install Meltano with: +```bash +pipx install meltano +``` + +Install Cookiecutter with: +```bash +pipx install cookiecutter +``` + +## Instructions + +1. In your terminal, navigate to the parent folder in which you'd like the project to be created. +2. Run `cookiecutter https://github.com/brooklyn-data/meltano-on-github-actions` and follow the prompts. +3. From inside the newly generated project, search for all 'TODO' strings, and complete any actions required. +4. Once ready to publish, initialize Git with `git init`. +5. Create an empty repository in GitHub. +6. Take the `.git` URL of the newly created remote repository, and run `git remote add origin <.git url>`. +7. Stage and commit the generated project files with `git add -A` and `git commit -m 'Initial commit'`. +8. Make sure the branch is named `main` by running `git branch -M main`. +9. Finally, push the created project to the remote repository with `git push -u origin main`. +10. Configure any required secrets in the GitHub repo. + +## Slack alerts +Slack alerts on failure are enabled [using the official Slack GitHub action](https://github.com/slackapi/slack-github-action), using '[Technique 2: Slack App](https://github.com/slackapi/slack-github-action#technique-2-slack-app)'. To configure: + +1. [Create a Slack App](https://api.slack.com/apps) for the workspace, with a suitable name (e.g. Meltano). +2. Add the [chat.write](https://api.slack.com/scopes/chat:write) bot scope under OAuth & Permissions. +3. Install the app to the workspace. +4. Copy the app's Bot Token from the OAuth & Permissions page and add it as a secret in the repo settings named `SLACK_BOT_TOKEN`. +5. Invite the bot user into the channel you wish to post messages to (/invite @bot_user_name). +6. Copy the Slack channel's Channel ID (from the channel's About section, accessed by clicking the drop down arrow next to the channel's name) into another repository secret named `SLACK_CHANNEL_ID`. + +# About Brooklyn Data Co. +We are a full-stack data and analytics team, focused on leadership, process improvement, implementation, and advanced analytics. [Read more about what we do](https://brooklyndata.co) and [check out our open roles!](https://brooklyndata.co/careers) diff --git a/cookiecutter.json b/cookiecutter.json new file mode 100644 index 0000000..5cc8d4c --- /dev/null +++ b/cookiecutter.json @@ -0,0 +1,9 @@ +{ + "org_name": "Correctly capitalized organization name", + "project_folder_name": "meltano-on-github-actions", + "add_target": [ + "No", + "Snowflake", + "BigQuery" + ] +} diff --git a/{{ cookiecutter.project_folder_name }}/.editorconfig b/{{ cookiecutter.project_folder_name }}/.editorconfig new file mode 100644 index 0000000..69fbfa8 --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +indent_style = space +indent_size = 4 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.yml] +indent_size = 2 diff --git a/{{ cookiecutter.project_folder_name }}/.github/workflows/README.md b/{{ cookiecutter.project_folder_name }}/.github/workflows/README.md new file mode 100644 index 0000000..28ba154 --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/.github/workflows/README.md @@ -0,0 +1 @@ +Create new tap-target workflow files using the existing as a guide. diff --git a/{{ cookiecutter.project_folder_name }}/.github/workflows/example-workflow.yml b/{{ cookiecutter.project_folder_name }}/.github/workflows/example-workflow.yml new file mode 100644 index 0000000..047cfb9 --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/.github/workflows/example-workflow.yml @@ -0,0 +1,70 @@ +{%- raw %} +on: + schedule: + - cron: '0 8 * * *' # Every day at 8AM UTC - TODO update as needed + workflow_dispatch: # workflow_dispatch enables the workflow to be triggered manually + +jobs: + meltano: + runs-on: ubuntu-latest + name: Meltano + steps: + # Checkout the repository so that a path to the action.yml can be used + - uses: actions/checkout@v2 +{% endraw %}{%- if cookiecutter.add_target == "BigQuery" %}{% raw %} + # Authenticate to Google SDK for target-bigquery + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v0.5.1 + with: + # TODO add GCP_PROJECT_ID and GCP_SA_KEY as GitHub Secrets + project_id: ${{ secrets.GCP_PROJECT_ID }} + service_account_key: ${{ secrets.GCP_SA_KEY }} + export_default_credentials: true +{% endraw %}{%- endif %}{% raw %} + # Get the database files from the previous run, if exists + - name: Download latest SQLite database artifact + uses: dawidd6/action-download-artifact@v2 + continue-on-error: true + with: + workflow: ${{ github.workflow }} + branch: main + name: meltano.db + path: ${{ github.workspace }} + workflow_conclusion: completed # Download artifacts from the latest completed run, even if it failed. This enables Meltano to start off from where a tap failed. + - name: Meltano + if: always() # Always run this step even if the prior failed due to no artifact found (this should only take effect if first time running) + uses: ./ + env: + # TODO add and modify as needed and add to the repo's secrets +{% endraw %}{%- if cookiecutter.add_target == "Snowflake" %}{% raw %} + TARGET_SNOWFLAKE_ACCOUNT: ${{ secrets.TARGET_SNOWFLAKE_ACCOUNT }} + TARGET_SNOWFLAKE_USERNAME: MELTANO + TARGET_SNOWFLAKE_PASSWORD: ${{ secrets.TARGET_SNOWFLAKE_PASSWORD }} + TARGET_SNOWFLAKE_DBNAME: MELTANO + TARGET_SNOWFLAKE_WAREHOUSE: MELTANO + TARGET_SNOWFLAKE_FILE_FORMAT: meltano.public.csv +{% endraw %}{%- elif cookiecutter.add_target == "BigQuery" %}{% raw %} + TARGET_BIGQUERY_PROJECT_ID: # TODO add the production BigQuery project ID here +{% endraw %}{%- endif %}{% raw %} + MELTANO_DATABASE_URI: sqlite:////github/workspace/meltano.db + with: + # TODO set correct tap and target names + args: meltano elt tap-example {% endraw %}{% if cookiecutter.add_target == "Snowflake" %}target-snowflake{% elif cookiecutter.add_target == "BigQuery" %}target-bigquery{% else %}target-example{% endif %}{% raw %} --job_id=meltano # job_id required to use incremental state, can be anything + - name: Meltano Slack alert + # TODO configure SLACK_CHANNEL_ID and SLACK_BOT_TOKEN secrets in the repo + if: failure() # Only run if one of the previous steps fails + id: slack + uses: slackapi/slack-github-action@v1.16.0 + with: + channel-id: ${{ secrets.SLACK_CHANNEL_ID }} + slack-message: "" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + # Upload the sqlite database containing incremental, bookmark state + - name: Upload SQLite database artifact + if: always() # Always run this step even if the prior failed + uses: actions/upload-artifact@v2 + with: + name: meltano.db + path: ${{ github.workspace }}/meltano.db* +{%- endraw %} diff --git a/{{ cookiecutter.project_folder_name }}/.gitignore b/{{ cookiecutter.project_folder_name }}/.gitignore new file mode 100644 index 0000000..37cd3e3 --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/.gitignore @@ -0,0 +1,137 @@ +# Secrets and internal config files +**/.secrets/* +gcp.json + +# Ignore meltano internal cache and sqlite systemdb + +.meltano/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/{{ cookiecutter.project_folder_name }}/Dockerfile b/{{ cookiecutter.project_folder_name }}/Dockerfile new file mode 100644 index 0000000..6499a1d --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/Dockerfile @@ -0,0 +1,10 @@ +FROM meltano/meltano:v1.96.0-python3.8 + +# Meltano project and install +COPY . /project/ +RUN meltano install + +# GitHub actions sets the working directory to the repo root when running. +# The entrypoint script sets the current directory to the meltano project and +# runs the container arguments as a command. +ENTRYPOINT ["./entrypoint.sh"] diff --git a/{{ cookiecutter.project_folder_name }}/README.md b/{{ cookiecutter.project_folder_name }}/README.md new file mode 100644 index 0000000..7466ef5 --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/README.md @@ -0,0 +1,87 @@ +# {{ cookiecutter.org_name }}'s Meltano project + +This project has been generated using [Brooklyn Data Co.](https://brooklyndata.co/)'s [Meltano on GitHub Actions template](https://github.com/brooklyn-data/meltano-on-github-actions). + +## What's Meltano? + +[Meltano](https://meltano.com/) is a configuration and orchestration manager for Singer taps and targets. A tap extracts data from a data source, and a target moves data into a destination. + +## In this repo + +This repo contains the configuration for a Meltano project and GitHub Action workflows which run Meltano jobs on schedules. The configured taps and targets can be seen within `meltano.yml`. Schedules are configured within `.github/workflows`. + +## Local development + +Prerequisites: +- Python >= 3.7 +- `pipx` +- `meltano` + +Install pipx with: +```bash +pip install pipx +pipx ensurepath +``` + +Install Meltano with: +```bash +pipx install meltano +``` + +### Local Meltano invocation + +1. Create a `.env` file in this directory using `example.env` as a template and populate the blank values +2. Initialize the Meltano project: +```bash +meltano install +``` +3. Test using Meltano: +```bash +# Test invocation: +meltano invoke tap-example --version +# Run a test `elt` pipeline: +meltano elt tap-example target-jsonl --job_id=test +``` + +The output streams from the tap will be written to `.jsonl` files within the `output` directory. + +{%- if cookiecutter.add_target == "Snowflake" %} +## Snowflake configuration + +We use [pipelinewise-target-snowflake](https://github.com/transferwise/pipelinewise-target-snowflake) to load data into Snowflake. See the pre-requirements for the target here: https://github.com/transferwise/pipelinewise-target-snowflake#pre-requirements + +Use development credentials and database of `MELTANO_DEV` configured as default in `meltano.yml` for local testing. +{%- elif cookiecutter.add_target == "BigQuery" %} +## BigQuery configuration + +We use [target-bigquery](https://github.com/adswerve/target-bigquery) to load data into BigQuery. See the pre-requirements for the target here: https://github.com/adswerve/target-bigquery#how-to-use-it + +Use development credentials and project configured as default in `meltano.yml` for local testing. +{%- endif %} +## Deployment + +This pipeline is deployed on GitHub Actions, and orchestrated by Meltano. Secrets are configured in GitHub actions per [this guide](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-a-repository). Note that configuration in `meltano.yml` is overridden by environment variables. The environment variables set are: + +# TODO - add to or modify as needed +{%- if cookiecutter.add_target == "Snowflake" %} +- `TARGET_SNOWFLAKE_ACCOUNT` +- `TARGET_SNOWFLAKE_USERNAME` +- `TARGET_SNOWFLAKE_PASSWORD` +- `TARGET_SNOWFLAKE_DBNAME` +- `TARGET_SNOWFLAKE_WAREHOUSE` +- `TARGET_SNOWFLAKE_FILE_FORMAT` +{%- endif %} + +[Artifacts](https://docs.github.com/en/actions/advanced-guides/storing-workflow-data-as-artifacts) are used to persist Meltano's SQLite database between runs. Doing so enables pipelines to run incrementally, i.e. only load new or updated data in each run. + +GitHub Action configuration can be seen in `.github/workflows`. A new workflow is created for each pipeline, with independent state. Note that the `action-download-artifact` step requires that the workflow is not given a 'name' parameter. + +## Slack alerts +Slack alerts on failure are enabled [using the official Slack GitHub action](https://github.com/slackapi/slack-github-action), using '[Technique 2: Slack App](https://github.com/slackapi/slack-github-action#technique-2-slack-app)'. Configuration: + +1. [Create a Slack App](https://api.slack.com/apps) for the workspace, with a suitable name (e.g. Meltano). +2. Add the [chat.write](https://api.slack.com/scopes/chat:write) bot scope under OAuth & Permissions. +3. Install the app to the workspace. +4. Copy the app's Bot Token from the OAuth & Permissions page and add it as a secret in the repo settings named `SLACK_BOT_TOKEN`. +5. Invite the bot user into the channel you wish to post messages to (/invite @bot_user_name). +6. Copy the Slack channel's Channel ID (from the channel's About window, accessed by clicking the drop down arrow next to the channel's name) into another repository secret named `SLACK_CHANNEL_ID`. diff --git a/{{ cookiecutter.project_folder_name }}/action.yml b/{{ cookiecutter.project_folder_name }}/action.yml new file mode 100644 index 0000000..d556f72 --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/action.yml @@ -0,0 +1,5 @@ +name: 'Run Meltano' +description: 'Runs a Meltano command in the Docker image' +runs: + using: 'docker' + image: 'Dockerfile' diff --git a/{{ cookiecutter.project_folder_name }}/entrypoint.sh b/{{ cookiecutter.project_folder_name }}/entrypoint.sh new file mode 100755 index 0000000..810b334 --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/entrypoint.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +cd /project +echo "Running '$*'" +eval $* diff --git a/{{ cookiecutter.project_folder_name }}/example.env b/{{ cookiecutter.project_folder_name }}/example.env new file mode 100644 index 0000000..311f169 --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/example.env @@ -0,0 +1,10 @@ +# TODO - Add any other required environment variables here for development +# Copy and paste me, renaming result to .env +{%- if cookiecutter.add_target == "Snowflake" %} +TARGET_SNOWFLAKE_ACCOUNT="" +TARGET_SNOWFLAKE_USERNAME="" +TARGET_SNOWFLAKE_PASSWORD="" +TARGET_SNOWFLAKE_WAREHOUSE="" +{%- elif cookiecutter.add_target == "BigQuery" %} +GOOGLE_APPLICATION_CREDENTIALS="gcp.json" +{%- endif %} diff --git a/{{ cookiecutter.project_folder_name }}/meltano.yml b/{{ cookiecutter.project_folder_name }}/meltano.yml new file mode 100644 index 0000000..0ca9a11 --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/meltano.yml @@ -0,0 +1,36 @@ +version: 1 +plugins: + extractors: + - name: tap-example # TODO add a real tap's configuration here https://docs.meltano.com/guide/plugin-management#adding-a-plugin-to-your-project +{%- if cookiecutter.add_target == "Snowflake" %} + # Load into the example schema in Snowflake TODO edit as needed +{%- elif cookiecutter.add_target == "BigQuery" %} + # Load into the example dataset in BigQuery TODO edit as needed +{%- endif %} + load_schema: example + loaders: + # target-jsonl is used for testing, files are stored in the output directory + - name: target-jsonl + variant: andyh1203 + pip_url: target-jsonl +{%- if cookiecutter.add_target == "Snowflake" %} + - name: target-snowflake + variant: transferwise + pip_url: pipelinewise-target-snowflake + config: + # dbname and file_format are overridden using environment variables in production + # TODO create required Snowflake objects per https://github.com/transferwise/pipelinewise-target-snowflake#pre-requirements + dbname: meltano_dev + file_format: meltano_dev.public.csv + add_metadata_columns: true +{%- elif cookiecutter.add_target == "BigQuery" %} + - name: target-bigquery + variant: adswerve + pip_url: git+https://github.com/adswerve/target-bigquery.git + config: + # project_id is overridden using an environment variable in production + # TODO configure BigQuery per https://github.com/adswerve/target-bigquery#how-to-use-it + project_id: # TODO add the GCP project ID to use for local testing here + dataset_id: $MELTANO_EXTRACT__LOAD_SCHEMA + add_metadata_columns: true +{%- endif %} diff --git a/{{ cookiecutter.project_folder_name }}/output/.gitignore b/{{ cookiecutter.project_folder_name }}/output/.gitignore new file mode 100644 index 0000000..b165efc --- /dev/null +++ b/{{ cookiecutter.project_folder_name }}/output/.gitignore @@ -0,0 +1,4 @@ +# This directory is used as a target by target-jsonl, so ignore all files + +* +!.gitignore \ No newline at end of file