diff --git a/modules/ROOT/images/airflow/airflow-connection.png b/modules/ROOT/images/airflow/airflow-connection.png deleted file mode 100644 index d82a951c2..000000000 Binary files a/modules/ROOT/images/airflow/airflow-connection.png and /dev/null differ diff --git a/modules/ROOT/nav.adoc b/modules/ROOT/nav.adoc index 3ef4e5a3f..454c439d3 100644 --- a/modules/ROOT/nav.adoc +++ b/modules/ROOT/nav.adoc @@ -24,7 +24,10 @@ * Manage data ** xref::nos.adoc[] ** xref::select-the-right-data-ingestion-tools-for-teradata-vantage.adoc[] -** xref::airflow.adoc[] +** xref:airflow:airflow.adoc[] +** xref:airflow:airflow-azure-to-teradata-transfer-operator-doc.adoc[] +** xref:airflow:airflow-s3-to-teradata-transfer-operator-doc.adoc[] +** xref:other-integrations:execute-airflow-workflows-that-use-dbt-with-teradata-vantage.adoc[] ** xref::dbt.adoc[] ** xref::advanced-dbt.adoc[] ** xref:modelops:using-feast-feature-store-with-teradata-vantage.adoc[] @@ -34,7 +37,6 @@ ** xref:elt:transforming-external-data-loaded-via-airbyte-in-teradata-vantage-using-dbt.adoc[] ** xref:tools-and-utilities:run-bulkloads-efficiently-with-teradata-parallel-transporter.adoc[Load data with TPT] ** xref::create-parquet-files-in-object-storage.adoc[] -** xref:other-integrations:execute-airflow-workflows-that-use-dbt-with-teradata-vantage.adoc[] ** xref:cloud-guides:integrate-teradata-vantage-to-salesforce-using-amazon-appflow.adoc[] ** xref::segment.adoc[] ** xref:cloud-guides:connect-azure-data-share-to-teradata-vantage.adoc[] diff --git a/modules/ROOT/pages/airflow.adoc b/modules/ROOT/pages/airflow.adoc deleted file mode 100644 index 1c1b456bf..000000000 --- a/modules/ROOT/pages/airflow.adoc +++ /dev/null @@ -1,190 +0,0 @@ -= Use Apache Airflow with Teradata Vantage -:experimental: -:page-author: Satish Chinthanippu -:page-email: satish.chinthanippu@teradata.com -:page-revdate: February 06th, 2024 -:description: Use Apache Airflow with Teradata Vantage. -:keywords: data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, elt, airflow, workflow. -:tabs: -:dir: airflow - -== Overview - -This tutorial demonstrates how to use airflow with Teradata Vantage. Airflow will be installed on Ubuntu System. - -== Prerequisites - -* Ubuntu 22.x -* Access to a Teradata Vantage instance. -+ -include::ROOT:partial$vantage_clearscape_analytics.adoc[] -* Python *3.8*, *3.9*, *3.10* or *3.11* installed. -* pip - -== Install Apache Airflow - -1. Set the AIRFLOW_HOME environment variable. Airflow requires a home directory and uses ~/airflow by default, but you can set a different location if you prefer. The AIRFLOW_HOME environment variable is used to inform Airflow of the desired location. -+ -[source, bash] ----- -export AIRFLOW_HOME=~/airflow ----- -2. Install `apache-airflow` stable version 2.8.1 from PyPI repository.: -+ -[source, bash] ----- -AIRFLOW_VERSION=2.8.2 -PYTHON_VERSION="$(python --version | cut -d " " -f 2 | cut -d "." -f 1-2)" -CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" -pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" ----- -3. Install the Airflow Teradata provider stable version from PyPI repository. -+ -[source, bash] ----- -pip install "apache-airflow-providers-teradata" ----- - -+ -NOTE: For security reasons, the test connection functionality is disabled by default across Airflow UI, API and CLI. -The availability of the functionality can be controlled by the test_connection flag in the core section of the Airflow configuration ($AIRFLOW_HOME/airflow.cfg) or Define below environment variable before starting airflow server. -export AIRFLOW__CORE__TEST_CONNECTION=Enabled -+ - - -== Start Airflow Standalone - -1. Run Airflow Standalone -+ -[source, bash] ----- -airflow standalone ----- -2. Access the Airflow UI. Visit https://localhost:8080 in the browser and log in with the admin account details shown in the terminal. - - -Teradata Connections may be defined in Airflow in the following ways: - -1. Using Airflow Web UI -2. Using Environment Variable - -== Define a Teradata connection in Airflow Web UI - -1. Open the Admin -> Connections section of the UI. Click the Create link to create a new connection. -+ -image::{dir}/airflow-connection.png[Airflow admin dropdown, width=75%] -2. Fill in input details in New Connection Page. -+ -image::{dir}/airflow-newconnection.png[Airflow New Connection, width=75%] -* Connection Id: Unique ID of Teradata Connection. -* Connection Type: Type of the system. Select Teradata. -* Database Server URL (required): Teradata instance hostname to connect to. -* Database (optional): Specify the name of the database to connect to -* Login (required): Specify the user name to connect. -* Password (required): Specify the password to connect. -* Click on Test and Save. - -== Define a Teradata connection in Environment Variable -Airflow connections may be defined in environment variables in either of one below formats. - -1. JSON format -2. URI format - -+ -NOTE: The naming convention is AIRFLOW_CONN_{CONN_ID}, all uppercase (note the single underscores surrounding CONN). -So if your connection id is teradata_conn_id then the variable name should be AIRFLOW_CONN_TERADATA_CONN_ID -+ - - -== JSON format example - - -[source, bash] ----- -export AIRFLOW_CONN_TERADATA_CONN_ID='{ - "conn_type": "teradata", - "login": "teradata_user", - "password": "my-password", - "host": "my-host", - "schema": "my-schema", - "extra": { - "tmode": "TERA", - "sslmode": "verify-ca" - } -}' - ----- - -== URI format example - - -[source, bash] ----- -export AIRFLOW_CONN_TERADATA_CONN_ID='teradata://teradata_user:my-password@my-host/my-schema?tmode=TERA&sslmode=verify-ca' ----- - -Refer https://airflow.apache.org/docs/apache-airflow-providers-teradata/stable/connections/teradata.html[Teradata Hook] for detailed information on Teradata Connection in Airflow. - -== Define a DAG in Airflow - -1. In Airflow, DAGs are defined as Python code. -2. Create a DAG as a python file like sample.py under DAG_FOLDER - $AIRFLOW_HOME/files/dags directory. -+ -[source, python] ----- -from datetime import datetime -from airflow import DAG -from airflow.providers.teradata.operators.teradata import TeradataOperator -CONN_ID = "Teradata_TestConn" -with DAG( - dag_id="example_teradata_operator", - max_active_runs=1, - max_active_tasks=3, - catchup=False, - start_date=datetime(2023, 1, 1), -) as dag: - create = TeradataOperator( - task_id="table_create", - conn_id=CONN_ID, - sql=""" - CREATE TABLE my_users, - FALLBACK ( - user_id decimal(10,0) NOT NULL GENERATED ALWAYS AS IDENTITY ( - START WITH 1 - INCREMENT BY 1 - MINVALUE 1 - MAXVALUE 2147483647 - NO CYCLE), - user_name VARCHAR(30) - ) PRIMARY INDEX (user_id); - """, - ) ----- - -== Load DAG - -Airflow loads DAGs from Python source files, which it looks for inside its configured DAG_FOLDER - $AIRFLOW_HOME/files/dags directory. - -== Run DAG -DAGs will run in one of two ways: -1. When they are triggered either manually or via the API -2. On a defined schedule, which is defined as part of the DAG -`example_teradata_operator` is defined to trigger as manually. To define a schedule, any valid link:https://en.wikipedia.org/wiki/Cron[Crontab, window="_blank"] schedule value can be passed to the schedule argument. -[source, python] ----- -with DAG( - dag_id="my_daily_dag", - schedule="0 0 * * *" - ) as dag: ----- - -== Summary - -This tutorial demonstrated how to use Airflow and the Airflow Teradata provider with a Teradata Vantage instance. The example DAG provided creates `my_users` table in the Teradata Vantage instance defined in Connection UI. - -== Further reading -* link:https://airflow.apache.org/docs/apache-airflow/stable/start.html[airflow documentation] -* link:https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html[airflow DAGs] - - -include::ROOT:partial$community_link.adoc[] diff --git a/modules/airflow/images/airflow-azure-to-teradata-transfer-operator-doc/airflow-console-password.png b/modules/airflow/images/airflow-azure-to-teradata-transfer-operator-doc/airflow-console-password.png index e45630de8..d2a38d97a 100644 Binary files a/modules/airflow/images/airflow-azure-to-teradata-transfer-operator-doc/airflow-console-password.png and b/modules/airflow/images/airflow-azure-to-teradata-transfer-operator-doc/airflow-console-password.png differ diff --git a/modules/airflow/images/airflow-s3-to-teradata-transfer-operator-doc/airflow-console-password.png b/modules/airflow/images/airflow-s3-to-teradata-transfer-operator-doc/airflow-console-password.png index e45630de8..d2a38d97a 100644 Binary files a/modules/airflow/images/airflow-s3-to-teradata-transfer-operator-doc/airflow-console-password.png and b/modules/airflow/images/airflow-s3-to-teradata-transfer-operator-doc/airflow-console-password.png differ diff --git a/modules/airflow/images/airflow/airflow-connection.png b/modules/airflow/images/airflow/airflow-connection.png new file mode 100644 index 000000000..f898e2aeb Binary files /dev/null and b/modules/airflow/images/airflow/airflow-connection.png differ diff --git a/modules/airflow/images/airflow/airflow-console-password.png b/modules/airflow/images/airflow/airflow-console-password.png new file mode 100644 index 000000000..d2a38d97a Binary files /dev/null and b/modules/airflow/images/airflow/airflow-console-password.png differ diff --git a/modules/ROOT/images/airflow/airflow-newconnection.png b/modules/airflow/images/airflow/airflow-newconnection.png similarity index 100% rename from modules/ROOT/images/airflow/airflow-newconnection.png rename to modules/airflow/images/airflow/airflow-newconnection.png diff --git a/modules/airflow/images/airflow/dag.png b/modules/airflow/images/airflow/dag.png new file mode 100644 index 000000000..78c77e4c2 Binary files /dev/null and b/modules/airflow/images/airflow/dag.png differ diff --git a/modules/airflow/pages/airflow.adoc b/modules/airflow/pages/airflow.adoc new file mode 100644 index 000000000..84f562cb2 --- /dev/null +++ b/modules/airflow/pages/airflow.adoc @@ -0,0 +1,237 @@ += Use Apache Airflow with Teradata Vantage +:experimental: +:page-author: Satish Chinthanippu +:page-email: satish.chinthanippu@teradata.com +:page-revdate: February 06th, 2024 +:description: Use Apache Airflow with Teradata Vantage. +:keywords: data warehouses, compute storage separation, teradata, vantage, cloud data platform, object storage, business intelligence, enterprise analytics, elt, airflow, workflow. +:tabs: +:dir: airflow + +== Overview + +This document provides detailed instructions and guidance on using Apache Airflow with Teradata Vantage to create a table using the `TeradataOperator`. It covers the setup, configuration, and execution steps required to create a table within Teradata Vantage. The `TeradataOperator` is specifically designed for executing queries on Teradata databases. + +NOTE: Use `https://learn.microsoft.com/en-us/windows/wsl/install[The Windows Subsystem for Linux (WSL)]` on `Windows` to try this quickstart example. + +== Prerequisites +* Access to a Teradata Vantage instance, version 17.10 or higher. ++ +include::ROOT:partial$vantage_clearscape_analytics.adoc[] +* Python 3.8, 3.9, 3.10 or 3.11 and python3-env, python3-pip installed. ++ +[tabs, id="python_install"] +==== +Linux:: ++ +[source,bash] +---- +sudo apt install -y python3-venv python3-pip +---- +WSL:: ++ +[source,bash] +---- +sudo apt install -y python3-venv python3-pip +---- +macOS:: ++ +[source,bash] +---- +brew install python +---- + Refer https://docs.python-guide.org/starting/install3/osx/[Installation Guide] if you face any issues. +==== + +== Install Apache Airflow +1. Create a new python environment to manage airflow and its dependencies. Activate the environment. ++ +[source, bash] +---- +python3 -m venv airflow_env +source airflow_env/bin/activate +AIRFLOW_VERSION=2.9.3 +PYTHON_VERSION="$(python3 --version | cut -d " " -f 2 | cut -d "." -f 1-2)" +CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" +pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" +---- + ++ +2. Install the Apache Airflow Teradata provider package. ++ +[source, bash] +---- +pip install "apache-airflow-providers-teradata" +---- +3. Set the AIRFLOW_HOME environment variable. ++ +[source, bash] +---- +export AIRFLOW_HOME=~/airflow +---- + +== Configure Apache Airflow +1. Switch to the virtual environment where Apache Airflow was installed at <> ++ +[source, bash] +---- +source airflow_env/bin/activate +---- +2. Configure the listed environment variables to activate the test connection button, preventing the loading of sample DAGs and default connections in the Airflow UI. ++ +[source, bash] + export AIRFLOW__CORE__TEST_CONNECTION=Enabled + export AIRFLOW__CORE__LOAD_EXAMPLES=false + export AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=false + +== Start the Apache Airflow web server +1. Run airflow's web server ++ +[source, bash] +---- +airflow standalone +---- +2. Access the airflow UI. Visit https://localhost:8080 in the browser and log in with the admin account details shown in the terminal. ++ +image::{dir}/airflow-console-password.png[Airflow Password,align="left" width=75%] + +== Define the Apache Airflow connection to Vantage + +The Teradata connection in airflow can be defined in the following ways: + +1. Using the Airflow Web UI +2. Using Environment Variables + +=== Define a Teradata connection in Apache Airflow Web UI + +1. Open the Admin -> Connections section of the UI. Click the Create link to create a new connection. ++ +image::{dir}/airflow-connection.png[Airflow admin dropdown, width=75%] +2. Fill in input details in New Connection Page. ++ +image::{dir}/airflow-newconnection.png[Airflow New Connection, width=75%] +* Connection Id: Unique ID of Teradata Connection. +* Connection Type: Type of the system. Select Teradata. +* Database Server URL (required): Teradata instance hostname to connect to. +* Database (optional): Specify the name of the database to connect to +* Login (required): Specify the user name to connect. +* Password (required): Specify the password to connect. +* Click on Test and Save. + +Refer https://airflow.apache.org/docs/apache-airflow-providers-teradata/stable/connections/teradata.html[Teradata Connection] for more details. + + +=== Define a Teradata connection as Environment Variable +The Teradata connection can be defined as environment variables in one of the following formats. + +1. JSON format +2. URI format + ++ +NOTE: The naming convention for environment variables is `AIRFLOW_CONN_{CONN_ID}` with all uppercase letters (note the single underscore surrounding `CONN`). For example, if your connection ID is `teradata_default`, the environment variable should be named `AIRFLOW_CONN_TERADATA_DEFAULT`. ++ + + +==== JSON format example + + +[source, bash] +---- +export AIRFLOW_CONN_TERADATA_DEFAULT='{ + "conn_type": "teradata", + "login": "teradata_user", + "password": "my-password", + "host": "my-host", + "schema": "my-schema", + "extra": { + "tmode": "TERA", + "sslmode": "verify-ca" + } +}' + +---- + +==== URI format example + + +[source, bash] +---- +export AIRFLOW_CONN_TERADATA_DEFAULT='teradata://teradata_user:my-password@my-host/my-schema?tmode=TERA&sslmode=verify-ca' +---- + +Refer https://airflow.apache.org/docs/apache-airflow-providers-teradata/stable/connections/teradata.html[Teradata Hook] for detailed information on Teradata Connection in Airflow. + +== Define a DAG in Airflow + +1. In airflow, DAGs are defined as Python code. +2. Create a DAG as a python file like sample.py under DAG_FOLDER - $AIRFLOW_HOME/dags directory. ++ +[source, python] +---- +from datetime import datetime +from airflow import DAG +from airflow.providers.teradata.operators.teradata import TeradataOperator +CONN_ID = "teradata_default" +with DAG( + dag_id="example_teradata_operator", + max_active_runs=1, + max_active_tasks=3, + catchup=False, + start_date=datetime(2023, 1, 1), +) as dag: + create = TeradataOperator( + task_id="table_create", + teradata_conn_id=CONN_ID, + sql=""" + CREATE TABLE my_users, + FALLBACK ( + user_id decimal(10,0) NOT NULL GENERATED ALWAYS AS IDENTITY ( + START WITH 1 + INCREMENT BY 1 + MINVALUE 1 + MAXVALUE 2147483647 + NO CYCLE), + user_name VARCHAR(30) + ) PRIMARY INDEX (user_id); + """, + ) +---- + +== Load DAG + +When the DAG file is copied to $AIRFLOW_HOME/dags, Apache Airflow displays the DAG in the UI under the DAGs section. It will take 2 to 3 minutes to load the DAG in the Apache Airflow UI. + + +== Run DAG +DAGs can be executed in one of two ways: + +1. Manually or via the API: You can trigger DAGs manually or through API calls. +2. On a scheduled basis: DAGs can be set to run according to a schedule defined within their configuration. + +The `example_teradata_operator` DAG is configured to be triggered manually. To define a schedule, you can use any valid link:https://en.wikipedia.org/wiki/Cron[Crontab, window="_blank"] schedule value for the schedule argument. +[source, python] +---- +with DAG( + dag_id="example_teradata_operator", + max_active_runs=1, + max_active_tasks=3, + catchup=False, + schedule="0 0 * * *" + start_date=datetime(2023, 1, 1), +) as dag: +---- + +Run the DAG manually as shown in the image below. + +image::{dir}/dag.png[Run DAG,align="left" width=75%] + +== Summary + +This tutorial demonstrated how to use Apache Airflow along with the Airflow Teradata provider to interact with a Teradata Vantage instance. The example DAG provided shows how to create the `my_users` table in the Teradata Vantage instance specified in the Connection UI. + +== Further reading +* link:https://airflow.apache.org/docs/apache-airflow/stable/start.html[airflow documentation] +* link:https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html[airflow DAGs] + + +include::ROOT:partial$community_link.adoc[]