From c9a332893f4f40786741ecc9f7ca545572eba0e8 Mon Sep 17 00:00:00 2001 From: Muhammet Orazov Date: Thu, 7 Mar 2019 11:04:37 +0100 Subject: [PATCH] Prepare release of GCS and Azure export feature Add new commands to Readme. --- README.md | 110 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 82 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 7c636a6c..b7569717 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,10 @@ can contact our support team. ## Table of Contents * [Overview](#overview) -* [Usage](#usage) -* [Building from Source](#building-from-source) +* [Getting started](#getting-started) +* [Import](#import) +* [Export](#export) +* [Building from source](#building-from-source) ## Overview @@ -22,25 +24,39 @@ This repository contains helper code to create [Exasol][exasol] ETL UDFs in order to transfer data to/from public cloud storage services such as [AWS S3][s3], [Google Cloud Storage][gcs] and [Azure Blob Storage][azure]. -**Currently only [Apache Parquet][parquet] format and export to AWS S3 is -supported.** +**Currently only [Apache Parquet][parquet] format is supported.** Please be aware that Exasol already supports natively [loading CSV format from AWS S3][sol-594] (but not GCS or Azure) and similarly transfering data to/from [Apache Hive][hadoop-etl-udfs]. -## Usage +## Getting started -Please follow the steps described below in order to setup the UDFs. +Please follow the steps described below in order to setup the `IMPORT` and +`EXPORT` UDF scripts. -### Download the JAR file +### Download the file Download the latest jar file from [releases][jars]. Additionally, you can also build it from the source by following the [build from -source](#building-from-source) steps. +source](#building-from-source) guide. This will allow you to use latest commits +that are not released yet. -### Upload the JAR file to Exasol BucketFS +### Create Exasol Bucket + +In order to use the import or export functionality of `cloud-storage-etl-udfs`, +you have to upload the jar to a bucket in the Exasol bucket file system +(BucketFS). + +For this overview we are using an example bucket named `bucket1`. + +### Upload the JAR file to the bucket + +This will allow using the jar in the ETL UDF scripts later on. Before uploading +the jar, please make sure that the BucketFS ports are open. + +Here we use the port number `2580` for http. ```bash curl \ @@ -49,9 +65,11 @@ curl \ http://w:MY-PASSWORD@EXA-NODE-ID:2580/bucket1/cloud-storage-etl-udfs-{VERSION}.jar ``` -Please change required parameters. +Please change other required parameters such as `VERSION`, `EXA-NODE-ID`. + +### Create ETL UDFs scripts -### Create UDFs scripts +Run the following SQL commands to create Exasol scripts. ```sql CREATE SCHEMA ETL; @@ -89,11 +107,18 @@ CREATE OR REPLACE JAVA SET SCRIPT EXPORT_TABLE(...) EMITS (ROWS_AFFECTED INT) AS / ``` -### Import data from cloud storages +Please do not forget to change the bucket name or latest jar version according +to your setup. + +## IMPORT + +Please follow instructions below in order to import from different cloud +storages. -Please follow steps below in order to import from cloud strorages. +### Make sure an Exasol table is available -#### Create an Exasol schema and table +In this walkthrough we use the `SALES_POSITIONS` table to import data into or to +export its contents to cloud storages. ```sql CREATE SCHEMA RETAIL; @@ -112,25 +137,26 @@ CREATE TABLE SALES_POSITIONS ( ); ``` -#### Import from AWS S3 +Similarly, please do not forget to change the paths accordingly in the import +commands below. -```sql --- ALTER SESSION SET SCRIPT_OUTPUT_ADDRESS='10.0.2.162:3000'; +### Import from AWS S3 +```sql IMPORT INTO SALES_POSITIONS FROM SCRIPT ETL.IMPORT_PATH WITH - BUCKET_PATH = 's3a://exa-mo-frankfurt/test/retail/sales_positions/*' + BUCKET_PATH = 's3a://exa-bucket/data/parquet/retail/sales_positions/*' S3_ACCESS_KEY = 'MY_AWS_ACCESS_KEY' S3_SECRET_KEY = 'MY_AWS_SECRET_KEY' S3_ENDPOINT = 's3.MY_REGION.amazonaws.com' - PARALLELISM = 'nproc()*10'; + PARALLELISM = 'nproc()'; -- MY_REGION is one of AWS regions, for example, eu-central-1 SELECT * FROM SALES_POSITIONS LIMIT 10; ``` -#### Import from Google GCS +### Import from Google GCS In order to read data from [Google GCS][gcs], you need to provide a service account key file. This should be uploaded to a secure Exasol bucket in advance. @@ -149,40 +175,68 @@ And then run import, ```sql IMPORT INTO SALES_POSITIONS FROM SCRIPT ETL.IMPORT_PATH WITH - BUCKET_PATH = 'gs://exa-test-bucket/data/parquet/sales_positions/*' + BUCKET_PATH = 'gs://exa-bucket/data/parquet/retail/sales_positions/*' GCS_PROJECT_ID = 'MY_GCS_PORJECT_ID' GCS_KEYFILE_PATH = 'MY_BUCKETFS_PATH/project-id-service-keyfile.json' - PARALLELISM = 'nproc()*10'; + PARALLELISM = 'nproc()'; SELECT * FROM SALES_POSITIONS LIMIT 10; ``` -#### Import from Azure Blob Store +### Import from Azure Blob Store ```sql IMPORT INTO SALES_POSITIONS FROM SCRIPT ETL.IMPORT_PATH WITH - BUCKET_PATH = 'wasbs://@.blob.core.windows.net/sales-positions/*' + BUCKET_PATH = 'wasbs://@.blob.core.windows.net/data/parquet/sales-positions/*' AZURE_ACCOUNT_NAME = 'MY_AZURE_STORAGE_ACCOUNT_NAME' AZURE_SECRET_KEY = 'MY_AZURE_STORAGE_SECRET_KEY' - PARALLELISM = 'nproc()*10'; + PARALLELISM = 'nproc()'; SELECT * FROM SALES_POSITIONS LIMIT 10; ``` -#### Export to AWS S3 +## EXPORT + +Please follow steps below in order to export to various cloud storages. + +Like import, we will export the `SALES_POSITIONS` table. + +### Export to AWS S3 ```sql EXPORT SALES_POSITIONS INTO SCRIPT ETL.EXPORT_PATH WITH - BUCKET_PATH = 's3a://exa-mo-frankfurt/export/retail/sales_positions/' + BUCKET_PATH = 's3a://exa-bucket/data/parquet/retail/sales_positions/' S3_ACCESS_KEY = 'MY_AWS_ACCESS_KEY' S3_SECRET_KEY = 'MY_AWS_SECRET_KEY' S3_ENDPOINT = 's3.MY_REGION.amazonaws.com' PARALLELISM = 'nproc()'; ``` -## Building from Source +### Export to GCS + +```sql +EXPORT SALES_POSITIONS +INTO SCRIPT ETL.EXPORT_PATH WITH + BUCKET_PATH = 'gs://exa-bucket/data/parquet/retail/sales_positions/' + GCS_PROJECT_ID = 'MY_GCS_PORJECT_ID' + GCS_KEYFILE_PATH = 'MY_BUCKETFS_PATH/project-id-service-keyfile.json' + PARALLELISM = 'nproc()'; +``` + +### Export to Azure Blob Store + +```sql +EXPORT SALES_POSITIONS +INTO SCRIPT ETL.EXPORT_PATH WITH + BUCKET_PATH = 'wasbs://@.blob.core.windows.net/data/parquet/sales-positions/' + AZURE_ACCOUNT_NAME = 'MY_AZURE_STORAGE_ACCOUNT_NAME' + AZURE_SECRET_KEY = 'MY_AZURE_STORAGE_SECRET_KEY' + PARALLELISM = 'nproc()'; +``` + +## Building from source Clone the repository,