From 957de4e27d3a243ccb9beccd2ee72ab231e9dcf4 Mon Sep 17 00:00:00 2001 From: normanj-bitquill <78755797+normanj-bitquill@users.noreply.github.com> Date: Tue, 17 Dec 2024 13:18:27 -0800 Subject: [PATCH 1/2] Created docker files for an integ test cluster (#601) (#986) * Created docker files for an integ test cluster (#601) Cluster contains: * Spark master * Spark worker * OpenSearch server * OpenSearch dashboards * Minio server Signed-off-by: Norman Jordan * Updated to start Spark Connect on the Spark master container Signed-off-by: Norman Jordan * Can integration tests against the docker cluster The Python script for integration tests was updated to run queries against the docker cluster. The required indices are created as part of the script. The queries for the Python script were likely out of date. These have been updated when the fix for the query was obvious. There are still 6 tests that fail. Signed-off-by: Norman Jordan * Fixed up the documentation for docker integration tests Signed-off-by: Norman Jordan * Added a link in the toplevel README Signed-off-by: Norman Jordan * Described creation of test indices Signed-off-by: Norman Jordan --------- Signed-off-by: Norman Jordan --- README.md | 2 + docker/integ-test/.env | 13 + docker/integ-test/docker-compose.yml | 143 ++++ docker/integ-test/log4j2.properties | 69 ++ docker/integ-test/prepare_scala_queries.py | 23 + docker/integ-test/queries.scala | 619 +++++++++++++++++ docker/integ-test/spark-defaults.conf | 35 + docker/integ-test/spark-master-entrypoint.sh | 17 + integ-test/script/README.md | 73 +- integ-test/script/SanityTest.py | 202 ++---- integ-test/script/data/customer.mapping.json | 30 + integ-test/script/data/http_logs.json | 12 + integ-test/script/data/http_logs.mapping.json | 30 + integ-test/script/data/lineitem.mapping.json | 54 ++ integ-test/script/data/nation.mapping.json | 18 + integ-test/script/data/nested.json | 10 + integ-test/script/data/nested.mapping.json | 37 + integ-test/script/data/orders.mapping.json | 33 + integ-test/script/data/part.mapping.json | 33 + integ-test/script/data/partsupp.mapping.json | 21 + integ-test/script/data/people.json | 12 + integ-test/script/data/people.mapping.json | 24 + integ-test/script/data/region.mapping.json | 15 + integ-test/script/data/supplier.mapping.json | 27 + integ-test/script/data/work_info.json | 10 + integ-test/script/data/work_info.mapping.json | 18 + integ-test/script/test_cases.csv | 644 +++++++++--------- 27 files changed, 1748 insertions(+), 476 deletions(-) create mode 100644 docker/integ-test/.env create mode 100644 docker/integ-test/docker-compose.yml create mode 100644 docker/integ-test/log4j2.properties create mode 100755 docker/integ-test/prepare_scala_queries.py create mode 100644 docker/integ-test/queries.scala create mode 100644 docker/integ-test/spark-defaults.conf create mode 100755 docker/integ-test/spark-master-entrypoint.sh create mode 100644 integ-test/script/data/customer.mapping.json create mode 100644 integ-test/script/data/http_logs.json create mode 100644 integ-test/script/data/http_logs.mapping.json create mode 100644 integ-test/script/data/lineitem.mapping.json create mode 100644 integ-test/script/data/nation.mapping.json create mode 100644 integ-test/script/data/nested.json create mode 100644 integ-test/script/data/nested.mapping.json create mode 100644 integ-test/script/data/orders.mapping.json create mode 100644 integ-test/script/data/part.mapping.json create mode 100644 integ-test/script/data/partsupp.mapping.json create mode 100644 integ-test/script/data/people.json create mode 100644 integ-test/script/data/people.mapping.json create mode 100644 integ-test/script/data/region.mapping.json create mode 100644 integ-test/script/data/supplier.mapping.json create mode 100644 integ-test/script/data/work_info.json create mode 100644 integ-test/script/data/work_info.mapping.json diff --git a/README.md b/README.md index db3790e64..6732db3af 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,8 @@ bin/spark-shell --packages "org.opensearch:opensearch-spark-ppl_2.12:0.7.0-SNAPS ### PPL Run queries on a local spark cluster See ppl usage sample on local spark cluster [PPL on local spark ](docs/ppl-lang/local-spark-ppl-test-instruction.md) +### Running integration tests on a local spark cluster +See integration test documentation [Docker Integration Tests](integ-test/script/README.md) ## Code of Conduct diff --git a/docker/integ-test/.env b/docker/integ-test/.env new file mode 100644 index 000000000..cf73bdc89 --- /dev/null +++ b/docker/integ-test/.env @@ -0,0 +1,13 @@ +SPARK_VERSION=3.5.3 +OPENSEARCH_VERSION=latest +DASHBOARDS_VERSION=latest +MASTER_UI_PORT=8080 +MASTER_PORT=7077 +UI_PORT=4040 +SPARK_CONNECT_PORT=15002 +PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar +FLINT_JAR=../../flint-spark-integration/target/scala-2.12/flint-spark-integration-assembly-0.7.0-SNAPSHOT.jar +OPENSEARCH_NODE_MEMORY=512m +OPENSEARCH_ADMIN_PASSWORD=C0rrecthorsebatterystaple. +OPENSEARCH_PORT=9200 +OPENSEARCH_DASHBOARDS_PORT=5601 diff --git a/docker/integ-test/docker-compose.yml b/docker/integ-test/docker-compose.yml new file mode 100644 index 000000000..c5ee53d7d --- /dev/null +++ b/docker/integ-test/docker-compose.yml @@ -0,0 +1,143 @@ +services: + spark: + image: bitnami/spark:${SPARK_VERSION:-3.5.3} + container_name: spark + ports: + - "${MASTER_UI_PORT:-8080}:8080" + - "${MASTER_PORT:-7077}:7077" + - "${UI_PORT:-4040}:4040" + - "${SPARK_CONNECT_PORT}:15002" + entrypoint: /opt/bitnami/scripts/spark/master-entrypoint.sh + environment: + - SPARK_MODE=master + - SPARK_RPC_AUTHENTICATION_ENABLED=no + - SPARK_RPC_ENCRYPTION_ENABLED=no + - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no + - SPARK_SSL_ENABLED=no + - SPARK_PUBLIC_DNS=localhost + volumes: + - type: bind + source: ./spark-master-entrypoint.sh + target: /opt/bitnami/scripts/spark/master-entrypoint.sh + - type: bind + source: ./spark-defaults.conf + target: /opt/bitnami/spark/conf/spark-defaults.conf + - type: bind + source: ./log4j2.properties + target: /opt/bitnami/spark/conf/log4j2.properties + - type: bind + source: $PPL_JAR + target: /opt/bitnami/spark/jars/ppl-spark-integration.jar + - type: bind + source: $FLINT_JAR + target: /opt/bitnami/spark/jars/flint-spark-integration.jar + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/"] + interval: 1m + timeout: 5s + retries: 3 + start_period: 30s + start_interval: 5s + networks: + - opensearch-net + + spark-worker: + image: bitnami/spark:${SPARK_VERSION:-3.5.3} + container_name: spark-worker + environment: + - SPARK_MODE=worker + - SPARK_MASTER_URL=spark://spark:7077 + - SPARK_WORKER_MEMORY=${WORKER_MEMORY:-1G} + - SPARK_WORKER_CORES=${WORKER_CORES:-1} + - SPARK_RPC_AUTHENTICATION_ENABLED=no + - SPARK_RPC_ENCRYPTION_ENABLED=no + - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no + - SPARK_SSL_ENABLED=no + - SPARK_PUBLIC_DNS=localhost + volumes: + - type: bind + source: ./spark-defaults.conf + target: /opt/bitnami/spark/conf/spark-defaults.conf + - type: bind + source: ./log4j2.properties + target: /opt/bitnami/spark/conf/log4j2.properties + - type: bind + source: $PPL_JAR + target: /opt/bitnami/spark/jars/ppl-spark-integration.jar + - type: bind + source: $FLINT_JAR + target: /opt/bitnami/spark/jars/flint-spark-integration.jar + networks: + - opensearch-net + depends_on: + - spark + + opensearch: + image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-latest} + container_name: opensearch + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch + - discovery.seed_hosts=opensearch + - cluster.initial_cluster_manager_nodes=opensearch + - bootstrap.memory_lock=true + - plugins.security.ssl.http.enabled=false + - OPENSEARCH_JAVA_OPTS=-Xms${OPENSEARCH_NODE_MEMORY:-512m} -Xmx${OPENSEARCH_NODE_MEMORY:-512m} + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_ADMIN_PASSWORD} + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 + hard: 65536 + volumes: + - opensearch-data:/usr/share/opensearch/data + ports: + - ${OPENSEARCH_PORT:-9200}:9200 + - 9600:9600 + expose: + - "${OPENSEARCH_PORT:-9200}" + healthcheck: + test: ["CMD", "curl", "-f", "-u", "admin:${OPENSEARCH_ADMIN_PASSWORD}", "http://localhost:9200/_cluster/health"] + interval: 1m + timeout: 5s + retries: 3 + start_period: 30s + start_interval: 5s + networks: + - opensearch-net + + opensearch-dashboards: + image: opensearchproject/opensearch-dashboards:${DASHBOARDS_VERSION} + container_name: opensearch-dashboards + ports: + - ${OPENSEARCH_DASHBOARDS_PORT:-5601}:5601 + expose: + - "${OPENSEARCH_DASHBOARDS_PORT:-5601}" + environment: + OPENSEARCH_HOSTS: '["http://opensearch:9200"]' + networks: + - opensearch-net + depends_on: + - opensearch + + minio: + image: minio/minio + container_name: minio-S3 + # See original entrypoint/command under https://github.com/minio/minio/blob/master/Dockerfile + entrypoint: sh -c 'mkdir -p /data/test && minio server /data --console-address ":9001"' + ports: + - "9000:9000" + - "9001:9001" + volumes: + - minio-data:/data + networks: + - opensearch-net + +volumes: + opensearch-data: + minio-data: + +networks: + opensearch-net: diff --git a/docker/integ-test/log4j2.properties b/docker/integ-test/log4j2.properties new file mode 100644 index 000000000..ab96e03ba --- /dev/null +++ b/docker/integ-test/log4j2.properties @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +rootLogger.level = info +rootLogger.appenderRef.stdout.ref = console + +# In the pattern layout configuration below, we specify an explicit `%ex` conversion +# pattern for logging Throwables. If this was omitted, then (by default) Log4J would +# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional +# class packaging information. That extra information can sometimes add a substantial +# performance overhead, so we disable it in our default logging config. +# For more information, see SPARK-39361. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex + +# Set the default spark-shell/spark-sql log level to WARN. When running the +# spark-shell/spark-sql, the log level for these classes is used to overwrite +# the root logger's log level, so that the user can have different defaults +# for the shell and regular Spark apps. +logger.repl.name = org.apache.spark.repl.Main +logger.repl.level = warn + +logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver +logger.thriftserver.level = warn + +# Settings to quiet third party logs that are too verbose +logger.jetty1.name = org.sparkproject.jetty +logger.jetty1.level = warn +logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle +logger.jetty2.level = error +logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper +logger.replexprTyper.level = info +logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter +logger.replSparkILoopInterpreter.level = info +logger.parquet1.name = org.apache.parquet +logger.parquet1.level = error +logger.parquet2.name = parquet +logger.parquet2.level = error + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler +logger.RetryingHMSHandler.level = fatal +logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry +logger.FunctionRegistry.level = error + +# For deploying Spark ThriftServer +# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 +appender.console.filter.1.type = RegexFilter +appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* +appender.console.filter.1.onMatch = deny +appender.console.filter.1.onMismatch = neutral diff --git a/docker/integ-test/prepare_scala_queries.py b/docker/integ-test/prepare_scala_queries.py new file mode 100755 index 000000000..dec62593b --- /dev/null +++ b/docker/integ-test/prepare_scala_queries.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import csv + +queries = None +with open('../../integ-test/script/test_cases.csv', 'r') as f: + reader = csv.DictReader(f) + queries = [(row['query'], i, row.get('expected_status', None)) for i, row in enumerate(reader, start=1) if row['query'].strip()] + +print('try {') +for query in queries: + query_str = query[0].replace('\n', '').replace('"', '\\"') + if 'FAILED' == query[2]: + print(' try {') + print(f' spark.sql("{query_str}")') + print(' throw new Error') + print(' } catch {') + print(' case e: Exception => null') + print(' }\n') + else: + print(f' spark.sql("{query_str}")\n') +print('}') + diff --git a/docker/integ-test/queries.scala b/docker/integ-test/queries.scala new file mode 100644 index 000000000..7d6ee78c1 --- /dev/null +++ b/docker/integ-test/queries.scala @@ -0,0 +1,619 @@ +{ + try { + spark.sql("describe myglue_test.default.http_logs") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("describe `myglue_test`.`default`.`http_logs`") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | dedup 1 status | fields @timestamp, clientip, status, size | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup status, size | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup 1 status keepempty=true | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup status, size keepempty=true | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup 2 status | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup 2 status, size | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup 2 status, size keepempty=true | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | dedup status CONSECUTIVE=true | fields status") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | dedup 2 status, size CONSECUTIVE=true | fields status") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | sort stat | fields @timestamp, clientip, status | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | fields @timestamp, notexisted | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.nested | fields int_col, struct_col.field1, struct_col2.field1 | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.nested | where struct_col2.field1.subfield > 'valueA' | sort int_col | fields int_col, struct_col.field1.subfield, struct_col2.field1.subfield") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | fields - @timestamp, clientip, status | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval new_time = @timestamp, new_clientip = clientip | fields - new_time, new_clientip, status | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval new_clientip = lower(clientip) | fields - new_clientip | head 10") + + spark.sql("source = myglue_test.default.http_logs | fields + @timestamp, clientip, status | fields - clientip, status | head 10") + + spark.sql("source = myglue_test.default.http_logs | fields - clientip, status | fields + @timestamp, clientip, status| head 10") + + spark.sql("source = myglue_test.default.http_logs | where status = 200 | head 10") + + spark.sql("source = myglue_test.default.http_logs | where status != 200 | head 10") + + spark.sql("source = myglue_test.default.http_logs | where size > 0 | head 10") + + spark.sql("source = myglue_test.default.http_logs | where size <= 0 | head 10") + + spark.sql("source = myglue_test.default.http_logs | where clientip = '236.14.2.0' | head 10") + + spark.sql("source = myglue_test.default.http_logs | where size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100") + + spark.sql("source = myglue_test.default.http_logs | where size <= 0 AND like(request, 'GET%') | head 10") + + spark.sql("source = myglue_test.default.http_logs status = 200 | head 10") + + spark.sql("source = myglue_test.default.http_logs size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100") + + spark.sql("source = myglue_test.default.http_logs size <= 0 AND like(request, 'GET%') | head 10") + + spark.sql("source = myglue_test.default.http_logs substring(clientip, 5, 2) = \"12\" | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | where isempty(size)") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | where ispresent(size)") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | where isnull(size) | head 10") + + spark.sql("source = myglue_test.default.http_logs | where isnotnull(size) | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | where isnotnull(coalesce(size, status)) | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | where like(request, 'GET%') | head 10") + + spark.sql("source = myglue_test.default.http_logs | where like(request, '%bordeaux%') | head 10") + + spark.sql("source = myglue_test.default.http_logs | where substring(clientip, 5, 2) = \"12\" | head 10") + + spark.sql("source = myglue_test.default.http_logs | where lower(request) = \"get /images/backnews.gif http/1.0\" | head 10") + + spark.sql("source = myglue_test.default.http_logs | where length(request) = 38 | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | where case(status = 200, 'success' else 'failed') = 'success' | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | eval h = \"Hello\", w = \"World\" | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval @h = \"Hello\" | eval @w = \"World\" | fields @timestamp, @h, @w") + + spark.sql("source = myglue_test.default.http_logs | eval newF = clientip | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval newF = clientip | fields clientip, newF | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval f = size | where f > 1 | sort f | fields size, clientip, status | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval f = status * 2 | eval h = f * 2 | fields status, f, h | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval f = size * 2, h = status | stats sum(f) by h") + + spark.sql("source = myglue_test.default.http_logs | eval f = UPPER(request) | eval h = 40 | fields f, h | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | eval request = \"test\" | fields request | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | eval size = abs(size) | where size < 500") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | eval status_string = case(status = 200, 'success' else 'failed') | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | eval n = now() | eval t = unix_timestamp(@timestamp) | fields n, t | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | eval e = isempty(size) | eval p = ispresent(size) | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | eval c = coalesce(size, status) | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | eval c = coalesce(request) | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | eval col1 = ln(size) | eval col2 = unix_timestamp(@timestamp) | sort - col1 | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval col1 = 1 | sort col1 | head 4 | eval col2 = 2 | sort - col2 | sort - size | head 2 | fields @timestamp, clientip, col2") + + spark.sql("source = myglue_test.default.mini_http_logs | eval stat = status | where stat > 300 | sort stat | fields @timestamp,clientip,status | head 5") + + spark.sql("source = myglue_test.default.http_logs | eval col1 = size, col2 = clientip | stats avg(col1) by col2") + + spark.sql("source = myglue_test.default.http_logs | stats avg(size) by clientip") + + spark.sql("source = myglue_test.default.http_logs | eval new_request = upper(request) | eval compound_field = concat('Hello ', if(like(new_request, '%bordeaux%'), 'World', clientip)) | fields new_request, compound_field | head 10") + + spark.sql("source = myglue_test.default.http_logs | stats avg(size)") + + spark.sql("source = myglue_test.default.nested | stats max(int_col) by struct_col.field2") + + spark.sql("source = myglue_test.default.nested | stats distinct_count(int_col)") + + spark.sql("source = myglue_test.default.nested | stats stddev_samp(int_col)") + + spark.sql("source = myglue_test.default.nested | stats stddev_pop(int_col)") + + spark.sql("source = myglue_test.default.nested | stats percentile(int_col)") + + spark.sql("source = myglue_test.default.nested | stats percentile_approx(int_col)") + + spark.sql("source = myglue_test.default.mini_http_logs | stats stddev_samp(status)") + + spark.sql("source = myglue_test.default.mini_http_logs | where stats > 200 | stats percentile_approx(status, 99)") + + spark.sql("source = myglue_test.default.nested | stats count(int_col) by span(struct_col.field2, 10) as a_span") + + spark.sql("source = myglue_test.default.nested | stats avg(int_col) by span(struct_col.field2, 10) as a_span, struct_col2.field2") + + spark.sql("source = myglue_test.default.http_logs | stats sum(size) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10") + + spark.sql("source = myglue_test.default.http_logs | stats distinct_count(clientip) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10") + + spark.sql("source = myglue_test.default.http_logs | stats avg(size) as avg_size by status, year | stats avg(avg_size) as avg_avg_size by year") + + spark.sql("source = myglue_test.default.http_logs | stats avg(size) as avg_size by status, year, month | stats avg(avg_size) as avg_avg_size by year, month | stats avg(avg_avg_size) as avg_avg_avg_size by year") + + try { + spark.sql("source = myglue_test.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | stats avg(avg_int) as avg_avg_int by struct_col2.field2") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | eval new_col = avg_int | stats avg(avg_int) as avg_avg_int by new_col") + + spark.sql("source = myglue_test.default.nested | rare int_col") + + spark.sql("source = myglue_test.default.nested | rare int_col by struct_col.field2") + + spark.sql("source = myglue_test.default.http_logs | rare request") + + spark.sql("source = myglue_test.default.http_logs | where status > 300 | rare request by status") + + spark.sql("source = myglue_test.default.http_logs | rare clientip") + + spark.sql("source = myglue_test.default.http_logs | where status > 300 | rare clientip") + + spark.sql("source = myglue_test.default.http_logs | where status > 300 | rare clientip by day") + + spark.sql("source = myglue_test.default.nested | top int_col by struct_col.field2") + + spark.sql("source = myglue_test.default.nested | top 1 int_col by struct_col.field2") + + spark.sql("source = myglue_test.default.nested | top 2 int_col by struct_col.field2") + + spark.sql("source = myglue_test.default.nested | top int_col") + + try { + spark.sql("source = myglue_test.default.http_logs | inner join left=l right=r on l.status = r.int_col myglue_test.default.nested | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | fields request, domain | head 10") + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | top 1 domain") + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | stats count() by domain") + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | eval a = 1 | fields a, domain | head 10") + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | where size > 0 | sort - size | fields size, domain | head 10") + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/(?[a-zA-Z]+)/.*' | where domain = 'english' | sort - picName | fields domain, picName | head 10") + + spark.sql("source = myglue_test.default.http_logs | patterns request | fields patterns_field | head 10") + + spark.sql("source = myglue_test.default.http_logs | patterns request | where size > 0 | fields patterns_field | head 10") + + spark.sql("source = myglue_test.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | fields request, no_letter | head 10") + + spark.sql("source = myglue_test.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | stats count() by no_letter") + + try { + spark.sql("source = myglue_test.default.http_logs | patterns new_field='status' pattern='[a-zA-Z]' request | fields request, status | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | rename @timestamp as timestamp | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | sort size | head 10") + + spark.sql("source = myglue_test.default.http_logs | sort + size | head 10") + + spark.sql("source = myglue_test.default.http_logs | sort - size | head 10") + + spark.sql("source = myglue_test.default.http_logs | sort + size, + @timestamp | head 10") + + spark.sql("source = myglue_test.default.http_logs | sort - size, - @timestamp | head 10") + + spark.sql("source = myglue_test.default.http_logs | sort - size, @timestamp | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval c1 = upper(request) | eval c2 = concat('Hello ', if(like(c1, '%bordeaux%'), 'World', clientip)) | eval c3 = length(request) | eval c4 = ltrim(request) | eval c5 = rtrim(request) | eval c6 = substring(clientip, 5, 2) | eval c7 = trim(request) | eval c8 = upper(request) | eval c9 = position('bordeaux' IN request) | eval c10 = replace(request, 'GET', 'GGG') | fields c1, c2, c3, c4, c5, c6, c7, c8, c9, c10 | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval c1 = unix_timestamp(@timestamp) | eval c2 = now() | eval c3 =DAY_OF_WEEK(@timestamp) | eval c4 =DAY_OF_MONTH(@timestamp) | eval c5 =DAY_OF_YEAR(@timestamp) | eval c6 =WEEK_OF_YEAR(@timestamp) | eval c7 =WEEK(@timestamp) | eval c8 =MONTH_OF_YEAR(@timestamp) | eval c9 =HOUR_OF_DAY(@timestamp) | eval c10 =MINUTE_OF_HOUR(@timestamp) | eval c11 =SECOND_OF_MINUTE(@timestamp) | eval c12 =LOCALTIME() | fields c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12 | head 10") + + spark.sql("source=myglue_test.default.people | eval c1 = adddate(@timestamp, 1) | fields c1 | head 10") + + spark.sql("source=myglue_test.default.people | eval c2 = subdate(@timestamp, 1) | fields c2 | head 10") + + spark.sql("source=myglue_test.default.people | eval c1 = date_add(@timestamp INTERVAL 1 DAY) | fields c1 | head 10") + + spark.sql("source=myglue_test.default.people | eval c1 = date_sub(@timestamp INTERVAL 1 DAY) | fields c1 | head 10") + + spark.sql("source=myglue_test.default.people | eval `CURDATE()` = CURDATE() | fields `CURDATE()`") + + spark.sql("source=myglue_test.default.people | eval `CURRENT_DATE()` = CURRENT_DATE() | fields `CURRENT_DATE()`") + + spark.sql("source=myglue_test.default.people | eval `CURRENT_TIMESTAMP()` = CURRENT_TIMESTAMP() | fields `CURRENT_TIMESTAMP()`") + + spark.sql("source=myglue_test.default.people | eval `DATE('2020-08-26')` = DATE('2020-08-26') | fields `DATE('2020-08-26')`") + + spark.sql("source=myglue_test.default.people | eval `DATE(TIMESTAMP('2020-08-26 13:49:00'))` = DATE(TIMESTAMP('2020-08-26 13:49:00')) | fields `DATE(TIMESTAMP('2020-08-26 13:49:00'))`") + + spark.sql("source=myglue_test.default.people | eval `DATE('2020-08-26 13:49')` = DATE('2020-08-26 13:49') | fields `DATE('2020-08-26 13:49')`") + + spark.sql("source=myglue_test.default.people | eval `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')` = DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS'), `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')` = DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a') | fields `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')`, `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')`") + + spark.sql("source=myglue_test.default.people | eval `'2000-01-02' - '2000-01-01'` = DATEDIFF(TIMESTAMP('2000-01-02 00:00:00'), TIMESTAMP('2000-01-01 23:59:59')), `'2001-02-01' - '2004-01-01'` = DATEDIFF(DATE('2001-02-01'), TIMESTAMP('2004-01-01 00:00:00')) | fields `'2000-01-02' - '2000-01-01'`, `'2001-02-01' - '2004-01-01'`") + + spark.sql("source=myglue_test.default.people | eval `DAY(DATE('2020-08-26'))` = DAY(DATE('2020-08-26')) | fields `DAY(DATE('2020-08-26'))`") + + try { + spark.sql("source=myglue_test.default.people | eval `DAYNAME(DATE('2020-08-26'))` = DAYNAME(DATE('2020-08-26')) | fields `DAYNAME(DATE('2020-08-26'))`") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source=myglue_test.default.people | eval `CURRENT_TIMEZONE()` = CURRENT_TIMEZONE() | fields `CURRENT_TIMEZONE()`") + + spark.sql("source=myglue_test.default.people | eval `UTC_TIMESTAMP()` = UTC_TIMESTAMP() | fields `UTC_TIMESTAMP()`") + + spark.sql("source=myglue_test.default.people | eval `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')` = TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | eval `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))` = TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00')) | fields `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')`, `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))`") + + spark.sql("source=myglue_test.default.people | eval `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')` = TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | eval `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` = TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') | fields `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')`, `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')`") + + spark.sql(" source = myglue_test.default.http_logs | stats count()") + + spark.sql("source = myglue_test.default.http_logs | stats avg(size) as c1, max(size) as c2, min(size) as c3, sum(size) as c4, percentile(size, 50) as c5, stddev_pop(size) as c6, stddev_samp(size) as c7, distinct_count(size) as c8") + + spark.sql("source = myglue_test.default.http_logs | eval c1 = abs(size) | eval c2 = ceil(size) | eval c3 = floor(size) | eval c4 = sqrt(size) | eval c5 = ln(size) | eval c6 = pow(size, 2) | eval c7 = mod(size, 2) | fields c1, c2, c3, c4, c5, c6, c7 | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval c1 = isnull(request) | eval c2 = isnotnull(request) | eval c3 = ifnull(request,\"Unknown\") | eval c4 = nullif(request,\"Unknown\") | eval c5 = isnull(size) | eval c6 = if(like(request, '%bordeaux%'), 'hello', 'world') | fields c1, c2, c3, c4, c5, c6 | head 10") + + spark.sql("/* this is block comment */ source = myglue_test.tpch_csv.orders | head 1 // this is line comment") + + spark.sql("/* test in tpch q16, q18, q20 */ source = myglue_test.tpch_csv.orders | head 1 // add source=xx to avoid failure in automation") + + spark.sql("/* test in tpch q4, q21, q22 */ source = myglue_test.tpch_csv.orders | head 1") + + spark.sql("/* test in tpch q2, q11, q15, q17, q20, q22 */ source = myglue_test.tpch_csv.orders | head 1") + + spark.sql("/* test in tpch q7, q8, q9, q13, q15, q22 */ source = myglue_test.tpch_csv.orders | head 1") + + spark.sql("/* lots of inner join tests in tpch */ source = myglue_test.tpch_csv.orders | head 1") + + spark.sql("/* left join test in tpch q13 */ source = myglue_test.tpch_csv.orders | head 1") + + spark.sql("source = myglue_test.tpch_csv.orders | right outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') myglue_test.tpch_csv.customer| stats count(o_orderkey) as c_count by c_custkey| sort - c_count") + + spark.sql("source = myglue_test.tpch_csv.orders | full outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') myglue_test.tpch_csv.customer| stats count(o_orderkey) as c_count by c_custkey| sort - c_count") + + spark.sql("source = myglue_test.tpch_csv.customer| semi join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| where c_mktsegment = 'BUILDING' | sort - c_custkey| head 10") + + spark.sql("source = myglue_test.tpch_csv.customer| anti join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| where c_mktsegment = 'BUILDING' | sort - c_custkey| head 10") + + spark.sql("source = myglue_test.tpch_csv.supplier| where like(s_comment, '%Customer%Complaints%')| join ON s_nationkey > n_nationkey [ source = myglue_test.tpch_csv.nation | where n_name = 'SAUDI ARABIA' ]| sort - s_name| head 10") + + spark.sql("source = myglue_test.tpch_csv.supplier| where like(s_comment, '%Customer%Complaints%')| join [ source = myglue_test.tpch_csv.nation | where n_name = 'SAUDI ARABIA' ]| sort - s_name| head 10") + + spark.sql("source=myglue_test.default.people | LOOKUP myglue_test.default.work_info uid AS id REPLACE department | stats distinct_count(department)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id APPEND department | stats distinct_count(department)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id REPLACE department AS country | stats distinct_count(country)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id APPEND department AS country | stats distinct_count(country)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uID AS id, name REPLACE department | stats distinct_count(department)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS ID, name APPEND department | stats distinct_count(department)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uID AS id, name | head 10") + + spark.sql("source = myglue_test.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP myglue_test.default.work_info name REPLACE occupation AS major | stats distinct_count(major)") + + spark.sql("source = myglue_test.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP myglue_test.default.work_info name APPEND occupation AS major | stats distinct_count(major)") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('{\"account_number\":1,\"balance\":39225,\"age\":32,\"gender\":\"M\"}') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('{\"f1\":\"abc\",\"f2\":{\"f3\":\"a\",\"f4\":\"b\"}}') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('[1,2,3,{\"f1\":1,\"f2\":[5,6]},4]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('[]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json(‘{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('{\"invalid\": \"json\"') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('[1,2,3]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json(‘[1,2') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('[invalid json]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('invalid json') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json(null) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array('this', 'is', 'a', 'string', 'array') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array() | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array(1, 2, 0, -1, 1.1, -0.11) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array('this', 'is', 1.1, -0.11, true, false) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = array_length(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = array_length(json_array()) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array_length('[]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array_length('[1,2,3,{\"f1\":1,\"f2\":[5,6]},4]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array_length('{\"key\": 1}') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array_length('[1,2') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', 'string_value')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', 123.45)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', true)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object(\"a\", 1, \"b\", 2, \"c\", 3)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', array())) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', array(1, 2, 3))) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('outer', json_object('inner', 123.45))) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object(\"array\", json_array(1,2,0,-1,1.1,-0.11))) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | where json_valid(('{\"account_number\":1,\"balance\":39225,\"age\":32,\"gender\":\"M\"}') | head 1") + + spark.sql("source = myglue_test.default.http_logs | where not json_valid(('{\"account_number\":1,\"balance\":39225,\"age\":32,\"gender\":\"M\"}') | head 1") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('{\"account_number\":1,\"balance\":39225,\"age\":32,\"gender\":\"M\"}')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('{\"f1\":\"abc\",\"f2\":{\"f3\":\"a\",\"f4\":\"b\"}}')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2,3,{\"f1\":1,\"f2\":[5,6]},4]')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('[]')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json(‘{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('{\"invalid\": \"json\"')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2,3]')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('[invalid json]')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('invalid json')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json(null)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.teacher') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[*]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[0]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[*].name') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[1].name') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[0].not_exist_key') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[10]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > 0) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > -10) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(json_object(\"a\",1,\"b\",-1),json_object(\"a\",-1,\"b\",-1)), result = forall(array, x -> x.a > 0) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(json_object(\"a\",1,\"b\",-1),json_object(\"a\",-1,\"b\",-1)), result = exists(array, x -> x.b < 0) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 0) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 10) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 0) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 10) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,3), result = transform(array, x -> x + 1) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,3), result = transform(array, (x, y) -> x + y) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x, acc -> acc * 10) | head 1 | fields result") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eventstats avg(salary) by country | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by country | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as countby span(age, 10) | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by span(age, 10) as age_span, country | sort id | head 10") + + spark.sql("source=myglue_test.default.people | where country != 'USA' | eventstats stddev_samp(salary), stddev_pop(salary), percentile_approx(salary, 60) by span(salary, 1000) as salary_span | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age by occupation, country | eventstats avg(avg_age) as avg_state_age by country | sort id | head 10") + + try { + spark.sql("source=myglue_test.default.people | eventstats distinct_count(salary) by span(salary, 1000) as age_span") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.tpch_csv.lineitem| where l_shipdate <= subdate(date('1998-12-01'), 90)| stats sum(l_quantity) as sum_qty, sum(l_extendedprice) as sum_base_price, sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, avg(l_quantity) as avg_qty, avg(l_extendedprice) as avg_price, avg(l_discount) as avg_disc, count() as count_order by l_returnflag, l_linestatus| sort l_returnflag, l_linestatus") + + spark.sql("source = myglue_test.tpch_csv.part| join ON p_partkey = ps_partkey myglue_test.tpch_csv.partsupp| join ON s_suppkey = ps_suppkey myglue_test.tpch_csv.supplier| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation| join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region| where p_size = 15 AND like(p_type, '%BRASS') AND r_name = 'EUROPE' AND ps_supplycost = [ source = myglue_test.tpch_csv.partsupp | join ON s_suppkey = ps_suppkey myglue_test.tpch_csv.supplier | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation | join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region | where r_name = 'EUROPE' | stats MIN(ps_supplycost) ]| sort - s_acctbal, n_name, s_name, p_partkey| head 100") + + spark.sql("source = myglue_test.tpch_csv.customer| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem| where c_mktsegment = 'BUILDING' AND o_orderdate < date('1995-03-15') AND l_shipdate > date('1995-03-15')| stats sum(l_extendedprice * (1 - l_discount)) as revenue by l_orderkey, o_orderdate, o_shippriority | sort - revenue, o_orderdate| head 10") + + spark.sql("source = myglue_test.tpch_csv.orders| where o_orderdate >= date('1993-07-01') and o_orderdate < date_add(date('1993-07-01'), interval 3 month) and exists [ source = myglue_test.tpch_csv.lineitem | where l_orderkey = o_orderkey and l_commitdate < l_receiptdate ]| stats count() as order_count by o_orderpriority| sort o_orderpriority") + + spark.sql("source = myglue_test.tpch_csv.customer| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem| join ON l_suppkey = s_suppkey AND c_nationkey = s_nationkey myglue_test.tpch_csv.supplier| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation| join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region| where r_name = 'ASIA' AND o_orderdate >= date('1994-01-01') AND o_orderdate < date_add(date('1994-01-01'), interval 1 year)| stats sum(l_extendedprice * (1 - l_discount)) as revenue by n_name| sort - revenue") + + spark.sql("source = myglue_test.tpch_csv.lineitem| where l_shipdate >= date('1994-01-01') and l_shipdate < adddate(date('1994-01-01'), 365) and l_discount between .06 - 0.01 and .06 + 0.01 and l_quantity < 24| stats sum(l_extendedprice * l_discount) as revenue") + + spark.sql("source = [ source = myglue_test.tpch_csv.supplier | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.lineitem | join ON o_orderkey = l_orderkey myglue_test.tpch_csv.orders | join ON c_custkey = o_custkey myglue_test.tpch_csv.customer | join ON s_nationkey = n1.n_nationkey myglue_test.tpch_csv.nation as n1 | join ON c_nationkey = n2.n_nationkey myglue_test.tpch_csv.nation as n2 | where l_shipdate between date('1995-01-01') and date('1996-12-31') and n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY' or n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE' | eval supp_nation = n1.n_name, cust_nation = n2.n_name, l_year = year(l_shipdate), volume = l_extendedprice * (1 - l_discount) | fields supp_nation, cust_nation, l_year, volume ] as shipping| stats sum(volume) as revenue by supp_nation, cust_nation, l_year| sort supp_nation, cust_nation, l_year") + + spark.sql("source = [ source = myglue_test.tpch_csv.part | join ON p_partkey = l_partkey myglue_test.tpch_csv.lineitem | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.supplier | join ON l_orderkey = o_orderkey myglue_test.tpch_csv.orders | join ON o_custkey = c_custkey myglue_test.tpch_csv.customer | join ON c_nationkey = n1.n_nationkey myglue_test.tpch_csv.nation as n1 | join ON s_nationkey = n2.n_nationkey myglue_test.tpch_csv.nation as n2 | join ON n1.n_regionkey = r_regionkey myglue_test.tpch_csv.region | where r_name = 'AMERICA' AND p_type = 'ECONOMY ANODIZED STEEL' and o_orderdate between date('1995-01-01') and date('1996-12-31') | eval o_year = year(o_orderdate) | eval volume = l_extendedprice * (1 - l_discount) | eval nation = n2.n_name | fields o_year, volume, nation ] as all_nations| stats sum(case(nation = 'BRAZIL', volume else 0)) as sum_case, sum(volume) as sum_volume by o_year| eval mkt_share = sum_case / sum_volume| fields mkt_share, o_year| sort o_year") + + spark.sql("source = [ source = myglue_test.tpch_csv.part | join ON p_partkey = l_partkey myglue_test.tpch_csv.lineitem | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.supplier | join ON ps_partkey = l_partkey and ps_suppkey = l_suppkey myglue_test.tpch_csv.partsupp | join ON o_orderkey = l_orderkey myglue_test.tpch_csv.orders | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation | where like(p_name, '%green%') | eval nation = n_name | eval o_year = year(o_orderdate) | eval amount = l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity | fields nation, o_year, amount ] as profit| stats sum(amount) as sum_profit by nation, o_year| sort nation, - o_year") + + spark.sql("source = myglue_test.tpch_csv.customer| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem| join ON c_nationkey = n_nationkey myglue_test.tpch_csv.nation| where o_orderdate >= date('1993-10-01') AND o_orderdate < date_add(date('1993-10-01'), interval 3 month) AND l_returnflag = 'R'| stats sum(l_extendedprice * (1 - l_discount)) as revenue by c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment| sort - revenue| head 20") + + spark.sql("source = myglue_test.tpch_csv.partsupp| join ON ps_suppkey = s_suppkey myglue_test.tpch_csv.supplier| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation| where n_name = 'GERMANY'| stats sum(ps_supplycost * ps_availqty) as value by ps_partkey| where value > [ source = myglue_test.tpch_csv.partsupp | join ON ps_suppkey = s_suppkey myglue_test.tpch_csv.supplier | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation | where n_name = 'GERMANY' | stats sum(ps_supplycost * ps_availqty) as check | eval threshold = check * 0.0001000000 | fields threshold ]| sort - value") + + spark.sql("source = myglue_test.tpch_csv.orders| join ON o_orderkey = l_orderkey myglue_test.tpch_csv.lineitem| where l_commitdate < l_receiptdate and l_shipdate < l_commitdate and l_shipmode in ('MAIL', 'SHIP') and l_receiptdate >= date('1994-01-01') and l_receiptdate < date_add(date('1994-01-01'), interval 1 year)| stats sum(case(o_orderpriority = '1-URGENT' or o_orderpriority = '2-HIGH', 1 else 0)) as high_line_count, sum(case(o_orderpriority != '1-URGENT' and o_orderpriority != '2-HIGH', 1 else 0)) as low_line_countby by l_shipmode| sort l_shipmode") + + spark.sql("source = [ source = myglue_test.tpch_csv.customer | left outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') myglue_test.tpch_csv.orders | stats count(o_orderkey) as c_count by c_custkey ] as c_orders| stats count() as custdist by c_count| sort - custdist, - c_count") + + spark.sql("source = myglue_test.tpch_csv.lineitem| join ON l_partkey = p_partkey AND l_shipdate >= date('1995-09-01') AND l_shipdate < date_add(date('1995-09-01'), interval 1 month) myglue_test.tpch_csv.part| stats sum(case(like(p_type, 'PROMO%'), l_extendedprice * (1 - l_discount) else 0)) as sum1, sum(l_extendedprice * (1 - l_discount)) as sum2| eval promo_revenue = 100.00 * sum1 / sum2 // Stats and Eval commands can combine when issues/819 resolved| fields promo_revenue") + + spark.sql("source = myglue_test.tpch_csv.supplier| join right = revenue0 ON s_suppkey = supplier_no [ source = myglue_test.tpch_csv.lineitem | where l_shipdate >= date('1996-01-01') AND l_shipdate < date_add(date('1996-01-01'), interval 3 month) | eval supplier_no = l_suppkey | stats sum(l_extendedprice * (1 - l_discount)) as total_revenue by supplier_no ]| where total_revenue = [ source = [ source = myglue_test.tpch_csv.lineitem | where l_shipdate >= date('1996-01-01') AND l_shipdate < date_add(date('1996-01-01'), interval 3 month) | eval supplier_no = l_suppkey | stats sum(l_extendedprice * (1 - l_discount)) as total_revenue by supplier_no ] | stats max(total_revenue) ]| sort s_suppkey| fields s_suppkey, s_name, s_address, s_phone, total_revenue") + + spark.sql("source = myglue_test.tpch_csv.partsupp| join ON p_partkey = ps_partkey myglue_test.tpch_csv.part| where p_brand != 'Brand#45' and not like(p_type, 'MEDIUM POLISHED%') and p_size in (49, 14, 23, 45, 19, 3, 36, 9) and ps_suppkey not in [ source = myglue_test.tpch_csv.supplier | where like(s_comment, '%Customer%Complaints%') | fields s_suppkey ]| stats distinct_count(ps_suppkey) as supplier_cnt by p_brand, p_type, p_size| sort - supplier_cnt, p_brand, p_type, p_size") + + spark.sql("source = myglue_test.tpch_csv.lineitem| join ON p_partkey = l_partkey myglue_test.tpch_csv.part| where p_brand = 'Brand#23' and p_container = 'MED BOX' and l_quantity < [ source = myglue_test.tpch_csv.lineitem | where l_partkey = p_partkey | stats avg(l_quantity) as avg | eval `0.2 * avg` = 0.2 * avg | fields `0.2 * avg` ]| stats sum(l_extendedprice) as sum| eval avg_yearly = sum / 7.0| fields avg_yearly") + + spark.sql("source = myglue_test.tpch_csv.customer| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| join ON o_orderkey = l_orderkey myglue_test.tpch_csv.lineitem| where o_orderkey in [ source = myglue_test.tpch_csv.lineitem | stats sum(l_quantity) as sum by l_orderkey | where sum > 300 | fields l_orderkey ]| stats sum(l_quantity) by c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice| sort - o_totalprice, o_orderdate| head 100") + + spark.sql("source = myglue_test.tpch_csv.lineitem| join ON p_partkey = l_partkey and p_brand = 'Brand#12' and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') and l_quantity >= 1 and l_quantity <= 1 + 10 and p_size between 1 and 5 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' OR p_partkey = l_partkey and p_brand = 'Brand#23' and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') and l_quantity >= 10 and l_quantity <= 10 + 10 and p_size between 1 and 10 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' OR p_partkey = l_partkey and p_brand = 'Brand#34' and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') and l_quantity >= 20 and l_quantity <= 20 + 10 and p_size between 1 and 15 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' myglue_test.tpch_csv.part") + + spark.sql("source = myglue_test.tpch_csv.supplier| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation| where n_name = 'CANADA' and s_suppkey in [ source = myglue_test.tpch_csv.partsupp | where ps_partkey in [ source = myglue_test.tpch_csv.part | where like(p_name, 'forest%') | fields p_partkey ] and ps_availqty > [ source = myglue_test.tpch_csv.lineitem | where l_partkey = ps_partkey and l_suppkey = ps_suppkey and l_shipdate >= date('1994-01-01') and l_shipdate < date_add(date('1994-01-01'), interval 1 year) | stats sum(l_quantity) as sum_l_quantity | eval half_sum_l_quantity = 0.5 * sum_l_quantity | fields half_sum_l_quantity ] | fields ps_suppkey ]") + + spark.sql("source = myglue_test.tpch_csv.supplier| join ON s_suppkey = l1.l_suppkey myglue_test.tpch_csv.lineitem as l1| join ON o_orderkey = l1.l_orderkey myglue_test.tpch_csv.orders| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation| where o_orderstatus = 'F' and l1.l_receiptdate > l1.l_commitdate and exists [ source = myglue_test.tpch_csv.lineitem as l2 | where l2.l_orderkey = l1.l_orderkey and l2.l_suppkey != l1.l_suppkey ] and not exists [ source = myglue_test.tpch_csv.lineitem as l3 | where l3.l_orderkey = l1.l_orderkey and l3.l_suppkey != l1.l_suppkey and l3.l_receiptdate > l3.l_commitdate ] and n_name = 'SAUDI ARABIA'| stats count() as numwait by s_name| sort - numwait, s_name| head 100") + + spark.sql("source = [ source = myglue_test.tpch_csv.customer | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') and c_acctbal > [ source = myglue_test.tpch_csv.customer | where c_acctbal > 0.00 and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') | stats avg(c_acctbal) ] and not exists [ source = myglue_test.tpch_csv.orders | where o_custkey = c_custkey ] | eval cntrycode = substring(c_phone, 1, 2) | fields cntrycode, c_acctbal ] as custsale| stats count() as numcust, sum(c_acctbal) as totacctbal by cntrycode| sort cntrycode") + +} diff --git a/docker/integ-test/spark-defaults.conf b/docker/integ-test/spark-defaults.conf new file mode 100644 index 000000000..19b9e4ec1 --- /dev/null +++ b/docker/integ-test/spark-defaults.conf @@ -0,0 +1,35 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" +spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions,org.opensearch.flint.spark.FlintSparkExtensions +spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog +spark.datasource.flint.host opensearch +spark.datasource.flint.port 9200 +spark.datasource.flint.scheme http +spark.datasource.flint.auth basic +spark.datasource.flint.auth.username admin +spark.datasource.flint.auth.password C0rrecthorsebatterystaple. diff --git a/docker/integ-test/spark-master-entrypoint.sh b/docker/integ-test/spark-master-entrypoint.sh new file mode 100755 index 000000000..a21c20643 --- /dev/null +++ b/docker/integ-test/spark-master-entrypoint.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +function start_spark_connect() { + sc_version=$(ls -1 /opt/bitnami/spark/jars/spark-core_*.jar | sed -e 's/^.*\/spark-core_//' -e 's/\.jar$//' -e 's/-/:/') + + attempt=1 + while [ -e "/tmp/spark_master_running" -a "$attempt" -le 10 ]; do + sleep 1 + /opt/bitnami/spark/sbin/start-connect-server.sh --master spark://spark:7077 --packages org.apache.spark:spark-connect_${sc_version} + attempt=$(($attempt+1)) + done +} + +touch /tmp/spark_master_running +start_spark_connect & +/opt/bitnami/scripts/spark/entrypoint.sh /opt/bitnami/scripts/spark/run.sh +rm /tmp/spark_master_running diff --git a/integ-test/script/README.md b/integ-test/script/README.md index 7ce0c6886..f9e9a8e93 100644 --- a/integ-test/script/README.md +++ b/integ-test/script/README.md @@ -17,41 +17,55 @@ Apart from the basic feature, it also has some advanced functionality includes: ### Usage To use this script, you need to have Python **3.6** or higher installed. It also requires the following Python libraries: ```shell -pip install requests pandas openpyxl +pip install requests pandas openpyxl pyspark setuptools pyarrow grpcio grpcio-status protobuf +``` + +Build the Flint and PPL extensions for Spark. +``` +sbt clean +sbt sparkSqlApplicationCosmetic/assembly sparkPPLCosmetic/assembly +``` + +Next start the Docker containers that will be used for the tests. In the directory `docker/integ-test` +```shell +docker compose up -d +``` + +After the tests are finished, the Docker containers can be stopped from the directory `docker/integ-test` with: +```shell +docker compose down ``` After getting the requisite libraries, you can run the script with the following command line parameters in your shell: ```shell -python SanityTest.py --base-url ${URL_ADDRESS} --username *** --password *** --datasource ${DATASOURCE_NAME} --input-csv test_cases.csv --output-file test_report --max-workers 2 --check-interval 10 --timeout 600 +python SanityTest.py --spark-url ${SPARK_URL} --username *** --password *** --opensearch-url ${OPENSEARCH_URL} --input-csv test_cases.csv --output-file test_report ``` -You need to replace the placeholders with your actual values of URL_ADDRESS, DATASOURCE_NAME and USERNAME, PASSWORD for authentication to your endpoint. +You need to replace the placeholders with your actual values of SPARK_URL, OPENSEARCH_URL and USERNAME, PASSWORD for authentication to your endpoint. + +Running against the docker cluster, `SPARK_URL` should be set to `sc://localhost:15002` and `OPENSEARCH_URL` should be set +to `http://localhost:9200` For more details of the command line parameters, you can see the help manual via command: ```shell python SanityTest.py --help -usage: SanityTest.py [-h] --base-url BASE_URL --username USERNAME --password PASSWORD --datasource DATASOURCE --input-csv INPUT_CSV - --output-file OUTPUT_FILE [--max-workers MAX_WORKERS] [--check-interval CHECK_INTERVAL] [--timeout TIMEOUT] +usage: SanityTest.py [-h] --spark-url SPARK_URL --username USERNAME --password PASSWORD --datasource DATASOURCE --input-csv INPUT_CSV + --output-file OPENSEARCH_URL [--max-workers MAX_WORKERS] [--check-interval CHECK_INTERVAL] [--timeout TIMEOUT] [--start-row START_ROW] [--end-row END_ROW] Run tests from a CSV file and generate a report. options: -h, --help show this help message and exit - --base-url BASE_URL Base URL of the service + --spark-url SPARK_URL Spark Connect URL of the service --username USERNAME Username for authentication --password PASSWORD Password for authentication - --datasource DATASOURCE - Datasource name + --output-file OPENSEARCH_URL + URL of the OpenSearch service --input-csv INPUT_CSV Path to the CSV file containing test queries --output-file OUTPUT_FILE Path to the output report file - --max-workers MAX_WORKERS - optional, Maximum number of worker threads (default: 2) - --check-interval CHECK_INTERVAL - optional, Check interval in seconds (default: 10) - --timeout TIMEOUT optional, Timeout in seconds (default: 600) --start-row START_ROW optional, The start row of the query to run, start from 1 --end-row END_ROW optional, The end row of the query to run, not included @@ -64,7 +78,20 @@ As claimed in the description, the input CSV file should at least have the colum We also provide a sample input CSV file `test_cases.csv` for reference. It includes all sanity test cases we have currently in the Flint. -**TODO**: the prerequisite data of the test cases and ingesting process +### Indices and Data for Testing +After the docker containers have started, the test script will try to create indices that are needed for testing. It will look in the directory `data`. It will start by +looking for all files with names ending with `.mapping.json`. The start of the filename is the name of the index to create. The contents of the file is the field mappings. + +[Supported field types](https://opensearch.org/docs/latest/field-types/supported-field-types/index/) + +[Example mapping](https://opensearch.org/docs/latest/field-types/supported-field-types/index/#example) + +After the indices have been created, the script will look for all other files ending with `.json`. These are the files for bulk inserting data into the indices. The start +of the filename is the index to insert data into. The contents of the file are used as the body of the bulk insert request. + +[Bulk Insert](https://opensearch.org/docs/latest/api-reference/document-apis/bulk/) + +[Example Body](https://opensearch.org/docs/latest/api-reference/document-apis/bulk/) ### Report Explanation The generated report contains two files: @@ -78,12 +105,12 @@ It also provides the query_id, session_id and start/end time for each query, whi An example of Excel report: -| query_name | query | expected_status | status | check_status | error | result | Duration (s) | query_id | session_id | Start Time | End Time | -|------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|---------|--------------|------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------|-------------------------------|------------------------------|----------------------|---------------------| -| 1 | describe myglue_test.default.http_logs | SUCCESS | SUCCESS | TRUE | | {'status': 'SUCCESS', 'schema': [{...}, ...], 'datarows': [[...], ...], 'total': 31, 'size': 31} | 37.51 | SHFEVWxDNnZjem15Z2x1ZV90ZXN0 | RkgzZm0xNlA5MG15Z2x1ZV90ZXN0 | 2024-11-07 13:34:10 | 2024-11-07 13:34:47 | -| 2 | source = myglue_test.default.http_logs \| dedup status CONSECUTIVE=true | SUCCESS | FAILED | FALSE | {"Message":"Fail to run query. Cause: Consecutive deduplication is not supported"} | | 39.53 | dVNlaVVxOFZrZW15Z2x1ZV90ZXN0 | ZGU2MllVYmI4dG15Z2x1ZV90ZXN0 | 2024-11-07 13:34:10 | 2024-11-07 13:34:49 | -| 3 | source = myglue_test.default.http_logs \| eval res = json_keys(json('{"account_number":1,"balance":39225,"age":32,"gender":"M"}')) \| head 1 \| fields res | SUCCESS | SUCCESS | TRUE | | {'status': 'SUCCESS', 'schema': [{'name': 'res', 'type': 'array'}], 'datarows': [[['account_number', 'balance', 'age', 'gender']]], 'total': 1, 'size': 1} | 12.77 | WHQxaXlVSGtGUm15Z2x1ZV90ZXN0 | RkgzZm0xNlA5MG15Z2x1ZV90ZXN0 | 2024-11-07 13:34:47 | 2024-11-07 13:38:45 | -| ... | ... | ... | ... | ... | | | ... | ... | ... | ... | ... | +| query_name | query | expected_status | status | check_status | error | result | duration (s) | Start Time | End Time | +|------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|---------|--------------|------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------|----------------------|---------------------| +| 1 | describe myglue_test.default.http_logs | SUCCESS | SUCCESS | TRUE | | {'status': 'SUCCESS', 'schema': [{...}, ...], 'datarows': [[...], ...], 'total': 31, 'size': 31} | 37.51 | 2024-11-07 13:34:10 | 2024-11-07 13:34:47 | +| 2 | source = myglue_test.default.http_logs \| dedup status CONSECUTIVE=true | SUCCESS | FAILED | FALSE | {"Message":"Fail to run query. Cause: Consecutive deduplication is not supported"} | | 39.53 | 2024-11-07 13:34:10 | 2024-11-07 13:34:49 | +| 3 | source = myglue_test.default.http_logs \| eval res = json_keys(json('{"account_number":1,"balance":39225,"age":32,"gender":"M"}')) \| head 1 \| fields res | SUCCESS | SUCCESS | TRUE | | {'status': 'SUCCESS', 'schema': [{'name': 'res', 'type': 'array'}], 'datarows': [[['account_number', 'balance', 'age', 'gender']]], 'total': 1, 'size': 1} | 12.77 | 2024-11-07 13:34:47 | 2024-11-07 13:38:45 | +| ... | ... | ... | ... | ... | | | ... | ... | ... | #### JSON Report @@ -103,7 +130,7 @@ An example of JSON report: "detailed_results": [ { "query_name": 1, - "query": "source = myglue_test.default.http_logs | stats avg(size)", + "query": "source = dev.default.http_logs | stats avg(size)", "query_id": "eFZmTlpTa3EyTW15Z2x1ZV90ZXN0", "session_id": "bFJDMWxzb2NVUm15Z2x1ZV90ZXN0", "status": "SUCCESS", @@ -130,7 +157,7 @@ An example of JSON report: }, { "query_name": 2, - "query": "source = myglue_test.default.http_logs | eval res = json_keys(json(\u2018{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}')) | head 1 | fields res", + "query": "source = def.default.http_logs | eval res = json_keys(json(\u2018{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}')) | head 1 | fields res", "query_id": "bjF4Y1VnbXdFYm15Z2x1ZV90ZXN0", "session_id": "c3pvU1V6OW8xM215Z2x1ZV90ZXN0", "status": "FAILED", @@ -142,7 +169,7 @@ An example of JSON report: }, { "query_name": 2, - "query": "source = myglue_test.default.http_logs | eval col1 = size, col2 = clientip | stats avg(col1) by col2", + "query": "source = dev.default.http_logs | eval col1 = size, col2 = clientip | stats avg(col1) by col2", "query_id": "azVyMFFORnBFRW15Z2x1ZV90ZXN0", "session_id": "VWF0SEtrNWM3bm15Z2x1ZV90ZXN0", "status": "TIMEOUT", diff --git a/integ-test/script/SanityTest.py b/integ-test/script/SanityTest.py index eb97752b4..b4e6210bb 100644 --- a/integ-test/script/SanityTest.py +++ b/integ-test/script/SanityTest.py @@ -3,6 +3,7 @@ SPDX-License-Identifier: Apache-2.0 """ +import glob import signal import sys import requests @@ -11,18 +12,18 @@ import time import logging from datetime import datetime -import pandas as pd import argparse from requests.auth import HTTPBasicAuth -from concurrent.futures import ThreadPoolExecutor, as_completed +from pyspark.sql import SparkSession import threading +import pandas as pd """ Environment: python3 Example to use this script: -python SanityTest.py --base-url ${URL_ADDRESS} --username *** --password *** --datasource ${DATASOURCE_NAME} --input-csv test_queries.csv --output-file test_report --max-workers 2 --check-interval 10 --timeout 600 +python SanityTest.py --spark-url ${SPARK_URL} --username *** --password *** --opensearch_url ${OPENSEARCH_URL} --input-csv test_queries.csv --output-file test_report The input file test_queries.csv should contain column: `query` @@ -33,24 +34,19 @@ """ class FlintTester: - def __init__(self, base_url, username, password, datasource, max_workers, check_interval, timeout, output_file, start_row, end_row, log_level): - self.base_url = base_url + def __init__(self, spark_url, username, password, opensearch_url, output_file, start_row, end_row, log_level): + self.spark_url = spark_url self.auth = HTTPBasicAuth(username, password) - self.datasource = datasource - self.headers = { 'Content-Type': 'application/json' } - self.max_workers = max_workers - self.check_interval = check_interval - self.timeout = timeout + self.opensearch_url = opensearch_url self.output_file = output_file self.start = start_row - 1 if start_row else None self.end = end_row - 1 if end_row else None self.log_level = log_level - self.max_attempts = (int)(timeout / check_interval) self.logger = self._setup_logger() - self.executor = ThreadPoolExecutor(max_workers=self.max_workers) - self.thread_local = threading.local() self.test_results = [] + self.spark_client = SparkSession.builder.remote(spark_url).appName("integ-test").getOrCreate() + def _setup_logger(self): logger = logging.getLogger('FlintTester') logger.setLevel(self.log_level) @@ -72,126 +68,80 @@ def _setup_logger(self): return logger + # Create the indices needed for the tests + def create_indices(self): + self.logger.info("Creating indices") - def get_session_id(self): - if not hasattr(self.thread_local, 'session_id'): - self.thread_local.session_id = "empty_session_id" - self.logger.debug(f"get session id {self.thread_local.session_id}") - return self.thread_local.session_id + json_files = glob.glob('data/*.json') + mapping_files = [f for f in json_files if f.endswith('.mapping.json')] + data_files = [f for f in json_files if not f.endswith('.mapping.json')] + existing_indices = set() - def set_session_id(self, session_id): - """Reuse the session id for the same thread""" - self.logger.debug(f"set session id {session_id}") - self.thread_local.session_id = session_id + for mapping_file in mapping_files: + index_name = mapping_file[5 : mapping_file.index('.')] - # Call submit API to submit the query - def submit_query(self, query, session_id="Empty"): - url = f"{self.base_url}/_plugins/_async_query" - payload = { - "datasource": self.datasource, - "lang": "ppl", - "query": query, - "sessionId": session_id - } - self.logger.debug(f"Submit query with payload: {payload}") - response_json = None - try: - response = requests.post(url, auth=self.auth, json=payload, headers=self.headers) - response_json = response.json() - response.raise_for_status() - return response_json - except Exception as e: - return {"error": f"{str(e)}, got response {response_json}"} + self.logger.info(f"Checking if index exists: {index_name}") + response = requests.get(f'{self.opensearch_url}/{index_name}', auth=self.auth) + if response.status_code == 200: + existing_indices.add(index_name) + continue - # Call get API to check the query status - def get_query_result(self, query_id): - url = f"{self.base_url}/_plugins/_async_query/{query_id}" - response_json = None - try: - response = requests.get(url, auth=self.auth) - response_json = response.json() - response.raise_for_status() - return response_json - except Exception as e: - return {"status": "FAILED", "error": f"{str(e)}, got response {response_json}"} + self.logger.info(f"Creating index: {index_name}") - # Call delete API to cancel the query - def cancel_query(self, query_id): - url = f"{self.base_url}/_plugins/_async_query/{query_id}" - response_json = None - try: - response = requests.delete(url, auth=self.auth) - response_json = response.json() - response.raise_for_status() - self.logger.info(f"Cancelled query [{query_id}] with info {response.json()}") - return response_json - except Exception as e: - self.logger.warning(f"Cancel query [{query_id}] error: {str(e)}, got response {response_json}") + file_data = open(mapping_file, 'rb').read() + headers = {'Content-Type': 'application/json'} + + response = requests.put(f'{self.opensearch_url}/{index_name}', auth=self.auth, headers=headers, data=file_data) + if response.status_code != 200: + self.logger.error(f'Failed to create index: {index_name}') + response.raise_for_status() + + for data_file in data_files: + index_name = data_file[5 : data_file.index('.')] + if index_name in existing_indices: + continue + + self.logger.info(f"Populating index: {index_name}") + + file_data = open(data_file, 'rb').read() + headers = {'Content-Type': 'application/x-ndjson'} + + response = requests.post(f'{self.opensearch_url}/{index_name}/_bulk', auth=self.auth, headers=headers, data=file_data) + if response.status_code != 200: + response.raise_for_status() # Run the test and return the result def run_test(self, query, seq_id, expected_status): self.logger.info(f"Starting test: {seq_id}, {query}") start_time = datetime.now() - pre_session_id = self.get_session_id() - submit_result = self.submit_query(query, pre_session_id) - if "error" in submit_result: - self.logger.warning(f"Submit error: {submit_result}") - return { - "query_name": seq_id, - "query": query, - "expected_status": expected_status, - "status": "SUBMIT_FAILED", - "check_status": "SUBMIT_FAILED" == expected_status if expected_status else None, - "error": submit_result["error"], - "duration": 0, - "start_time": start_time, - "end_time": datetime.now() - } - - query_id = submit_result["queryId"] - session_id = submit_result["sessionId"] - self.logger.info(f"Submit return: {submit_result}") - if (session_id != pre_session_id): - self.logger.info(f"Update session id from {pre_session_id} to {session_id}") - self.set_session_id(session_id) - - test_result = self.check_query_status(query_id) + + query_str = query.replace('\n', ' ') + status = None + result = None + error_str = None + try: + result = self.spark_client.sql(query_str) + status = 'SUCCESS' + except Exception as e: + status = 'FAILED' + error_str = str(e) + end_time = datetime.now() duration = (end_time - start_time).total_seconds() return { "query_name": seq_id, "query": query, - "query_id": query_id, - "session_id": session_id, "expected_status": expected_status, - "status": test_result["status"], - "check_status": test_result["status"] == expected_status if expected_status else None, - "error": test_result.get("error", ""), - "result": test_result if test_result["status"] == "SUCCESS" else None, + "status": status, + "check_status": status == expected_status if expected_status else None, + "error": error_str if error_str else None, + "result": result, "duration": duration, "start_time": start_time, "end_time": end_time } - # Check the status of the query periodically until it is completed or failed or exceeded the timeout - def check_query_status(self, query_id): - query_id = query_id - - for attempt in range(self.max_attempts): - time.sleep(self.check_interval) - result = self.get_query_result(query_id) - - if result["status"] == "FAILED" or result["status"] == "SUCCESS": - return result - - # Cancel the query if it exceeds the timeout - self.cancel_query(query_id) - return { - "status": "TIMEOUT", - "error": "Query execution exceeded " + str(self.timeout) + " seconds with last status: " + result["status"], - } - def run_tests_from_csv(self, csv_file): with open(csv_file, 'r') as f: reader = csv.DictReader(f) @@ -200,20 +150,15 @@ def run_tests_from_csv(self, csv_file): # Filtering queries based on start and end queries = queries[self.start:self.end] - # Parallel execution - futures = [self.executor.submit(self.run_test, query, seq_id, expected_status) for query, seq_id, expected_status in queries] - for future in as_completed(futures): - result = future.result() - self.logger.info(f"Completed test: {result["query_name"]}, {result["query"]}, got result status: {result["status"]}") - self.test_results.append(result) + self.test_results = [] + for query in queries: + self.test_results.append(self.run_test(query[0], query[1], query[2])) def generate_report(self): self.logger.info("Generating report...") total_queries = len(self.test_results) successful_queries = sum(1 for r in self.test_results if r['status'] == 'SUCCESS') failed_queries = sum(1 for r in self.test_results if r['status'] == 'FAILED') - submit_failed_queries = sum(1 for r in self.test_results if r['status'] == 'SUBMIT_FAILED') - timeout_queries = sum(1 for r in self.test_results if r['status'] == 'TIMEOUT') # Create report report = { @@ -221,8 +166,6 @@ def generate_report(self): "total_queries": total_queries, "successful_queries": successful_queries, "failed_queries": failed_queries, - "submit_failed_queries": submit_failed_queries, - "timeout_queries": timeout_queries, "execution_time": sum(r['duration'] for r in self.test_results) }, "detailed_results": self.test_results @@ -249,15 +192,12 @@ def signal_handler(sig, frame, tester): def main(): # Parse command line arguments parser = argparse.ArgumentParser(description="Run tests from a CSV file and generate a report.") - parser.add_argument("--base-url", required=True, help="Base URL of the service") + parser.add_argument("--spark-url", required=True, help="URL of the Spark service") parser.add_argument("--username", required=True, help="Username for authentication") parser.add_argument("--password", required=True, help="Password for authentication") - parser.add_argument("--datasource", required=True, help="Datasource name") + parser.add_argument("--opensearch-url", required=True, help="URL of the OpenSearch service") parser.add_argument("--input-csv", required=True, help="Path to the CSV file containing test queries") parser.add_argument("--output-file", required=True, help="Path to the output report file") - parser.add_argument("--max-workers", type=int, default=2, help="optional, Maximum number of worker threads (default: 2)") - parser.add_argument("--check-interval", type=int, default=5, help="optional, Check interval in seconds (default: 5)") - parser.add_argument("--timeout", type=int, default=600, help="optional, Timeout in seconds (default: 600)") parser.add_argument("--start-row", type=int, default=None, help="optional, The start row of the query to run, start from 1") parser.add_argument("--end-row", type=int, default=None, help="optional, The end row of the query to run, not included") parser.add_argument("--log-level", default="INFO", help="optional, Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL, default: INFO)") @@ -265,13 +205,10 @@ def main(): args = parser.parse_args() tester = FlintTester( - base_url=args.base_url, + spark_url=args.spark_url, username=args.username, password=args.password, - datasource=args.datasource, - max_workers=args.max_workers, - check_interval=args.check_interval, - timeout=args.timeout, + opensearch_url=args.opensearch_url, output_file=args.output_file, start_row=args.start_row, end_row=args.end_row, @@ -282,6 +219,9 @@ def main(): signal.signal(signal.SIGINT, lambda sig, frame: signal_handler(sig, frame, tester)) signal.signal(signal.SIGTERM, lambda sig, frame: signal_handler(sig, frame, tester)) + # Create indices + tester.create_indices() + # Running tests tester.run_tests_from_csv(args.input_csv) diff --git a/integ-test/script/data/customer.mapping.json b/integ-test/script/data/customer.mapping.json new file mode 100644 index 000000000..a98d473a2 --- /dev/null +++ b/integ-test/script/data/customer.mapping.json @@ -0,0 +1,30 @@ +{ + "mappings": { + "properties": { + "c_custkey": { + "type": "integer" + }, + "c_name": { + "type": "text" + }, + "c_address": { + "type": "text" + }, + "c_nationkey": { + "type": "integer" + }, + "c_phone": { + "type": "text" + }, + "c_acctbal": { + "type": "double" + }, + "c_mktsegment": { + "type": "text" + }, + "c_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/http_logs.json b/integ-test/script/data/http_logs.json new file mode 100644 index 000000000..ff2aa2fca --- /dev/null +++ b/integ-test/script/data/http_logs.json @@ -0,0 +1,12 @@ +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696154400000, "year": 2023, "month": 10, "day": 1, "clientip": "40.135.0.0", "request": "GET /images/hm_bg.jpg HTTP/1.0", "status": 200, "size": 24736} +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696154700000, "year": 2023, "month": 10, "day": 1, "clientip": "232.0.0.0", "request": "GET /images/hm_bg.jpg HTTP/1.0", "status": 200, "size": 24736} +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696155000000, "year": 2023, "month": 10, "day": 1, "clientip": "26.1.0.0", "request": "GET /images/hm_bg.jpg HTTP/1.0", "status": 200, "size": 24736} +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696155300000, "year": 2023, "month": 10, "day": 1, "clientip": "247.37.0.0", "request": "GET /french/splash_inet.html HTTP/1.0", "status": 200, "size": 3781} +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696155600000, "year": 2023, "month": 10, "day": 1, "clientip": "247.37.0.0", "request": "GET /images/hm_nbg.jpg HTTP/1.0", "status": 304, "size": 0} +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696155900000, "year": 2023, "month": 10, "day": 1, "clientip": "252.0.0.0", "request": "GET /images/hm_bg.jpg HTTP/1.0", "status": 200, "size": 24736} diff --git a/integ-test/script/data/http_logs.mapping.json b/integ-test/script/data/http_logs.mapping.json new file mode 100644 index 000000000..b944fbd4b --- /dev/null +++ b/integ-test/script/data/http_logs.mapping.json @@ -0,0 +1,30 @@ +{ + "mappings": { + "properties": { + "@timestamp": { + "type": "date" + }, + "year": { + "type": "integer" + }, + "month": { + "type": "integer" + }, + "day": { + "type": "integer" + }, + "clientip": { + "type": "keyword" + }, + "request": { + "type": "text" + }, + "status": { + "type": "integer" + }, + "size": { + "type": "integer" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/lineitem.mapping.json b/integ-test/script/data/lineitem.mapping.json new file mode 100644 index 000000000..2fb1cdb40 --- /dev/null +++ b/integ-test/script/data/lineitem.mapping.json @@ -0,0 +1,54 @@ +{ + "mappings": { + "properties": { + "l_orderkey": { + "type": "integer" + }, + "l_partkey": { + "type": "text" + }, + "l_suppkey": { + "type": "integer" + }, + "l_linenumber": { + "type": "integer" + }, + "l_quantity": { + "type": "double" + }, + "l_extendedprice": { + "type": "double" + }, + "l_discount": { + "type": "double" + }, + "l_tax": { + "type": "double" + }, + "l_returnflag": { + "type": "text" + }, + "l_linestatus": { + "type": "text" + }, + "l_shipdate": { + "type": "date" + }, + "l_commitdate": { + "type": "date" + }, + "l_receiptdate": { + "type": "date" + }, + "l_shipinstruct": { + "type": "text" + }, + "l_shipmode": { + "type": "text" + }, + "l_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/nation.mapping.json b/integ-test/script/data/nation.mapping.json new file mode 100644 index 000000000..d0e82e559 --- /dev/null +++ b/integ-test/script/data/nation.mapping.json @@ -0,0 +1,18 @@ +{ + "mappings": { + "properties": { + "n_nationkey": { + "type": "integer" + }, + "n_name": { + "type": "text" + }, + "n_regionkey": { + "type": "integer" + }, + "n_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/nested.json b/integ-test/script/data/nested.json new file mode 100644 index 000000000..eb8af683b --- /dev/null +++ b/integ-test/script/data/nested.json @@ -0,0 +1,10 @@ +{"index": {"_index": "nested"}} +{"int_col": 30, "struct_col": {"field1": {"subfield": "value1"}, "field2": 123}, "struct_col2": {"field1": {"subfield": "valueA"}, "field2": 23}} +{"index": {"_index": "nested"}} +{"int_col": 40, "struct_col": {"field1": {"subfield": "value5"}, "field2": 123}, "struct_col2": {"field1": {"subfield": "valueB"}, "field2": 33}} +{"index": {"_index": "nested"}} +{"int_col": 30, "struct_col": {"field1": {"subfield": "value4"}, "field2": 823}, "struct_col2": {"field1": {"subfield": "valueC"}, "field2": 83}} +{"index": {"_index": "nested"}} +{"int_col": 40, "struct_col": {"field1": {"subfield": "value2"}, "field2": 456}, "struct_col2": {"field1": {"subfield": "valueD"}, "field2": 46}} +{"index": {"_index": "nested"}} +{"int_col": 50, "struct_col": {"field1": {"subfield": "value3"}, "field2": 789}, "struct_col2": {"field1": {"subfield": "valueE"}, "field2": 89}} diff --git a/integ-test/script/data/nested.mapping.json b/integ-test/script/data/nested.mapping.json new file mode 100644 index 000000000..1aa189415 --- /dev/null +++ b/integ-test/script/data/nested.mapping.json @@ -0,0 +1,37 @@ +{ + "mappings": { + "properties": { + "int_col": { + "type": "integer" + }, + "struct_col": { + "properties": { + "field1": { + "properties": { + "subfield": { + "type": "text" + } + } + }, + "field2": { + "type": "integer" + } + } + }, + "struct_col2": { + "properties": { + "field1": { + "properties": { + "subfield": { + "type": "text" + } + } + }, + "field2": { + "type": "integer" + } + } + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/orders.mapping.json b/integ-test/script/data/orders.mapping.json new file mode 100644 index 000000000..59b3cecdd --- /dev/null +++ b/integ-test/script/data/orders.mapping.json @@ -0,0 +1,33 @@ +{ + "mappings": { + "properties": { + "o_orderkey": { + "type": "integer" + }, + "o_custkey": { + "type": "integer" + }, + "o_orderstatus": { + "type": "text" + }, + "o_totalprice": { + "type": "double" + }, + "o_orderdate": { + "type": "date" + }, + "o_orderpriority": { + "type": "text" + }, + "o_clerk": { + "type": "text" + }, + "o_shippriority": { + "type": "integer" + }, + "o_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/part.mapping.json b/integ-test/script/data/part.mapping.json new file mode 100644 index 000000000..8be7e9aa0 --- /dev/null +++ b/integ-test/script/data/part.mapping.json @@ -0,0 +1,33 @@ +{ + "mappings": { + "properties": { + "p_partkey": { + "type": "integer" + }, + "p_name": { + "type": "text" + }, + "p_mfgr": { + "type": "text" + }, + "p_brand": { + "type": "text" + }, + "p_type": { + "type": "text" + }, + "p_size": { + "type": "integer" + }, + "p_container": { + "type": "text" + }, + "p_retailprice": { + "type": "double" + }, + "p_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/partsupp.mapping.json b/integ-test/script/data/partsupp.mapping.json new file mode 100644 index 000000000..13509ad46 --- /dev/null +++ b/integ-test/script/data/partsupp.mapping.json @@ -0,0 +1,21 @@ +{ + "mappings": { + "properties": { + "ps_partkey": { + "type": "integer" + }, + "ps_suppkey": { + "type": "integer" + }, + "ps_availqty": { + "type": "integer" + }, + "ps_supplycost": { + "type": "double" + }, + "ps_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/people.json b/integ-test/script/data/people.json new file mode 100644 index 000000000..4563a2c4b --- /dev/null +++ b/integ-test/script/data/people.json @@ -0,0 +1,12 @@ +{"index": {"_index": "people"}} +{"@timestamp": 1718458823000, "id": 1000, "name": "Jake", "occupation": "Engineer", "country": "England", "salary": 100000} +{"index": {"_index": "people"}} +{"@timestamp": 1718458833000, "id": 1001, "name": "Hello", "occupation": "Artist", "country": "USA", "salary": 70000} +{"index": {"_index": "people"}} +{"@timestamp": 1718458843000, "id": 1002, "name": "John", "occupation": "Doctor", "country": "Canada", "salary": 120000} +{"index": {"_index": "people"}} +{"@timestamp": 1718458853000, "id": 1003, "name": "David", "occupation": "Doctor", "country": null, "salary": 120000} +{"index": {"_index": "people"}} +{"@timestamp": 1718458863000, "id": 1004, "name": "David", "occupation": null, "country": "Canada", "salary": 0} +{"index": {"_index": "people"}} +{"@timestamp": 1718458873000, "id": 1005, "name": "Jane", "occupation": "Scientist", "country": "Canada", "salary": 90000} diff --git a/integ-test/script/data/people.mapping.json b/integ-test/script/data/people.mapping.json new file mode 100644 index 000000000..b5dde8ff6 --- /dev/null +++ b/integ-test/script/data/people.mapping.json @@ -0,0 +1,24 @@ +{ + "mappings": { + "properties": { + "@timestamp": { + "type": "date" + }, + "id": { + "type": "integer" + }, + "name": { + "type": "text" + }, + "occupation": { + "type": "text" + }, + "country": { + "type": "text" + }, + "salary": { + "type": "integer" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/region.mapping.json b/integ-test/script/data/region.mapping.json new file mode 100644 index 000000000..3dddbc580 --- /dev/null +++ b/integ-test/script/data/region.mapping.json @@ -0,0 +1,15 @@ +{ + "mappings": { + "properties": { + "r_regionkey": { + "type": "integer" + }, + "r_name": { + "type": "text" + }, + "r_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/supplier.mapping.json b/integ-test/script/data/supplier.mapping.json new file mode 100644 index 000000000..bdcb933b6 --- /dev/null +++ b/integ-test/script/data/supplier.mapping.json @@ -0,0 +1,27 @@ +{ + "mappings": { + "properties": { + "s_suppkey": { + "type": "integer" + }, + "s_name": { + "type": "text" + }, + "s_address": { + "type": "text" + }, + "s_nationkey": { + "type": "integer" + }, + "s_phone": { + "type": "text" + }, + "s_acctbal": { + "type": "double" + }, + "s_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/work_info.json b/integ-test/script/data/work_info.json new file mode 100644 index 000000000..64802bdad --- /dev/null +++ b/integ-test/script/data/work_info.json @@ -0,0 +1,10 @@ +{"index": {"_index": "work_info"}} +{"uid": 1000, "name": "Jake", "department": "IT", "occupation": "Engineer"} +{"index": {"_index": "work_info"}} +{"uid": 1002, "name": "John", "department": "DATA", "occupation": "Scientist"} +{"index": {"_index": "work_info"}} +{"uid": 1003, "name": "David", "department": "HR", "occupation": "Doctor"} +{"index": {"_index": "work_info"}} +{"uid": 1005, "name": "Jane", "department": "DATA", "occupation": "Engineer"} +{"index": {"_index": "work_info"}} +{"uid": 1006, "name": "Tom", "department": "SALES", "occupation": "Artist"} diff --git a/integ-test/script/data/work_info.mapping.json b/integ-test/script/data/work_info.mapping.json new file mode 100644 index 000000000..3fb5e2c28 --- /dev/null +++ b/integ-test/script/data/work_info.mapping.json @@ -0,0 +1,18 @@ +{ + "mappings": { + "properties": { + "uid": { + "type": "integer" + }, + "name": { + "type": "text" + }, + "department": { + "type": "text" + }, + "occupation": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/test_cases.csv b/integ-test/script/test_cases.csv index 7df05f5a3..91500efea 100644 --- a/integ-test/script/test_cases.csv +++ b/integ-test/script/test_cases.csv @@ -1,116 +1,116 @@ query,expected_status -describe myglue_test.default.http_logs,FAILED -describe `myglue_test`.`default`.`http_logs`,FAILED -"source = myglue_test.default.http_logs | dedup 1 status | fields @timestamp, clientip, status, size | head 10",SUCCESS -"source = myglue_test.default.http_logs | dedup status, size | head 10",SUCCESS -source = myglue_test.default.http_logs | dedup 1 status keepempty=true | head 10,SUCCESS -"source = myglue_test.default.http_logs | dedup status, size keepempty=true | head 10",SUCCESS -source = myglue_test.default.http_logs | dedup 2 status | head 10,SUCCESS -"source = myglue_test.default.http_logs | dedup 2 status, size | head 10",SUCCESS -"source = myglue_test.default.http_logs | dedup 2 status, size keepempty=true | head 10",SUCCESS -source = myglue_test.default.http_logs | dedup status CONSECUTIVE=true | fields status,FAILED -"source = myglue_test.default.http_logs | dedup 2 status, size CONSECUTIVE=true | fields status",FAILED -"source = myglue_test.default.http_logs | sort stat | fields @timestamp, clientip, status | head 10",SUCCESS -"source = myglue_test.default.http_logs | fields @timestamp, notexisted | head 10",FAILED -"source = myglue_test.default.nested | fields int_col, struct_col.field1, struct_col2.field1 | head 10",FAILED -"source = myglue_test.default.nested | where struct_col2.field1.subfield > 'valueA' | sort int_col | fields int_col, struct_col.field1.subfield, struct_col2.field1.subfield",FAILED -"source = myglue_test.default.http_logs | fields - @timestamp, clientip, status | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval new_time = @timestamp, new_clientip = clientip | fields - new_time, new_clientip, status | head 10",SUCCESS -source = myglue_test.default.http_logs | eval new_clientip = lower(clientip) | fields - new_clientip | head 10,SUCCESS -"source = myglue_test.default.http_logs | fields + @timestamp, clientip, status | fields - clientip, status | head 10",SUCCESS -"source = myglue_test.default.http_logs | fields - clientip, status | fields + @timestamp, clientip, status| head 10",SUCCESS -source = myglue_test.default.http_logs | where status = 200 | head 10,SUCCESS -source = myglue_test.default.http_logs | where status != 200 | head 10,SUCCESS -source = myglue_test.default.http_logs | where size > 0 | head 10,SUCCESS -source = myglue_test.default.http_logs | where size <= 0 | head 10,SUCCESS -source = myglue_test.default.http_logs | where clientip = '236.14.2.0' | head 10,SUCCESS -source = myglue_test.default.http_logs | where size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100,SUCCESS -"source = myglue_test.default.http_logs | where size <= 0 AND like(request, 'GET%') | head 10",SUCCESS -source = myglue_test.default.http_logs status = 200 | head 10,SUCCESS -source = myglue_test.default.http_logs size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100,SUCCESS -"source = myglue_test.default.http_logs size <= 0 AND like(request, 'GET%') | head 10",SUCCESS -"source = myglue_test.default.http_logs substring(clientip, 5, 2) = ""12"" | head 10",SUCCESS -source = myglue_test.default.http_logs | where isempty(size),FAILED -source = myglue_test.default.http_logs | where ispresent(size),FAILED -source = myglue_test.default.http_logs | where isnull(size) | head 10,SUCCESS -source = myglue_test.default.http_logs | where isnotnull(size) | head 10,SUCCESS -"source = myglue_test.default.http_logs | where isnotnull(coalesce(size, status)) | head 10",FAILED -"source = myglue_test.default.http_logs | where like(request, 'GET%') | head 10",SUCCESS -"source = myglue_test.default.http_logs | where like(request, '%bordeaux%') | head 10",SUCCESS -"source = myglue_test.default.http_logs | where substring(clientip, 5, 2) = ""12"" | head 10",SUCCESS -"source = myglue_test.default.http_logs | where lower(request) = ""get /images/backnews.gif http/1.0"" | head 10",SUCCESS -source = myglue_test.default.http_logs | where length(request) = 38 | head 10,SUCCESS -"source = myglue_test.default.http_logs | where case(status = 200, 'success' else 'failed') = 'success' | head 10",FAILED -"source = myglue_test.default.http_logs | eval h = ""Hello"", w = ""World"" | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval @h = ""Hello"" | eval @w = ""World"" | fields @timestamp, @h, @w",SUCCESS -source = myglue_test.default.http_logs | eval newF = clientip | head 10,SUCCESS -"source = myglue_test.default.http_logs | eval newF = clientip | fields clientip, newF | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval f = size | where f > 1 | sort f | fields size, clientip, status | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval f = status * 2 | eval h = f * 2 | fields status, f, h | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval f = size * 2, h = status | stats sum(f) by h",SUCCESS -"source = myglue_test.default.http_logs | eval f = UPPER(request) | eval h = 40 | fields f, h | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval request = ""test"" | fields request | head 10",FAILED -source = myglue_test.default.http_logs | eval size = abs(size) | where size < 500,FAILED -"source = myglue_test.default.http_logs | eval status_string = case(status = 200, 'success' else 'failed') | head 10",FAILED -"source = myglue_test.default.http_logs | eval n = now() | eval t = unix_timestamp(@timestamp) | fields n, t | head 10",SUCCESS -source = myglue_test.default.http_logs | eval e = isempty(size) | eval p = ispresent(size) | head 10,FAILED -"source = myglue_test.default.http_logs | eval c = coalesce(size, status) | head 10",FAILED -source = myglue_test.default.http_logs | eval c = coalesce(request) | head 10,FAILED -source = myglue_test.default.http_logs | eval col1 = ln(size) | eval col2 = unix_timestamp(@timestamp) | sort - col1 | head 10,SUCCESS -"source = myglue_test.default.http_logs | eval col1 = 1 | sort col1 | head 4 | eval col2 = 2 | sort - col2 | sort - size | head 2 | fields @timestamp, clientip, col2",SUCCESS -"source = myglue_test.default.mini_http_logs | eval stat = status | where stat > 300 | sort stat | fields @timestamp,clientip,status | head 5",SUCCESS -"source = myglue_test.default.http_logs | eval col1 = size, col2 = clientip | stats avg(col1) by col2",SUCCESS -source = myglue_test.default.http_logs | stats avg(size) by clientip,SUCCESS -"source = myglue_test.default.http_logs | eval new_request = upper(request) | eval compound_field = concat('Hello ', if(like(new_request, '%bordeaux%'), 'World', clientip)) | fields new_request, compound_field | head 10",SUCCESS -source = myglue_test.default.http_logs | stats avg(size),SUCCESS -source = myglue_test.default.nested | stats max(int_col) by struct_col.field2,SUCCESS -source = myglue_test.default.nested | stats distinct_count(int_col),SUCCESS -source = myglue_test.default.nested | stats stddev_samp(int_col),SUCCESS -source = myglue_test.default.nested | stats stddev_pop(int_col),SUCCESS -source = myglue_test.default.nested | stats percentile(int_col),SUCCESS -source = myglue_test.default.nested | stats percentile_approx(int_col),SUCCESS -source = myglue_test.default.mini_http_logs | stats stddev_samp(status),SUCCESS -"source = myglue_test.default.mini_http_logs | where stats > 200 | stats percentile_approx(status, 99)",SUCCESS -"source = myglue_test.default.nested | stats count(int_col) by span(struct_col.field2, 10) as a_span",SUCCESS -"source = myglue_test.default.nested | stats avg(int_col) by span(struct_col.field2, 10) as a_span, struct_col2.field2",SUCCESS -"source = myglue_test.default.http_logs | stats sum(size) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10",SUCCESS -"source = myglue_test.default.http_logs | stats distinct_count(clientip) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10",SUCCESS -"source = myglue_test.default.http_logs | stats avg(size) as avg_size by status, year | stats avg(avg_size) as avg_avg_size by year",SUCCESS -"source = myglue_test.default.http_logs | stats avg(size) as avg_size by status, year, month | stats avg(avg_size) as avg_avg_size by year, month | stats avg(avg_avg_size) as avg_avg_avg_size by year",SUCCESS -"source = myglue_test.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | stats avg(avg_int) as avg_avg_int by struct_col2.field2",FAILED -"source = myglue_test.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | eval new_col = avg_int | stats avg(avg_int) as avg_avg_int by new_col",SUCCESS -source = myglue_test.default.nested | rare int_col,SUCCESS -source = myglue_test.default.nested | rare int_col by struct_col.field2,SUCCESS -source = myglue_test.default.http_logs | rare request,SUCCESS -source = myglue_test.default.http_logs | where status > 300 | rare request by status,SUCCESS -source = myglue_test.default.http_logs | rare clientip,SUCCESS -source = myglue_test.default.http_logs | where status > 300 | rare clientip,SUCCESS -source = myglue_test.default.http_logs | where status > 300 | rare clientip by day,SUCCESS -source = myglue_test.default.nested | top int_col by struct_col.field2,SUCCESS -source = myglue_test.default.nested | top 1 int_col by struct_col.field2,SUCCESS -source = myglue_test.default.nested | top 2 int_col by struct_col.field2,SUCCESS -source = myglue_test.default.nested | top int_col,SUCCESS -source = myglue_test.default.http_logs | inner join left=l right=r on l.status = r.int_col myglue_test.default.nested | head 10,FAILED -"source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | fields request, domain | head 10",SUCCESS -source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | top 1 domain,SUCCESS -source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | stats count() by domain,SUCCESS -"source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | eval a = 1 | fields a, domain | head 10",SUCCESS -"source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | where size > 0 | sort - size | fields size, domain | head 10",SUCCESS -"source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/(?[a-zA-Z]+)/.*' | where domain = 'english' | sort - picName | fields domain, picName | head 10",SUCCESS -source = myglue_test.default.http_logs | patterns request | fields patterns_field | head 10,SUCCESS -source = myglue_test.default.http_logs | patterns request | where size > 0 | fields patterns_field | head 10,SUCCESS -"source = myglue_test.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | fields request, no_letter | head 10",SUCCESS -source = myglue_test.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | stats count() by no_letter,SUCCESS -"source = myglue_test.default.http_logs | patterns new_field='status' pattern='[a-zA-Z]' request | fields request, status | head 10",FAILED -source = myglue_test.default.http_logs | rename @timestamp as timestamp | head 10,FAILED -source = myglue_test.default.http_logs | sort size | head 10,SUCCESS -source = myglue_test.default.http_logs | sort + size | head 10,SUCCESS -source = myglue_test.default.http_logs | sort - size | head 10,SUCCESS -"source = myglue_test.default.http_logs | sort + size, + @timestamp | head 10",SUCCESS -"source = myglue_test.default.http_logs | sort - size, - @timestamp | head 10",SUCCESS -"source = myglue_test.default.http_logs | sort - size, @timestamp | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval c1 = upper(request) | eval c2 = concat('Hello ', if(like(c1, '%bordeaux%'), 'World', clientip)) | eval c3 = length(request) | eval c4 = ltrim(request) | eval c5 = rtrim(request) | eval c6 = substring(clientip, 5, 2) | eval c7 = trim(request) | eval c8 = upper(request) | eval c9 = position('bordeaux' IN request) | eval c10 = replace(request, 'GET', 'GGG') | fields c1, c2, c3, c4, c5, c6, c7, c8, c9, c10 | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval c1 = unix_timestamp(@timestamp) | eval c2 = now() | eval c3 = +describe dev.default.http_logs,FAILED +describe `dev`.`default`.`http_logs`,FAILED +"source = dev.default.http_logs | dedup 1 status | fields @timestamp, clientip, status, size | head 10",SUCCESS +"source = dev.default.http_logs | dedup status, size | head 10",SUCCESS +source = dev.default.http_logs | dedup 1 status keepempty=true | head 10,SUCCESS +"source = dev.default.http_logs | dedup status, size keepempty=true | head 10",SUCCESS +source = dev.default.http_logs | dedup 2 status | head 10,SUCCESS +"source = dev.default.http_logs | dedup 2 status, size | head 10",SUCCESS +"source = dev.default.http_logs | dedup 2 status, size keepempty=true | head 10",SUCCESS +source = dev.default.http_logs | dedup status CONSECUTIVE=true | fields status,FAILED +"source = dev.default.http_logs | dedup 2 status, size CONSECUTIVE=true | fields status",FAILED +"source = dev.default.http_logs | sort status | fields @timestamp, clientip, status | head 10",SUCCESS +"source = dev.default.http_logs | fields @timestamp, notexisted | head 10",FAILED +"source = dev.default.nested | fields int_col, struct_col.field1, struct_col2.field1 | head 10",SUCCESS +"source = dev.default.nested | where struct_col2.field1.subfield > 'valueA' | sort int_col | fields int_col, struct_col.field1.subfield, struct_col2.field1.subfield",SUCCESS +"source = dev.default.http_logs | fields - @timestamp, clientip, status | head 10",SUCCESS +"source = dev.default.http_logs | eval new_time = @timestamp, new_clientip = clientip | fields - new_time, new_clientip, status | head 10",SUCCESS +source = dev.default.http_logs | eval new_clientip = lower(clientip) | fields - new_clientip | head 10,SUCCESS +"source = dev.default.http_logs | fields + @timestamp, clientip, status | fields - clientip, status | head 10",SUCCESS +"source = dev.default.http_logs | fields - clientip, status | fields + @timestamp, clientip, status| head 10",SUCCESS +source = dev.default.http_logs | where status = 200 | head 10,SUCCESS +source = dev.default.http_logs | where status != 200 | head 10,SUCCESS +source = dev.default.http_logs | where size > 0 | head 10,SUCCESS +source = dev.default.http_logs | where size <= 0 | head 10,SUCCESS +source = dev.default.http_logs | where clientip = '236.14.2.0' | head 10,SUCCESS +source = dev.default.http_logs | where size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100,SUCCESS +"source = dev.default.http_logs | where size <= 0 AND like(request, 'GET%') | head 10",SUCCESS +source = dev.default.http_logs status = 200 | head 10,SUCCESS +source = dev.default.http_logs size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100,SUCCESS +"source = dev.default.http_logs size <= 0 AND like(request, 'GET%') | head 10",SUCCESS +"source = dev.default.http_logs substring(clientip, 5, 2) = ""12"" | head 10",SUCCESS +source = dev.default.http_logs | where isempty(size),SUCCESS +source = dev.default.http_logs | where ispresent(size),SUCCESS +source = dev.default.http_logs | where isnull(size) | head 10,SUCCESS +source = dev.default.http_logs | where isnotnull(size) | head 10,SUCCESS +"source = dev.default.http_logs | where isnotnull(coalesce(size, status)) | head 10",SUCCESS +"source = dev.default.http_logs | where like(request, 'GET%') | head 10",SUCCESS +"source = dev.default.http_logs | where like(request, '%bordeaux%') | head 10",SUCCESS +"source = dev.default.http_logs | where substring(clientip, 5, 2) = ""12"" | head 10",SUCCESS +"source = dev.default.http_logs | where lower(request) = ""get /images/backnews.gif http/1.0"" | head 10",SUCCESS +source = dev.default.http_logs | where length(request) = 38 | head 10,SUCCESS +"source = dev.default.http_logs | where case(status = 200, 'success' else 'failed') = 'success' | head 10",SUCCESS +"source = dev.default.http_logs | eval h = ""Hello"", w = ""World"" | head 10",SUCCESS +"source = dev.default.http_logs | eval @h = ""Hello"" | eval @w = ""World"" | fields @timestamp, @h, @w",SUCCESS +source = dev.default.http_logs | eval newF = clientip | head 10,SUCCESS +"source = dev.default.http_logs | eval newF = clientip | fields clientip, newF | head 10",SUCCESS +"source = dev.default.http_logs | eval f = size | where f > 1 | sort f | fields size, clientip, status | head 10",SUCCESS +"source = dev.default.http_logs | eval f = status * 2 | eval h = f * 2 | fields status, f, h | head 10",SUCCESS +"source = dev.default.http_logs | eval f = size * 2, h = status | stats sum(f) by h",SUCCESS +"source = dev.default.http_logs | eval f = UPPER(request) | eval h = 40 | fields f, h | head 10",SUCCESS +"source = dev.default.http_logs | eval request = ""test"" | fields request | head 10",FAILED +source = dev.default.http_logs | eval size = abs(size) | where size < 500,FAILED +"source = dev.default.http_logs | eval status_string = case(status = 200, 'success' else 'failed') | head 10",SUCCESS +"source = dev.default.http_logs | eval n = now() | eval t = unix_timestamp(@timestamp) | fields n, t | head 10",SUCCESS +source = dev.default.http_logs | eval e = isempty(size) | eval p = ispresent(size) | head 10,SUCCESS +"source = dev.default.http_logs | eval c = coalesce(size, status) | head 10",SUCCESS +source = dev.default.http_logs | eval c = coalesce(request) | head 10,SUCCESS +source = dev.default.http_logs | eval col1 = ln(size) | eval col2 = unix_timestamp(@timestamp) | sort - col1 | head 10,SUCCESS +"source = dev.default.http_logs | eval col1 = 1 | sort col1 | head 4 | eval col2 = 2 | sort - col2 | sort - size | head 2 | fields @timestamp, clientip, col2",SUCCESS +"source = dev.default.http_logs | eval stat = status | where stat > 300 | sort stat | fields @timestamp,clientip,status | head 5",SUCCESS +"source = dev.default.http_logs | eval col1 = size, col2 = clientip | stats avg(col1) by col2",SUCCESS +source = dev.default.http_logs | stats avg(size) by clientip,SUCCESS +"source = dev.default.http_logs | eval new_request = upper(request) | eval compound_field = concat('Hello ', if(like(new_request, '%bordeaux%'), 'World', clientip)) | fields new_request, compound_field | head 10",SUCCESS +source = dev.default.http_logs | stats avg(size),SUCCESS +source = dev.default.nested | stats max(int_col) by struct_col.field2,SUCCESS +source = dev.default.nested | stats distinct_count(int_col),SUCCESS +source = dev.default.nested | stats stddev_samp(int_col),SUCCESS +source = dev.default.nested | stats stddev_pop(int_col),SUCCESS +"source = dev.default.nested | stats percentile(int_col, 90)",SUCCESS +"source = dev.default.nested | stats percentile_approx(int_col, 99)",SUCCESS +source = dev.default.http_logs | stats stddev_samp(status),SUCCESS +"source = dev.default.http_logs | where status > 200 | stats percentile_approx(status, 99)",SUCCESS +"source = dev.default.nested | stats count(int_col) by span(struct_col.field2, 10) as a_span",SUCCESS +"source = dev.default.nested | stats avg(int_col) by span(struct_col.field2, 10) as a_span, struct_col2.field2",SUCCESS +"source = dev.default.http_logs | stats sum(size) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10",SUCCESS +"source = dev.default.http_logs | stats distinct_count(clientip) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10",SUCCESS +"source = dev.default.http_logs | stats avg(size) as avg_size by status, year | stats avg(avg_size) as avg_avg_size by year",SUCCESS +"source = dev.default.http_logs | stats avg(size) as avg_size by status, year, month | stats avg(avg_size) as avg_avg_size by year, month | stats avg(avg_avg_size) as avg_avg_avg_size by year",SUCCESS +"source = dev.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | stats avg(avg_int) as avg_avg_int by struct_col2.field2",FAILED +"source = dev.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | eval new_col = avg_int | stats avg(avg_int) as avg_avg_int by new_col",SUCCESS +source = dev.default.nested | rare int_col,SUCCESS +source = dev.default.nested | rare int_col by struct_col.field2,SUCCESS +source = dev.default.http_logs | rare request,SUCCESS +source = dev.default.http_logs | where status > 300 | rare request by status,SUCCESS +source = dev.default.http_logs | rare clientip,SUCCESS +source = dev.default.http_logs | where status > 300 | rare clientip,SUCCESS +source = dev.default.http_logs | where status > 300 | rare clientip by day,SUCCESS +source = dev.default.nested | top int_col by struct_col.field2,SUCCESS +source = dev.default.nested | top 1 int_col by struct_col.field2,SUCCESS +source = dev.default.nested | top 2 int_col by struct_col.field2,SUCCESS +source = dev.default.nested | top int_col,SUCCESS +source = dev.default.http_logs | inner join left=l right=r on l.status = r.int_col dev.default.nested | head 10,SUCCESS +"source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | fields request, domain | head 10",SUCCESS +source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | top 1 domain,SUCCESS +source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | stats count() by domain,SUCCESS +"source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | eval a = 1 | fields a, domain | head 10",SUCCESS +"source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | where size > 0 | sort - size | fields size, domain | head 10",SUCCESS +"source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/(?[a-zA-Z]+)/.*' | where domain = 'english' | sort - picName | fields domain, picName | head 10",SUCCESS +source = dev.default.http_logs | patterns request | fields patterns_field | head 10,SUCCESS +source = dev.default.http_logs | patterns request | where size > 0 | fields patterns_field | head 10,SUCCESS +"source = dev.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | fields request, no_letter | head 10",SUCCESS +source = dev.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | stats count() by no_letter,SUCCESS +"source = dev.default.http_logs | patterns new_field='status' pattern='[a-zA-Z]' request | fields request, status | head 10",FAILED +source = dev.default.http_logs | rename @timestamp as timestamp | head 10,SUCCESS +source = dev.default.http_logs | sort size | head 10,SUCCESS +source = dev.default.http_logs | sort + size | head 10,SUCCESS +source = dev.default.http_logs | sort - size | head 10,SUCCESS +"source = dev.default.http_logs | sort + size, + @timestamp | head 10",SUCCESS +"source = dev.default.http_logs | sort - size, - @timestamp | head 10",SUCCESS +"source = dev.default.http_logs | sort - size, @timestamp | head 10",SUCCESS +"source = dev.default.http_logs | eval c1 = upper(request) | eval c2 = concat('Hello ', if(like(c1, '%bordeaux%'), 'World', clientip)) | eval c3 = length(request) | eval c4 = ltrim(request) | eval c5 = rtrim(request) | eval c6 = substring(clientip, 5, 2) | eval c7 = trim(request) | eval c8 = upper(request) | eval c9 = position('bordeaux' IN request) | eval c10 = replace(request, 'GET', 'GGG') | fields c1, c2, c3, c4, c5, c6, c7, c8, c9, c10 | head 10",SUCCESS +"source = dev.default.http_logs | eval c1 = unix_timestamp(@timestamp) | eval c2 = now() | eval c3 = DAY_OF_WEEK(@timestamp) | eval c4 = DAY_OF_MONTH(@timestamp) | eval c5 = DAY_OF_YEAR(@timestamp) | eval c6 = @@ -121,151 +121,151 @@ HOUR_OF_DAY(@timestamp) | eval c10 = MINUTE_OF_HOUR(@timestamp) | eval c11 = SECOND_OF_MINUTE(@timestamp) | eval c12 = LOCALTIME() | fields c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12 | head 10",SUCCESS -"source=myglue_test.default.people | eval c1 = adddate(@timestamp, 1) | fields c1 | head 10",SUCCESS -"source=myglue_test.default.people | eval c2 = subdate(@timestamp, 1) | fields c2 | head 10",SUCCESS -source=myglue_test.default.people | eval c1 = date_add(@timestamp INTERVAL 1 DAY) | fields c1 | head 10,SUCCESS -source=myglue_test.default.people | eval c1 = date_sub(@timestamp INTERVAL 1 DAY) | fields c1 | head 10,SUCCESS -source=myglue_test.default.people | eval `CURDATE()` = CURDATE() | fields `CURDATE()`,SUCCESS -source=myglue_test.default.people | eval `CURRENT_DATE()` = CURRENT_DATE() | fields `CURRENT_DATE()`,SUCCESS -source=myglue_test.default.people | eval `CURRENT_TIMESTAMP()` = CURRENT_TIMESTAMP() | fields `CURRENT_TIMESTAMP()`,SUCCESS -source=myglue_test.default.people | eval `DATE('2020-08-26')` = DATE('2020-08-26') | fields `DATE('2020-08-26')`,SUCCESS -source=myglue_test.default.people | eval `DATE(TIMESTAMP('2020-08-26 13:49:00'))` = DATE(TIMESTAMP('2020-08-26 13:49:00')) | fields `DATE(TIMESTAMP('2020-08-26 13:49:00'))`,SUCCESS -source=myglue_test.default.people | eval `DATE('2020-08-26 13:49')` = DATE('2020-08-26 13:49') | fields `DATE('2020-08-26 13:49')`,SUCCESS -"source=myglue_test.default.people | eval `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')` = DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS'), `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')` = DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a') | fields `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')`, `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')`",SUCCESS -"source=myglue_test.default.people | eval `'2000-01-02' - '2000-01-01'` = DATEDIFF(TIMESTAMP('2000-01-02 00:00:00'), TIMESTAMP('2000-01-01 23:59:59')), `'2001-02-01' - '2004-01-01'` = DATEDIFF(DATE('2001-02-01'), TIMESTAMP('2004-01-01 00:00:00')) | fields `'2000-01-02' - '2000-01-01'`, `'2001-02-01' - '2004-01-01'`", -source=myglue_test.default.people | eval `DAY(DATE('2020-08-26'))` = DAY(DATE('2020-08-26')) | fields `DAY(DATE('2020-08-26'))`, -source=myglue_test.default.people | eval `DAYNAME(DATE('2020-08-26'))` = DAYNAME(DATE('2020-08-26')) | fields `DAYNAME(DATE('2020-08-26'))`,FAILED -source=myglue_test.default.people | eval `CURRENT_TIMEZONE()` = CURRENT_TIMEZONE() | fields `CURRENT_TIMEZONE()`,SUCCESS -source=myglue_test.default.people | eval `UTC_TIMESTAMP()` = UTC_TIMESTAMP() | fields `UTC_TIMESTAMP()`,SUCCESS -"source=myglue_test.default.people | eval `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')` = TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | eval `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))` = TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00')) | fields `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')`, `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))`",SUCCESS -"source=myglue_test.default.people | eval `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')` = TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | eval `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` = TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') | fields `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')`, `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')`",SUCCESS - source = myglue_test.default.http_logs | stats count(),SUCCESS -"source = myglue_test.default.http_logs | stats avg(size) as c1, max(size) as c2, min(size) as c3, sum(size) as c4, percentile(size, 50) as c5, stddev_pop(size) as c6, stddev_samp(size) as c7, distinct_count(size) as c8",SUCCESS -"source = myglue_test.default.http_logs | eval c1 = abs(size) | eval c2 = ceil(size) | eval c3 = floor(size) | eval c4 = sqrt(size) | eval c5 = ln(size) | eval c6 = pow(size, 2) | eval c7 = mod(size, 2) | fields c1, c2, c3, c4, c5, c6, c7 | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval c1 = isnull(request) | eval c2 = isnotnull(request) | eval c3 = ifnull(request, +"source=dev.default.people | eval c1 = adddate(@timestamp, 1) | fields c1 | head 10",SUCCESS +"source=dev.default.people | eval c2 = subdate(@timestamp, 1) | fields c2 | head 10",SUCCESS +source=dev.default.people | eval c1 = date_add(@timestamp INTERVAL 1 DAY) | fields c1 | head 10,SUCCESS +source=dev.default.people | eval c1 = date_sub(@timestamp INTERVAL 1 DAY) | fields c1 | head 10,SUCCESS +source=dev.default.people | eval `CURDATE()` = CURDATE() | fields `CURDATE()`,SUCCESS +source=dev.default.people | eval `CURRENT_DATE()` = CURRENT_DATE() | fields `CURRENT_DATE()`,SUCCESS +source=dev.default.people | eval `CURRENT_TIMESTAMP()` = CURRENT_TIMESTAMP() | fields `CURRENT_TIMESTAMP()`,SUCCESS +source=dev.default.people | eval `DATE('2020-08-26')` = DATE('2020-08-26') | fields `DATE('2020-08-26')`,SUCCESS +source=dev.default.people | eval `DATE(TIMESTAMP('2020-08-26 13:49:00'))` = DATE(TIMESTAMP('2020-08-26 13:49:00')) | fields `DATE(TIMESTAMP('2020-08-26 13:49:00'))`,SUCCESS +source=dev.default.people | eval `DATE('2020-08-26 13:49')` = DATE('2020-08-26 13:49') | fields `DATE('2020-08-26 13:49')`,SUCCESS +"source=dev.default.people | eval `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')` = DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS'), `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')` = DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a') | fields `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')`, `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')`",SUCCESS +"source=dev.default.people | eval `'2000-01-02' - '2000-01-01'` = DATEDIFF(TIMESTAMP('2000-01-02 00:00:00'), TIMESTAMP('2000-01-01 23:59:59')), `'2001-02-01' - '2004-01-01'` = DATEDIFF(DATE('2001-02-01'), TIMESTAMP('2004-01-01 00:00:00')) | fields `'2000-01-02' - '2000-01-01'`, `'2001-02-01' - '2004-01-01'`",SUCCESS +source=dev.default.people | eval `DAY(DATE('2020-08-26'))` = DAY(DATE('2020-08-26')) | fields `DAY(DATE('2020-08-26'))`,SUCCESS +source=dev.default.people | eval `DAYNAME(DATE('2020-08-26'))` = DAYNAME(DATE('2020-08-26')) | fields `DAYNAME(DATE('2020-08-26'))`,FAILED +source=dev.default.people | eval `CURRENT_TIMEZONE()` = CURRENT_TIMEZONE() | fields `CURRENT_TIMEZONE()`,SUCCESS +source=dev.default.people | eval `UTC_TIMESTAMP()` = UTC_TIMESTAMP() | fields `UTC_TIMESTAMP()`,SUCCESS +"source=dev.default.people | eval `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')` = TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | eval `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))` = TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00')) | fields `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')`, `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))`",SUCCESS +"source=dev.default.people | eval `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')` = TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | eval `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` = TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') | fields `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')`, `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')`",SUCCESS + source = dev.default.http_logs | stats count(),SUCCESS +"source = dev.default.http_logs | stats avg(size) as c1, max(size) as c2, min(size) as c3, sum(size) as c4, percentile(size, 50) as c5, stddev_pop(size) as c6, stddev_samp(size) as c7, distinct_count(size) as c8",SUCCESS +"source = dev.default.http_logs | eval c1 = abs(size) | eval c2 = ceil(size) | eval c3 = floor(size) | eval c4 = sqrt(size) | eval c5 = ln(size) | eval c6 = pow(size, 2) | eval c7 = mod(size, 2) | fields c1, c2, c3, c4, c5, c6, c7 | head 10",SUCCESS +"source = dev.default.http_logs | eval c1 = isnull(request) | eval c2 = isnotnull(request) | eval c3 = ifnull(request, ""Unknown"") | eval c4 = nullif(request, ""Unknown"") | eval c5 = isnull(size) | eval c6 = if(like(request, '%bordeaux%'), 'hello', 'world') | fields c1, c2, c3, c4, c5, c6 | head 10",SUCCESS -/* this is block comment */ source = myglue_test.tpch_csv.orders | head 1 // this is line comment,SUCCESS -"/* test in tpch q16, q18, q20 */ source = myglue_test.tpch_csv.orders | head 1 // add source=xx to avoid failure in automation",SUCCESS -"/* test in tpch q4, q21, q22 */ source = myglue_test.tpch_csv.orders | head 1",SUCCESS -"/* test in tpch q2, q11, q15, q17, q20, q22 */ source = myglue_test.tpch_csv.orders | head 1",SUCCESS -"/* test in tpch q7, q8, q9, q13, q15, q22 */ source = myglue_test.tpch_csv.orders | head 1",SUCCESS -/* lots of inner join tests in tpch */ source = myglue_test.tpch_csv.orders | head 1,SUCCESS -/* left join test in tpch q13 */ source = myglue_test.tpch_csv.orders | head 1,SUCCESS -"source = myglue_test.tpch_csv.orders +/* this is block comment */ source = dev.default.orders | head 1 // this is line comment,SUCCESS +"/* test in tpch q16, q18, q20 */ source = dev.default.orders | head 1 // add source=xx to avoid failure in automation",SUCCESS +"/* test in tpch q4, q21, q22 */ source = dev.default.orders | head 1",SUCCESS +"/* test in tpch q2, q11, q15, q17, q20, q22 */ source = dev.default.orders | head 1",SUCCESS +"/* test in tpch q7, q8, q9, q13, q15, q22 */ source = dev.default.orders | head 1",SUCCESS +/* lots of inner join tests in tpch */ source = dev.default.orders | head 1,SUCCESS +/* left join test in tpch q13 */ source = dev.default.orders | head 1,SUCCESS +"source = dev.default.orders | right outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') - myglue_test.tpch_csv.customer + dev.default.customer | stats count(o_orderkey) as c_count by c_custkey | sort - c_count",SUCCESS -"source = myglue_test.tpch_csv.orders +"source = dev.default.orders | full outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') - myglue_test.tpch_csv.customer + dev.default.customer | stats count(o_orderkey) as c_count by c_custkey | sort - c_count",SUCCESS -"source = myglue_test.tpch_csv.customer -| semi join ON c_custkey = o_custkey myglue_test.tpch_csv.orders +"source = dev.default.customer +| semi join ON c_custkey = o_custkey dev.default.orders | where c_mktsegment = 'BUILDING' | sort - c_custkey | head 10",SUCCESS -"source = myglue_test.tpch_csv.customer -| anti join ON c_custkey = o_custkey myglue_test.tpch_csv.orders +"source = dev.default.customer +| anti join ON c_custkey = o_custkey dev.default.orders | where c_mktsegment = 'BUILDING' | sort - c_custkey | head 10",SUCCESS -"source = myglue_test.tpch_csv.supplier +"source = dev.default.supplier | where like(s_comment, '%Customer%Complaints%') -| join ON s_nationkey > n_nationkey [ source = myglue_test.tpch_csv.nation | where n_name = 'SAUDI ARABIA' ] +| join ON s_nationkey > n_nationkey [ source = dev.default.nation | where n_name = 'SAUDI ARABIA' ] | sort - s_name | head 10",SUCCESS -"source = myglue_test.tpch_csv.supplier +"source = dev.default.supplier | where like(s_comment, '%Customer%Complaints%') -| join [ source = myglue_test.tpch_csv.nation | where n_name = 'SAUDI ARABIA' ] +| join [ source = dev.default.nation | where n_name = 'SAUDI ARABIA' ] | sort - s_name | head 10",SUCCESS -source=myglue_test.default.people | LOOKUP myglue_test.default.work_info uid AS id REPLACE department | stats distinct_count(department),SUCCESS -source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id APPEND department | stats distinct_count(department),SUCCESS -source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id REPLACE department AS country | stats distinct_count(country),SUCCESS -source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id APPEND department AS country | stats distinct_count(country),SUCCESS -"source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uID AS id, name REPLACE department | stats distinct_count(department)",SUCCESS -"source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS ID, name APPEND department | stats distinct_count(department)",SUCCESS -"source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uID AS id, name | head 10",SUCCESS -"source = myglue_test.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP myglue_test.default.work_info name REPLACE occupation AS major | stats distinct_count(major)",SUCCESS -"source = myglue_test.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP myglue_test.default.work_info name APPEND occupation AS major | stats distinct_count(major)",SUCCESS -"source = myglue_test.default.http_logs | eval res = json('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json('{""f1"":""abc"",""f2"":{""f3"":""a"",""f4"":""b""}}') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json('[1,2,3,{""f1"":1,""f2"":[5,6]},4]') | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = json('[]') | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json(‘{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json('{""invalid"": ""json""') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json('[1,2,3]') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json(‘[1,2') | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = json('[invalid json]') | head 1 | fields res,SUCCESS -source = myglue_test.default.http_logs | eval res = json('invalid json') | head 1 | fields res,SUCCESS -source = myglue_test.default.http_logs | eval res = json(null) | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array('this', 'is', 'a', 'string', 'array') | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = json_array() | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array(1, 2, 0, -1, 1.1, -0.11) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array('this', 'is', 1.1, -0.11, true, false) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = array_length(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = array_length(json_array()) | head 1 | fields res,SUCCESS -source = myglue_test.default.http_logs | eval res = json_array_length('[]') | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array_length('[1,2,3,{""f1"":1,""f2"":[5,6]},4]') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array_length('{\""key\"": 1}') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array_length('[1,2') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', 'string_value')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', 123.45)) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', true)) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object(""a"", 1, ""b"", 2, ""c"", 3)) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', array())) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', array(1, 2, 3))) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('outer', json_object('inner', 123.45))) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object(""array"", json_array(1,2,0,-1,1.1,-0.11))) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | where json_valid(('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1",SUCCESS -"source = myglue_test.default.http_logs | where not json_valid(('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('{""f1"":""abc"",""f2"":{""f3"":""a"",""f4"":""b""}}')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2,3,{""f1"":1,""f2"":[5,6]},4]')) | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = json_keys(json('[]')) | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json(‘{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('{""invalid"": ""json""')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2,3]')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2')) | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = json_keys(json('[invalid json]')) | head 1 | fields res,SUCCESS -source = myglue_test.default.http_logs | eval res = json_keys(json('invalid json')) | head 1 | fields res,SUCCESS -source = myglue_test.default.http_logs | eval res = json_keys(json(null)) | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.teacher') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[*]') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[0]') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[*].name') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[1].name') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[0].not_exist_key') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[10]') | head 1 | fields res",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > 0) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > -10) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(json_object(""a"",1,""b"",-1),json_object(""a"",-1,""b"",-1)), result = forall(array, x -> x.a > 0) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(json_object(""a"",1,""b"",-1),json_object(""a"",-1,""b"",-1)), result = exists(array, x -> x.b < 0) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 0) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 10) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 0) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 10) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,3), result = transform(array, x -> x + 1) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,3), result = transform(array, (x, y) -> x + y) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x, acc -> acc * 10) | head 1 | fields result",SUCCESS -source=myglue_test.default.people | eval age = salary | eventstats avg(age) | sort id | head 10,SUCCESS -"source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count | sort id | head 10",SUCCESS -source=myglue_test.default.people | eventstats avg(salary) by country | sort id | head 10,SUCCESS -"source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by country | sort id | head 10",SUCCESS -"source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count +source=dev.default.people | LOOKUP dev.default.work_info uid AS id REPLACE department | stats distinct_count(department),SUCCESS +source = dev.default.people| LOOKUP dev.default.work_info uid AS id APPEND department | stats distinct_count(department),SUCCESS +source = dev.default.people| LOOKUP dev.default.work_info uid AS id REPLACE department AS country | stats distinct_count(country),SUCCESS +source = dev.default.people| LOOKUP dev.default.work_info uid AS id APPEND department AS country | stats distinct_count(country),SUCCESS +"source = dev.default.people| LOOKUP dev.default.work_info uID AS id, name REPLACE department | stats distinct_count(department)",SUCCESS +"source = dev.default.people| LOOKUP dev.default.work_info uid AS ID, name APPEND department | stats distinct_count(department)",SUCCESS +"source = dev.default.people| LOOKUP dev.default.work_info uID AS id, name | head 10",SUCCESS +"source = dev.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP dev.default.work_info name REPLACE occupation AS major | stats distinct_count(major)",SUCCESS +"source = dev.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP dev.default.work_info name APPEND occupation AS major | stats distinct_count(major)",SUCCESS +"source = dev.default.http_logs | eval res = json('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json('{""f1"":""abc"",""f2"":{""f3"":""a"",""f4"":""b""}}') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json('[1,2,3,{""f1"":1,""f2"":[5,6]},4]') | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = json('[]') | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json('{""invalid"": ""json""') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json('[1,2,3]') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json('[1,2') | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = json('[invalid json]') | head 1 | fields res,SUCCESS +source = dev.default.http_logs | eval res = json('invalid json') | head 1 | fields res,SUCCESS +source = dev.default.http_logs | eval res = json(null) | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json_array('this', 'is', 'a', 'string', 'array') | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = json_array() | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json_array(1, 2, 0, -1, 1.1, -0.11) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_array('this', 'is', 1.1, -0.11, true, false) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = array_length(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = array_length(json_array()) | head 1 | fields res,SUCCESS +source = dev.default.http_logs | eval res = json_array_length('[]') | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json_array_length('[1,2,3,{""f1"":1,""f2"":[5,6]},4]') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_array_length('{\""key\"": 1}') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_array_length('[1,2') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('key', 'string_value')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('key', 123.45)) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('key', true)) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object(""a"", 1, ""b"", 2, ""c"", 3)) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('key', array())) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('key', array(1, 2, 3))) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('outer', json_object('inner', 123.45))) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object(""array"", json_array(1,2,0,-1,1.1,-0.11))) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | where json_valid('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1",SUCCESS +"source = dev.default.http_logs | where not json_valid('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('{""f1"":""abc"",""f2"":{""f3"":""a"",""f4"":""b""}}')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('[1,2,3,{""f1"":1,""f2"":[5,6]},4]')) | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = json_keys(json('[]')) | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('{""invalid"": ""json""')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('[1,2,3]')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('[1,2')) | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = json_keys(json('[invalid json]')) | head 1 | fields res,SUCCESS +source = dev.default.http_logs | eval res = json_keys(json('invalid json')) | head 1 | fields res,SUCCESS +source = dev.default.http_logs | eval res = json_keys(json(null)) | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.teacher') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[*]') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[0]') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[*].name') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[1].name') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[0].not_exist_key') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[10]') | head 1 | fields res",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > 0) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > -10) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(json_object(""a"",1,""b"",-1),json_object(""a"",-1,""b"",-1)), result = forall(array, x -> x.a > 0) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(json_object(""a"",1,""b"",-1),json_object(""a"",-1,""b"",-1)), result = exists(array, x -> x.b < 0) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 0) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 10) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 0) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 10) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,3), result = transform(array, x -> x + 1) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,3), result = transform(array, (x, y) -> x + y) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x, acc -> acc * 10) | head 1 | fields result",SUCCESS +source=dev.default.people | eval age = salary | eventstats avg(age) | sort id | head 10,SUCCESS +"source=dev.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count | sort id | head 10",SUCCESS +source=dev.default.people | eventstats avg(salary) by country | sort id | head 10,SUCCESS +"source=dev.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by country | sort id | head 10",SUCCESS +"source=dev.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by span(age, 10) | sort id | head 10",SUCCESS -"source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by span(age, 10) as age_span, country | sort id | head 10",SUCCESS -"source=myglue_test.default.people | where country != 'USA' | eventstats stddev_samp(salary), stddev_pop(salary), percentile_approx(salary, 60) by span(salary, 1000) as salary_span | sort id | head 10",SUCCESS -"source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age by occupation, country | eventstats avg(avg_age) as avg_state_age by country | sort id | head 10",SUCCESS -"source=myglue_test.default.people | eventstats distinct_count(salary) by span(salary, 1000) as age_span",FAILED -"source = myglue_test.tpch_csv.lineitem +"source=dev.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by span(age, 10) as age_span, country | sort id | head 10",SUCCESS +"source=dev.default.people | where country != 'USA' | eventstats stddev_samp(salary), stddev_pop(salary), percentile_approx(salary, 60) by span(salary, 1000) as salary_span | sort id | head 10",SUCCESS +"source=dev.default.people | eval age = salary | eventstats avg(age) as avg_age by occupation, country | eventstats avg(avg_age) as avg_state_age by country | sort id | head 10",SUCCESS +"source=dev.default.people | eventstats distinct_count(salary) by span(salary, 1000) as age_span",FAILED +"source = dev.default.lineitem | where l_shipdate <= subdate(date('1998-12-01'), 90) | stats sum(l_quantity) as sum_qty, sum(l_extendedprice) as sum_base_price, @@ -277,59 +277,59 @@ by span(age, 10) | sort id | head 10",SUCCESS count() as count_order by l_returnflag, l_linestatus | sort l_returnflag, l_linestatus",SUCCESS -"source = myglue_test.tpch_csv.part -| join ON p_partkey = ps_partkey myglue_test.tpch_csv.partsupp -| join ON s_suppkey = ps_suppkey myglue_test.tpch_csv.supplier -| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation -| join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region +"source = dev.default.part +| join ON p_partkey = ps_partkey dev.default.partsupp +| join ON s_suppkey = ps_suppkey dev.default.supplier +| join ON s_nationkey = n_nationkey dev.default.nation +| join ON n_regionkey = r_regionkey dev.default.region | where p_size = 15 AND like(p_type, '%BRASS') AND r_name = 'EUROPE' AND ps_supplycost = [ - source = myglue_test.tpch_csv.partsupp - | join ON s_suppkey = ps_suppkey myglue_test.tpch_csv.supplier - | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation - | join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region + source = dev.default.partsupp + | join ON s_suppkey = ps_suppkey dev.default.supplier + | join ON s_nationkey = n_nationkey dev.default.nation + | join ON n_regionkey = r_regionkey dev.default.region | where r_name = 'EUROPE' | stats MIN(ps_supplycost) ] | sort - s_acctbal, n_name, s_name, p_partkey | head 100",SUCCESS -"source = myglue_test.tpch_csv.customer -| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders -| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem +"source = dev.default.customer +| join ON c_custkey = o_custkey dev.default.orders +| join ON l_orderkey = o_orderkey dev.default.lineitem | where c_mktsegment = 'BUILDING' AND o_orderdate < date('1995-03-15') AND l_shipdate > date('1995-03-15') | stats sum(l_extendedprice * (1 - l_discount)) as revenue by l_orderkey, o_orderdate, o_shippriority | sort - revenue, o_orderdate | head 10",SUCCESS -"source = myglue_test.tpch_csv.orders +"source = dev.default.orders | where o_orderdate >= date('1993-07-01') and o_orderdate < date_add(date('1993-07-01'), interval 3 month) and exists [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | where l_orderkey = o_orderkey and l_commitdate < l_receiptdate ] | stats count() as order_count by o_orderpriority | sort o_orderpriority",SUCCESS -"source = myglue_test.tpch_csv.customer -| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders -| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem -| join ON l_suppkey = s_suppkey AND c_nationkey = s_nationkey myglue_test.tpch_csv.supplier -| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation -| join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region +"source = dev.default.customer +| join ON c_custkey = o_custkey dev.default.orders +| join ON l_orderkey = o_orderkey dev.default.lineitem +| join ON l_suppkey = s_suppkey AND c_nationkey = s_nationkey dev.default.supplier +| join ON s_nationkey = n_nationkey dev.default.nation +| join ON n_regionkey = r_regionkey dev.default.region | where r_name = 'ASIA' AND o_orderdate >= date('1994-01-01') AND o_orderdate < date_add(date('1994-01-01'), interval 1 year) | stats sum(l_extendedprice * (1 - l_discount)) as revenue by n_name | sort - revenue",SUCCESS -"source = myglue_test.tpch_csv.lineitem +"source = dev.default.lineitem | where l_shipdate >= date('1994-01-01') and l_shipdate < adddate(date('1994-01-01'), 365) and l_discount between .06 - 0.01 and .06 + 0.01 and l_quantity < 24 | stats sum(l_extendedprice * l_discount) as revenue",SUCCESS "source = [ - source = myglue_test.tpch_csv.supplier - | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.lineitem - | join ON o_orderkey = l_orderkey myglue_test.tpch_csv.orders - | join ON c_custkey = o_custkey myglue_test.tpch_csv.customer - | join ON s_nationkey = n1.n_nationkey myglue_test.tpch_csv.nation as n1 - | join ON c_nationkey = n2.n_nationkey myglue_test.tpch_csv.nation as n2 + source = dev.default.supplier + | join ON s_suppkey = l_suppkey dev.default.lineitem + | join ON o_orderkey = l_orderkey dev.default.orders + | join ON c_custkey = o_custkey dev.default.customer + | join ON s_nationkey = n1.n_nationkey dev.default.nation as n1 + | join ON c_nationkey = n2.n_nationkey dev.default.nation as n2 | where l_shipdate between date('1995-01-01') and date('1996-12-31') and n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY' or n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE' | eval supp_nation = n1.n_name, cust_nation = n2.n_name, l_year = year(l_shipdate), volume = l_extendedprice * (1 - l_discount) @@ -338,14 +338,14 @@ by span(age, 10) | sort id | head 10",SUCCESS | stats sum(volume) as revenue by supp_nation, cust_nation, l_year | sort supp_nation, cust_nation, l_year",SUCCESS "source = [ - source = myglue_test.tpch_csv.part - | join ON p_partkey = l_partkey myglue_test.tpch_csv.lineitem - | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.supplier - | join ON l_orderkey = o_orderkey myglue_test.tpch_csv.orders - | join ON o_custkey = c_custkey myglue_test.tpch_csv.customer - | join ON c_nationkey = n1.n_nationkey myglue_test.tpch_csv.nation as n1 - | join ON s_nationkey = n2.n_nationkey myglue_test.tpch_csv.nation as n2 - | join ON n1.n_regionkey = r_regionkey myglue_test.tpch_csv.region + source = dev.default.part + | join ON p_partkey = l_partkey dev.default.lineitem + | join ON s_suppkey = l_suppkey dev.default.supplier + | join ON l_orderkey = o_orderkey dev.default.orders + | join ON o_custkey = c_custkey dev.default.customer + | join ON c_nationkey = n1.n_nationkey dev.default.nation as n1 + | join ON s_nationkey = n2.n_nationkey dev.default.nation as n2 + | join ON n1.n_regionkey = r_regionkey dev.default.region | where r_name = 'AMERICA' AND p_type = 'ECONOMY ANODIZED STEEL' and o_orderdate between date('1995-01-01') and date('1996-12-31') | eval o_year = year(o_orderdate) @@ -358,12 +358,12 @@ by span(age, 10) | sort id | head 10",SUCCESS | fields mkt_share, o_year | sort o_year",SUCCESS "source = [ - source = myglue_test.tpch_csv.part - | join ON p_partkey = l_partkey myglue_test.tpch_csv.lineitem - | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.supplier - | join ON ps_partkey = l_partkey and ps_suppkey = l_suppkey myglue_test.tpch_csv.partsupp - | join ON o_orderkey = l_orderkey myglue_test.tpch_csv.orders - | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation + source = dev.default.part + | join ON p_partkey = l_partkey dev.default.lineitem + | join ON s_suppkey = l_suppkey dev.default.supplier + | join ON ps_partkey = l_partkey and ps_suppkey = l_suppkey dev.default.partsupp + | join ON o_orderkey = l_orderkey dev.default.orders + | join ON s_nationkey = n_nationkey dev.default.nation | where like(p_name, '%green%') | eval nation = n_name | eval o_year = year(o_orderdate) @@ -372,33 +372,33 @@ by span(age, 10) | sort id | head 10",SUCCESS ] as profit | stats sum(amount) as sum_profit by nation, o_year | sort nation, - o_year",SUCCESS -"source = myglue_test.tpch_csv.customer -| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders -| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem -| join ON c_nationkey = n_nationkey myglue_test.tpch_csv.nation +"source = dev.default.customer +| join ON c_custkey = o_custkey dev.default.orders +| join ON l_orderkey = o_orderkey dev.default.lineitem +| join ON c_nationkey = n_nationkey dev.default.nation | where o_orderdate >= date('1993-10-01') AND o_orderdate < date_add(date('1993-10-01'), interval 3 month) AND l_returnflag = 'R' | stats sum(l_extendedprice * (1 - l_discount)) as revenue by c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment | sort - revenue | head 20",SUCCESS -"source = myglue_test.tpch_csv.partsupp -| join ON ps_suppkey = s_suppkey myglue_test.tpch_csv.supplier -| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation +"source = dev.default.partsupp +| join ON ps_suppkey = s_suppkey dev.default.supplier +| join ON s_nationkey = n_nationkey dev.default.nation | where n_name = 'GERMANY' | stats sum(ps_supplycost * ps_availqty) as value by ps_partkey | where value > [ - source = myglue_test.tpch_csv.partsupp - | join ON ps_suppkey = s_suppkey myglue_test.tpch_csv.supplier - | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation + source = dev.default.partsupp + | join ON ps_suppkey = s_suppkey dev.default.supplier + | join ON s_nationkey = n_nationkey dev.default.nation | where n_name = 'GERMANY' | stats sum(ps_supplycost * ps_availqty) as check | eval threshold = check * 0.0001000000 | fields threshold ] | sort - value",SUCCESS -"source = myglue_test.tpch_csv.orders -| join ON o_orderkey = l_orderkey myglue_test.tpch_csv.lineitem +"source = dev.default.orders +| join ON o_orderkey = l_orderkey dev.default.lineitem | where l_commitdate < l_receiptdate and l_shipdate < l_commitdate and l_shipmode in ('MAIL', 'SHIP') @@ -409,32 +409,32 @@ by span(age, 10) | sort id | head 10",SUCCESS by l_shipmode | sort l_shipmode",SUCCESS "source = [ - source = myglue_test.tpch_csv.customer + source = dev.default.customer | left outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') - myglue_test.tpch_csv.orders + dev.default.orders | stats count(o_orderkey) as c_count by c_custkey ] as c_orders | stats count() as custdist by c_count | sort - custdist, - c_count",SUCCESS -"source = myglue_test.tpch_csv.lineitem +"source = dev.default.lineitem | join ON l_partkey = p_partkey AND l_shipdate >= date('1995-09-01') AND l_shipdate < date_add(date('1995-09-01'), interval 1 month) - myglue_test.tpch_csv.part + dev.default.part | stats sum(case(like(p_type, 'PROMO%'), l_extendedprice * (1 - l_discount) else 0)) as sum1, sum(l_extendedprice * (1 - l_discount)) as sum2 | eval promo_revenue = 100.00 * sum1 / sum2 // Stats and Eval commands can combine when issues/819 resolved | fields promo_revenue",SUCCESS -"source = myglue_test.tpch_csv.supplier +"source = dev.default.supplier | join right = revenue0 ON s_suppkey = supplier_no [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | where l_shipdate >= date('1996-01-01') AND l_shipdate < date_add(date('1996-01-01'), interval 3 month) | eval supplier_no = l_suppkey | stats sum(l_extendedprice * (1 - l_discount)) as total_revenue by supplier_no ] | where total_revenue = [ source = [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | where l_shipdate >= date('1996-01-01') AND l_shipdate < date_add(date('1996-01-01'), interval 3 month) | eval supplier_no = l_suppkey | stats sum(l_extendedprice * (1 - l_discount)) as total_revenue by supplier_no @@ -443,24 +443,24 @@ by span(age, 10) | sort id | head 10",SUCCESS ] | sort s_suppkey | fields s_suppkey, s_name, s_address, s_phone, total_revenue",SUCCESS -"source = myglue_test.tpch_csv.partsupp -| join ON p_partkey = ps_partkey myglue_test.tpch_csv.part +"source = dev.default.partsupp +| join ON p_partkey = ps_partkey dev.default.part | where p_brand != 'Brand#45' and not like(p_type, 'MEDIUM POLISHED%') and p_size in (49, 14, 23, 45, 19, 3, 36, 9) and ps_suppkey not in [ - source = myglue_test.tpch_csv.supplier + source = dev.default.supplier | where like(s_comment, '%Customer%Complaints%') | fields s_suppkey ] | stats distinct_count(ps_suppkey) as supplier_cnt by p_brand, p_type, p_size | sort - supplier_cnt, p_brand, p_type, p_size",SUCCESS -"source = myglue_test.tpch_csv.lineitem -| join ON p_partkey = l_partkey myglue_test.tpch_csv.part +"source = dev.default.lineitem +| join ON p_partkey = l_partkey dev.default.part | where p_brand = 'Brand#23' and p_container = 'MED BOX' and l_quantity < [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | where l_partkey = p_partkey | stats avg(l_quantity) as avg | eval `0.2 * avg` = 0.2 * avg @@ -469,11 +469,11 @@ by span(age, 10) | sort id | head 10",SUCCESS | stats sum(l_extendedprice) as sum | eval avg_yearly = sum / 7.0 | fields avg_yearly",SUCCESS -"source = myglue_test.tpch_csv.customer -| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders -| join ON o_orderkey = l_orderkey myglue_test.tpch_csv.lineitem +"source = dev.default.customer +| join ON c_custkey = o_custkey dev.default.orders +| join ON o_orderkey = l_orderkey dev.default.lineitem | where o_orderkey in [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | stats sum(l_quantity) as sum by l_orderkey | where sum > 300 | fields l_orderkey @@ -481,7 +481,7 @@ by span(age, 10) | sort id | head 10",SUCCESS | stats sum(l_quantity) by c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice | sort - o_totalprice, o_orderdate | head 100",SUCCESS -"source = myglue_test.tpch_csv.lineitem +"source = dev.default.lineitem | join ON p_partkey = l_partkey and p_brand = 'Brand#12' and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') @@ -503,19 +503,19 @@ by span(age, 10) | sort id | head 10",SUCCESS and p_size between 1 and 15 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' - myglue_test.tpch_csv.part",SUCCESS -"source = myglue_test.tpch_csv.supplier -| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation + dev.default.part",SUCCESS +"source = dev.default.supplier +| join ON s_nationkey = n_nationkey dev.default.nation | where n_name = 'CANADA' and s_suppkey in [ - source = myglue_test.tpch_csv.partsupp + source = dev.default.partsupp | where ps_partkey in [ - source = myglue_test.tpch_csv.part + source = dev.default.part | where like(p_name, 'forest%') | fields p_partkey ] and ps_availqty > [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | where l_partkey = ps_partkey and l_suppkey = ps_suppkey and l_shipdate >= date('1994-01-01') @@ -526,19 +526,19 @@ by span(age, 10) | sort id | head 10",SUCCESS ] | fields ps_suppkey ]",SUCCESS -"source = myglue_test.tpch_csv.supplier -| join ON s_suppkey = l1.l_suppkey myglue_test.tpch_csv.lineitem as l1 -| join ON o_orderkey = l1.l_orderkey myglue_test.tpch_csv.orders -| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation +"source = dev.default.supplier +| join ON s_suppkey = l1.l_suppkey dev.default.lineitem as l1 +| join ON o_orderkey = l1.l_orderkey dev.default.orders +| join ON s_nationkey = n_nationkey dev.default.nation | where o_orderstatus = 'F' and l1.l_receiptdate > l1.l_commitdate and exists [ - source = myglue_test.tpch_csv.lineitem as l2 + source = dev.default.lineitem as l2 | where l2.l_orderkey = l1.l_orderkey and l2.l_suppkey != l1.l_suppkey ] and not exists [ - source = myglue_test.tpch_csv.lineitem as l3 + source = dev.default.lineitem as l3 | where l3.l_orderkey = l1.l_orderkey and l3.l_suppkey != l1.l_suppkey and l3.l_receiptdate > l3.l_commitdate @@ -548,16 +548,16 @@ by span(age, 10) | sort id | head 10",SUCCESS | sort - numwait, s_name | head 100",SUCCESS "source = [ - source = myglue_test.tpch_csv.customer + source = dev.default.customer | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') and c_acctbal > [ - source = myglue_test.tpch_csv.customer + source = dev.default.customer | where c_acctbal > 0.00 and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') | stats avg(c_acctbal) ] and not exists [ - source = myglue_test.tpch_csv.orders + source = dev.default.orders | where o_custkey = c_custkey ] | eval cntrycode = substring(c_phone, 1, 2) From 20ef8906a5d507bc9e5f2b6d1b6fbd7a381b1df9 Mon Sep 17 00:00:00 2001 From: kenrickyap <121634635+kenrickyap@users.noreply.github.com> Date: Thu, 19 Dec 2024 23:23:09 +0000 Subject: [PATCH 2/2] PPL geoip function (#871) * geoip function implementation Signed-off-by: Kenrick Yap <14yapkc1@gmail.com> * Fixed integration tests Signed-off-by: Kenrick Yap * linting Signed-off-by: Kenrick Yap * addressing PR comments (added addtional integ tests, doc changes) Signed-off-by: Kenrick Yap <14yapkc1@gmail.com> * fixed new integ tests Signed-off-by: Kenrick Yap * addressing pr comments Signed-off-by: Kenrick Yap * address review comments Signed-off-by: Kenrick Yap * moved validateGeoIpProperty to relevant class Signed-off-by: Kenrick Yap * updated scalaudf function descriptions Signed-off-by: Kenrick Yap --------- Signed-off-by: Kenrick Yap <14yapkc1@gmail.com> Signed-off-by: Kenrick Yap Signed-off-by: kenrickyap <121634635+kenrickyap@users.noreply.github.com> Co-authored-by: Kenrick Yap <14yapkc1@gmail.com> --- docs/ppl-lang/functions/ppl-ip.md | 65 +++- docs/ppl-lang/planning/ppl-geoip-command.md | 59 ++++ .../flint/spark/FlintSparkSuite.scala | 73 ++++ .../spark/ppl/FlintSparkPPLGeoipITSuite.scala | 314 +++++++++++++++++ .../src/main/antlr4/OpenSearchPPLLexer.g4 | 15 +- .../src/main/antlr4/OpenSearchPPLParser.g4 | 31 +- .../sql/ast/AbstractNodeVisitor.java | 5 + .../org/opensearch/sql/ast/tree/Eval.java | 4 +- .../org/opensearch/sql/ast/tree/GeoIp.java | 47 +++ .../expression/function/SerializableUdf.java | 87 ++++- .../sql/ppl/CatalystQueryPlanVisitor.java | 69 +++- .../opensearch/sql/ppl/parser/AstBuilder.java | 10 +- .../sql/ppl/parser/AstExpressionBuilder.java | 15 + .../GeoIpCatalystLogicalPlanTranslator.java | 222 ++++++++++++ ...PlanGeoipFunctionTranslatorTestSuite.scala | 332 ++++++++++++++++++ 15 files changed, 1314 insertions(+), 34 deletions(-) create mode 100644 docs/ppl-lang/planning/ppl-geoip-command.md create mode 100644 integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGeoipITSuite.scala create mode 100644 ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java create mode 100644 ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoIpCatalystLogicalPlanTranslator.java create mode 100644 ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGeoipFunctionTranslatorTestSuite.scala diff --git a/docs/ppl-lang/functions/ppl-ip.md b/docs/ppl-lang/functions/ppl-ip.md index fb0b468ba..65cc9dac9 100644 --- a/docs/ppl-lang/functions/ppl-ip.md +++ b/docs/ppl-lang/functions/ppl-ip.md @@ -32,4 +32,67 @@ Note: - `ip` can be an IPv4 or an IPv6 address - `cidr` can be an IPv4 or an IPv6 block - `ip` and `cidr` must be either both IPv4 or both IPv6 - - `ip` and `cidr` must both be valid and non-empty/non-null \ No newline at end of file + - `ip` and `cidr` must both be valid and non-empty/non-null + +### `GEOIP` + +**Description** + +`GEOIP(ip[, property]...)` retrieves geospatial data corresponding to the provided `ip`. + +**Argument type:** +- `ip` is string be **STRING** representing an IPv4 or an IPv6 address. +- `property` is **STRING** and must be one of the following: + - `COUNTRY_ISO_CODE` + - `COUNTRY_NAME` + - `CONTINENT_NAME` + - `REGION_ISO_CODE` + - `REGION_NAME` + - `CITY_NAME` + - `TIME_ZONE` + - `LOCATION` +- Return type: + - **STRING** if one property given + - **STRUCT_TYPE** if more than one or no property is given + +Example: + +_Without properties:_ + + os> source=ips | eval a = geoip(ip) | fields ip, a + fetched rows / total rows = 2/2 + +---------------------+-------------------------------------------------------------------------------------------------------+ + |ip |lol | + +---------------------+-------------------------------------------------------------------------------------------------------+ + |66.249.157.90 |{JM, Jamaica, North America, 14, Saint Catherine Parish, Portmore, America/Jamaica, 17.9686,-76.8827} | + |2a09:bac2:19f8:2ac3::|{CA, Canada, North America, PE, Prince Edward Island, Charlottetown, America/Halifax, 46.2396,-63.1355}| + +---------------------+-------+------+-------------------------------------------------------------------------------------------------------+ + +_With one property:_ + + os> source=users | eval a = geoip(ip, COUNTRY_NAME) | fields ip, a + fetched rows / total rows = 2/2 + +---------------------+-------+ + |ip |a | + +---------------------+-------+ + |66.249.157.90 |Jamaica| + |2a09:bac2:19f8:2ac3::|Canada | + +---------------------+-------+ + +_With multiple properties:_ + + os> source=users | eval a = geoip(ip, COUNTRY_NAME, REGION_NAME, CITY_NAME) | fields ip, a + fetched rows / total rows = 2/2 + +---------------------+---------------------------------------------+ + |ip |a | + +---------------------+---------------------------------------------+ + |66.249.157.90 |{Jamaica, Saint Catherine Parish, Portmore} | + |2a09:bac2:19f8:2ac3::|{Canada, Prince Edward Island, Charlottetown}| + +---------------------+---------------------------------------------+ + +Note: +- To use `geoip` user must create spark table containing geo ip location data. Instructions to create table can be found [here](../../opensearch-geoip.md). + - `geoip` command by default expects the created table to be called `geoip_ip_data`. + - if a different table name is desired, can set `spark.geoip.tablename` spark config to new table name. +- `ip` can be an IPv4 or an IPv6 address. +- `geoip` commands will always calculated first if used with other eval functions. diff --git a/docs/ppl-lang/planning/ppl-geoip-command.md b/docs/ppl-lang/planning/ppl-geoip-command.md new file mode 100644 index 000000000..aaed6c156 --- /dev/null +++ b/docs/ppl-lang/planning/ppl-geoip-command.md @@ -0,0 +1,59 @@ +## geoip syntax proposal + +geoip function to add information about the geographical location of an IPv4 or IPv6 address + +**Implementation syntax** +- `... | eval geoinfo = geoip(ipAddress *[,properties])` +- generic syntax +- `... | eval geoinfo = geoip(ipAddress)` +- retrieves all geo data +- `... | eval geoinfo = geoip(ipAddress, city, location)` +- retrieve only city, and location + +**Implementation details** +- Current implementation requires user to have created a geoip table. Geoip table has the following schema: + + ```SQL + CREATE TABLE geoip ( + cidr STRING, + country_iso_code STRING, + country_name STRING, + continent_name STRING, + region_iso_code STRING, + region_name STRING, + city_name STRING, + time_zone STRING, + location STRING, + ip_range_start BIGINT, + ip_range_end BIGINT, + ipv4 BOOLEAN + ) + ``` + +- `geoip` is resolved by performing a join on said table and projecting the resulting geoip data as a struct. +- an example of using `geoip` is equivalent to running the following SQL query: + + ```SQL + SELECT source.*, struct(geoip.country_name, geoip.city_name) AS a + FROM source, geoip + WHERE geoip.ip_range_start <= ip_to_int(source.ip) + AND geoip.ip_range_end > ip_to_int(source.ip) + AND geoip.ip_type = is_ipv4(source.ip); + ``` +- in the case that only one property is provided in function call, `geoip` returns string of specified property instead: + + ```SQL + SELECT source.*, geoip.country_name AS a + FROM source, geoip + WHERE geoip.ip_range_start <= ip_to_int(source.ip) + AND geoip.ip_range_end > ip_to_int(source.ip) + AND geoip.ip_type = is_ipv4(source.ip); + ``` + +**Future plan for additional data-sources** + +- Currently only using pre-existing geoip table defined within spark is possible. +- There is future plans to allow users to specify data sources: + - API data sources - if users have their own geoip provided will create ability for users to configure and call said endpoints + - OpenSearch geospatial client - once geospatial client is published we can leverage client to utilize OpenSearch geo2ip functionality. +- Additional datasource connection params will be provided through spark config options. diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala index 7c19cab12..5ea123c9d 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala @@ -771,6 +771,79 @@ trait FlintSparkSuite extends QueryTest with FlintSuite with OpenSearchSuite wit | """.stripMargin) } + protected def createGeoIpTestTable(testTable: String): Unit = { + sql(s""" + | CREATE TABLE $testTable + | ( + | ip STRING, + | ipv4 STRING, + | isValid BOOLEAN + | ) + | USING $tableType $tableOptions + |""".stripMargin) + + sql(s""" + | INSERT INTO $testTable + | VALUES ('66.249.157.90', '66.249.157.90', true), + | ('2a09:bac2:19f8:2ac3::', 'Given IPv6 is not mapped to IPv4', true), + | ('192.168.2.', '192.168.2.', false), + | ('2001:db8::ff00:12:', 'Given IPv6 is not mapped to IPv4', false) + | """.stripMargin) + } + + protected def createGeoIpTable(): Unit = { + sql(s""" + | CREATE TABLE geoip + | ( + | cidr STRING, + | country_iso_code STRING, + | country_name STRING, + | continent_name STRING, + | region_iso_code STRING, + | region_name STRING, + | city_name STRING, + | time_zone STRING, + | location STRING, + | ip_range_start DECIMAL(38,0), + | ip_range_end DECIMAL(38,0), + | ipv4 BOOLEAN + | ) + | USING $tableType $tableOptions + |""".stripMargin) + + sql(s""" + | INSERT INTO geoip + | VALUES ( + | '66.249.157.0/24', + | 'JM', + | 'Jamaica', + | 'North America', + | '14', + | 'Saint Catherine Parish', + | 'Portmore', + | 'America/Jamaica', + | '17.9686,-76.8827', + | 1123654912, + | 1123655167, + | true + | ), + | ( + | '2a09:bac2:19f8::/45', + | 'CA', + | 'Canada', + | 'North America', + | 'PE', + | 'Prince Edward Island', + | 'Charlottetown', + | 'America/Halifax', + | '46.2396,-63.1355', + | 55878094401180025937395073088449675264, + | 55878094401189697343951990121847324671, + | false + | ) + | """.stripMargin) + } + protected def createNestedJsonContentTable(tempFile: Path, testTable: String): Unit = { val json = """ diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGeoipITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGeoipITSuite.scala new file mode 100644 index 000000000..7031ab067 --- /dev/null +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGeoipITSuite.scala @@ -0,0 +1,314 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.ppl + +import java.util + +import org.opensearch.sql.expression.function.SerializableUdf.visit +import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq + +import org.apache.spark.SparkException +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation, UnresolvedStar} +import org.apache.spark.sql.catalyst.expressions.{Alias, And, CreateNamedStruct, EqualTo, Expression, GreaterThanOrEqual, LessThan, Literal} +import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.catalyst.plans.LeftOuter +import org.apache.spark.sql.catalyst.plans.logical.{DataFrameDropColumns, Filter, Join, JoinHint, LogicalPlan, Project, SubqueryAlias} +import org.apache.spark.sql.streaming.StreamTest + +class FlintSparkPPLGeoipITSuite + extends QueryTest + with LogicalPlanTestUtils + with FlintPPLSuite + with StreamTest { + + /** Test table and index name */ + private val testTable = "spark_catalog.default.flint_ppl_test" + override def beforeAll(): Unit = { + super.beforeAll() + + // Create test table + createGeoIpTestTable(testTable) + createGeoIpTable() + } + + protected override def afterEach(): Unit = { + super.afterEach() + // Stop all streaming jobs if any + spark.streams.active.foreach { job => + job.stop() + job.awaitTermination() + } + } + + private def getGeoIpQueryPlan( + ipAddress: UnresolvedAttribute, + left: LogicalPlan, + right: LogicalPlan, + projectionProperties: Alias): LogicalPlan = { + val joinPlan = getJoinPlan(ipAddress, left, right) + getProjection(joinPlan, projectionProperties) + } + + private def getJoinPlan( + ipAddress: UnresolvedAttribute, + left: LogicalPlan, + right: LogicalPlan): LogicalPlan = { + val is_ipv4 = visit("is_ipv4", util.List.of[Expression](ipAddress)) + val ip_to_int = visit("ip_to_int", util.List.of[Expression](ipAddress)) + + val t1 = SubqueryAlias("t1", left) + val t2 = SubqueryAlias("t2", right) + + val joinCondition = And( + And( + GreaterThanOrEqual(ip_to_int, UnresolvedAttribute("t2.ip_range_start")), + LessThan(ip_to_int, UnresolvedAttribute("t2.ip_range_end"))), + EqualTo(is_ipv4, UnresolvedAttribute("t2.ipv4"))) + Join(t1, t2, LeftOuter, Some(joinCondition), JoinHint.NONE) + } + + private def getProjection(joinPlan: LogicalPlan, projectionProperties: Alias): LogicalPlan = { + val projection = Project(Seq(UnresolvedStar(None), projectionProperties), joinPlan) + val dropList = Seq( + "t2.country_iso_code", + "t2.country_name", + "t2.continent_name", + "t2.region_iso_code", + "t2.region_name", + "t2.city_name", + "t2.time_zone", + "t2.location", + "t2.cidr", + "t2.ip_range_start", + "t2.ip_range_end", + "t2.ipv4").map(UnresolvedAttribute(_)) + DataFrameDropColumns(dropList, projection) + } + + test("test geoip with no parameters") { + val frame = sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip) | fields ip, a + | """.stripMargin) + + // Retrieve the results + val results: Array[Row] = frame.collect() + + // Define the expected results + val expectedResults: Array[Row] = Array( + Row( + "66.249.157.90", + Row( + "JM", + "Jamaica", + "North America", + "14", + "Saint Catherine Parish", + "Portmore", + "America/Jamaica", + "17.9686,-76.8827")), + Row( + "2a09:bac2:19f8:2ac3::", + Row( + "CA", + "Canada", + "North America", + "PE", + "Prince Edward Island", + "Charlottetown", + "America/Halifax", + "46.2396,-63.1355"))) + + // Compare the results + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Compare the logical plans + val logicalPlan: LogicalPlan = frame.queryExecution.logical + + val sourceTable: LogicalPlan = Filter( + EqualTo(UnresolvedAttribute("isValid"), Literal(true)), + UnresolvedRelation(testTable.split("\\.").toSeq)) + val geoTable: LogicalPlan = UnresolvedRelation(seq("geoip")) + val projectionStruct = CreateNamedStruct( + Seq( + Literal("country_iso_code"), + UnresolvedAttribute("t2.country_iso_code"), + Literal("country_name"), + UnresolvedAttribute("t2.country_name"), + Literal("continent_name"), + UnresolvedAttribute("t2.continent_name"), + Literal("region_iso_code"), + UnresolvedAttribute("t2.region_iso_code"), + Literal("region_name"), + UnresolvedAttribute("t2.region_name"), + Literal("city_name"), + UnresolvedAttribute("t2.city_name"), + Literal("time_zone"), + UnresolvedAttribute("t2.time_zone"), + Literal("location"), + UnresolvedAttribute("t2.location"))) + val structProjection = Alias(projectionStruct, "a")() + val geoIpPlan = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), sourceTable, geoTable, structProjection) + val expectedPlan: LogicalPlan = + Project(Seq(UnresolvedAttribute("ip"), UnresolvedAttribute("a")), geoIpPlan) + + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test geoip with one parameters") { + val frame = sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip, country_name) | fields ip, a + | """.stripMargin) + + // Retrieve the results + val results: Array[Row] = frame.collect() + // Define the expected results + val expectedResults: Array[Row] = + Array(Row("66.249.157.90", "Jamaica"), Row("2a09:bac2:19f8:2ac3::", "Canada")) + + // Compare the results + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Compare the logical plans + val logicalPlan: LogicalPlan = frame.queryExecution.logical + + val sourceTable: LogicalPlan = Filter( + EqualTo(UnresolvedAttribute("isValid"), Literal(true)), + UnresolvedRelation(testTable.split("\\.").toSeq)) + val geoTable: LogicalPlan = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + val geoIpPlan = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), sourceTable, geoTable, structProjection) + val expectedPlan: LogicalPlan = + Project(Seq(UnresolvedAttribute("ip"), UnresolvedAttribute("a")), geoIpPlan) + + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test geoip with multiple parameters") { + val frame = sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip, country_name, city_name) | fields ip, a + | """.stripMargin) + + // Retrieve the results + val results: Array[Row] = frame.collect() + // Define the expected results + val expectedResults: Array[Row] = Array( + Row("66.249.157.90", Row("Jamaica", "Portmore")), + Row("2a09:bac2:19f8:2ac3::", Row("Canada", "Charlottetown"))) + + // Compare the results + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Compare the logical plans + val logicalPlan: LogicalPlan = frame.queryExecution.logical + + val sourceTable: LogicalPlan = Filter( + EqualTo(UnresolvedAttribute("isValid"), Literal(true)), + UnresolvedRelation(testTable.split("\\.").toSeq)) + val geoTable: LogicalPlan = UnresolvedRelation(seq("geoip")) + val projectionStruct = CreateNamedStruct( + Seq( + Literal("country_name"), + UnresolvedAttribute("t2.country_name"), + Literal("city_name"), + UnresolvedAttribute("t2.city_name"))) + val structProjection = Alias(projectionStruct, "a")() + val geoIpPlan = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), sourceTable, geoTable, structProjection) + val expectedPlan: LogicalPlan = + Project(Seq(UnresolvedAttribute("ip"), UnresolvedAttribute("a")), geoIpPlan) + + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test geoip with partial projection on evaluated fields") { + val frame = sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip, city_name), b = geoip(ip, country_name) | fields ip, b + | """.stripMargin) + + // Retrieve the results + val results: Array[Row] = frame.collect() + // Define the expected results + val expectedResults: Array[Row] = + Array(Row("66.249.157.90", "Jamaica"), Row("2a09:bac2:19f8:2ac3::", "Canada")) + + // Compare the results + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Compare the logical plans + val logicalPlan: LogicalPlan = frame.queryExecution.logical + + val sourceTable: LogicalPlan = Filter( + EqualTo(UnresolvedAttribute("isValid"), Literal(true)), + UnresolvedRelation(testTable.split("\\.").toSeq)) + val geoTable: LogicalPlan = UnresolvedRelation(seq("geoip")) + + val structProjectionA = Alias(UnresolvedAttribute("t2.city_name"), "a")() + val geoIpPlanA = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), sourceTable, geoTable, structProjectionA) + + val structProjectionB = Alias(UnresolvedAttribute("t2.country_name"), "b")() + val geoIpPlanB = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), geoIpPlanA, geoTable, structProjectionB) + + val expectedPlan: LogicalPlan = + Project(Seq(UnresolvedAttribute("ip"), UnresolvedAttribute("b")), geoIpPlanB) + + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test geoip with projection on field that exists in both source and geoip table") { + val frame = sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip, country_name) | fields ipv4, a + | """.stripMargin) + + // Retrieve the results + val results: Array[Row] = frame.collect() + // Define the expected results + val expectedResults: Array[Row] = + Array(Row("66.249.157.90", "Jamaica"), Row("Given IPv6 is not mapped to IPv4", "Canada")) + + // Compare the results + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Compare the logical plans + val logicalPlan: LogicalPlan = frame.queryExecution.logical + + val sourceTable: LogicalPlan = Filter( + EqualTo(UnresolvedAttribute("isValid"), Literal(true)), + UnresolvedRelation(testTable.split("\\.").toSeq)) + val geoTable: LogicalPlan = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + val geoIpPlan = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), sourceTable, geoTable, structProjection) + val expectedPlan: LogicalPlan = + Project(Seq(UnresolvedAttribute("ipv4"), UnresolvedAttribute("a")), geoIpPlan) + + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test geoip with invalid parameter") { + assertThrows[ParseException](sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip, invalid_param) | fields ip, a + | """.stripMargin)) + } + + test("test geoip with invalid ip address provided") { + val frame = sql(s""" + | source = $testTable | eval a = geoip(ip) | fields ip, a + | """.stripMargin) + + // Retrieve the results + assertThrows[SparkException](frame.collect()) + } +} diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 index b7d615980..a6ab4f1de 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 @@ -416,9 +416,6 @@ ISPRESENT: 'ISPRESENT'; BETWEEN: 'BETWEEN'; CIDRMATCH: 'CIDRMATCH'; -// Geo Loction -GEOIP: 'GEOIP'; - // FLOWCONTROL FUNCTIONS IFNULL: 'IFNULL'; NULLIF: 'NULLIF'; @@ -428,6 +425,18 @@ TYPEOF: 'TYPEOF'; //OTHER CONDITIONAL EXPRESSIONS COALESCE: 'COALESCE'; +//GEOLOCATION FUNCTIONS +GEOIP: 'GEOIP'; + +//GEOLOCATION PROPERTIES +COUNTRY_ISO_CODE: 'COUNTRY_ISO_CODE'; +COUNTRY_NAME: 'COUNTRY_NAME'; +CONTINENT_NAME: 'CONTINENT_NAME'; +REGION_ISO_CODE: 'REGION_ISO_CODE'; +REGION_NAME: 'REGION_NAME'; +CITY_NAME: 'CITY_NAME'; +LOCATION: 'LOCATION'; + // RELEVANCE FUNCTIONS AND PARAMETERS MATCH: 'MATCH'; MATCH_PHRASE: 'MATCH_PHRASE'; diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 index b990fd549..0a2cdf1a0 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 @@ -387,6 +387,11 @@ sortbyClause evalClause : fieldExpression EQUAL expression + | geoipCommand + ; + +geoipCommand + : fieldExpression EQUAL GEOIP LT_PRTHS ipAddress = functionArg (COMMA properties = geoIpPropertyList)? RT_PRTHS ; // aggregation terms @@ -446,7 +451,6 @@ valueExpression | positionFunction # positionFunctionCall | caseFunction # caseExpr | timestampFunction # timestampFunctionCall - | geoipFunction # geoFunctionCall | LT_PRTHS valueExpression RT_PRTHS # parentheticValueExpr | LT_SQR_PRTHS subSearch RT_SQR_PRTHS # scalarSubqueryExpr | ident ARROW expression # lambda @@ -544,11 +548,6 @@ dataTypeFunctionCall : CAST LT_PRTHS expression AS convertedDataType RT_PRTHS ; -// geoip function -geoipFunction - : GEOIP LT_PRTHS (datasource = functionArg COMMA)? ipAddress = functionArg (COMMA properties = stringLiteral)? RT_PRTHS - ; - // boolean functions booleanFunctionCall : conditionFunctionBase LT_PRTHS functionArgs RT_PRTHS @@ -582,7 +581,6 @@ evalFunctionName | cryptographicFunctionName | jsonFunctionName | collectionFunctionName - | geoipFunctionName | lambdaFunctionName ; @@ -900,10 +898,6 @@ lambdaFunctionName | TRANSFORM | REDUCE ; - -geoipFunctionName - : GEOIP - ; positionFunctionName : POSITION @@ -913,6 +907,21 @@ coalesceFunctionName : COALESCE ; +geoIpPropertyList + : geoIpProperty (COMMA geoIpProperty)* + ; + +geoIpProperty + : COUNTRY_ISO_CODE + | COUNTRY_NAME + | CONTINENT_NAME + | REGION_ISO_CODE + | REGION_NAME + | CITY_NAME + | TIME_ZONE + | LOCATION + ; + // operators comparisonOperator : EQUAL diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index dadf6b968..f9b333b26 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -341,10 +341,15 @@ public T visitExistsSubquery(ExistsSubquery node, C context) { public T visitWindow(Window node, C context) { return visitChildren(node, context); } + public T visitCidr(Cidr node, C context) { return visitChildren(node, context); } + public T visitGeoIp(GeoIp node, C context) { + return visitChildren(node, context); + } + public T visitFlatten(Flatten flatten, C context) { return visitChildren(flatten, context); } diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Eval.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Eval.java index 0cc27b6a9..c8482a4ff 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Eval.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Eval.java @@ -12,7 +12,7 @@ import lombok.Setter; import lombok.ToString; import org.opensearch.sql.ast.AbstractNodeVisitor; -import org.opensearch.sql.ast.expression.Let; +import org.opensearch.sql.ast.Node; import java.util.List; @@ -23,7 +23,7 @@ @EqualsAndHashCode(callSuper = false) @RequiredArgsConstructor public class Eval extends UnresolvedPlan { - private final List expressionList; + private final List expressionList; private UnresolvedPlan child; @Override diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java new file mode 100644 index 000000000..feefa6929 --- /dev/null +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java @@ -0,0 +1,47 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.ToString; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.Node; +import org.opensearch.sql.ast.expression.AttributeList; +import org.opensearch.sql.ast.expression.Field; +import org.opensearch.sql.ast.expression.UnresolvedExpression; + +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +@Getter +@RequiredArgsConstructor +@EqualsAndHashCode(callSuper = false) +public class GeoIp extends UnresolvedPlan { + private UnresolvedPlan child; + private final Field field; + private final UnresolvedExpression ipAddress; + private final AttributeList properties; + + @Override + public List getChild() { + return ImmutableList.of(child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitGeoIp(this, context); + } + + @Override + public UnresolvedPlan attach(UnresolvedPlan child) { + this.child = child; + return this; + } +} \ No newline at end of file diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java index e80a26bc4..e931175ff 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java @@ -11,13 +11,18 @@ import org.apache.spark.sql.catalyst.expressions.Expression; import org.apache.spark.sql.catalyst.expressions.ScalaUDF; import org.apache.spark.sql.types.DataTypes; +import scala.Function1; import scala.Function2; import scala.Option; import scala.Serializable; +import scala.runtime.AbstractFunction1; +import scala.runtime.AbstractFunction2; import scala.collection.JavaConverters; import scala.collection.mutable.WrappedArray; -import scala.runtime.AbstractFunction2; +import java.math.BigInteger; +import java.net.InetAddress; +import java.net.UnknownHostException; import java.util.Collection; import java.util.List; import java.util.Map; @@ -28,7 +33,6 @@ import static org.opensearch.sql.expression.function.JsonUtils.removeNestedKey; import static org.opensearch.sql.ppl.utils.DataTypeTransformer.seq; - public interface SerializableUdf { @@ -142,11 +146,66 @@ public Boolean apply(String ipAddress, String cidrBlock) { } }; + class geoIpUtils { + /** + * Checks if provided ip string is ipv4 or ipv6. + * + * @param ipAddress To input ip string. + * @return true if ipAddress is ipv4, false if ipaddress is ipv6, AddressString Exception if invalid ip. + */ + public static Function1 isIpv4 = new SerializableAbstractFunction1<>() { + + IPAddressStringParameters valOptions = new IPAddressStringParameters.Builder() + .allowEmpty(false) + .setEmptyAsLoopback(false) + .allow_inet_aton(false) + .allowSingleSegment(false) + .toParams(); + + @Override + public Boolean apply(String ipAddress) { + IPAddressString parsedIpAddress = new IPAddressString(ipAddress, valOptions); + + try { + parsedIpAddress.validate(); + } catch (AddressStringException e) { + throw new RuntimeException("The given ipAddress '"+ipAddress+"' is invalid. It must be a valid IPv4 or IPv6 address. Error details: "+e.getMessage()); + } + + return parsedIpAddress.isIPv4(); + } + }; + + /** + * Convert ipAddress string to interger representation + * + * @param ipAddress The input ip string. + * @return converted BigInteger from ipAddress string. + */ + public static Function1 ipToInt = new SerializableAbstractFunction1<>() { + @Override + public BigInteger apply(String ipAddress) { + try { + InetAddress inetAddress = InetAddress.getByName(ipAddress); + byte[] addressBytes = inetAddress.getAddress(); + return new BigInteger(1, addressBytes); + } catch (UnknownHostException e) { + System.err.println("Invalid IP address: " + e.getMessage()); + } + return null; + } + }; + } + + abstract class SerializableAbstractFunction1 extends AbstractFunction1 + implements Serializable { + } + /** - * get the function reference according to its name + * Get the function reference according to its name * - * @param funcName - * @return + * @param funcName string representing function to retrieve. + * @return relevant ScalaUDF for given function name. */ static ScalaUDF visit(String funcName, List expressions) { switch (funcName) { @@ -177,6 +236,24 @@ static ScalaUDF visit(String funcName, List expressions) { Option.apply("json_append"), false, true); + case "is_ipv4": + return new ScalaUDF(geoIpUtils.isIpv4, + DataTypes.BooleanType, + seq(expressions), + seq(), + Option.empty(), + Option.apply("is_ipv4"), + false, + true); + case "ip_to_int": + return new ScalaUDF(geoIpUtils.ipToInt, + DataTypes.createDecimalType(38,0), + seq(expressions), + seq(), + Option.empty(), + Option.apply("ip_to_int"), + false, + true); default: return null; } diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java index d7f59bae3..0a6e869ba 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java @@ -56,6 +56,7 @@ import org.opensearch.sql.ast.tree.FillNull; import org.opensearch.sql.ast.tree.Filter; import org.opensearch.sql.ast.tree.Flatten; +import org.opensearch.sql.ast.tree.GeoIp; import org.opensearch.sql.ast.tree.Head; import org.opensearch.sql.ast.tree.Join; import org.opensearch.sql.ast.tree.Kmeans; @@ -69,9 +70,11 @@ import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.Trendline; +import org.opensearch.sql.ast.tree.UnresolvedPlan; import org.opensearch.sql.ast.tree.Window; import org.opensearch.sql.common.antlr.SyntaxCheckException; import org.opensearch.sql.ppl.utils.FieldSummaryTransformer; +import org.opensearch.sql.ppl.utils.GeoIpCatalystLogicalPlanTranslator; import org.opensearch.sql.ppl.utils.ParseTransformer; import org.opensearch.sql.ppl.utils.SortUtils; import org.opensearch.sql.ppl.utils.TrendlineCatalystUtils; @@ -562,19 +565,63 @@ public LogicalPlan visitRename(Rename node, CatalystPlanContext context) { public LogicalPlan visitEval(Eval node, CatalystPlanContext context) { visitFirstChild(node, context); List aliases = new ArrayList<>(); - List letExpressions = node.getExpressionList(); - for (Let let : letExpressions) { - Alias alias = new Alias(let.getVar().getField().toString(), let.getExpression()); - aliases.add(alias); + List expressions = node.getExpressionList(); + + // Geoip function modifies logical plan and is treated as QueryPlanVisitor instead of ExpressionVisitor + for (Node expr : expressions) { + if (expr instanceof Let) { + Let let = (Let) expr; + Alias alias = new Alias(let.getVar().getField().toString(), let.getExpression()); + aliases.add(alias); + } else if (expr instanceof UnresolvedPlan) { + expr.accept(this, context); + } else { + throw new SyntaxCheckException("Unexpected node type when visiting EVAL"); + } } - if (context.getNamedParseExpressions().isEmpty()) { - // Create an UnresolvedStar for all-fields projection - context.getNamedParseExpressions().push(UnresolvedStar$.MODULE$.apply(Option.>empty())); + + if (!aliases.isEmpty()) { + if (context.getNamedParseExpressions().isEmpty()) { + // Create an UnresolvedStar for all-fields projection + context.getNamedParseExpressions().push(UnresolvedStar$.MODULE$.apply(Option.>empty())); + } + + visitExpressionList(aliases, context); + Seq projectExpressions = context.retainAllNamedParseExpressions(p -> (NamedExpression) p); + // build the plan with the projection step + return context.apply(p -> new org.apache.spark.sql.catalyst.plans.logical.Project(projectExpressions, p)); + } else { + return context.getPlan(); } - List expressionList = visitExpressionList(aliases, context); - Seq projectExpressions = context.retainAllNamedParseExpressions(p -> (NamedExpression) p); - // build the plan with the projection step - return context.apply(p -> new org.apache.spark.sql.catalyst.plans.logical.Project(projectExpressions, p)); + } + + @Override + public LogicalPlan visitGeoIp(GeoIp node, CatalystPlanContext context) { + visitExpression(node.getProperties(), context); + List attributeList = new ArrayList<>(); + + while (!context.getNamedParseExpressions().isEmpty()) { + Expression nextExpression = context.getNamedParseExpressions().pop(); + String attributeName = nextExpression.toString(); + + if (attributeList.contains(attributeName)) { + throw new IllegalStateException("Duplicate attribute in GEOIP attribute list"); + } + + attributeList.add(0, attributeName); + } + + String fieldExpression = node.getField().getField().toString(); + Expression ipAddressExpression = visitExpression(node.getIpAddress(), context); + + return GeoIpCatalystLogicalPlanTranslator.getGeoipLogicalPlan( + new GeoIpCatalystLogicalPlanTranslator.GeoIpParameters( + fieldExpression, + ipAddressExpression, + attributeList + ), + context + ); } @Override diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index d4f9ece87..2ea23babf 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -336,10 +336,18 @@ public UnresolvedPlan visitSortCommand(OpenSearchPPLParser.SortCommandContext ct public UnresolvedPlan visitEvalCommand(OpenSearchPPLParser.EvalCommandContext ctx) { return new Eval( ctx.evalClause().stream() - .map(ct -> (Let) internalVisitExpression(ct)) + .map(ct -> (ct.geoipCommand() != null) ? visit(ct.geoipCommand()) : (Let) internalVisitExpression(ct)) .collect(Collectors.toList())); } + @Override + public UnresolvedPlan visitGeoipCommand(OpenSearchPPLParser.GeoipCommandContext ctx) { + Field field = (Field) internalVisitExpression(ctx.fieldExpression()); + UnresolvedExpression ipAddress = internalVisitExpression(ctx.ipAddress); + AttributeList properties = ctx.properties == null ? new AttributeList(Collections.emptyList()) : (AttributeList) internalVisitExpression(ctx.properties); + return new GeoIp(field, ipAddress, properties); + } + private List getGroupByList(OpenSearchPPLParser.ByClauseContext ctx) { return ctx.fieldList().fieldExpression().stream() .map(this::internalVisitExpression) diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java index 1fe57d13e..a73c593fe 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java @@ -49,6 +49,7 @@ import org.opensearch.sql.common.antlr.SyntaxCheckException; import org.opensearch.sql.common.utils.StringUtils; import org.opensearch.sql.ppl.utils.ArgumentFactory; +import org.opensearch.sql.ppl.utils.GeoIpCatalystLogicalPlanTranslator; import java.util.Arrays; import java.util.Collections; @@ -450,6 +451,20 @@ public UnresolvedExpression visitLambda(OpenSearchPPLParser.LambdaContext ctx) { return new LambdaFunction(function, arguments); } + @Override + public UnresolvedExpression visitGeoIpPropertyList(OpenSearchPPLParser.GeoIpPropertyListContext ctx) { + ImmutableList.Builder properties = ImmutableList.builder(); + if (ctx != null) { + for (OpenSearchPPLParser.GeoIpPropertyContext property : ctx.geoIpProperty()) { + String propertyName = property.getText().toUpperCase(); + GeoIpCatalystLogicalPlanTranslator.validateGeoIpProperty(propertyName); + properties.add(new Literal(propertyName, DataType.STRING)); + } + } + + return new AttributeList(properties.build()); + } + private List timestampFunctionArguments( OpenSearchPPLParser.TimestampFunctionCallContext ctx) { List args = diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoIpCatalystLogicalPlanTranslator.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoIpCatalystLogicalPlanTranslator.java new file mode 100644 index 000000000..cedc00846 --- /dev/null +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoIpCatalystLogicalPlanTranslator.java @@ -0,0 +1,222 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ppl.utils; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import org.apache.spark.SparkEnv; +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute$; +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation; +import org.apache.spark.sql.catalyst.analysis.UnresolvedStar$; +import org.apache.spark.sql.catalyst.expressions.Alias$; +import org.apache.spark.sql.catalyst.expressions.And; +import org.apache.spark.sql.catalyst.expressions.CreateStruct; +import org.apache.spark.sql.catalyst.expressions.EqualTo; +import org.apache.spark.sql.catalyst.expressions.Expression; +import org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual; +import org.apache.spark.sql.catalyst.expressions.LessThan; +import org.apache.spark.sql.catalyst.expressions.NamedExpression; +import org.apache.spark.sql.catalyst.plans.logical.DataFrameDropColumns; +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; +import org.apache.spark.sql.catalyst.plans.logical.Project; +import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias$; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; +import org.opensearch.sql.ast.tree.Join; +import org.opensearch.sql.expression.function.SerializableUdf; +import org.opensearch.sql.ppl.CatalystPlanContext; +import scala.Option; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Optional; +import java.util.stream.Collectors; + +import static java.util.List.of; + +import static org.opensearch.sql.ppl.utils.DataTypeTransformer.seq; +import static org.opensearch.sql.ppl.utils.JoinSpecTransformer.join; + +public interface GeoIpCatalystLogicalPlanTranslator { + String SPARK_CONF_KEY = "spark.geoip.tablename"; + String DEFAULT_GEOIP_TABLE_NAME = "geoip"; + String GEOIP_CIDR_COLUMN_NAME = "cidr"; + String GEOIP_IP_RANGE_START_COLUMN_NAME = "ip_range_start"; + String GEOIP_IP_RANGE_END_COLUMN_NAME = "ip_range_end"; + String GEOIP_IPV4_COLUMN_NAME = "ipv4"; + String SOURCE_TABLE_ALIAS = "t1"; + String GEOIP_TABLE_ALIAS = "t2"; + List GEOIP_TABLE_COLUMNS = Arrays.stream(GeoIpProperty.values()) + .map(Enum::name) + .collect(Collectors.toList()); + + /** + * Responsible to produce a Spark Logical Plan with given GeoIp command arguments, below is the sample logical plan + * with configuration [source=users, field=a, ipAddress=ip, properties=[country_name, city_name]] + * +- 'DataFrameDropColumns ['t2.country_iso_code, 't2.country_name, 't2.continent_name, 't2.region_iso_code, 't2.region_name, 't2.city_name, 't2.time_zone, 't2.location, 't2.cidr, 't2.start, 't2.end, 't2.ipv4] + * -- +- 'Project [*, named_struct(country_name, 't2.country_name, city_name, 't2.city_name) AS a#0] + * -- -- +- 'Join LeftOuter, (((ip_to_int('ip) >= 't2.start) AND (ip_to_int('ip) < 't2.end)) AND (is_ipv4('ip) = 't2.ipv4)) + * -- -- -- :- 'SubqueryAlias t1 + * -- -- -- -- : +- 'UnresolvedRelation [users], [], false + * -- -- -- +- 'SubqueryAlias t2 + * -- -- -- -- -- +- 'UnresolvedRelation [geoip], [], false + * . + * And the corresponded SQL query: + * . + * SELECT users.*, struct(geoip.country_name, geoip.city_name) AS a + * FROM users, geoip + * WHERE geoip.ip_range_start <= ip_to_int(users.ip) + * AND geoip.ip_range_end > ip_to_int(users.ip) + * AND geoip.ip_type = is_ipv4(users.ip); + * + * @param parameters GeoIp function parameters. + * @param context Context instance to retrieved Expression in resolved form. + * @return a LogicalPlan which will project new col with geoip location based on given ipAddresses. + */ + static LogicalPlan getGeoipLogicalPlan(GeoIpParameters parameters, CatalystPlanContext context) { + applyJoin(parameters.getIpAddress(), context); + return applyProjection(parameters.getField(), parameters.getProperties(), context); + } + + /** + * Responsible to produce join plan for GeoIp command, below is the sample logical plan + * with configuration [source=users, ipAddress=ip] + * +- 'Join LeftOuter, (((ip_to_int('ip) >= 't2.start) AND (ip_to_int('ip) < 't2.end)) AND (is_ipv4('ip) = 't2.ipv4)) + * -- :- 'SubqueryAlias t1 + * -- -- : +- 'UnresolvedRelation [users], [], false + * -- +- 'SubqueryAlias t2 + * -- -- -- +- 'UnresolvedRelation [geoip], [], false + * + * @param ipAddress Expression representing ip addresses to be queried. + * @param context Context instance to retrieved Expression in resolved form. + * @return a LogicalPlan which will perform join based on ip within cidr range in geoip table. + */ + static private LogicalPlan applyJoin(Expression ipAddress, CatalystPlanContext context) { + return context.apply(left -> { + LogicalPlan right = new UnresolvedRelation(seq(getGeoipTableName()), CaseInsensitiveStringMap.empty(), false); + LogicalPlan leftAlias = SubqueryAlias$.MODULE$.apply(SOURCE_TABLE_ALIAS, left); + LogicalPlan rightAlias = SubqueryAlias$.MODULE$.apply(GEOIP_TABLE_ALIAS, right); + Optional joinCondition = Optional.of(new And( + new And( + new GreaterThanOrEqual( + SerializableUdf.visit("ip_to_int", of(ipAddress)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS,GEOIP_IP_RANGE_START_COLUMN_NAME)) + ), + new LessThan( + SerializableUdf.visit("ip_to_int", of(ipAddress)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS,GEOIP_IP_RANGE_END_COLUMN_NAME)) + ) + ), + new EqualTo( + SerializableUdf.visit("is_ipv4", of(ipAddress)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS,GEOIP_IPV4_COLUMN_NAME)) + ) + )); + context.retainAllNamedParseExpressions(p -> p); + context.retainAllPlans(p -> p); + return join(leftAlias, + rightAlias, + Join.JoinType.LEFT, + joinCondition, + new Join.JoinHint()); + }); + } + + /** + * Responsible to produce a Spark Logical Plan with given GeoIp command arguments, below is the sample logical plan + * with configuration [source=users, field=a, properties=[country_name, city_name]] + * +- 'DataFrameDropColumns ['t2.country_iso_code, 't2.country_name, 't2.continent_name, 't2.region_iso_code, 't2.region_name, 't2.city_name, 't2.time_zone, 't2.location, 't2.cidr, 't2.start, 't2.end, 't2.ipv4] + * -- +- 'Project [*, named_struct(country_name, 't2.country_name, city_name, 't2.city_name) AS a#0] + * + * @param field Name of new eval geoip column. + * @param properties List of geo properties to be returned. + * @param context Context instance to retrieved Expression in resolved form. + * @return a LogicalPlan which will return source table and new eval geoip column. + */ + static private LogicalPlan applyProjection(String field, List properties, CatalystPlanContext context) { + List projectExpressions = new ArrayList<>(); + projectExpressions.add(UnresolvedStar$.MODULE$.apply(Option.empty())); + + List geoIpStructFields = createGeoIpStructFields(properties); + Expression columnValue = (geoIpStructFields.size() == 1)? + geoIpStructFields.get(0) : CreateStruct.apply(seq(geoIpStructFields)); + + NamedExpression geoCol = Alias$.MODULE$.apply( + columnValue, + field, + NamedExpression.newExprId(), + seq(new ArrayList<>()), + Option.empty(), + seq(new ArrayList<>())); + + projectExpressions.add(geoCol); + + List dropList = createGeoIpStructFields(new ArrayList<>()); + dropList.addAll(List.of( + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS, GEOIP_CIDR_COLUMN_NAME)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS, GEOIP_IP_RANGE_START_COLUMN_NAME)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS, GEOIP_IP_RANGE_END_COLUMN_NAME)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS, GEOIP_IPV4_COLUMN_NAME)) + )); + + context.apply(p -> new Project(seq(projectExpressions), p)); + return context.apply(p -> new DataFrameDropColumns(seq(dropList), p)); + } + + static private List createGeoIpStructFields(List attributeList) { + List attributeListToUse; + if (attributeList == null || attributeList.isEmpty()) { + attributeListToUse = GEOIP_TABLE_COLUMNS; + } else { + attributeListToUse = attributeList; + } + + return attributeListToUse.stream() + .map(a -> UnresolvedAttribute$.MODULE$.apply(seq( + GEOIP_TABLE_ALIAS, + a.toLowerCase(Locale.ROOT) + ))) + .collect(Collectors.toList()); + } + + static private String getGeoipTableName() { + String tableName = DEFAULT_GEOIP_TABLE_NAME; + + if (SparkEnv.get() != null && SparkEnv.get().conf() != null) { + tableName = SparkEnv.get().conf().get(SPARK_CONF_KEY, DEFAULT_GEOIP_TABLE_NAME); + } + + return tableName; + } + + @Getter + @AllArgsConstructor + class GeoIpParameters { + private final String field; + private final Expression ipAddress; + private final List properties; + } + + enum GeoIpProperty { + COUNTRY_ISO_CODE, + COUNTRY_NAME, + CONTINENT_NAME, + REGION_ISO_CODE, + REGION_NAME, + CITY_NAME, + TIME_ZONE, + LOCATION + } + + public static void validateGeoIpProperty(String propertyName) { + try { + GeoIpProperty.valueOf(propertyName); + } catch (NullPointerException | IllegalArgumentException e) { + throw new IllegalArgumentException("Invalid properties used."); + } + } +} diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGeoipFunctionTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGeoipFunctionTranslatorTestSuite.scala new file mode 100644 index 000000000..460b9769c --- /dev/null +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGeoipFunctionTranslatorTestSuite.scala @@ -0,0 +1,332 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.ppl + +import java.util + +import org.opensearch.flint.spark.ppl.PlaneUtils.plan +import org.opensearch.sql.expression.function.SerializableUdf.visit +import org.opensearch.sql.ppl.{CatalystPlanContext, CatalystQueryPlanVisitor} +import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq +import org.scalatest.matchers.should.Matchers + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} +import org.apache.spark.sql.catalyst.expressions.{Alias, And, CreateNamedStruct, Descending, EqualTo, Expression, ExprId, GreaterThanOrEqual, In, LessThan, Literal, NamedExpression, ScalaUDF, SortOrder} +import org.apache.spark.sql.catalyst.plans.{LeftOuter, PlanTest} +import org.apache.spark.sql.catalyst.plans.logical.{DataFrameDropColumns, Join, JoinHint, LogicalPlan, Project, Sort, SubqueryAlias} +import org.apache.spark.sql.types.DataTypes + +class PPLLogicalPlanGeoipFunctionTranslatorTestSuite + extends SparkFunSuite + with PlanTest + with LogicalPlanTestUtils + with Matchers { + + private val planTransformer = new CatalystQueryPlanVisitor() + private val pplParser = new PPLSyntaxParser() + + private def getGeoIpQueryPlan( + ipAddress: UnresolvedAttribute, + left: LogicalPlan, + right: LogicalPlan, + projectionProperties: Alias): LogicalPlan = { + val joinPlan = getJoinPlan(ipAddress, left, right) + getProjection(joinPlan, projectionProperties) + } + + private def getJoinPlan( + ipAddress: UnresolvedAttribute, + left: LogicalPlan, + right: LogicalPlan): LogicalPlan = { + val is_ipv4 = visit("is_ipv4", util.List.of[Expression](ipAddress)) + val ip_to_int = visit("ip_to_int", util.List.of[Expression](ipAddress)) + + val t1 = SubqueryAlias("t1", left) + val t2 = SubqueryAlias("t2", right) + + val joinCondition = And( + And( + GreaterThanOrEqual(ip_to_int, UnresolvedAttribute("t2.ip_range_start")), + LessThan(ip_to_int, UnresolvedAttribute("t2.ip_range_end"))), + EqualTo(is_ipv4, UnresolvedAttribute("t2.ipv4"))) + Join(t1, t2, LeftOuter, Some(joinCondition), JoinHint.NONE) + } + + private def getProjection(joinPlan: LogicalPlan, projectionProperties: Alias): LogicalPlan = { + val projection = Project(Seq(UnresolvedStar(None), projectionProperties), joinPlan) + val dropList = Seq( + "t2.country_iso_code", + "t2.country_name", + "t2.continent_name", + "t2.region_iso_code", + "t2.region_name", + "t2.city_name", + "t2.time_zone", + "t2.location", + "t2.cidr", + "t2.ip_range_start", + "t2.ip_range_end", + "t2.ipv4").map(UnresolvedAttribute(_)) + DataFrameDropColumns(dropList, projection) + } + + test("test geoip function - only ip_address provided") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source = users | eval a = geoip(ip_address)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("users")) + val geoTable = UnresolvedRelation(seq("geoip")) + + val projectionStruct = CreateNamedStruct( + Seq( + Literal("country_iso_code"), + UnresolvedAttribute("t2.country_iso_code"), + Literal("country_name"), + UnresolvedAttribute("t2.country_name"), + Literal("continent_name"), + UnresolvedAttribute("t2.continent_name"), + Literal("region_iso_code"), + UnresolvedAttribute("t2.region_iso_code"), + Literal("region_name"), + UnresolvedAttribute("t2.region_name"), + Literal("city_name"), + UnresolvedAttribute("t2.city_name"), + Literal("time_zone"), + UnresolvedAttribute("t2.time_zone"), + Literal("location"), + UnresolvedAttribute("t2.location"))) + val structProjection = Alias(projectionStruct, "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedStar(None)), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - source has same name as join alias") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=t1 | eval a = geoip(ip_address, country_name)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("t1")) + val geoTable = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedStar(None)), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - ipAddress col exist in geoip table") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=t1 | eval a = geoip(cidr, country_name)"), + context) + + val ipAddress = UnresolvedAttribute("cidr") + val sourceTable = UnresolvedRelation(seq("t1")) + val geoTable = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedStar(None)), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - duplicate parameters") { + val context = new CatalystPlanContext + + val exception = intercept[IllegalStateException] { + planTransformer.visit( + plan(pplParser, "source=t1 | eval a = geoip(cidr, country_name, country_name)"), + context) + } + + assert(exception.getMessage.contains("Duplicate attribute in GEOIP attribute list")) + } + + test("test geoip function - one property provided") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=users | eval a = geoip(ip_address, country_name)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("users")) + val geoTable = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedStar(None)), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - multiple properties provided") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=users | eval a = geoip(ip_address,country_name,location)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("users")) + val geoTable = UnresolvedRelation(seq("geoip")) + val projectionStruct = CreateNamedStruct( + Seq( + Literal("country_name"), + UnresolvedAttribute("t2.country_name"), + Literal("location"), + UnresolvedAttribute("t2.location"))) + val structProjection = Alias(projectionStruct, "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedStar(None)), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - multiple geoip calls") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan( + pplParser, + "source=t | eval a = geoip(ip_address, country_iso_code), b = geoip(ip_address, region_iso_code)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("t")) + val geoTable = UnresolvedRelation(seq("geoip")) + + val structProjectionA = Alias(UnresolvedAttribute("t2.country_iso_code"), "a")() + val colAPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjectionA) + + val structProjectionB = Alias(UnresolvedAttribute("t2.region_iso_code"), "b")() + val colBPlan = getGeoIpQueryPlan(ipAddress, colAPlan, geoTable, structProjectionB) + + val expectedPlan = Project(Seq(UnresolvedStar(None)), colBPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - other eval function used between geoip") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan( + pplParser, + "source=t | eval a = geoip(ip_address, time_zone), b = rand(), c = geoip(ip_address, region_name)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("t")) + val geoTable = UnresolvedRelation(seq("geoip")) + + val structProjectionA = Alias(UnresolvedAttribute("t2.time_zone"), "a")() + val colAPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjectionA) + + val structProjectionC = Alias(UnresolvedAttribute("t2.region_name"), "c")() + val colCPlan = getGeoIpQueryPlan(ipAddress, colAPlan, geoTable, structProjectionC) + + val randProjectList: Seq[NamedExpression] = Seq( + UnresolvedStar(None), + Alias(UnresolvedFunction("rand", Seq.empty, isDistinct = false), "b")()) + val colBPlan = Project(randProjectList, colCPlan) + + val expectedPlan = Project(Seq(UnresolvedStar(None)), colBPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - other eval function used before geoip") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=t | eval a = rand(), b = geoip(ip_address, city_name)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("t")) + val geoTable = UnresolvedRelation(seq("geoip")) + + val structProjectionB = Alias(UnresolvedAttribute("t2.city_name"), "b")() + val colBPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjectionB) + + val randProjectList: Seq[NamedExpression] = Seq( + UnresolvedStar(None), + Alias(UnresolvedFunction("rand", Seq.empty, isDistinct = false), "a")()) + val colAPlan = Project(randProjectList, colBPlan) + + val expectedPlan = Project(Seq(UnresolvedStar(None)), colAPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - projection on evaluated field") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=users | eval a = geoip(ip_address, country_name) | fields a"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("users")) + val geoTable = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedAttribute("a")), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip with partial projection on evaluated fields") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan( + pplParser, + "source=t | eval a = geoip(ip_address, country_iso_code), b = geoip(ip_address, region_iso_code) | fields b"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("t")) + val geoTable = UnresolvedRelation(seq("geoip")) + + val structProjectionA = Alias(UnresolvedAttribute("t2.country_iso_code"), "a")() + val colAPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjectionA) + + val structProjectionB = Alias(UnresolvedAttribute("t2.region_iso_code"), "b")() + val colBPlan = getGeoIpQueryPlan(ipAddress, colAPlan, geoTable, structProjectionB) + + val expectedPlan = Project(Seq(UnresolvedAttribute("b")), colBPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } +}