diff --git a/README.md b/README.md index db3790e64..6732db3af 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,8 @@ bin/spark-shell --packages "org.opensearch:opensearch-spark-ppl_2.12:0.7.0-SNAPS ### PPL Run queries on a local spark cluster See ppl usage sample on local spark cluster [PPL on local spark ](docs/ppl-lang/local-spark-ppl-test-instruction.md) +### Running integration tests on a local spark cluster +See integration test documentation [Docker Integration Tests](integ-test/script/README.md) ## Code of Conduct diff --git a/docker/integ-test/.env b/docker/integ-test/.env new file mode 100644 index 000000000..cf73bdc89 --- /dev/null +++ b/docker/integ-test/.env @@ -0,0 +1,13 @@ +SPARK_VERSION=3.5.3 +OPENSEARCH_VERSION=latest +DASHBOARDS_VERSION=latest +MASTER_UI_PORT=8080 +MASTER_PORT=7077 +UI_PORT=4040 +SPARK_CONNECT_PORT=15002 +PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar +FLINT_JAR=../../flint-spark-integration/target/scala-2.12/flint-spark-integration-assembly-0.7.0-SNAPSHOT.jar +OPENSEARCH_NODE_MEMORY=512m +OPENSEARCH_ADMIN_PASSWORD=C0rrecthorsebatterystaple. +OPENSEARCH_PORT=9200 +OPENSEARCH_DASHBOARDS_PORT=5601 diff --git a/docker/integ-test/docker-compose.yml b/docker/integ-test/docker-compose.yml new file mode 100644 index 000000000..c5ee53d7d --- /dev/null +++ b/docker/integ-test/docker-compose.yml @@ -0,0 +1,143 @@ +services: + spark: + image: bitnami/spark:${SPARK_VERSION:-3.5.3} + container_name: spark + ports: + - "${MASTER_UI_PORT:-8080}:8080" + - "${MASTER_PORT:-7077}:7077" + - "${UI_PORT:-4040}:4040" + - "${SPARK_CONNECT_PORT}:15002" + entrypoint: /opt/bitnami/scripts/spark/master-entrypoint.sh + environment: + - SPARK_MODE=master + - SPARK_RPC_AUTHENTICATION_ENABLED=no + - SPARK_RPC_ENCRYPTION_ENABLED=no + - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no + - SPARK_SSL_ENABLED=no + - SPARK_PUBLIC_DNS=localhost + volumes: + - type: bind + source: ./spark-master-entrypoint.sh + target: /opt/bitnami/scripts/spark/master-entrypoint.sh + - type: bind + source: ./spark-defaults.conf + target: /opt/bitnami/spark/conf/spark-defaults.conf + - type: bind + source: ./log4j2.properties + target: /opt/bitnami/spark/conf/log4j2.properties + - type: bind + source: $PPL_JAR + target: /opt/bitnami/spark/jars/ppl-spark-integration.jar + - type: bind + source: $FLINT_JAR + target: /opt/bitnami/spark/jars/flint-spark-integration.jar + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/"] + interval: 1m + timeout: 5s + retries: 3 + start_period: 30s + start_interval: 5s + networks: + - opensearch-net + + spark-worker: + image: bitnami/spark:${SPARK_VERSION:-3.5.3} + container_name: spark-worker + environment: + - SPARK_MODE=worker + - SPARK_MASTER_URL=spark://spark:7077 + - SPARK_WORKER_MEMORY=${WORKER_MEMORY:-1G} + - SPARK_WORKER_CORES=${WORKER_CORES:-1} + - SPARK_RPC_AUTHENTICATION_ENABLED=no + - SPARK_RPC_ENCRYPTION_ENABLED=no + - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no + - SPARK_SSL_ENABLED=no + - SPARK_PUBLIC_DNS=localhost + volumes: + - type: bind + source: ./spark-defaults.conf + target: /opt/bitnami/spark/conf/spark-defaults.conf + - type: bind + source: ./log4j2.properties + target: /opt/bitnami/spark/conf/log4j2.properties + - type: bind + source: $PPL_JAR + target: /opt/bitnami/spark/jars/ppl-spark-integration.jar + - type: bind + source: $FLINT_JAR + target: /opt/bitnami/spark/jars/flint-spark-integration.jar + networks: + - opensearch-net + depends_on: + - spark + + opensearch: + image: opensearchproject/opensearch:${OPENSEARCH_VERSION:-latest} + container_name: opensearch + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch + - discovery.seed_hosts=opensearch + - cluster.initial_cluster_manager_nodes=opensearch + - bootstrap.memory_lock=true + - plugins.security.ssl.http.enabled=false + - OPENSEARCH_JAVA_OPTS=-Xms${OPENSEARCH_NODE_MEMORY:-512m} -Xmx${OPENSEARCH_NODE_MEMORY:-512m} + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_ADMIN_PASSWORD} + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 + hard: 65536 + volumes: + - opensearch-data:/usr/share/opensearch/data + ports: + - ${OPENSEARCH_PORT:-9200}:9200 + - 9600:9600 + expose: + - "${OPENSEARCH_PORT:-9200}" + healthcheck: + test: ["CMD", "curl", "-f", "-u", "admin:${OPENSEARCH_ADMIN_PASSWORD}", "http://localhost:9200/_cluster/health"] + interval: 1m + timeout: 5s + retries: 3 + start_period: 30s + start_interval: 5s + networks: + - opensearch-net + + opensearch-dashboards: + image: opensearchproject/opensearch-dashboards:${DASHBOARDS_VERSION} + container_name: opensearch-dashboards + ports: + - ${OPENSEARCH_DASHBOARDS_PORT:-5601}:5601 + expose: + - "${OPENSEARCH_DASHBOARDS_PORT:-5601}" + environment: + OPENSEARCH_HOSTS: '["http://opensearch:9200"]' + networks: + - opensearch-net + depends_on: + - opensearch + + minio: + image: minio/minio + container_name: minio-S3 + # See original entrypoint/command under https://github.com/minio/minio/blob/master/Dockerfile + entrypoint: sh -c 'mkdir -p /data/test && minio server /data --console-address ":9001"' + ports: + - "9000:9000" + - "9001:9001" + volumes: + - minio-data:/data + networks: + - opensearch-net + +volumes: + opensearch-data: + minio-data: + +networks: + opensearch-net: diff --git a/docker/integ-test/log4j2.properties b/docker/integ-test/log4j2.properties new file mode 100644 index 000000000..ab96e03ba --- /dev/null +++ b/docker/integ-test/log4j2.properties @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +rootLogger.level = info +rootLogger.appenderRef.stdout.ref = console + +# In the pattern layout configuration below, we specify an explicit `%ex` conversion +# pattern for logging Throwables. If this was omitted, then (by default) Log4J would +# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional +# class packaging information. That extra information can sometimes add a substantial +# performance overhead, so we disable it in our default logging config. +# For more information, see SPARK-39361. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex + +# Set the default spark-shell/spark-sql log level to WARN. When running the +# spark-shell/spark-sql, the log level for these classes is used to overwrite +# the root logger's log level, so that the user can have different defaults +# for the shell and regular Spark apps. +logger.repl.name = org.apache.spark.repl.Main +logger.repl.level = warn + +logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver +logger.thriftserver.level = warn + +# Settings to quiet third party logs that are too verbose +logger.jetty1.name = org.sparkproject.jetty +logger.jetty1.level = warn +logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle +logger.jetty2.level = error +logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper +logger.replexprTyper.level = info +logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter +logger.replSparkILoopInterpreter.level = info +logger.parquet1.name = org.apache.parquet +logger.parquet1.level = error +logger.parquet2.name = parquet +logger.parquet2.level = error + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler +logger.RetryingHMSHandler.level = fatal +logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry +logger.FunctionRegistry.level = error + +# For deploying Spark ThriftServer +# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 +appender.console.filter.1.type = RegexFilter +appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* +appender.console.filter.1.onMatch = deny +appender.console.filter.1.onMismatch = neutral diff --git a/docker/integ-test/prepare_scala_queries.py b/docker/integ-test/prepare_scala_queries.py new file mode 100755 index 000000000..dec62593b --- /dev/null +++ b/docker/integ-test/prepare_scala_queries.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import csv + +queries = None +with open('../../integ-test/script/test_cases.csv', 'r') as f: + reader = csv.DictReader(f) + queries = [(row['query'], i, row.get('expected_status', None)) for i, row in enumerate(reader, start=1) if row['query'].strip()] + +print('try {') +for query in queries: + query_str = query[0].replace('\n', '').replace('"', '\\"') + if 'FAILED' == query[2]: + print(' try {') + print(f' spark.sql("{query_str}")') + print(' throw new Error') + print(' } catch {') + print(' case e: Exception => null') + print(' }\n') + else: + print(f' spark.sql("{query_str}")\n') +print('}') + diff --git a/docker/integ-test/queries.scala b/docker/integ-test/queries.scala new file mode 100644 index 000000000..7d6ee78c1 --- /dev/null +++ b/docker/integ-test/queries.scala @@ -0,0 +1,619 @@ +{ + try { + spark.sql("describe myglue_test.default.http_logs") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("describe `myglue_test`.`default`.`http_logs`") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | dedup 1 status | fields @timestamp, clientip, status, size | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup status, size | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup 1 status keepempty=true | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup status, size keepempty=true | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup 2 status | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup 2 status, size | head 10") + + spark.sql("source = myglue_test.default.http_logs | dedup 2 status, size keepempty=true | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | dedup status CONSECUTIVE=true | fields status") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | dedup 2 status, size CONSECUTIVE=true | fields status") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | sort stat | fields @timestamp, clientip, status | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | fields @timestamp, notexisted | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.nested | fields int_col, struct_col.field1, struct_col2.field1 | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.nested | where struct_col2.field1.subfield > 'valueA' | sort int_col | fields int_col, struct_col.field1.subfield, struct_col2.field1.subfield") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | fields - @timestamp, clientip, status | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval new_time = @timestamp, new_clientip = clientip | fields - new_time, new_clientip, status | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval new_clientip = lower(clientip) | fields - new_clientip | head 10") + + spark.sql("source = myglue_test.default.http_logs | fields + @timestamp, clientip, status | fields - clientip, status | head 10") + + spark.sql("source = myglue_test.default.http_logs | fields - clientip, status | fields + @timestamp, clientip, status| head 10") + + spark.sql("source = myglue_test.default.http_logs | where status = 200 | head 10") + + spark.sql("source = myglue_test.default.http_logs | where status != 200 | head 10") + + spark.sql("source = myglue_test.default.http_logs | where size > 0 | head 10") + + spark.sql("source = myglue_test.default.http_logs | where size <= 0 | head 10") + + spark.sql("source = myglue_test.default.http_logs | where clientip = '236.14.2.0' | head 10") + + spark.sql("source = myglue_test.default.http_logs | where size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100") + + spark.sql("source = myglue_test.default.http_logs | where size <= 0 AND like(request, 'GET%') | head 10") + + spark.sql("source = myglue_test.default.http_logs status = 200 | head 10") + + spark.sql("source = myglue_test.default.http_logs size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100") + + spark.sql("source = myglue_test.default.http_logs size <= 0 AND like(request, 'GET%') | head 10") + + spark.sql("source = myglue_test.default.http_logs substring(clientip, 5, 2) = \"12\" | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | where isempty(size)") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | where ispresent(size)") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | where isnull(size) | head 10") + + spark.sql("source = myglue_test.default.http_logs | where isnotnull(size) | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | where isnotnull(coalesce(size, status)) | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | where like(request, 'GET%') | head 10") + + spark.sql("source = myglue_test.default.http_logs | where like(request, '%bordeaux%') | head 10") + + spark.sql("source = myglue_test.default.http_logs | where substring(clientip, 5, 2) = \"12\" | head 10") + + spark.sql("source = myglue_test.default.http_logs | where lower(request) = \"get /images/backnews.gif http/1.0\" | head 10") + + spark.sql("source = myglue_test.default.http_logs | where length(request) = 38 | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | where case(status = 200, 'success' else 'failed') = 'success' | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | eval h = \"Hello\", w = \"World\" | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval @h = \"Hello\" | eval @w = \"World\" | fields @timestamp, @h, @w") + + spark.sql("source = myglue_test.default.http_logs | eval newF = clientip | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval newF = clientip | fields clientip, newF | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval f = size | where f > 1 | sort f | fields size, clientip, status | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval f = status * 2 | eval h = f * 2 | fields status, f, h | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval f = size * 2, h = status | stats sum(f) by h") + + spark.sql("source = myglue_test.default.http_logs | eval f = UPPER(request) | eval h = 40 | fields f, h | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | eval request = \"test\" | fields request | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | eval size = abs(size) | where size < 500") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | eval status_string = case(status = 200, 'success' else 'failed') | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | eval n = now() | eval t = unix_timestamp(@timestamp) | fields n, t | head 10") + + try { + spark.sql("source = myglue_test.default.http_logs | eval e = isempty(size) | eval p = ispresent(size) | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | eval c = coalesce(size, status) | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | eval c = coalesce(request) | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | eval col1 = ln(size) | eval col2 = unix_timestamp(@timestamp) | sort - col1 | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval col1 = 1 | sort col1 | head 4 | eval col2 = 2 | sort - col2 | sort - size | head 2 | fields @timestamp, clientip, col2") + + spark.sql("source = myglue_test.default.mini_http_logs | eval stat = status | where stat > 300 | sort stat | fields @timestamp,clientip,status | head 5") + + spark.sql("source = myglue_test.default.http_logs | eval col1 = size, col2 = clientip | stats avg(col1) by col2") + + spark.sql("source = myglue_test.default.http_logs | stats avg(size) by clientip") + + spark.sql("source = myglue_test.default.http_logs | eval new_request = upper(request) | eval compound_field = concat('Hello ', if(like(new_request, '%bordeaux%'), 'World', clientip)) | fields new_request, compound_field | head 10") + + spark.sql("source = myglue_test.default.http_logs | stats avg(size)") + + spark.sql("source = myglue_test.default.nested | stats max(int_col) by struct_col.field2") + + spark.sql("source = myglue_test.default.nested | stats distinct_count(int_col)") + + spark.sql("source = myglue_test.default.nested | stats stddev_samp(int_col)") + + spark.sql("source = myglue_test.default.nested | stats stddev_pop(int_col)") + + spark.sql("source = myglue_test.default.nested | stats percentile(int_col)") + + spark.sql("source = myglue_test.default.nested | stats percentile_approx(int_col)") + + spark.sql("source = myglue_test.default.mini_http_logs | stats stddev_samp(status)") + + spark.sql("source = myglue_test.default.mini_http_logs | where stats > 200 | stats percentile_approx(status, 99)") + + spark.sql("source = myglue_test.default.nested | stats count(int_col) by span(struct_col.field2, 10) as a_span") + + spark.sql("source = myglue_test.default.nested | stats avg(int_col) by span(struct_col.field2, 10) as a_span, struct_col2.field2") + + spark.sql("source = myglue_test.default.http_logs | stats sum(size) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10") + + spark.sql("source = myglue_test.default.http_logs | stats distinct_count(clientip) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10") + + spark.sql("source = myglue_test.default.http_logs | stats avg(size) as avg_size by status, year | stats avg(avg_size) as avg_avg_size by year") + + spark.sql("source = myglue_test.default.http_logs | stats avg(size) as avg_size by status, year, month | stats avg(avg_size) as avg_avg_size by year, month | stats avg(avg_avg_size) as avg_avg_avg_size by year") + + try { + spark.sql("source = myglue_test.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | stats avg(avg_int) as avg_avg_int by struct_col2.field2") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | eval new_col = avg_int | stats avg(avg_int) as avg_avg_int by new_col") + + spark.sql("source = myglue_test.default.nested | rare int_col") + + spark.sql("source = myglue_test.default.nested | rare int_col by struct_col.field2") + + spark.sql("source = myglue_test.default.http_logs | rare request") + + spark.sql("source = myglue_test.default.http_logs | where status > 300 | rare request by status") + + spark.sql("source = myglue_test.default.http_logs | rare clientip") + + spark.sql("source = myglue_test.default.http_logs | where status > 300 | rare clientip") + + spark.sql("source = myglue_test.default.http_logs | where status > 300 | rare clientip by day") + + spark.sql("source = myglue_test.default.nested | top int_col by struct_col.field2") + + spark.sql("source = myglue_test.default.nested | top 1 int_col by struct_col.field2") + + spark.sql("source = myglue_test.default.nested | top 2 int_col by struct_col.field2") + + spark.sql("source = myglue_test.default.nested | top int_col") + + try { + spark.sql("source = myglue_test.default.http_logs | inner join left=l right=r on l.status = r.int_col myglue_test.default.nested | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | fields request, domain | head 10") + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | top 1 domain") + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | stats count() by domain") + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | eval a = 1 | fields a, domain | head 10") + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | where size > 0 | sort - size | fields size, domain | head 10") + + spark.sql("source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/(?[a-zA-Z]+)/.*' | where domain = 'english' | sort - picName | fields domain, picName | head 10") + + spark.sql("source = myglue_test.default.http_logs | patterns request | fields patterns_field | head 10") + + spark.sql("source = myglue_test.default.http_logs | patterns request | where size > 0 | fields patterns_field | head 10") + + spark.sql("source = myglue_test.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | fields request, no_letter | head 10") + + spark.sql("source = myglue_test.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | stats count() by no_letter") + + try { + spark.sql("source = myglue_test.default.http_logs | patterns new_field='status' pattern='[a-zA-Z]' request | fields request, status | head 10") + throw new Error + } catch { + case e: Exception => null + } + + try { + spark.sql("source = myglue_test.default.http_logs | rename @timestamp as timestamp | head 10") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.default.http_logs | sort size | head 10") + + spark.sql("source = myglue_test.default.http_logs | sort + size | head 10") + + spark.sql("source = myglue_test.default.http_logs | sort - size | head 10") + + spark.sql("source = myglue_test.default.http_logs | sort + size, + @timestamp | head 10") + + spark.sql("source = myglue_test.default.http_logs | sort - size, - @timestamp | head 10") + + spark.sql("source = myglue_test.default.http_logs | sort - size, @timestamp | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval c1 = upper(request) | eval c2 = concat('Hello ', if(like(c1, '%bordeaux%'), 'World', clientip)) | eval c3 = length(request) | eval c4 = ltrim(request) | eval c5 = rtrim(request) | eval c6 = substring(clientip, 5, 2) | eval c7 = trim(request) | eval c8 = upper(request) | eval c9 = position('bordeaux' IN request) | eval c10 = replace(request, 'GET', 'GGG') | fields c1, c2, c3, c4, c5, c6, c7, c8, c9, c10 | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval c1 = unix_timestamp(@timestamp) | eval c2 = now() | eval c3 =DAY_OF_WEEK(@timestamp) | eval c4 =DAY_OF_MONTH(@timestamp) | eval c5 =DAY_OF_YEAR(@timestamp) | eval c6 =WEEK_OF_YEAR(@timestamp) | eval c7 =WEEK(@timestamp) | eval c8 =MONTH_OF_YEAR(@timestamp) | eval c9 =HOUR_OF_DAY(@timestamp) | eval c10 =MINUTE_OF_HOUR(@timestamp) | eval c11 =SECOND_OF_MINUTE(@timestamp) | eval c12 =LOCALTIME() | fields c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12 | head 10") + + spark.sql("source=myglue_test.default.people | eval c1 = adddate(@timestamp, 1) | fields c1 | head 10") + + spark.sql("source=myglue_test.default.people | eval c2 = subdate(@timestamp, 1) | fields c2 | head 10") + + spark.sql("source=myglue_test.default.people | eval c1 = date_add(@timestamp INTERVAL 1 DAY) | fields c1 | head 10") + + spark.sql("source=myglue_test.default.people | eval c1 = date_sub(@timestamp INTERVAL 1 DAY) | fields c1 | head 10") + + spark.sql("source=myglue_test.default.people | eval `CURDATE()` = CURDATE() | fields `CURDATE()`") + + spark.sql("source=myglue_test.default.people | eval `CURRENT_DATE()` = CURRENT_DATE() | fields `CURRENT_DATE()`") + + spark.sql("source=myglue_test.default.people | eval `CURRENT_TIMESTAMP()` = CURRENT_TIMESTAMP() | fields `CURRENT_TIMESTAMP()`") + + spark.sql("source=myglue_test.default.people | eval `DATE('2020-08-26')` = DATE('2020-08-26') | fields `DATE('2020-08-26')`") + + spark.sql("source=myglue_test.default.people | eval `DATE(TIMESTAMP('2020-08-26 13:49:00'))` = DATE(TIMESTAMP('2020-08-26 13:49:00')) | fields `DATE(TIMESTAMP('2020-08-26 13:49:00'))`") + + spark.sql("source=myglue_test.default.people | eval `DATE('2020-08-26 13:49')` = DATE('2020-08-26 13:49') | fields `DATE('2020-08-26 13:49')`") + + spark.sql("source=myglue_test.default.people | eval `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')` = DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS'), `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')` = DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a') | fields `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')`, `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')`") + + spark.sql("source=myglue_test.default.people | eval `'2000-01-02' - '2000-01-01'` = DATEDIFF(TIMESTAMP('2000-01-02 00:00:00'), TIMESTAMP('2000-01-01 23:59:59')), `'2001-02-01' - '2004-01-01'` = DATEDIFF(DATE('2001-02-01'), TIMESTAMP('2004-01-01 00:00:00')) | fields `'2000-01-02' - '2000-01-01'`, `'2001-02-01' - '2004-01-01'`") + + spark.sql("source=myglue_test.default.people | eval `DAY(DATE('2020-08-26'))` = DAY(DATE('2020-08-26')) | fields `DAY(DATE('2020-08-26'))`") + + try { + spark.sql("source=myglue_test.default.people | eval `DAYNAME(DATE('2020-08-26'))` = DAYNAME(DATE('2020-08-26')) | fields `DAYNAME(DATE('2020-08-26'))`") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source=myglue_test.default.people | eval `CURRENT_TIMEZONE()` = CURRENT_TIMEZONE() | fields `CURRENT_TIMEZONE()`") + + spark.sql("source=myglue_test.default.people | eval `UTC_TIMESTAMP()` = UTC_TIMESTAMP() | fields `UTC_TIMESTAMP()`") + + spark.sql("source=myglue_test.default.people | eval `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')` = TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | eval `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))` = TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00')) | fields `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')`, `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))`") + + spark.sql("source=myglue_test.default.people | eval `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')` = TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | eval `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` = TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') | fields `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')`, `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')`") + + spark.sql(" source = myglue_test.default.http_logs | stats count()") + + spark.sql("source = myglue_test.default.http_logs | stats avg(size) as c1, max(size) as c2, min(size) as c3, sum(size) as c4, percentile(size, 50) as c5, stddev_pop(size) as c6, stddev_samp(size) as c7, distinct_count(size) as c8") + + spark.sql("source = myglue_test.default.http_logs | eval c1 = abs(size) | eval c2 = ceil(size) | eval c3 = floor(size) | eval c4 = sqrt(size) | eval c5 = ln(size) | eval c6 = pow(size, 2) | eval c7 = mod(size, 2) | fields c1, c2, c3, c4, c5, c6, c7 | head 10") + + spark.sql("source = myglue_test.default.http_logs | eval c1 = isnull(request) | eval c2 = isnotnull(request) | eval c3 = ifnull(request,\"Unknown\") | eval c4 = nullif(request,\"Unknown\") | eval c5 = isnull(size) | eval c6 = if(like(request, '%bordeaux%'), 'hello', 'world') | fields c1, c2, c3, c4, c5, c6 | head 10") + + spark.sql("/* this is block comment */ source = myglue_test.tpch_csv.orders | head 1 // this is line comment") + + spark.sql("/* test in tpch q16, q18, q20 */ source = myglue_test.tpch_csv.orders | head 1 // add source=xx to avoid failure in automation") + + spark.sql("/* test in tpch q4, q21, q22 */ source = myglue_test.tpch_csv.orders | head 1") + + spark.sql("/* test in tpch q2, q11, q15, q17, q20, q22 */ source = myglue_test.tpch_csv.orders | head 1") + + spark.sql("/* test in tpch q7, q8, q9, q13, q15, q22 */ source = myglue_test.tpch_csv.orders | head 1") + + spark.sql("/* lots of inner join tests in tpch */ source = myglue_test.tpch_csv.orders | head 1") + + spark.sql("/* left join test in tpch q13 */ source = myglue_test.tpch_csv.orders | head 1") + + spark.sql("source = myglue_test.tpch_csv.orders | right outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') myglue_test.tpch_csv.customer| stats count(o_orderkey) as c_count by c_custkey| sort - c_count") + + spark.sql("source = myglue_test.tpch_csv.orders | full outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') myglue_test.tpch_csv.customer| stats count(o_orderkey) as c_count by c_custkey| sort - c_count") + + spark.sql("source = myglue_test.tpch_csv.customer| semi join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| where c_mktsegment = 'BUILDING' | sort - c_custkey| head 10") + + spark.sql("source = myglue_test.tpch_csv.customer| anti join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| where c_mktsegment = 'BUILDING' | sort - c_custkey| head 10") + + spark.sql("source = myglue_test.tpch_csv.supplier| where like(s_comment, '%Customer%Complaints%')| join ON s_nationkey > n_nationkey [ source = myglue_test.tpch_csv.nation | where n_name = 'SAUDI ARABIA' ]| sort - s_name| head 10") + + spark.sql("source = myglue_test.tpch_csv.supplier| where like(s_comment, '%Customer%Complaints%')| join [ source = myglue_test.tpch_csv.nation | where n_name = 'SAUDI ARABIA' ]| sort - s_name| head 10") + + spark.sql("source=myglue_test.default.people | LOOKUP myglue_test.default.work_info uid AS id REPLACE department | stats distinct_count(department)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id APPEND department | stats distinct_count(department)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id REPLACE department AS country | stats distinct_count(country)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id APPEND department AS country | stats distinct_count(country)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uID AS id, name REPLACE department | stats distinct_count(department)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS ID, name APPEND department | stats distinct_count(department)") + + spark.sql("source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uID AS id, name | head 10") + + spark.sql("source = myglue_test.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP myglue_test.default.work_info name REPLACE occupation AS major | stats distinct_count(major)") + + spark.sql("source = myglue_test.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP myglue_test.default.work_info name APPEND occupation AS major | stats distinct_count(major)") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('{\"account_number\":1,\"balance\":39225,\"age\":32,\"gender\":\"M\"}') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('{\"f1\":\"abc\",\"f2\":{\"f3\":\"a\",\"f4\":\"b\"}}') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('[1,2,3,{\"f1\":1,\"f2\":[5,6]},4]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('[]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json(‘{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('{\"invalid\": \"json\"') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('[1,2,3]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json(‘[1,2') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('[invalid json]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json('invalid json') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json(null) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array('this', 'is', 'a', 'string', 'array') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array() | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array(1, 2, 0, -1, 1.1, -0.11) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array('this', 'is', 1.1, -0.11, true, false) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = array_length(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = array_length(json_array()) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array_length('[]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array_length('[1,2,3,{\"f1\":1,\"f2\":[5,6]},4]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array_length('{\"key\": 1}') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_array_length('[1,2') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', 'string_value')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', 123.45)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', true)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object(\"a\", 1, \"b\", 2, \"c\", 3)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', array())) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', array(1, 2, 3))) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object('outer', json_object('inner', 123.45))) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = to_json_string(json_object(\"array\", json_array(1,2,0,-1,1.1,-0.11))) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | where json_valid(('{\"account_number\":1,\"balance\":39225,\"age\":32,\"gender\":\"M\"}') | head 1") + + spark.sql("source = myglue_test.default.http_logs | where not json_valid(('{\"account_number\":1,\"balance\":39225,\"age\":32,\"gender\":\"M\"}') | head 1") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('{\"account_number\":1,\"balance\":39225,\"age\":32,\"gender\":\"M\"}')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('{\"f1\":\"abc\",\"f2\":{\"f3\":\"a\",\"f4\":\"b\"}}')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2,3,{\"f1\":1,\"f2\":[5,6]},4]')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('[]')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json(‘{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('{\"invalid\": \"json\"')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2,3]')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('[invalid json]')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json('invalid json')) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_keys(json(null)) | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.teacher') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[*]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[0]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[*].name') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[1].name') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[0].not_exist_key') | head 1 | fields res") + + spark.sql("source = myglue_test.default.http_logs | eval res = json_extract('{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}', '$.student[10]') | head 1 | fields res") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > 0) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > -10) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(json_object(\"a\",1,\"b\",-1),json_object(\"a\",-1,\"b\",-1)), result = forall(array, x -> x.a > 0) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(json_object(\"a\",1,\"b\",-1),json_object(\"a\",-1,\"b\",-1)), result = exists(array, x -> x.b < 0) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 0) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 10) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 0) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 10) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,3), result = transform(array, x -> x + 1) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,3), result = transform(array, (x, y) -> x + y) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x) | head 1 | fields result") + + spark.sql("source = myglue_test.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x, acc -> acc * 10) | head 1 | fields result") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eventstats avg(salary) by country | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by country | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as countby span(age, 10) | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by span(age, 10) as age_span, country | sort id | head 10") + + spark.sql("source=myglue_test.default.people | where country != 'USA' | eventstats stddev_samp(salary), stddev_pop(salary), percentile_approx(salary, 60) by span(salary, 1000) as salary_span | sort id | head 10") + + spark.sql("source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age by occupation, country | eventstats avg(avg_age) as avg_state_age by country | sort id | head 10") + + try { + spark.sql("source=myglue_test.default.people | eventstats distinct_count(salary) by span(salary, 1000) as age_span") + throw new Error + } catch { + case e: Exception => null + } + + spark.sql("source = myglue_test.tpch_csv.lineitem| where l_shipdate <= subdate(date('1998-12-01'), 90)| stats sum(l_quantity) as sum_qty, sum(l_extendedprice) as sum_base_price, sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, avg(l_quantity) as avg_qty, avg(l_extendedprice) as avg_price, avg(l_discount) as avg_disc, count() as count_order by l_returnflag, l_linestatus| sort l_returnflag, l_linestatus") + + spark.sql("source = myglue_test.tpch_csv.part| join ON p_partkey = ps_partkey myglue_test.tpch_csv.partsupp| join ON s_suppkey = ps_suppkey myglue_test.tpch_csv.supplier| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation| join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region| where p_size = 15 AND like(p_type, '%BRASS') AND r_name = 'EUROPE' AND ps_supplycost = [ source = myglue_test.tpch_csv.partsupp | join ON s_suppkey = ps_suppkey myglue_test.tpch_csv.supplier | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation | join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region | where r_name = 'EUROPE' | stats MIN(ps_supplycost) ]| sort - s_acctbal, n_name, s_name, p_partkey| head 100") + + spark.sql("source = myglue_test.tpch_csv.customer| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem| where c_mktsegment = 'BUILDING' AND o_orderdate < date('1995-03-15') AND l_shipdate > date('1995-03-15')| stats sum(l_extendedprice * (1 - l_discount)) as revenue by l_orderkey, o_orderdate, o_shippriority | sort - revenue, o_orderdate| head 10") + + spark.sql("source = myglue_test.tpch_csv.orders| where o_orderdate >= date('1993-07-01') and o_orderdate < date_add(date('1993-07-01'), interval 3 month) and exists [ source = myglue_test.tpch_csv.lineitem | where l_orderkey = o_orderkey and l_commitdate < l_receiptdate ]| stats count() as order_count by o_orderpriority| sort o_orderpriority") + + spark.sql("source = myglue_test.tpch_csv.customer| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem| join ON l_suppkey = s_suppkey AND c_nationkey = s_nationkey myglue_test.tpch_csv.supplier| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation| join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region| where r_name = 'ASIA' AND o_orderdate >= date('1994-01-01') AND o_orderdate < date_add(date('1994-01-01'), interval 1 year)| stats sum(l_extendedprice * (1 - l_discount)) as revenue by n_name| sort - revenue") + + spark.sql("source = myglue_test.tpch_csv.lineitem| where l_shipdate >= date('1994-01-01') and l_shipdate < adddate(date('1994-01-01'), 365) and l_discount between .06 - 0.01 and .06 + 0.01 and l_quantity < 24| stats sum(l_extendedprice * l_discount) as revenue") + + spark.sql("source = [ source = myglue_test.tpch_csv.supplier | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.lineitem | join ON o_orderkey = l_orderkey myglue_test.tpch_csv.orders | join ON c_custkey = o_custkey myglue_test.tpch_csv.customer | join ON s_nationkey = n1.n_nationkey myglue_test.tpch_csv.nation as n1 | join ON c_nationkey = n2.n_nationkey myglue_test.tpch_csv.nation as n2 | where l_shipdate between date('1995-01-01') and date('1996-12-31') and n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY' or n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE' | eval supp_nation = n1.n_name, cust_nation = n2.n_name, l_year = year(l_shipdate), volume = l_extendedprice * (1 - l_discount) | fields supp_nation, cust_nation, l_year, volume ] as shipping| stats sum(volume) as revenue by supp_nation, cust_nation, l_year| sort supp_nation, cust_nation, l_year") + + spark.sql("source = [ source = myglue_test.tpch_csv.part | join ON p_partkey = l_partkey myglue_test.tpch_csv.lineitem | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.supplier | join ON l_orderkey = o_orderkey myglue_test.tpch_csv.orders | join ON o_custkey = c_custkey myglue_test.tpch_csv.customer | join ON c_nationkey = n1.n_nationkey myglue_test.tpch_csv.nation as n1 | join ON s_nationkey = n2.n_nationkey myglue_test.tpch_csv.nation as n2 | join ON n1.n_regionkey = r_regionkey myglue_test.tpch_csv.region | where r_name = 'AMERICA' AND p_type = 'ECONOMY ANODIZED STEEL' and o_orderdate between date('1995-01-01') and date('1996-12-31') | eval o_year = year(o_orderdate) | eval volume = l_extendedprice * (1 - l_discount) | eval nation = n2.n_name | fields o_year, volume, nation ] as all_nations| stats sum(case(nation = 'BRAZIL', volume else 0)) as sum_case, sum(volume) as sum_volume by o_year| eval mkt_share = sum_case / sum_volume| fields mkt_share, o_year| sort o_year") + + spark.sql("source = [ source = myglue_test.tpch_csv.part | join ON p_partkey = l_partkey myglue_test.tpch_csv.lineitem | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.supplier | join ON ps_partkey = l_partkey and ps_suppkey = l_suppkey myglue_test.tpch_csv.partsupp | join ON o_orderkey = l_orderkey myglue_test.tpch_csv.orders | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation | where like(p_name, '%green%') | eval nation = n_name | eval o_year = year(o_orderdate) | eval amount = l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity | fields nation, o_year, amount ] as profit| stats sum(amount) as sum_profit by nation, o_year| sort nation, - o_year") + + spark.sql("source = myglue_test.tpch_csv.customer| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem| join ON c_nationkey = n_nationkey myglue_test.tpch_csv.nation| where o_orderdate >= date('1993-10-01') AND o_orderdate < date_add(date('1993-10-01'), interval 3 month) AND l_returnflag = 'R'| stats sum(l_extendedprice * (1 - l_discount)) as revenue by c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment| sort - revenue| head 20") + + spark.sql("source = myglue_test.tpch_csv.partsupp| join ON ps_suppkey = s_suppkey myglue_test.tpch_csv.supplier| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation| where n_name = 'GERMANY'| stats sum(ps_supplycost * ps_availqty) as value by ps_partkey| where value > [ source = myglue_test.tpch_csv.partsupp | join ON ps_suppkey = s_suppkey myglue_test.tpch_csv.supplier | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation | where n_name = 'GERMANY' | stats sum(ps_supplycost * ps_availqty) as check | eval threshold = check * 0.0001000000 | fields threshold ]| sort - value") + + spark.sql("source = myglue_test.tpch_csv.orders| join ON o_orderkey = l_orderkey myglue_test.tpch_csv.lineitem| where l_commitdate < l_receiptdate and l_shipdate < l_commitdate and l_shipmode in ('MAIL', 'SHIP') and l_receiptdate >= date('1994-01-01') and l_receiptdate < date_add(date('1994-01-01'), interval 1 year)| stats sum(case(o_orderpriority = '1-URGENT' or o_orderpriority = '2-HIGH', 1 else 0)) as high_line_count, sum(case(o_orderpriority != '1-URGENT' and o_orderpriority != '2-HIGH', 1 else 0)) as low_line_countby by l_shipmode| sort l_shipmode") + + spark.sql("source = [ source = myglue_test.tpch_csv.customer | left outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') myglue_test.tpch_csv.orders | stats count(o_orderkey) as c_count by c_custkey ] as c_orders| stats count() as custdist by c_count| sort - custdist, - c_count") + + spark.sql("source = myglue_test.tpch_csv.lineitem| join ON l_partkey = p_partkey AND l_shipdate >= date('1995-09-01') AND l_shipdate < date_add(date('1995-09-01'), interval 1 month) myglue_test.tpch_csv.part| stats sum(case(like(p_type, 'PROMO%'), l_extendedprice * (1 - l_discount) else 0)) as sum1, sum(l_extendedprice * (1 - l_discount)) as sum2| eval promo_revenue = 100.00 * sum1 / sum2 // Stats and Eval commands can combine when issues/819 resolved| fields promo_revenue") + + spark.sql("source = myglue_test.tpch_csv.supplier| join right = revenue0 ON s_suppkey = supplier_no [ source = myglue_test.tpch_csv.lineitem | where l_shipdate >= date('1996-01-01') AND l_shipdate < date_add(date('1996-01-01'), interval 3 month) | eval supplier_no = l_suppkey | stats sum(l_extendedprice * (1 - l_discount)) as total_revenue by supplier_no ]| where total_revenue = [ source = [ source = myglue_test.tpch_csv.lineitem | where l_shipdate >= date('1996-01-01') AND l_shipdate < date_add(date('1996-01-01'), interval 3 month) | eval supplier_no = l_suppkey | stats sum(l_extendedprice * (1 - l_discount)) as total_revenue by supplier_no ] | stats max(total_revenue) ]| sort s_suppkey| fields s_suppkey, s_name, s_address, s_phone, total_revenue") + + spark.sql("source = myglue_test.tpch_csv.partsupp| join ON p_partkey = ps_partkey myglue_test.tpch_csv.part| where p_brand != 'Brand#45' and not like(p_type, 'MEDIUM POLISHED%') and p_size in (49, 14, 23, 45, 19, 3, 36, 9) and ps_suppkey not in [ source = myglue_test.tpch_csv.supplier | where like(s_comment, '%Customer%Complaints%') | fields s_suppkey ]| stats distinct_count(ps_suppkey) as supplier_cnt by p_brand, p_type, p_size| sort - supplier_cnt, p_brand, p_type, p_size") + + spark.sql("source = myglue_test.tpch_csv.lineitem| join ON p_partkey = l_partkey myglue_test.tpch_csv.part| where p_brand = 'Brand#23' and p_container = 'MED BOX' and l_quantity < [ source = myglue_test.tpch_csv.lineitem | where l_partkey = p_partkey | stats avg(l_quantity) as avg | eval `0.2 * avg` = 0.2 * avg | fields `0.2 * avg` ]| stats sum(l_extendedprice) as sum| eval avg_yearly = sum / 7.0| fields avg_yearly") + + spark.sql("source = myglue_test.tpch_csv.customer| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders| join ON o_orderkey = l_orderkey myglue_test.tpch_csv.lineitem| where o_orderkey in [ source = myglue_test.tpch_csv.lineitem | stats sum(l_quantity) as sum by l_orderkey | where sum > 300 | fields l_orderkey ]| stats sum(l_quantity) by c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice| sort - o_totalprice, o_orderdate| head 100") + + spark.sql("source = myglue_test.tpch_csv.lineitem| join ON p_partkey = l_partkey and p_brand = 'Brand#12' and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') and l_quantity >= 1 and l_quantity <= 1 + 10 and p_size between 1 and 5 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' OR p_partkey = l_partkey and p_brand = 'Brand#23' and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') and l_quantity >= 10 and l_quantity <= 10 + 10 and p_size between 1 and 10 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' OR p_partkey = l_partkey and p_brand = 'Brand#34' and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') and l_quantity >= 20 and l_quantity <= 20 + 10 and p_size between 1 and 15 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' myglue_test.tpch_csv.part") + + spark.sql("source = myglue_test.tpch_csv.supplier| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation| where n_name = 'CANADA' and s_suppkey in [ source = myglue_test.tpch_csv.partsupp | where ps_partkey in [ source = myglue_test.tpch_csv.part | where like(p_name, 'forest%') | fields p_partkey ] and ps_availqty > [ source = myglue_test.tpch_csv.lineitem | where l_partkey = ps_partkey and l_suppkey = ps_suppkey and l_shipdate >= date('1994-01-01') and l_shipdate < date_add(date('1994-01-01'), interval 1 year) | stats sum(l_quantity) as sum_l_quantity | eval half_sum_l_quantity = 0.5 * sum_l_quantity | fields half_sum_l_quantity ] | fields ps_suppkey ]") + + spark.sql("source = myglue_test.tpch_csv.supplier| join ON s_suppkey = l1.l_suppkey myglue_test.tpch_csv.lineitem as l1| join ON o_orderkey = l1.l_orderkey myglue_test.tpch_csv.orders| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation| where o_orderstatus = 'F' and l1.l_receiptdate > l1.l_commitdate and exists [ source = myglue_test.tpch_csv.lineitem as l2 | where l2.l_orderkey = l1.l_orderkey and l2.l_suppkey != l1.l_suppkey ] and not exists [ source = myglue_test.tpch_csv.lineitem as l3 | where l3.l_orderkey = l1.l_orderkey and l3.l_suppkey != l1.l_suppkey and l3.l_receiptdate > l3.l_commitdate ] and n_name = 'SAUDI ARABIA'| stats count() as numwait by s_name| sort - numwait, s_name| head 100") + + spark.sql("source = [ source = myglue_test.tpch_csv.customer | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') and c_acctbal > [ source = myglue_test.tpch_csv.customer | where c_acctbal > 0.00 and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') | stats avg(c_acctbal) ] and not exists [ source = myglue_test.tpch_csv.orders | where o_custkey = c_custkey ] | eval cntrycode = substring(c_phone, 1, 2) | fields cntrycode, c_acctbal ] as custsale| stats count() as numcust, sum(c_acctbal) as totacctbal by cntrycode| sort cntrycode") + +} diff --git a/docker/integ-test/spark-defaults.conf b/docker/integ-test/spark-defaults.conf new file mode 100644 index 000000000..19b9e4ec1 --- /dev/null +++ b/docker/integ-test/spark-defaults.conf @@ -0,0 +1,35 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" +spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions,org.opensearch.flint.spark.FlintSparkExtensions +spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog +spark.datasource.flint.host opensearch +spark.datasource.flint.port 9200 +spark.datasource.flint.scheme http +spark.datasource.flint.auth basic +spark.datasource.flint.auth.username admin +spark.datasource.flint.auth.password C0rrecthorsebatterystaple. diff --git a/docker/integ-test/spark-master-entrypoint.sh b/docker/integ-test/spark-master-entrypoint.sh new file mode 100755 index 000000000..a21c20643 --- /dev/null +++ b/docker/integ-test/spark-master-entrypoint.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +function start_spark_connect() { + sc_version=$(ls -1 /opt/bitnami/spark/jars/spark-core_*.jar | sed -e 's/^.*\/spark-core_//' -e 's/\.jar$//' -e 's/-/:/') + + attempt=1 + while [ -e "/tmp/spark_master_running" -a "$attempt" -le 10 ]; do + sleep 1 + /opt/bitnami/spark/sbin/start-connect-server.sh --master spark://spark:7077 --packages org.apache.spark:spark-connect_${sc_version} + attempt=$(($attempt+1)) + done +} + +touch /tmp/spark_master_running +start_spark_connect & +/opt/bitnami/scripts/spark/entrypoint.sh /opt/bitnami/scripts/spark/run.sh +rm /tmp/spark_master_running diff --git a/docs/ppl-lang/functions/ppl-ip.md b/docs/ppl-lang/functions/ppl-ip.md index fb0b468ba..65cc9dac9 100644 --- a/docs/ppl-lang/functions/ppl-ip.md +++ b/docs/ppl-lang/functions/ppl-ip.md @@ -32,4 +32,67 @@ Note: - `ip` can be an IPv4 or an IPv6 address - `cidr` can be an IPv4 or an IPv6 block - `ip` and `cidr` must be either both IPv4 or both IPv6 - - `ip` and `cidr` must both be valid and non-empty/non-null \ No newline at end of file + - `ip` and `cidr` must both be valid and non-empty/non-null + +### `GEOIP` + +**Description** + +`GEOIP(ip[, property]...)` retrieves geospatial data corresponding to the provided `ip`. + +**Argument type:** +- `ip` is string be **STRING** representing an IPv4 or an IPv6 address. +- `property` is **STRING** and must be one of the following: + - `COUNTRY_ISO_CODE` + - `COUNTRY_NAME` + - `CONTINENT_NAME` + - `REGION_ISO_CODE` + - `REGION_NAME` + - `CITY_NAME` + - `TIME_ZONE` + - `LOCATION` +- Return type: + - **STRING** if one property given + - **STRUCT_TYPE** if more than one or no property is given + +Example: + +_Without properties:_ + + os> source=ips | eval a = geoip(ip) | fields ip, a + fetched rows / total rows = 2/2 + +---------------------+-------------------------------------------------------------------------------------------------------+ + |ip |lol | + +---------------------+-------------------------------------------------------------------------------------------------------+ + |66.249.157.90 |{JM, Jamaica, North America, 14, Saint Catherine Parish, Portmore, America/Jamaica, 17.9686,-76.8827} | + |2a09:bac2:19f8:2ac3::|{CA, Canada, North America, PE, Prince Edward Island, Charlottetown, America/Halifax, 46.2396,-63.1355}| + +---------------------+-------+------+-------------------------------------------------------------------------------------------------------+ + +_With one property:_ + + os> source=users | eval a = geoip(ip, COUNTRY_NAME) | fields ip, a + fetched rows / total rows = 2/2 + +---------------------+-------+ + |ip |a | + +---------------------+-------+ + |66.249.157.90 |Jamaica| + |2a09:bac2:19f8:2ac3::|Canada | + +---------------------+-------+ + +_With multiple properties:_ + + os> source=users | eval a = geoip(ip, COUNTRY_NAME, REGION_NAME, CITY_NAME) | fields ip, a + fetched rows / total rows = 2/2 + +---------------------+---------------------------------------------+ + |ip |a | + +---------------------+---------------------------------------------+ + |66.249.157.90 |{Jamaica, Saint Catherine Parish, Portmore} | + |2a09:bac2:19f8:2ac3::|{Canada, Prince Edward Island, Charlottetown}| + +---------------------+---------------------------------------------+ + +Note: +- To use `geoip` user must create spark table containing geo ip location data. Instructions to create table can be found [here](../../opensearch-geoip.md). + - `geoip` command by default expects the created table to be called `geoip_ip_data`. + - if a different table name is desired, can set `spark.geoip.tablename` spark config to new table name. +- `ip` can be an IPv4 or an IPv6 address. +- `geoip` commands will always calculated first if used with other eval functions. diff --git a/docs/ppl-lang/planning/ppl-geoip-command.md b/docs/ppl-lang/planning/ppl-geoip-command.md new file mode 100644 index 000000000..aaed6c156 --- /dev/null +++ b/docs/ppl-lang/planning/ppl-geoip-command.md @@ -0,0 +1,59 @@ +## geoip syntax proposal + +geoip function to add information about the geographical location of an IPv4 or IPv6 address + +**Implementation syntax** +- `... | eval geoinfo = geoip(ipAddress *[,properties])` +- generic syntax +- `... | eval geoinfo = geoip(ipAddress)` +- retrieves all geo data +- `... | eval geoinfo = geoip(ipAddress, city, location)` +- retrieve only city, and location + +**Implementation details** +- Current implementation requires user to have created a geoip table. Geoip table has the following schema: + + ```SQL + CREATE TABLE geoip ( + cidr STRING, + country_iso_code STRING, + country_name STRING, + continent_name STRING, + region_iso_code STRING, + region_name STRING, + city_name STRING, + time_zone STRING, + location STRING, + ip_range_start BIGINT, + ip_range_end BIGINT, + ipv4 BOOLEAN + ) + ``` + +- `geoip` is resolved by performing a join on said table and projecting the resulting geoip data as a struct. +- an example of using `geoip` is equivalent to running the following SQL query: + + ```SQL + SELECT source.*, struct(geoip.country_name, geoip.city_name) AS a + FROM source, geoip + WHERE geoip.ip_range_start <= ip_to_int(source.ip) + AND geoip.ip_range_end > ip_to_int(source.ip) + AND geoip.ip_type = is_ipv4(source.ip); + ``` +- in the case that only one property is provided in function call, `geoip` returns string of specified property instead: + + ```SQL + SELECT source.*, geoip.country_name AS a + FROM source, geoip + WHERE geoip.ip_range_start <= ip_to_int(source.ip) + AND geoip.ip_range_end > ip_to_int(source.ip) + AND geoip.ip_type = is_ipv4(source.ip); + ``` + +**Future plan for additional data-sources** + +- Currently only using pre-existing geoip table defined within spark is possible. +- There is future plans to allow users to specify data sources: + - API data sources - if users have their own geoip provided will create ability for users to configure and call said endpoints + - OpenSearch geospatial client - once geospatial client is published we can leverage client to utilize OpenSearch geo2ip functionality. +- Additional datasource connection params will be provided through spark config options. diff --git a/integ-test/script/README.md b/integ-test/script/README.md index 7ce0c6886..f9e9a8e93 100644 --- a/integ-test/script/README.md +++ b/integ-test/script/README.md @@ -17,41 +17,55 @@ Apart from the basic feature, it also has some advanced functionality includes: ### Usage To use this script, you need to have Python **3.6** or higher installed. It also requires the following Python libraries: ```shell -pip install requests pandas openpyxl +pip install requests pandas openpyxl pyspark setuptools pyarrow grpcio grpcio-status protobuf +``` + +Build the Flint and PPL extensions for Spark. +``` +sbt clean +sbt sparkSqlApplicationCosmetic/assembly sparkPPLCosmetic/assembly +``` + +Next start the Docker containers that will be used for the tests. In the directory `docker/integ-test` +```shell +docker compose up -d +``` + +After the tests are finished, the Docker containers can be stopped from the directory `docker/integ-test` with: +```shell +docker compose down ``` After getting the requisite libraries, you can run the script with the following command line parameters in your shell: ```shell -python SanityTest.py --base-url ${URL_ADDRESS} --username *** --password *** --datasource ${DATASOURCE_NAME} --input-csv test_cases.csv --output-file test_report --max-workers 2 --check-interval 10 --timeout 600 +python SanityTest.py --spark-url ${SPARK_URL} --username *** --password *** --opensearch-url ${OPENSEARCH_URL} --input-csv test_cases.csv --output-file test_report ``` -You need to replace the placeholders with your actual values of URL_ADDRESS, DATASOURCE_NAME and USERNAME, PASSWORD for authentication to your endpoint. +You need to replace the placeholders with your actual values of SPARK_URL, OPENSEARCH_URL and USERNAME, PASSWORD for authentication to your endpoint. + +Running against the docker cluster, `SPARK_URL` should be set to `sc://localhost:15002` and `OPENSEARCH_URL` should be set +to `http://localhost:9200` For more details of the command line parameters, you can see the help manual via command: ```shell python SanityTest.py --help -usage: SanityTest.py [-h] --base-url BASE_URL --username USERNAME --password PASSWORD --datasource DATASOURCE --input-csv INPUT_CSV - --output-file OUTPUT_FILE [--max-workers MAX_WORKERS] [--check-interval CHECK_INTERVAL] [--timeout TIMEOUT] +usage: SanityTest.py [-h] --spark-url SPARK_URL --username USERNAME --password PASSWORD --datasource DATASOURCE --input-csv INPUT_CSV + --output-file OPENSEARCH_URL [--max-workers MAX_WORKERS] [--check-interval CHECK_INTERVAL] [--timeout TIMEOUT] [--start-row START_ROW] [--end-row END_ROW] Run tests from a CSV file and generate a report. options: -h, --help show this help message and exit - --base-url BASE_URL Base URL of the service + --spark-url SPARK_URL Spark Connect URL of the service --username USERNAME Username for authentication --password PASSWORD Password for authentication - --datasource DATASOURCE - Datasource name + --output-file OPENSEARCH_URL + URL of the OpenSearch service --input-csv INPUT_CSV Path to the CSV file containing test queries --output-file OUTPUT_FILE Path to the output report file - --max-workers MAX_WORKERS - optional, Maximum number of worker threads (default: 2) - --check-interval CHECK_INTERVAL - optional, Check interval in seconds (default: 10) - --timeout TIMEOUT optional, Timeout in seconds (default: 600) --start-row START_ROW optional, The start row of the query to run, start from 1 --end-row END_ROW optional, The end row of the query to run, not included @@ -64,7 +78,20 @@ As claimed in the description, the input CSV file should at least have the colum We also provide a sample input CSV file `test_cases.csv` for reference. It includes all sanity test cases we have currently in the Flint. -**TODO**: the prerequisite data of the test cases and ingesting process +### Indices and Data for Testing +After the docker containers have started, the test script will try to create indices that are needed for testing. It will look in the directory `data`. It will start by +looking for all files with names ending with `.mapping.json`. The start of the filename is the name of the index to create. The contents of the file is the field mappings. + +[Supported field types](https://opensearch.org/docs/latest/field-types/supported-field-types/index/) + +[Example mapping](https://opensearch.org/docs/latest/field-types/supported-field-types/index/#example) + +After the indices have been created, the script will look for all other files ending with `.json`. These are the files for bulk inserting data into the indices. The start +of the filename is the index to insert data into. The contents of the file are used as the body of the bulk insert request. + +[Bulk Insert](https://opensearch.org/docs/latest/api-reference/document-apis/bulk/) + +[Example Body](https://opensearch.org/docs/latest/api-reference/document-apis/bulk/) ### Report Explanation The generated report contains two files: @@ -78,12 +105,12 @@ It also provides the query_id, session_id and start/end time for each query, whi An example of Excel report: -| query_name | query | expected_status | status | check_status | error | result | Duration (s) | query_id | session_id | Start Time | End Time | -|------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|---------|--------------|------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------|-------------------------------|------------------------------|----------------------|---------------------| -| 1 | describe myglue_test.default.http_logs | SUCCESS | SUCCESS | TRUE | | {'status': 'SUCCESS', 'schema': [{...}, ...], 'datarows': [[...], ...], 'total': 31, 'size': 31} | 37.51 | SHFEVWxDNnZjem15Z2x1ZV90ZXN0 | RkgzZm0xNlA5MG15Z2x1ZV90ZXN0 | 2024-11-07 13:34:10 | 2024-11-07 13:34:47 | -| 2 | source = myglue_test.default.http_logs \| dedup status CONSECUTIVE=true | SUCCESS | FAILED | FALSE | {"Message":"Fail to run query. Cause: Consecutive deduplication is not supported"} | | 39.53 | dVNlaVVxOFZrZW15Z2x1ZV90ZXN0 | ZGU2MllVYmI4dG15Z2x1ZV90ZXN0 | 2024-11-07 13:34:10 | 2024-11-07 13:34:49 | -| 3 | source = myglue_test.default.http_logs \| eval res = json_keys(json('{"account_number":1,"balance":39225,"age":32,"gender":"M"}')) \| head 1 \| fields res | SUCCESS | SUCCESS | TRUE | | {'status': 'SUCCESS', 'schema': [{'name': 'res', 'type': 'array'}], 'datarows': [[['account_number', 'balance', 'age', 'gender']]], 'total': 1, 'size': 1} | 12.77 | WHQxaXlVSGtGUm15Z2x1ZV90ZXN0 | RkgzZm0xNlA5MG15Z2x1ZV90ZXN0 | 2024-11-07 13:34:47 | 2024-11-07 13:38:45 | -| ... | ... | ... | ... | ... | | | ... | ... | ... | ... | ... | +| query_name | query | expected_status | status | check_status | error | result | duration (s) | Start Time | End Time | +|------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|---------|--------------|------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------|----------------------|---------------------| +| 1 | describe myglue_test.default.http_logs | SUCCESS | SUCCESS | TRUE | | {'status': 'SUCCESS', 'schema': [{...}, ...], 'datarows': [[...], ...], 'total': 31, 'size': 31} | 37.51 | 2024-11-07 13:34:10 | 2024-11-07 13:34:47 | +| 2 | source = myglue_test.default.http_logs \| dedup status CONSECUTIVE=true | SUCCESS | FAILED | FALSE | {"Message":"Fail to run query. Cause: Consecutive deduplication is not supported"} | | 39.53 | 2024-11-07 13:34:10 | 2024-11-07 13:34:49 | +| 3 | source = myglue_test.default.http_logs \| eval res = json_keys(json('{"account_number":1,"balance":39225,"age":32,"gender":"M"}')) \| head 1 \| fields res | SUCCESS | SUCCESS | TRUE | | {'status': 'SUCCESS', 'schema': [{'name': 'res', 'type': 'array'}], 'datarows': [[['account_number', 'balance', 'age', 'gender']]], 'total': 1, 'size': 1} | 12.77 | 2024-11-07 13:34:47 | 2024-11-07 13:38:45 | +| ... | ... | ... | ... | ... | | | ... | ... | ... | #### JSON Report @@ -103,7 +130,7 @@ An example of JSON report: "detailed_results": [ { "query_name": 1, - "query": "source = myglue_test.default.http_logs | stats avg(size)", + "query": "source = dev.default.http_logs | stats avg(size)", "query_id": "eFZmTlpTa3EyTW15Z2x1ZV90ZXN0", "session_id": "bFJDMWxzb2NVUm15Z2x1ZV90ZXN0", "status": "SUCCESS", @@ -130,7 +157,7 @@ An example of JSON report: }, { "query_name": 2, - "query": "source = myglue_test.default.http_logs | eval res = json_keys(json(\u2018{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}')) | head 1 | fields res", + "query": "source = def.default.http_logs | eval res = json_keys(json(\u2018{\"teacher\":\"Alice\",\"student\":[{\"name\":\"Bob\",\"rank\":1},{\"name\":\"Charlie\",\"rank\":2}]}')) | head 1 | fields res", "query_id": "bjF4Y1VnbXdFYm15Z2x1ZV90ZXN0", "session_id": "c3pvU1V6OW8xM215Z2x1ZV90ZXN0", "status": "FAILED", @@ -142,7 +169,7 @@ An example of JSON report: }, { "query_name": 2, - "query": "source = myglue_test.default.http_logs | eval col1 = size, col2 = clientip | stats avg(col1) by col2", + "query": "source = dev.default.http_logs | eval col1 = size, col2 = clientip | stats avg(col1) by col2", "query_id": "azVyMFFORnBFRW15Z2x1ZV90ZXN0", "session_id": "VWF0SEtrNWM3bm15Z2x1ZV90ZXN0", "status": "TIMEOUT", diff --git a/integ-test/script/SanityTest.py b/integ-test/script/SanityTest.py index eb97752b4..b4e6210bb 100644 --- a/integ-test/script/SanityTest.py +++ b/integ-test/script/SanityTest.py @@ -3,6 +3,7 @@ SPDX-License-Identifier: Apache-2.0 """ +import glob import signal import sys import requests @@ -11,18 +12,18 @@ import time import logging from datetime import datetime -import pandas as pd import argparse from requests.auth import HTTPBasicAuth -from concurrent.futures import ThreadPoolExecutor, as_completed +from pyspark.sql import SparkSession import threading +import pandas as pd """ Environment: python3 Example to use this script: -python SanityTest.py --base-url ${URL_ADDRESS} --username *** --password *** --datasource ${DATASOURCE_NAME} --input-csv test_queries.csv --output-file test_report --max-workers 2 --check-interval 10 --timeout 600 +python SanityTest.py --spark-url ${SPARK_URL} --username *** --password *** --opensearch_url ${OPENSEARCH_URL} --input-csv test_queries.csv --output-file test_report The input file test_queries.csv should contain column: `query` @@ -33,24 +34,19 @@ """ class FlintTester: - def __init__(self, base_url, username, password, datasource, max_workers, check_interval, timeout, output_file, start_row, end_row, log_level): - self.base_url = base_url + def __init__(self, spark_url, username, password, opensearch_url, output_file, start_row, end_row, log_level): + self.spark_url = spark_url self.auth = HTTPBasicAuth(username, password) - self.datasource = datasource - self.headers = { 'Content-Type': 'application/json' } - self.max_workers = max_workers - self.check_interval = check_interval - self.timeout = timeout + self.opensearch_url = opensearch_url self.output_file = output_file self.start = start_row - 1 if start_row else None self.end = end_row - 1 if end_row else None self.log_level = log_level - self.max_attempts = (int)(timeout / check_interval) self.logger = self._setup_logger() - self.executor = ThreadPoolExecutor(max_workers=self.max_workers) - self.thread_local = threading.local() self.test_results = [] + self.spark_client = SparkSession.builder.remote(spark_url).appName("integ-test").getOrCreate() + def _setup_logger(self): logger = logging.getLogger('FlintTester') logger.setLevel(self.log_level) @@ -72,126 +68,80 @@ def _setup_logger(self): return logger + # Create the indices needed for the tests + def create_indices(self): + self.logger.info("Creating indices") - def get_session_id(self): - if not hasattr(self.thread_local, 'session_id'): - self.thread_local.session_id = "empty_session_id" - self.logger.debug(f"get session id {self.thread_local.session_id}") - return self.thread_local.session_id + json_files = glob.glob('data/*.json') + mapping_files = [f for f in json_files if f.endswith('.mapping.json')] + data_files = [f for f in json_files if not f.endswith('.mapping.json')] + existing_indices = set() - def set_session_id(self, session_id): - """Reuse the session id for the same thread""" - self.logger.debug(f"set session id {session_id}") - self.thread_local.session_id = session_id + for mapping_file in mapping_files: + index_name = mapping_file[5 : mapping_file.index('.')] - # Call submit API to submit the query - def submit_query(self, query, session_id="Empty"): - url = f"{self.base_url}/_plugins/_async_query" - payload = { - "datasource": self.datasource, - "lang": "ppl", - "query": query, - "sessionId": session_id - } - self.logger.debug(f"Submit query with payload: {payload}") - response_json = None - try: - response = requests.post(url, auth=self.auth, json=payload, headers=self.headers) - response_json = response.json() - response.raise_for_status() - return response_json - except Exception as e: - return {"error": f"{str(e)}, got response {response_json}"} + self.logger.info(f"Checking if index exists: {index_name}") + response = requests.get(f'{self.opensearch_url}/{index_name}', auth=self.auth) + if response.status_code == 200: + existing_indices.add(index_name) + continue - # Call get API to check the query status - def get_query_result(self, query_id): - url = f"{self.base_url}/_plugins/_async_query/{query_id}" - response_json = None - try: - response = requests.get(url, auth=self.auth) - response_json = response.json() - response.raise_for_status() - return response_json - except Exception as e: - return {"status": "FAILED", "error": f"{str(e)}, got response {response_json}"} + self.logger.info(f"Creating index: {index_name}") - # Call delete API to cancel the query - def cancel_query(self, query_id): - url = f"{self.base_url}/_plugins/_async_query/{query_id}" - response_json = None - try: - response = requests.delete(url, auth=self.auth) - response_json = response.json() - response.raise_for_status() - self.logger.info(f"Cancelled query [{query_id}] with info {response.json()}") - return response_json - except Exception as e: - self.logger.warning(f"Cancel query [{query_id}] error: {str(e)}, got response {response_json}") + file_data = open(mapping_file, 'rb').read() + headers = {'Content-Type': 'application/json'} + + response = requests.put(f'{self.opensearch_url}/{index_name}', auth=self.auth, headers=headers, data=file_data) + if response.status_code != 200: + self.logger.error(f'Failed to create index: {index_name}') + response.raise_for_status() + + for data_file in data_files: + index_name = data_file[5 : data_file.index('.')] + if index_name in existing_indices: + continue + + self.logger.info(f"Populating index: {index_name}") + + file_data = open(data_file, 'rb').read() + headers = {'Content-Type': 'application/x-ndjson'} + + response = requests.post(f'{self.opensearch_url}/{index_name}/_bulk', auth=self.auth, headers=headers, data=file_data) + if response.status_code != 200: + response.raise_for_status() # Run the test and return the result def run_test(self, query, seq_id, expected_status): self.logger.info(f"Starting test: {seq_id}, {query}") start_time = datetime.now() - pre_session_id = self.get_session_id() - submit_result = self.submit_query(query, pre_session_id) - if "error" in submit_result: - self.logger.warning(f"Submit error: {submit_result}") - return { - "query_name": seq_id, - "query": query, - "expected_status": expected_status, - "status": "SUBMIT_FAILED", - "check_status": "SUBMIT_FAILED" == expected_status if expected_status else None, - "error": submit_result["error"], - "duration": 0, - "start_time": start_time, - "end_time": datetime.now() - } - - query_id = submit_result["queryId"] - session_id = submit_result["sessionId"] - self.logger.info(f"Submit return: {submit_result}") - if (session_id != pre_session_id): - self.logger.info(f"Update session id from {pre_session_id} to {session_id}") - self.set_session_id(session_id) - - test_result = self.check_query_status(query_id) + + query_str = query.replace('\n', ' ') + status = None + result = None + error_str = None + try: + result = self.spark_client.sql(query_str) + status = 'SUCCESS' + except Exception as e: + status = 'FAILED' + error_str = str(e) + end_time = datetime.now() duration = (end_time - start_time).total_seconds() return { "query_name": seq_id, "query": query, - "query_id": query_id, - "session_id": session_id, "expected_status": expected_status, - "status": test_result["status"], - "check_status": test_result["status"] == expected_status if expected_status else None, - "error": test_result.get("error", ""), - "result": test_result if test_result["status"] == "SUCCESS" else None, + "status": status, + "check_status": status == expected_status if expected_status else None, + "error": error_str if error_str else None, + "result": result, "duration": duration, "start_time": start_time, "end_time": end_time } - # Check the status of the query periodically until it is completed or failed or exceeded the timeout - def check_query_status(self, query_id): - query_id = query_id - - for attempt in range(self.max_attempts): - time.sleep(self.check_interval) - result = self.get_query_result(query_id) - - if result["status"] == "FAILED" or result["status"] == "SUCCESS": - return result - - # Cancel the query if it exceeds the timeout - self.cancel_query(query_id) - return { - "status": "TIMEOUT", - "error": "Query execution exceeded " + str(self.timeout) + " seconds with last status: " + result["status"], - } - def run_tests_from_csv(self, csv_file): with open(csv_file, 'r') as f: reader = csv.DictReader(f) @@ -200,20 +150,15 @@ def run_tests_from_csv(self, csv_file): # Filtering queries based on start and end queries = queries[self.start:self.end] - # Parallel execution - futures = [self.executor.submit(self.run_test, query, seq_id, expected_status) for query, seq_id, expected_status in queries] - for future in as_completed(futures): - result = future.result() - self.logger.info(f"Completed test: {result["query_name"]}, {result["query"]}, got result status: {result["status"]}") - self.test_results.append(result) + self.test_results = [] + for query in queries: + self.test_results.append(self.run_test(query[0], query[1], query[2])) def generate_report(self): self.logger.info("Generating report...") total_queries = len(self.test_results) successful_queries = sum(1 for r in self.test_results if r['status'] == 'SUCCESS') failed_queries = sum(1 for r in self.test_results if r['status'] == 'FAILED') - submit_failed_queries = sum(1 for r in self.test_results if r['status'] == 'SUBMIT_FAILED') - timeout_queries = sum(1 for r in self.test_results if r['status'] == 'TIMEOUT') # Create report report = { @@ -221,8 +166,6 @@ def generate_report(self): "total_queries": total_queries, "successful_queries": successful_queries, "failed_queries": failed_queries, - "submit_failed_queries": submit_failed_queries, - "timeout_queries": timeout_queries, "execution_time": sum(r['duration'] for r in self.test_results) }, "detailed_results": self.test_results @@ -249,15 +192,12 @@ def signal_handler(sig, frame, tester): def main(): # Parse command line arguments parser = argparse.ArgumentParser(description="Run tests from a CSV file and generate a report.") - parser.add_argument("--base-url", required=True, help="Base URL of the service") + parser.add_argument("--spark-url", required=True, help="URL of the Spark service") parser.add_argument("--username", required=True, help="Username for authentication") parser.add_argument("--password", required=True, help="Password for authentication") - parser.add_argument("--datasource", required=True, help="Datasource name") + parser.add_argument("--opensearch-url", required=True, help="URL of the OpenSearch service") parser.add_argument("--input-csv", required=True, help="Path to the CSV file containing test queries") parser.add_argument("--output-file", required=True, help="Path to the output report file") - parser.add_argument("--max-workers", type=int, default=2, help="optional, Maximum number of worker threads (default: 2)") - parser.add_argument("--check-interval", type=int, default=5, help="optional, Check interval in seconds (default: 5)") - parser.add_argument("--timeout", type=int, default=600, help="optional, Timeout in seconds (default: 600)") parser.add_argument("--start-row", type=int, default=None, help="optional, The start row of the query to run, start from 1") parser.add_argument("--end-row", type=int, default=None, help="optional, The end row of the query to run, not included") parser.add_argument("--log-level", default="INFO", help="optional, Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL, default: INFO)") @@ -265,13 +205,10 @@ def main(): args = parser.parse_args() tester = FlintTester( - base_url=args.base_url, + spark_url=args.spark_url, username=args.username, password=args.password, - datasource=args.datasource, - max_workers=args.max_workers, - check_interval=args.check_interval, - timeout=args.timeout, + opensearch_url=args.opensearch_url, output_file=args.output_file, start_row=args.start_row, end_row=args.end_row, @@ -282,6 +219,9 @@ def main(): signal.signal(signal.SIGINT, lambda sig, frame: signal_handler(sig, frame, tester)) signal.signal(signal.SIGTERM, lambda sig, frame: signal_handler(sig, frame, tester)) + # Create indices + tester.create_indices() + # Running tests tester.run_tests_from_csv(args.input_csv) diff --git a/integ-test/script/data/customer.mapping.json b/integ-test/script/data/customer.mapping.json new file mode 100644 index 000000000..a98d473a2 --- /dev/null +++ b/integ-test/script/data/customer.mapping.json @@ -0,0 +1,30 @@ +{ + "mappings": { + "properties": { + "c_custkey": { + "type": "integer" + }, + "c_name": { + "type": "text" + }, + "c_address": { + "type": "text" + }, + "c_nationkey": { + "type": "integer" + }, + "c_phone": { + "type": "text" + }, + "c_acctbal": { + "type": "double" + }, + "c_mktsegment": { + "type": "text" + }, + "c_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/http_logs.json b/integ-test/script/data/http_logs.json new file mode 100644 index 000000000..ff2aa2fca --- /dev/null +++ b/integ-test/script/data/http_logs.json @@ -0,0 +1,12 @@ +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696154400000, "year": 2023, "month": 10, "day": 1, "clientip": "40.135.0.0", "request": "GET /images/hm_bg.jpg HTTP/1.0", "status": 200, "size": 24736} +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696154700000, "year": 2023, "month": 10, "day": 1, "clientip": "232.0.0.0", "request": "GET /images/hm_bg.jpg HTTP/1.0", "status": 200, "size": 24736} +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696155000000, "year": 2023, "month": 10, "day": 1, "clientip": "26.1.0.0", "request": "GET /images/hm_bg.jpg HTTP/1.0", "status": 200, "size": 24736} +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696155300000, "year": 2023, "month": 10, "day": 1, "clientip": "247.37.0.0", "request": "GET /french/splash_inet.html HTTP/1.0", "status": 200, "size": 3781} +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696155600000, "year": 2023, "month": 10, "day": 1, "clientip": "247.37.0.0", "request": "GET /images/hm_nbg.jpg HTTP/1.0", "status": 304, "size": 0} +{"index": {"_index": "http_logs"}} +{"@timestamp": 1696155900000, "year": 2023, "month": 10, "day": 1, "clientip": "252.0.0.0", "request": "GET /images/hm_bg.jpg HTTP/1.0", "status": 200, "size": 24736} diff --git a/integ-test/script/data/http_logs.mapping.json b/integ-test/script/data/http_logs.mapping.json new file mode 100644 index 000000000..b944fbd4b --- /dev/null +++ b/integ-test/script/data/http_logs.mapping.json @@ -0,0 +1,30 @@ +{ + "mappings": { + "properties": { + "@timestamp": { + "type": "date" + }, + "year": { + "type": "integer" + }, + "month": { + "type": "integer" + }, + "day": { + "type": "integer" + }, + "clientip": { + "type": "keyword" + }, + "request": { + "type": "text" + }, + "status": { + "type": "integer" + }, + "size": { + "type": "integer" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/lineitem.mapping.json b/integ-test/script/data/lineitem.mapping.json new file mode 100644 index 000000000..2fb1cdb40 --- /dev/null +++ b/integ-test/script/data/lineitem.mapping.json @@ -0,0 +1,54 @@ +{ + "mappings": { + "properties": { + "l_orderkey": { + "type": "integer" + }, + "l_partkey": { + "type": "text" + }, + "l_suppkey": { + "type": "integer" + }, + "l_linenumber": { + "type": "integer" + }, + "l_quantity": { + "type": "double" + }, + "l_extendedprice": { + "type": "double" + }, + "l_discount": { + "type": "double" + }, + "l_tax": { + "type": "double" + }, + "l_returnflag": { + "type": "text" + }, + "l_linestatus": { + "type": "text" + }, + "l_shipdate": { + "type": "date" + }, + "l_commitdate": { + "type": "date" + }, + "l_receiptdate": { + "type": "date" + }, + "l_shipinstruct": { + "type": "text" + }, + "l_shipmode": { + "type": "text" + }, + "l_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/nation.mapping.json b/integ-test/script/data/nation.mapping.json new file mode 100644 index 000000000..d0e82e559 --- /dev/null +++ b/integ-test/script/data/nation.mapping.json @@ -0,0 +1,18 @@ +{ + "mappings": { + "properties": { + "n_nationkey": { + "type": "integer" + }, + "n_name": { + "type": "text" + }, + "n_regionkey": { + "type": "integer" + }, + "n_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/nested.json b/integ-test/script/data/nested.json new file mode 100644 index 000000000..eb8af683b --- /dev/null +++ b/integ-test/script/data/nested.json @@ -0,0 +1,10 @@ +{"index": {"_index": "nested"}} +{"int_col": 30, "struct_col": {"field1": {"subfield": "value1"}, "field2": 123}, "struct_col2": {"field1": {"subfield": "valueA"}, "field2": 23}} +{"index": {"_index": "nested"}} +{"int_col": 40, "struct_col": {"field1": {"subfield": "value5"}, "field2": 123}, "struct_col2": {"field1": {"subfield": "valueB"}, "field2": 33}} +{"index": {"_index": "nested"}} +{"int_col": 30, "struct_col": {"field1": {"subfield": "value4"}, "field2": 823}, "struct_col2": {"field1": {"subfield": "valueC"}, "field2": 83}} +{"index": {"_index": "nested"}} +{"int_col": 40, "struct_col": {"field1": {"subfield": "value2"}, "field2": 456}, "struct_col2": {"field1": {"subfield": "valueD"}, "field2": 46}} +{"index": {"_index": "nested"}} +{"int_col": 50, "struct_col": {"field1": {"subfield": "value3"}, "field2": 789}, "struct_col2": {"field1": {"subfield": "valueE"}, "field2": 89}} diff --git a/integ-test/script/data/nested.mapping.json b/integ-test/script/data/nested.mapping.json new file mode 100644 index 000000000..1aa189415 --- /dev/null +++ b/integ-test/script/data/nested.mapping.json @@ -0,0 +1,37 @@ +{ + "mappings": { + "properties": { + "int_col": { + "type": "integer" + }, + "struct_col": { + "properties": { + "field1": { + "properties": { + "subfield": { + "type": "text" + } + } + }, + "field2": { + "type": "integer" + } + } + }, + "struct_col2": { + "properties": { + "field1": { + "properties": { + "subfield": { + "type": "text" + } + } + }, + "field2": { + "type": "integer" + } + } + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/orders.mapping.json b/integ-test/script/data/orders.mapping.json new file mode 100644 index 000000000..59b3cecdd --- /dev/null +++ b/integ-test/script/data/orders.mapping.json @@ -0,0 +1,33 @@ +{ + "mappings": { + "properties": { + "o_orderkey": { + "type": "integer" + }, + "o_custkey": { + "type": "integer" + }, + "o_orderstatus": { + "type": "text" + }, + "o_totalprice": { + "type": "double" + }, + "o_orderdate": { + "type": "date" + }, + "o_orderpriority": { + "type": "text" + }, + "o_clerk": { + "type": "text" + }, + "o_shippriority": { + "type": "integer" + }, + "o_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/part.mapping.json b/integ-test/script/data/part.mapping.json new file mode 100644 index 000000000..8be7e9aa0 --- /dev/null +++ b/integ-test/script/data/part.mapping.json @@ -0,0 +1,33 @@ +{ + "mappings": { + "properties": { + "p_partkey": { + "type": "integer" + }, + "p_name": { + "type": "text" + }, + "p_mfgr": { + "type": "text" + }, + "p_brand": { + "type": "text" + }, + "p_type": { + "type": "text" + }, + "p_size": { + "type": "integer" + }, + "p_container": { + "type": "text" + }, + "p_retailprice": { + "type": "double" + }, + "p_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/partsupp.mapping.json b/integ-test/script/data/partsupp.mapping.json new file mode 100644 index 000000000..13509ad46 --- /dev/null +++ b/integ-test/script/data/partsupp.mapping.json @@ -0,0 +1,21 @@ +{ + "mappings": { + "properties": { + "ps_partkey": { + "type": "integer" + }, + "ps_suppkey": { + "type": "integer" + }, + "ps_availqty": { + "type": "integer" + }, + "ps_supplycost": { + "type": "double" + }, + "ps_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/people.json b/integ-test/script/data/people.json new file mode 100644 index 000000000..4563a2c4b --- /dev/null +++ b/integ-test/script/data/people.json @@ -0,0 +1,12 @@ +{"index": {"_index": "people"}} +{"@timestamp": 1718458823000, "id": 1000, "name": "Jake", "occupation": "Engineer", "country": "England", "salary": 100000} +{"index": {"_index": "people"}} +{"@timestamp": 1718458833000, "id": 1001, "name": "Hello", "occupation": "Artist", "country": "USA", "salary": 70000} +{"index": {"_index": "people"}} +{"@timestamp": 1718458843000, "id": 1002, "name": "John", "occupation": "Doctor", "country": "Canada", "salary": 120000} +{"index": {"_index": "people"}} +{"@timestamp": 1718458853000, "id": 1003, "name": "David", "occupation": "Doctor", "country": null, "salary": 120000} +{"index": {"_index": "people"}} +{"@timestamp": 1718458863000, "id": 1004, "name": "David", "occupation": null, "country": "Canada", "salary": 0} +{"index": {"_index": "people"}} +{"@timestamp": 1718458873000, "id": 1005, "name": "Jane", "occupation": "Scientist", "country": "Canada", "salary": 90000} diff --git a/integ-test/script/data/people.mapping.json b/integ-test/script/data/people.mapping.json new file mode 100644 index 000000000..b5dde8ff6 --- /dev/null +++ b/integ-test/script/data/people.mapping.json @@ -0,0 +1,24 @@ +{ + "mappings": { + "properties": { + "@timestamp": { + "type": "date" + }, + "id": { + "type": "integer" + }, + "name": { + "type": "text" + }, + "occupation": { + "type": "text" + }, + "country": { + "type": "text" + }, + "salary": { + "type": "integer" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/region.mapping.json b/integ-test/script/data/region.mapping.json new file mode 100644 index 000000000..3dddbc580 --- /dev/null +++ b/integ-test/script/data/region.mapping.json @@ -0,0 +1,15 @@ +{ + "mappings": { + "properties": { + "r_regionkey": { + "type": "integer" + }, + "r_name": { + "type": "text" + }, + "r_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/supplier.mapping.json b/integ-test/script/data/supplier.mapping.json new file mode 100644 index 000000000..bdcb933b6 --- /dev/null +++ b/integ-test/script/data/supplier.mapping.json @@ -0,0 +1,27 @@ +{ + "mappings": { + "properties": { + "s_suppkey": { + "type": "integer" + }, + "s_name": { + "type": "text" + }, + "s_address": { + "type": "text" + }, + "s_nationkey": { + "type": "integer" + }, + "s_phone": { + "type": "text" + }, + "s_acctbal": { + "type": "double" + }, + "s_comment": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/data/work_info.json b/integ-test/script/data/work_info.json new file mode 100644 index 000000000..64802bdad --- /dev/null +++ b/integ-test/script/data/work_info.json @@ -0,0 +1,10 @@ +{"index": {"_index": "work_info"}} +{"uid": 1000, "name": "Jake", "department": "IT", "occupation": "Engineer"} +{"index": {"_index": "work_info"}} +{"uid": 1002, "name": "John", "department": "DATA", "occupation": "Scientist"} +{"index": {"_index": "work_info"}} +{"uid": 1003, "name": "David", "department": "HR", "occupation": "Doctor"} +{"index": {"_index": "work_info"}} +{"uid": 1005, "name": "Jane", "department": "DATA", "occupation": "Engineer"} +{"index": {"_index": "work_info"}} +{"uid": 1006, "name": "Tom", "department": "SALES", "occupation": "Artist"} diff --git a/integ-test/script/data/work_info.mapping.json b/integ-test/script/data/work_info.mapping.json new file mode 100644 index 000000000..3fb5e2c28 --- /dev/null +++ b/integ-test/script/data/work_info.mapping.json @@ -0,0 +1,18 @@ +{ + "mappings": { + "properties": { + "uid": { + "type": "integer" + }, + "name": { + "type": "text" + }, + "department": { + "type": "text" + }, + "occupation": { + "type": "text" + } + } + } +} \ No newline at end of file diff --git a/integ-test/script/test_cases.csv b/integ-test/script/test_cases.csv index 7df05f5a3..91500efea 100644 --- a/integ-test/script/test_cases.csv +++ b/integ-test/script/test_cases.csv @@ -1,116 +1,116 @@ query,expected_status -describe myglue_test.default.http_logs,FAILED -describe `myglue_test`.`default`.`http_logs`,FAILED -"source = myglue_test.default.http_logs | dedup 1 status | fields @timestamp, clientip, status, size | head 10",SUCCESS -"source = myglue_test.default.http_logs | dedup status, size | head 10",SUCCESS -source = myglue_test.default.http_logs | dedup 1 status keepempty=true | head 10,SUCCESS -"source = myglue_test.default.http_logs | dedup status, size keepempty=true | head 10",SUCCESS -source = myglue_test.default.http_logs | dedup 2 status | head 10,SUCCESS -"source = myglue_test.default.http_logs | dedup 2 status, size | head 10",SUCCESS -"source = myglue_test.default.http_logs | dedup 2 status, size keepempty=true | head 10",SUCCESS -source = myglue_test.default.http_logs | dedup status CONSECUTIVE=true | fields status,FAILED -"source = myglue_test.default.http_logs | dedup 2 status, size CONSECUTIVE=true | fields status",FAILED -"source = myglue_test.default.http_logs | sort stat | fields @timestamp, clientip, status | head 10",SUCCESS -"source = myglue_test.default.http_logs | fields @timestamp, notexisted | head 10",FAILED -"source = myglue_test.default.nested | fields int_col, struct_col.field1, struct_col2.field1 | head 10",FAILED -"source = myglue_test.default.nested | where struct_col2.field1.subfield > 'valueA' | sort int_col | fields int_col, struct_col.field1.subfield, struct_col2.field1.subfield",FAILED -"source = myglue_test.default.http_logs | fields - @timestamp, clientip, status | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval new_time = @timestamp, new_clientip = clientip | fields - new_time, new_clientip, status | head 10",SUCCESS -source = myglue_test.default.http_logs | eval new_clientip = lower(clientip) | fields - new_clientip | head 10,SUCCESS -"source = myglue_test.default.http_logs | fields + @timestamp, clientip, status | fields - clientip, status | head 10",SUCCESS -"source = myglue_test.default.http_logs | fields - clientip, status | fields + @timestamp, clientip, status| head 10",SUCCESS -source = myglue_test.default.http_logs | where status = 200 | head 10,SUCCESS -source = myglue_test.default.http_logs | where status != 200 | head 10,SUCCESS -source = myglue_test.default.http_logs | where size > 0 | head 10,SUCCESS -source = myglue_test.default.http_logs | where size <= 0 | head 10,SUCCESS -source = myglue_test.default.http_logs | where clientip = '236.14.2.0' | head 10,SUCCESS -source = myglue_test.default.http_logs | where size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100,SUCCESS -"source = myglue_test.default.http_logs | where size <= 0 AND like(request, 'GET%') | head 10",SUCCESS -source = myglue_test.default.http_logs status = 200 | head 10,SUCCESS -source = myglue_test.default.http_logs size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100,SUCCESS -"source = myglue_test.default.http_logs size <= 0 AND like(request, 'GET%') | head 10",SUCCESS -"source = myglue_test.default.http_logs substring(clientip, 5, 2) = ""12"" | head 10",SUCCESS -source = myglue_test.default.http_logs | where isempty(size),FAILED -source = myglue_test.default.http_logs | where ispresent(size),FAILED -source = myglue_test.default.http_logs | where isnull(size) | head 10,SUCCESS -source = myglue_test.default.http_logs | where isnotnull(size) | head 10,SUCCESS -"source = myglue_test.default.http_logs | where isnotnull(coalesce(size, status)) | head 10",FAILED -"source = myglue_test.default.http_logs | where like(request, 'GET%') | head 10",SUCCESS -"source = myglue_test.default.http_logs | where like(request, '%bordeaux%') | head 10",SUCCESS -"source = myglue_test.default.http_logs | where substring(clientip, 5, 2) = ""12"" | head 10",SUCCESS -"source = myglue_test.default.http_logs | where lower(request) = ""get /images/backnews.gif http/1.0"" | head 10",SUCCESS -source = myglue_test.default.http_logs | where length(request) = 38 | head 10,SUCCESS -"source = myglue_test.default.http_logs | where case(status = 200, 'success' else 'failed') = 'success' | head 10",FAILED -"source = myglue_test.default.http_logs | eval h = ""Hello"", w = ""World"" | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval @h = ""Hello"" | eval @w = ""World"" | fields @timestamp, @h, @w",SUCCESS -source = myglue_test.default.http_logs | eval newF = clientip | head 10,SUCCESS -"source = myglue_test.default.http_logs | eval newF = clientip | fields clientip, newF | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval f = size | where f > 1 | sort f | fields size, clientip, status | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval f = status * 2 | eval h = f * 2 | fields status, f, h | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval f = size * 2, h = status | stats sum(f) by h",SUCCESS -"source = myglue_test.default.http_logs | eval f = UPPER(request) | eval h = 40 | fields f, h | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval request = ""test"" | fields request | head 10",FAILED -source = myglue_test.default.http_logs | eval size = abs(size) | where size < 500,FAILED -"source = myglue_test.default.http_logs | eval status_string = case(status = 200, 'success' else 'failed') | head 10",FAILED -"source = myglue_test.default.http_logs | eval n = now() | eval t = unix_timestamp(@timestamp) | fields n, t | head 10",SUCCESS -source = myglue_test.default.http_logs | eval e = isempty(size) | eval p = ispresent(size) | head 10,FAILED -"source = myglue_test.default.http_logs | eval c = coalesce(size, status) | head 10",FAILED -source = myglue_test.default.http_logs | eval c = coalesce(request) | head 10,FAILED -source = myglue_test.default.http_logs | eval col1 = ln(size) | eval col2 = unix_timestamp(@timestamp) | sort - col1 | head 10,SUCCESS -"source = myglue_test.default.http_logs | eval col1 = 1 | sort col1 | head 4 | eval col2 = 2 | sort - col2 | sort - size | head 2 | fields @timestamp, clientip, col2",SUCCESS -"source = myglue_test.default.mini_http_logs | eval stat = status | where stat > 300 | sort stat | fields @timestamp,clientip,status | head 5",SUCCESS -"source = myglue_test.default.http_logs | eval col1 = size, col2 = clientip | stats avg(col1) by col2",SUCCESS -source = myglue_test.default.http_logs | stats avg(size) by clientip,SUCCESS -"source = myglue_test.default.http_logs | eval new_request = upper(request) | eval compound_field = concat('Hello ', if(like(new_request, '%bordeaux%'), 'World', clientip)) | fields new_request, compound_field | head 10",SUCCESS -source = myglue_test.default.http_logs | stats avg(size),SUCCESS -source = myglue_test.default.nested | stats max(int_col) by struct_col.field2,SUCCESS -source = myglue_test.default.nested | stats distinct_count(int_col),SUCCESS -source = myglue_test.default.nested | stats stddev_samp(int_col),SUCCESS -source = myglue_test.default.nested | stats stddev_pop(int_col),SUCCESS -source = myglue_test.default.nested | stats percentile(int_col),SUCCESS -source = myglue_test.default.nested | stats percentile_approx(int_col),SUCCESS -source = myglue_test.default.mini_http_logs | stats stddev_samp(status),SUCCESS -"source = myglue_test.default.mini_http_logs | where stats > 200 | stats percentile_approx(status, 99)",SUCCESS -"source = myglue_test.default.nested | stats count(int_col) by span(struct_col.field2, 10) as a_span",SUCCESS -"source = myglue_test.default.nested | stats avg(int_col) by span(struct_col.field2, 10) as a_span, struct_col2.field2",SUCCESS -"source = myglue_test.default.http_logs | stats sum(size) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10",SUCCESS -"source = myglue_test.default.http_logs | stats distinct_count(clientip) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10",SUCCESS -"source = myglue_test.default.http_logs | stats avg(size) as avg_size by status, year | stats avg(avg_size) as avg_avg_size by year",SUCCESS -"source = myglue_test.default.http_logs | stats avg(size) as avg_size by status, year, month | stats avg(avg_size) as avg_avg_size by year, month | stats avg(avg_avg_size) as avg_avg_avg_size by year",SUCCESS -"source = myglue_test.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | stats avg(avg_int) as avg_avg_int by struct_col2.field2",FAILED -"source = myglue_test.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | eval new_col = avg_int | stats avg(avg_int) as avg_avg_int by new_col",SUCCESS -source = myglue_test.default.nested | rare int_col,SUCCESS -source = myglue_test.default.nested | rare int_col by struct_col.field2,SUCCESS -source = myglue_test.default.http_logs | rare request,SUCCESS -source = myglue_test.default.http_logs | where status > 300 | rare request by status,SUCCESS -source = myglue_test.default.http_logs | rare clientip,SUCCESS -source = myglue_test.default.http_logs | where status > 300 | rare clientip,SUCCESS -source = myglue_test.default.http_logs | where status > 300 | rare clientip by day,SUCCESS -source = myglue_test.default.nested | top int_col by struct_col.field2,SUCCESS -source = myglue_test.default.nested | top 1 int_col by struct_col.field2,SUCCESS -source = myglue_test.default.nested | top 2 int_col by struct_col.field2,SUCCESS -source = myglue_test.default.nested | top int_col,SUCCESS -source = myglue_test.default.http_logs | inner join left=l right=r on l.status = r.int_col myglue_test.default.nested | head 10,FAILED -"source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | fields request, domain | head 10",SUCCESS -source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | top 1 domain,SUCCESS -source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | stats count() by domain,SUCCESS -"source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | eval a = 1 | fields a, domain | head 10",SUCCESS -"source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | where size > 0 | sort - size | fields size, domain | head 10",SUCCESS -"source = myglue_test.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/(?[a-zA-Z]+)/.*' | where domain = 'english' | sort - picName | fields domain, picName | head 10",SUCCESS -source = myglue_test.default.http_logs | patterns request | fields patterns_field | head 10,SUCCESS -source = myglue_test.default.http_logs | patterns request | where size > 0 | fields patterns_field | head 10,SUCCESS -"source = myglue_test.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | fields request, no_letter | head 10",SUCCESS -source = myglue_test.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | stats count() by no_letter,SUCCESS -"source = myglue_test.default.http_logs | patterns new_field='status' pattern='[a-zA-Z]' request | fields request, status | head 10",FAILED -source = myglue_test.default.http_logs | rename @timestamp as timestamp | head 10,FAILED -source = myglue_test.default.http_logs | sort size | head 10,SUCCESS -source = myglue_test.default.http_logs | sort + size | head 10,SUCCESS -source = myglue_test.default.http_logs | sort - size | head 10,SUCCESS -"source = myglue_test.default.http_logs | sort + size, + @timestamp | head 10",SUCCESS -"source = myglue_test.default.http_logs | sort - size, - @timestamp | head 10",SUCCESS -"source = myglue_test.default.http_logs | sort - size, @timestamp | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval c1 = upper(request) | eval c2 = concat('Hello ', if(like(c1, '%bordeaux%'), 'World', clientip)) | eval c3 = length(request) | eval c4 = ltrim(request) | eval c5 = rtrim(request) | eval c6 = substring(clientip, 5, 2) | eval c7 = trim(request) | eval c8 = upper(request) | eval c9 = position('bordeaux' IN request) | eval c10 = replace(request, 'GET', 'GGG') | fields c1, c2, c3, c4, c5, c6, c7, c8, c9, c10 | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval c1 = unix_timestamp(@timestamp) | eval c2 = now() | eval c3 = +describe dev.default.http_logs,FAILED +describe `dev`.`default`.`http_logs`,FAILED +"source = dev.default.http_logs | dedup 1 status | fields @timestamp, clientip, status, size | head 10",SUCCESS +"source = dev.default.http_logs | dedup status, size | head 10",SUCCESS +source = dev.default.http_logs | dedup 1 status keepempty=true | head 10,SUCCESS +"source = dev.default.http_logs | dedup status, size keepempty=true | head 10",SUCCESS +source = dev.default.http_logs | dedup 2 status | head 10,SUCCESS +"source = dev.default.http_logs | dedup 2 status, size | head 10",SUCCESS +"source = dev.default.http_logs | dedup 2 status, size keepempty=true | head 10",SUCCESS +source = dev.default.http_logs | dedup status CONSECUTIVE=true | fields status,FAILED +"source = dev.default.http_logs | dedup 2 status, size CONSECUTIVE=true | fields status",FAILED +"source = dev.default.http_logs | sort status | fields @timestamp, clientip, status | head 10",SUCCESS +"source = dev.default.http_logs | fields @timestamp, notexisted | head 10",FAILED +"source = dev.default.nested | fields int_col, struct_col.field1, struct_col2.field1 | head 10",SUCCESS +"source = dev.default.nested | where struct_col2.field1.subfield > 'valueA' | sort int_col | fields int_col, struct_col.field1.subfield, struct_col2.field1.subfield",SUCCESS +"source = dev.default.http_logs | fields - @timestamp, clientip, status | head 10",SUCCESS +"source = dev.default.http_logs | eval new_time = @timestamp, new_clientip = clientip | fields - new_time, new_clientip, status | head 10",SUCCESS +source = dev.default.http_logs | eval new_clientip = lower(clientip) | fields - new_clientip | head 10,SUCCESS +"source = dev.default.http_logs | fields + @timestamp, clientip, status | fields - clientip, status | head 10",SUCCESS +"source = dev.default.http_logs | fields - clientip, status | fields + @timestamp, clientip, status| head 10",SUCCESS +source = dev.default.http_logs | where status = 200 | head 10,SUCCESS +source = dev.default.http_logs | where status != 200 | head 10,SUCCESS +source = dev.default.http_logs | where size > 0 | head 10,SUCCESS +source = dev.default.http_logs | where size <= 0 | head 10,SUCCESS +source = dev.default.http_logs | where clientip = '236.14.2.0' | head 10,SUCCESS +source = dev.default.http_logs | where size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100,SUCCESS +"source = dev.default.http_logs | where size <= 0 AND like(request, 'GET%') | head 10",SUCCESS +source = dev.default.http_logs status = 200 | head 10,SUCCESS +source = dev.default.http_logs size > 0 AND status = 200 OR clientip = '236.14.2.0' | head 100,SUCCESS +"source = dev.default.http_logs size <= 0 AND like(request, 'GET%') | head 10",SUCCESS +"source = dev.default.http_logs substring(clientip, 5, 2) = ""12"" | head 10",SUCCESS +source = dev.default.http_logs | where isempty(size),SUCCESS +source = dev.default.http_logs | where ispresent(size),SUCCESS +source = dev.default.http_logs | where isnull(size) | head 10,SUCCESS +source = dev.default.http_logs | where isnotnull(size) | head 10,SUCCESS +"source = dev.default.http_logs | where isnotnull(coalesce(size, status)) | head 10",SUCCESS +"source = dev.default.http_logs | where like(request, 'GET%') | head 10",SUCCESS +"source = dev.default.http_logs | where like(request, '%bordeaux%') | head 10",SUCCESS +"source = dev.default.http_logs | where substring(clientip, 5, 2) = ""12"" | head 10",SUCCESS +"source = dev.default.http_logs | where lower(request) = ""get /images/backnews.gif http/1.0"" | head 10",SUCCESS +source = dev.default.http_logs | where length(request) = 38 | head 10,SUCCESS +"source = dev.default.http_logs | where case(status = 200, 'success' else 'failed') = 'success' | head 10",SUCCESS +"source = dev.default.http_logs | eval h = ""Hello"", w = ""World"" | head 10",SUCCESS +"source = dev.default.http_logs | eval @h = ""Hello"" | eval @w = ""World"" | fields @timestamp, @h, @w",SUCCESS +source = dev.default.http_logs | eval newF = clientip | head 10,SUCCESS +"source = dev.default.http_logs | eval newF = clientip | fields clientip, newF | head 10",SUCCESS +"source = dev.default.http_logs | eval f = size | where f > 1 | sort f | fields size, clientip, status | head 10",SUCCESS +"source = dev.default.http_logs | eval f = status * 2 | eval h = f * 2 | fields status, f, h | head 10",SUCCESS +"source = dev.default.http_logs | eval f = size * 2, h = status | stats sum(f) by h",SUCCESS +"source = dev.default.http_logs | eval f = UPPER(request) | eval h = 40 | fields f, h | head 10",SUCCESS +"source = dev.default.http_logs | eval request = ""test"" | fields request | head 10",FAILED +source = dev.default.http_logs | eval size = abs(size) | where size < 500,FAILED +"source = dev.default.http_logs | eval status_string = case(status = 200, 'success' else 'failed') | head 10",SUCCESS +"source = dev.default.http_logs | eval n = now() | eval t = unix_timestamp(@timestamp) | fields n, t | head 10",SUCCESS +source = dev.default.http_logs | eval e = isempty(size) | eval p = ispresent(size) | head 10,SUCCESS +"source = dev.default.http_logs | eval c = coalesce(size, status) | head 10",SUCCESS +source = dev.default.http_logs | eval c = coalesce(request) | head 10,SUCCESS +source = dev.default.http_logs | eval col1 = ln(size) | eval col2 = unix_timestamp(@timestamp) | sort - col1 | head 10,SUCCESS +"source = dev.default.http_logs | eval col1 = 1 | sort col1 | head 4 | eval col2 = 2 | sort - col2 | sort - size | head 2 | fields @timestamp, clientip, col2",SUCCESS +"source = dev.default.http_logs | eval stat = status | where stat > 300 | sort stat | fields @timestamp,clientip,status | head 5",SUCCESS +"source = dev.default.http_logs | eval col1 = size, col2 = clientip | stats avg(col1) by col2",SUCCESS +source = dev.default.http_logs | stats avg(size) by clientip,SUCCESS +"source = dev.default.http_logs | eval new_request = upper(request) | eval compound_field = concat('Hello ', if(like(new_request, '%bordeaux%'), 'World', clientip)) | fields new_request, compound_field | head 10",SUCCESS +source = dev.default.http_logs | stats avg(size),SUCCESS +source = dev.default.nested | stats max(int_col) by struct_col.field2,SUCCESS +source = dev.default.nested | stats distinct_count(int_col),SUCCESS +source = dev.default.nested | stats stddev_samp(int_col),SUCCESS +source = dev.default.nested | stats stddev_pop(int_col),SUCCESS +"source = dev.default.nested | stats percentile(int_col, 90)",SUCCESS +"source = dev.default.nested | stats percentile_approx(int_col, 99)",SUCCESS +source = dev.default.http_logs | stats stddev_samp(status),SUCCESS +"source = dev.default.http_logs | where status > 200 | stats percentile_approx(status, 99)",SUCCESS +"source = dev.default.nested | stats count(int_col) by span(struct_col.field2, 10) as a_span",SUCCESS +"source = dev.default.nested | stats avg(int_col) by span(struct_col.field2, 10) as a_span, struct_col2.field2",SUCCESS +"source = dev.default.http_logs | stats sum(size) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10",SUCCESS +"source = dev.default.http_logs | stats distinct_count(clientip) by span(@timestamp, 1d) as age_size_per_day | sort - age_size_per_day | head 10",SUCCESS +"source = dev.default.http_logs | stats avg(size) as avg_size by status, year | stats avg(avg_size) as avg_avg_size by year",SUCCESS +"source = dev.default.http_logs | stats avg(size) as avg_size by status, year, month | stats avg(avg_size) as avg_avg_size by year, month | stats avg(avg_avg_size) as avg_avg_avg_size by year",SUCCESS +"source = dev.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | stats avg(avg_int) as avg_avg_int by struct_col2.field2",FAILED +"source = dev.default.nested | stats avg(int_col) as avg_int by struct_col.field2, struct_col2.field2 | eval new_col = avg_int | stats avg(avg_int) as avg_avg_int by new_col",SUCCESS +source = dev.default.nested | rare int_col,SUCCESS +source = dev.default.nested | rare int_col by struct_col.field2,SUCCESS +source = dev.default.http_logs | rare request,SUCCESS +source = dev.default.http_logs | where status > 300 | rare request by status,SUCCESS +source = dev.default.http_logs | rare clientip,SUCCESS +source = dev.default.http_logs | where status > 300 | rare clientip,SUCCESS +source = dev.default.http_logs | where status > 300 | rare clientip by day,SUCCESS +source = dev.default.nested | top int_col by struct_col.field2,SUCCESS +source = dev.default.nested | top 1 int_col by struct_col.field2,SUCCESS +source = dev.default.nested | top 2 int_col by struct_col.field2,SUCCESS +source = dev.default.nested | top int_col,SUCCESS +source = dev.default.http_logs | inner join left=l right=r on l.status = r.int_col dev.default.nested | head 10,SUCCESS +"source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | fields request, domain | head 10",SUCCESS +source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | top 1 domain,SUCCESS +source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | stats count() by domain,SUCCESS +"source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | eval a = 1 | fields a, domain | head 10",SUCCESS +"source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/.*' | where size > 0 | sort - size | fields size, domain | head 10",SUCCESS +"source = dev.default.http_logs | parse request 'GET /(?[a-zA-Z]+)/(?[a-zA-Z]+)/.*' | where domain = 'english' | sort - picName | fields domain, picName | head 10",SUCCESS +source = dev.default.http_logs | patterns request | fields patterns_field | head 10,SUCCESS +source = dev.default.http_logs | patterns request | where size > 0 | fields patterns_field | head 10,SUCCESS +"source = dev.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | fields request, no_letter | head 10",SUCCESS +source = dev.default.http_logs | patterns new_field='no_letter' pattern='[a-zA-Z]' request | stats count() by no_letter,SUCCESS +"source = dev.default.http_logs | patterns new_field='status' pattern='[a-zA-Z]' request | fields request, status | head 10",FAILED +source = dev.default.http_logs | rename @timestamp as timestamp | head 10,SUCCESS +source = dev.default.http_logs | sort size | head 10,SUCCESS +source = dev.default.http_logs | sort + size | head 10,SUCCESS +source = dev.default.http_logs | sort - size | head 10,SUCCESS +"source = dev.default.http_logs | sort + size, + @timestamp | head 10",SUCCESS +"source = dev.default.http_logs | sort - size, - @timestamp | head 10",SUCCESS +"source = dev.default.http_logs | sort - size, @timestamp | head 10",SUCCESS +"source = dev.default.http_logs | eval c1 = upper(request) | eval c2 = concat('Hello ', if(like(c1, '%bordeaux%'), 'World', clientip)) | eval c3 = length(request) | eval c4 = ltrim(request) | eval c5 = rtrim(request) | eval c6 = substring(clientip, 5, 2) | eval c7 = trim(request) | eval c8 = upper(request) | eval c9 = position('bordeaux' IN request) | eval c10 = replace(request, 'GET', 'GGG') | fields c1, c2, c3, c4, c5, c6, c7, c8, c9, c10 | head 10",SUCCESS +"source = dev.default.http_logs | eval c1 = unix_timestamp(@timestamp) | eval c2 = now() | eval c3 = DAY_OF_WEEK(@timestamp) | eval c4 = DAY_OF_MONTH(@timestamp) | eval c5 = DAY_OF_YEAR(@timestamp) | eval c6 = @@ -121,151 +121,151 @@ HOUR_OF_DAY(@timestamp) | eval c10 = MINUTE_OF_HOUR(@timestamp) | eval c11 = SECOND_OF_MINUTE(@timestamp) | eval c12 = LOCALTIME() | fields c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12 | head 10",SUCCESS -"source=myglue_test.default.people | eval c1 = adddate(@timestamp, 1) | fields c1 | head 10",SUCCESS -"source=myglue_test.default.people | eval c2 = subdate(@timestamp, 1) | fields c2 | head 10",SUCCESS -source=myglue_test.default.people | eval c1 = date_add(@timestamp INTERVAL 1 DAY) | fields c1 | head 10,SUCCESS -source=myglue_test.default.people | eval c1 = date_sub(@timestamp INTERVAL 1 DAY) | fields c1 | head 10,SUCCESS -source=myglue_test.default.people | eval `CURDATE()` = CURDATE() | fields `CURDATE()`,SUCCESS -source=myglue_test.default.people | eval `CURRENT_DATE()` = CURRENT_DATE() | fields `CURRENT_DATE()`,SUCCESS -source=myglue_test.default.people | eval `CURRENT_TIMESTAMP()` = CURRENT_TIMESTAMP() | fields `CURRENT_TIMESTAMP()`,SUCCESS -source=myglue_test.default.people | eval `DATE('2020-08-26')` = DATE('2020-08-26') | fields `DATE('2020-08-26')`,SUCCESS -source=myglue_test.default.people | eval `DATE(TIMESTAMP('2020-08-26 13:49:00'))` = DATE(TIMESTAMP('2020-08-26 13:49:00')) | fields `DATE(TIMESTAMP('2020-08-26 13:49:00'))`,SUCCESS -source=myglue_test.default.people | eval `DATE('2020-08-26 13:49')` = DATE('2020-08-26 13:49') | fields `DATE('2020-08-26 13:49')`,SUCCESS -"source=myglue_test.default.people | eval `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')` = DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS'), `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')` = DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a') | fields `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')`, `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')`",SUCCESS -"source=myglue_test.default.people | eval `'2000-01-02' - '2000-01-01'` = DATEDIFF(TIMESTAMP('2000-01-02 00:00:00'), TIMESTAMP('2000-01-01 23:59:59')), `'2001-02-01' - '2004-01-01'` = DATEDIFF(DATE('2001-02-01'), TIMESTAMP('2004-01-01 00:00:00')) | fields `'2000-01-02' - '2000-01-01'`, `'2001-02-01' - '2004-01-01'`", -source=myglue_test.default.people | eval `DAY(DATE('2020-08-26'))` = DAY(DATE('2020-08-26')) | fields `DAY(DATE('2020-08-26'))`, -source=myglue_test.default.people | eval `DAYNAME(DATE('2020-08-26'))` = DAYNAME(DATE('2020-08-26')) | fields `DAYNAME(DATE('2020-08-26'))`,FAILED -source=myglue_test.default.people | eval `CURRENT_TIMEZONE()` = CURRENT_TIMEZONE() | fields `CURRENT_TIMEZONE()`,SUCCESS -source=myglue_test.default.people | eval `UTC_TIMESTAMP()` = UTC_TIMESTAMP() | fields `UTC_TIMESTAMP()`,SUCCESS -"source=myglue_test.default.people | eval `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')` = TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | eval `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))` = TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00')) | fields `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')`, `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))`",SUCCESS -"source=myglue_test.default.people | eval `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')` = TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | eval `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` = TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') | fields `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')`, `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')`",SUCCESS - source = myglue_test.default.http_logs | stats count(),SUCCESS -"source = myglue_test.default.http_logs | stats avg(size) as c1, max(size) as c2, min(size) as c3, sum(size) as c4, percentile(size, 50) as c5, stddev_pop(size) as c6, stddev_samp(size) as c7, distinct_count(size) as c8",SUCCESS -"source = myglue_test.default.http_logs | eval c1 = abs(size) | eval c2 = ceil(size) | eval c3 = floor(size) | eval c4 = sqrt(size) | eval c5 = ln(size) | eval c6 = pow(size, 2) | eval c7 = mod(size, 2) | fields c1, c2, c3, c4, c5, c6, c7 | head 10",SUCCESS -"source = myglue_test.default.http_logs | eval c1 = isnull(request) | eval c2 = isnotnull(request) | eval c3 = ifnull(request, +"source=dev.default.people | eval c1 = adddate(@timestamp, 1) | fields c1 | head 10",SUCCESS +"source=dev.default.people | eval c2 = subdate(@timestamp, 1) | fields c2 | head 10",SUCCESS +source=dev.default.people | eval c1 = date_add(@timestamp INTERVAL 1 DAY) | fields c1 | head 10,SUCCESS +source=dev.default.people | eval c1 = date_sub(@timestamp INTERVAL 1 DAY) | fields c1 | head 10,SUCCESS +source=dev.default.people | eval `CURDATE()` = CURDATE() | fields `CURDATE()`,SUCCESS +source=dev.default.people | eval `CURRENT_DATE()` = CURRENT_DATE() | fields `CURRENT_DATE()`,SUCCESS +source=dev.default.people | eval `CURRENT_TIMESTAMP()` = CURRENT_TIMESTAMP() | fields `CURRENT_TIMESTAMP()`,SUCCESS +source=dev.default.people | eval `DATE('2020-08-26')` = DATE('2020-08-26') | fields `DATE('2020-08-26')`,SUCCESS +source=dev.default.people | eval `DATE(TIMESTAMP('2020-08-26 13:49:00'))` = DATE(TIMESTAMP('2020-08-26 13:49:00')) | fields `DATE(TIMESTAMP('2020-08-26 13:49:00'))`,SUCCESS +source=dev.default.people | eval `DATE('2020-08-26 13:49')` = DATE('2020-08-26 13:49') | fields `DATE('2020-08-26 13:49')`,SUCCESS +"source=dev.default.people | eval `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')` = DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS'), `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')` = DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a') | fields `DATE_FORMAT('1998-01-31 13:14:15.012345', 'HH:mm:ss.SSSSSS')`, `DATE_FORMAT(TIMESTAMP('1998-01-31 13:14:15.012345'), 'yyyy-MMM-dd hh:mm:ss a')`",SUCCESS +"source=dev.default.people | eval `'2000-01-02' - '2000-01-01'` = DATEDIFF(TIMESTAMP('2000-01-02 00:00:00'), TIMESTAMP('2000-01-01 23:59:59')), `'2001-02-01' - '2004-01-01'` = DATEDIFF(DATE('2001-02-01'), TIMESTAMP('2004-01-01 00:00:00')) | fields `'2000-01-02' - '2000-01-01'`, `'2001-02-01' - '2004-01-01'`",SUCCESS +source=dev.default.people | eval `DAY(DATE('2020-08-26'))` = DAY(DATE('2020-08-26')) | fields `DAY(DATE('2020-08-26'))`,SUCCESS +source=dev.default.people | eval `DAYNAME(DATE('2020-08-26'))` = DAYNAME(DATE('2020-08-26')) | fields `DAYNAME(DATE('2020-08-26'))`,FAILED +source=dev.default.people | eval `CURRENT_TIMEZONE()` = CURRENT_TIMEZONE() | fields `CURRENT_TIMEZONE()`,SUCCESS +source=dev.default.people | eval `UTC_TIMESTAMP()` = UTC_TIMESTAMP() | fields `UTC_TIMESTAMP()`,SUCCESS +"source=dev.default.people | eval `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')` = TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00') | eval `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))` = TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00')) | fields `TIMESTAMPDIFF(YEAR, '1997-01-01 00:00:00', '2001-03-06 00:00:00')`, `TIMESTAMPDIFF(SECOND, timestamp('1997-01-01 00:00:23'), timestamp('1997-01-01 00:00:00'))`",SUCCESS +"source=dev.default.people | eval `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')` = TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00') | eval `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')` = TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00') | fields `TIMESTAMPADD(DAY, 17, '2000-01-01 00:00:00')`, `TIMESTAMPADD(QUARTER, -1, '2000-01-01 00:00:00')`",SUCCESS + source = dev.default.http_logs | stats count(),SUCCESS +"source = dev.default.http_logs | stats avg(size) as c1, max(size) as c2, min(size) as c3, sum(size) as c4, percentile(size, 50) as c5, stddev_pop(size) as c6, stddev_samp(size) as c7, distinct_count(size) as c8",SUCCESS +"source = dev.default.http_logs | eval c1 = abs(size) | eval c2 = ceil(size) | eval c3 = floor(size) | eval c4 = sqrt(size) | eval c5 = ln(size) | eval c6 = pow(size, 2) | eval c7 = mod(size, 2) | fields c1, c2, c3, c4, c5, c6, c7 | head 10",SUCCESS +"source = dev.default.http_logs | eval c1 = isnull(request) | eval c2 = isnotnull(request) | eval c3 = ifnull(request, ""Unknown"") | eval c4 = nullif(request, ""Unknown"") | eval c5 = isnull(size) | eval c6 = if(like(request, '%bordeaux%'), 'hello', 'world') | fields c1, c2, c3, c4, c5, c6 | head 10",SUCCESS -/* this is block comment */ source = myglue_test.tpch_csv.orders | head 1 // this is line comment,SUCCESS -"/* test in tpch q16, q18, q20 */ source = myglue_test.tpch_csv.orders | head 1 // add source=xx to avoid failure in automation",SUCCESS -"/* test in tpch q4, q21, q22 */ source = myglue_test.tpch_csv.orders | head 1",SUCCESS -"/* test in tpch q2, q11, q15, q17, q20, q22 */ source = myglue_test.tpch_csv.orders | head 1",SUCCESS -"/* test in tpch q7, q8, q9, q13, q15, q22 */ source = myglue_test.tpch_csv.orders | head 1",SUCCESS -/* lots of inner join tests in tpch */ source = myglue_test.tpch_csv.orders | head 1,SUCCESS -/* left join test in tpch q13 */ source = myglue_test.tpch_csv.orders | head 1,SUCCESS -"source = myglue_test.tpch_csv.orders +/* this is block comment */ source = dev.default.orders | head 1 // this is line comment,SUCCESS +"/* test in tpch q16, q18, q20 */ source = dev.default.orders | head 1 // add source=xx to avoid failure in automation",SUCCESS +"/* test in tpch q4, q21, q22 */ source = dev.default.orders | head 1",SUCCESS +"/* test in tpch q2, q11, q15, q17, q20, q22 */ source = dev.default.orders | head 1",SUCCESS +"/* test in tpch q7, q8, q9, q13, q15, q22 */ source = dev.default.orders | head 1",SUCCESS +/* lots of inner join tests in tpch */ source = dev.default.orders | head 1,SUCCESS +/* left join test in tpch q13 */ source = dev.default.orders | head 1,SUCCESS +"source = dev.default.orders | right outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') - myglue_test.tpch_csv.customer + dev.default.customer | stats count(o_orderkey) as c_count by c_custkey | sort - c_count",SUCCESS -"source = myglue_test.tpch_csv.orders +"source = dev.default.orders | full outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') - myglue_test.tpch_csv.customer + dev.default.customer | stats count(o_orderkey) as c_count by c_custkey | sort - c_count",SUCCESS -"source = myglue_test.tpch_csv.customer -| semi join ON c_custkey = o_custkey myglue_test.tpch_csv.orders +"source = dev.default.customer +| semi join ON c_custkey = o_custkey dev.default.orders | where c_mktsegment = 'BUILDING' | sort - c_custkey | head 10",SUCCESS -"source = myglue_test.tpch_csv.customer -| anti join ON c_custkey = o_custkey myglue_test.tpch_csv.orders +"source = dev.default.customer +| anti join ON c_custkey = o_custkey dev.default.orders | where c_mktsegment = 'BUILDING' | sort - c_custkey | head 10",SUCCESS -"source = myglue_test.tpch_csv.supplier +"source = dev.default.supplier | where like(s_comment, '%Customer%Complaints%') -| join ON s_nationkey > n_nationkey [ source = myglue_test.tpch_csv.nation | where n_name = 'SAUDI ARABIA' ] +| join ON s_nationkey > n_nationkey [ source = dev.default.nation | where n_name = 'SAUDI ARABIA' ] | sort - s_name | head 10",SUCCESS -"source = myglue_test.tpch_csv.supplier +"source = dev.default.supplier | where like(s_comment, '%Customer%Complaints%') -| join [ source = myglue_test.tpch_csv.nation | where n_name = 'SAUDI ARABIA' ] +| join [ source = dev.default.nation | where n_name = 'SAUDI ARABIA' ] | sort - s_name | head 10",SUCCESS -source=myglue_test.default.people | LOOKUP myglue_test.default.work_info uid AS id REPLACE department | stats distinct_count(department),SUCCESS -source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id APPEND department | stats distinct_count(department),SUCCESS -source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id REPLACE department AS country | stats distinct_count(country),SUCCESS -source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS id APPEND department AS country | stats distinct_count(country),SUCCESS -"source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uID AS id, name REPLACE department | stats distinct_count(department)",SUCCESS -"source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uid AS ID, name APPEND department | stats distinct_count(department)",SUCCESS -"source = myglue_test.default.people| LOOKUP myglue_test.default.work_info uID AS id, name | head 10",SUCCESS -"source = myglue_test.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP myglue_test.default.work_info name REPLACE occupation AS major | stats distinct_count(major)",SUCCESS -"source = myglue_test.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP myglue_test.default.work_info name APPEND occupation AS major | stats distinct_count(major)",SUCCESS -"source = myglue_test.default.http_logs | eval res = json('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json('{""f1"":""abc"",""f2"":{""f3"":""a"",""f4"":""b""}}') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json('[1,2,3,{""f1"":1,""f2"":[5,6]},4]') | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = json('[]') | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json(‘{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json('{""invalid"": ""json""') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json('[1,2,3]') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json(‘[1,2') | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = json('[invalid json]') | head 1 | fields res,SUCCESS -source = myglue_test.default.http_logs | eval res = json('invalid json') | head 1 | fields res,SUCCESS -source = myglue_test.default.http_logs | eval res = json(null) | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array('this', 'is', 'a', 'string', 'array') | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = json_array() | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array(1, 2, 0, -1, 1.1, -0.11) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array('this', 'is', 1.1, -0.11, true, false) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = array_length(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = array_length(json_array()) | head 1 | fields res,SUCCESS -source = myglue_test.default.http_logs | eval res = json_array_length('[]') | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array_length('[1,2,3,{""f1"":1,""f2"":[5,6]},4]') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array_length('{\""key\"": 1}') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_array_length('[1,2') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', 'string_value')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', 123.45)) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', true)) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object(""a"", 1, ""b"", 2, ""c"", 3)) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', array())) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('key', array(1, 2, 3))) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object('outer', json_object('inner', 123.45))) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = to_json_string(json_object(""array"", json_array(1,2,0,-1,1.1,-0.11))) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | where json_valid(('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1",SUCCESS -"source = myglue_test.default.http_logs | where not json_valid(('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('{""f1"":""abc"",""f2"":{""f3"":""a"",""f4"":""b""}}')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2,3,{""f1"":1,""f2"":[5,6]},4]')) | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = json_keys(json('[]')) | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json(‘{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('{""invalid"": ""json""')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2,3]')) | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_keys(json('[1,2')) | head 1 | fields res",SUCCESS -source = myglue_test.default.http_logs | eval res = json_keys(json('[invalid json]')) | head 1 | fields res,SUCCESS -source = myglue_test.default.http_logs | eval res = json_keys(json('invalid json')) | head 1 | fields res,SUCCESS -source = myglue_test.default.http_logs | eval res = json_keys(json(null)) | head 1 | fields res,SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.teacher') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[*]') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[0]') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[*].name') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[1].name') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[0].not_exist_key') | head 1 | fields res",SUCCESS -"source = myglue_test.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[10]') | head 1 | fields res",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > 0) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > -10) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(json_object(""a"",1,""b"",-1),json_object(""a"",-1,""b"",-1)), result = forall(array, x -> x.a > 0) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(json_object(""a"",1,""b"",-1),json_object(""a"",-1,""b"",-1)), result = exists(array, x -> x.b < 0) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 0) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 10) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 0) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 10) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,3), result = transform(array, x -> x + 1) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,3), result = transform(array, (x, y) -> x + y) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x) | head 1 | fields result",SUCCESS -"source = myglue_test.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x, acc -> acc * 10) | head 1 | fields result",SUCCESS -source=myglue_test.default.people | eval age = salary | eventstats avg(age) | sort id | head 10,SUCCESS -"source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count | sort id | head 10",SUCCESS -source=myglue_test.default.people | eventstats avg(salary) by country | sort id | head 10,SUCCESS -"source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by country | sort id | head 10",SUCCESS -"source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count +source=dev.default.people | LOOKUP dev.default.work_info uid AS id REPLACE department | stats distinct_count(department),SUCCESS +source = dev.default.people| LOOKUP dev.default.work_info uid AS id APPEND department | stats distinct_count(department),SUCCESS +source = dev.default.people| LOOKUP dev.default.work_info uid AS id REPLACE department AS country | stats distinct_count(country),SUCCESS +source = dev.default.people| LOOKUP dev.default.work_info uid AS id APPEND department AS country | stats distinct_count(country),SUCCESS +"source = dev.default.people| LOOKUP dev.default.work_info uID AS id, name REPLACE department | stats distinct_count(department)",SUCCESS +"source = dev.default.people| LOOKUP dev.default.work_info uid AS ID, name APPEND department | stats distinct_count(department)",SUCCESS +"source = dev.default.people| LOOKUP dev.default.work_info uID AS id, name | head 10",SUCCESS +"source = dev.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP dev.default.work_info name REPLACE occupation AS major | stats distinct_count(major)",SUCCESS +"source = dev.default.people | eval major = occupation | fields id, name, major, country, salary | LOOKUP dev.default.work_info name APPEND occupation AS major | stats distinct_count(major)",SUCCESS +"source = dev.default.http_logs | eval res = json('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json('{""f1"":""abc"",""f2"":{""f3"":""a"",""f4"":""b""}}') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json('[1,2,3,{""f1"":1,""f2"":[5,6]},4]') | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = json('[]') | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json('{""invalid"": ""json""') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json('[1,2,3]') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json('[1,2') | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = json('[invalid json]') | head 1 | fields res,SUCCESS +source = dev.default.http_logs | eval res = json('invalid json') | head 1 | fields res,SUCCESS +source = dev.default.http_logs | eval res = json(null) | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json_array('this', 'is', 'a', 'string', 'array') | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = json_array() | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json_array(1, 2, 0, -1, 1.1, -0.11) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_array('this', 'is', 1.1, -0.11, true, false) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = array_length(json_array(1,2,0,-1,1.1,-0.11)) | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = array_length(json_array()) | head 1 | fields res,SUCCESS +source = dev.default.http_logs | eval res = json_array_length('[]') | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json_array_length('[1,2,3,{""f1"":1,""f2"":[5,6]},4]') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_array_length('{\""key\"": 1}') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_array_length('[1,2') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('key', 'string_value')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('key', 123.45)) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('key', true)) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object(""a"", 1, ""b"", 2, ""c"", 3)) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('key', array())) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('key', array(1, 2, 3))) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object('outer', json_object('inner', 123.45))) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = to_json_string(json_object(""array"", json_array(1,2,0,-1,1.1,-0.11))) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | where json_valid('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1",SUCCESS +"source = dev.default.http_logs | where not json_valid('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}') | head 1",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('{""account_number"":1,""balance"":39225,""age"":32,""gender"":""M""}')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('{""f1"":""abc"",""f2"":{""f3"":""a"",""f4"":""b""}}')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('[1,2,3,{""f1"":1,""f2"":[5,6]},4]')) | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = json_keys(json('[]')) | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('{""invalid"": ""json""')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('[1,2,3]')) | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_keys(json('[1,2')) | head 1 | fields res",SUCCESS +source = dev.default.http_logs | eval res = json_keys(json('[invalid json]')) | head 1 | fields res,SUCCESS +source = dev.default.http_logs | eval res = json_keys(json('invalid json')) | head 1 | fields res,SUCCESS +source = dev.default.http_logs | eval res = json_keys(json(null)) | head 1 | fields res,SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.teacher') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[*]') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[0]') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[*].name') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[1].name') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[0].not_exist_key') | head 1 | fields res",SUCCESS +"source = dev.default.http_logs | eval res = json_extract('{""teacher"":""Alice"",""student"":[{""name"":""Bob"",""rank"":1},{""name"":""Charlie"",""rank"":2}]}', '$.student[10]') | head 1 | fields res",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > 0) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = forall(array, x -> x > -10) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(json_object(""a"",1,""b"",-1),json_object(""a"",-1,""b"",-1)), result = forall(array, x -> x.a > 0) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(json_object(""a"",1,""b"",-1),json_object(""a"",-1,""b"",-1)), result = exists(array, x -> x.b < 0) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 0) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = exists(array, x -> x > 10) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 0) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,0,-1,1.1,-0.11), result = filter(array, x -> x > 10) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,3), result = transform(array, x -> x + 1) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,3), result = transform(array, (x, y) -> x + y) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x) | head 1 | fields result",SUCCESS +"source = dev.default.people | eval array = json_array(1,2,3), result = reduce(array, 0, (acc, x) -> acc + x, acc -> acc * 10) | head 1 | fields result",SUCCESS +source=dev.default.people | eval age = salary | eventstats avg(age) | sort id | head 10,SUCCESS +"source=dev.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count | sort id | head 10",SUCCESS +source=dev.default.people | eventstats avg(salary) by country | sort id | head 10,SUCCESS +"source=dev.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by country | sort id | head 10",SUCCESS +"source=dev.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by span(age, 10) | sort id | head 10",SUCCESS -"source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by span(age, 10) as age_span, country | sort id | head 10",SUCCESS -"source=myglue_test.default.people | where country != 'USA' | eventstats stddev_samp(salary), stddev_pop(salary), percentile_approx(salary, 60) by span(salary, 1000) as salary_span | sort id | head 10",SUCCESS -"source=myglue_test.default.people | eval age = salary | eventstats avg(age) as avg_age by occupation, country | eventstats avg(avg_age) as avg_state_age by country | sort id | head 10",SUCCESS -"source=myglue_test.default.people | eventstats distinct_count(salary) by span(salary, 1000) as age_span",FAILED -"source = myglue_test.tpch_csv.lineitem +"source=dev.default.people | eval age = salary | eventstats avg(age) as avg_age, max(age) as max_age, min(age) as min_age, count(age) as count by span(age, 10) as age_span, country | sort id | head 10",SUCCESS +"source=dev.default.people | where country != 'USA' | eventstats stddev_samp(salary), stddev_pop(salary), percentile_approx(salary, 60) by span(salary, 1000) as salary_span | sort id | head 10",SUCCESS +"source=dev.default.people | eval age = salary | eventstats avg(age) as avg_age by occupation, country | eventstats avg(avg_age) as avg_state_age by country | sort id | head 10",SUCCESS +"source=dev.default.people | eventstats distinct_count(salary) by span(salary, 1000) as age_span",FAILED +"source = dev.default.lineitem | where l_shipdate <= subdate(date('1998-12-01'), 90) | stats sum(l_quantity) as sum_qty, sum(l_extendedprice) as sum_base_price, @@ -277,59 +277,59 @@ by span(age, 10) | sort id | head 10",SUCCESS count() as count_order by l_returnflag, l_linestatus | sort l_returnflag, l_linestatus",SUCCESS -"source = myglue_test.tpch_csv.part -| join ON p_partkey = ps_partkey myglue_test.tpch_csv.partsupp -| join ON s_suppkey = ps_suppkey myglue_test.tpch_csv.supplier -| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation -| join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region +"source = dev.default.part +| join ON p_partkey = ps_partkey dev.default.partsupp +| join ON s_suppkey = ps_suppkey dev.default.supplier +| join ON s_nationkey = n_nationkey dev.default.nation +| join ON n_regionkey = r_regionkey dev.default.region | where p_size = 15 AND like(p_type, '%BRASS') AND r_name = 'EUROPE' AND ps_supplycost = [ - source = myglue_test.tpch_csv.partsupp - | join ON s_suppkey = ps_suppkey myglue_test.tpch_csv.supplier - | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation - | join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region + source = dev.default.partsupp + | join ON s_suppkey = ps_suppkey dev.default.supplier + | join ON s_nationkey = n_nationkey dev.default.nation + | join ON n_regionkey = r_regionkey dev.default.region | where r_name = 'EUROPE' | stats MIN(ps_supplycost) ] | sort - s_acctbal, n_name, s_name, p_partkey | head 100",SUCCESS -"source = myglue_test.tpch_csv.customer -| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders -| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem +"source = dev.default.customer +| join ON c_custkey = o_custkey dev.default.orders +| join ON l_orderkey = o_orderkey dev.default.lineitem | where c_mktsegment = 'BUILDING' AND o_orderdate < date('1995-03-15') AND l_shipdate > date('1995-03-15') | stats sum(l_extendedprice * (1 - l_discount)) as revenue by l_orderkey, o_orderdate, o_shippriority | sort - revenue, o_orderdate | head 10",SUCCESS -"source = myglue_test.tpch_csv.orders +"source = dev.default.orders | where o_orderdate >= date('1993-07-01') and o_orderdate < date_add(date('1993-07-01'), interval 3 month) and exists [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | where l_orderkey = o_orderkey and l_commitdate < l_receiptdate ] | stats count() as order_count by o_orderpriority | sort o_orderpriority",SUCCESS -"source = myglue_test.tpch_csv.customer -| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders -| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem -| join ON l_suppkey = s_suppkey AND c_nationkey = s_nationkey myglue_test.tpch_csv.supplier -| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation -| join ON n_regionkey = r_regionkey myglue_test.tpch_csv.region +"source = dev.default.customer +| join ON c_custkey = o_custkey dev.default.orders +| join ON l_orderkey = o_orderkey dev.default.lineitem +| join ON l_suppkey = s_suppkey AND c_nationkey = s_nationkey dev.default.supplier +| join ON s_nationkey = n_nationkey dev.default.nation +| join ON n_regionkey = r_regionkey dev.default.region | where r_name = 'ASIA' AND o_orderdate >= date('1994-01-01') AND o_orderdate < date_add(date('1994-01-01'), interval 1 year) | stats sum(l_extendedprice * (1 - l_discount)) as revenue by n_name | sort - revenue",SUCCESS -"source = myglue_test.tpch_csv.lineitem +"source = dev.default.lineitem | where l_shipdate >= date('1994-01-01') and l_shipdate < adddate(date('1994-01-01'), 365) and l_discount between .06 - 0.01 and .06 + 0.01 and l_quantity < 24 | stats sum(l_extendedprice * l_discount) as revenue",SUCCESS "source = [ - source = myglue_test.tpch_csv.supplier - | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.lineitem - | join ON o_orderkey = l_orderkey myglue_test.tpch_csv.orders - | join ON c_custkey = o_custkey myglue_test.tpch_csv.customer - | join ON s_nationkey = n1.n_nationkey myglue_test.tpch_csv.nation as n1 - | join ON c_nationkey = n2.n_nationkey myglue_test.tpch_csv.nation as n2 + source = dev.default.supplier + | join ON s_suppkey = l_suppkey dev.default.lineitem + | join ON o_orderkey = l_orderkey dev.default.orders + | join ON c_custkey = o_custkey dev.default.customer + | join ON s_nationkey = n1.n_nationkey dev.default.nation as n1 + | join ON c_nationkey = n2.n_nationkey dev.default.nation as n2 | where l_shipdate between date('1995-01-01') and date('1996-12-31') and n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY' or n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE' | eval supp_nation = n1.n_name, cust_nation = n2.n_name, l_year = year(l_shipdate), volume = l_extendedprice * (1 - l_discount) @@ -338,14 +338,14 @@ by span(age, 10) | sort id | head 10",SUCCESS | stats sum(volume) as revenue by supp_nation, cust_nation, l_year | sort supp_nation, cust_nation, l_year",SUCCESS "source = [ - source = myglue_test.tpch_csv.part - | join ON p_partkey = l_partkey myglue_test.tpch_csv.lineitem - | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.supplier - | join ON l_orderkey = o_orderkey myglue_test.tpch_csv.orders - | join ON o_custkey = c_custkey myglue_test.tpch_csv.customer - | join ON c_nationkey = n1.n_nationkey myglue_test.tpch_csv.nation as n1 - | join ON s_nationkey = n2.n_nationkey myglue_test.tpch_csv.nation as n2 - | join ON n1.n_regionkey = r_regionkey myglue_test.tpch_csv.region + source = dev.default.part + | join ON p_partkey = l_partkey dev.default.lineitem + | join ON s_suppkey = l_suppkey dev.default.supplier + | join ON l_orderkey = o_orderkey dev.default.orders + | join ON o_custkey = c_custkey dev.default.customer + | join ON c_nationkey = n1.n_nationkey dev.default.nation as n1 + | join ON s_nationkey = n2.n_nationkey dev.default.nation as n2 + | join ON n1.n_regionkey = r_regionkey dev.default.region | where r_name = 'AMERICA' AND p_type = 'ECONOMY ANODIZED STEEL' and o_orderdate between date('1995-01-01') and date('1996-12-31') | eval o_year = year(o_orderdate) @@ -358,12 +358,12 @@ by span(age, 10) | sort id | head 10",SUCCESS | fields mkt_share, o_year | sort o_year",SUCCESS "source = [ - source = myglue_test.tpch_csv.part - | join ON p_partkey = l_partkey myglue_test.tpch_csv.lineitem - | join ON s_suppkey = l_suppkey myglue_test.tpch_csv.supplier - | join ON ps_partkey = l_partkey and ps_suppkey = l_suppkey myglue_test.tpch_csv.partsupp - | join ON o_orderkey = l_orderkey myglue_test.tpch_csv.orders - | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation + source = dev.default.part + | join ON p_partkey = l_partkey dev.default.lineitem + | join ON s_suppkey = l_suppkey dev.default.supplier + | join ON ps_partkey = l_partkey and ps_suppkey = l_suppkey dev.default.partsupp + | join ON o_orderkey = l_orderkey dev.default.orders + | join ON s_nationkey = n_nationkey dev.default.nation | where like(p_name, '%green%') | eval nation = n_name | eval o_year = year(o_orderdate) @@ -372,33 +372,33 @@ by span(age, 10) | sort id | head 10",SUCCESS ] as profit | stats sum(amount) as sum_profit by nation, o_year | sort nation, - o_year",SUCCESS -"source = myglue_test.tpch_csv.customer -| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders -| join ON l_orderkey = o_orderkey myglue_test.tpch_csv.lineitem -| join ON c_nationkey = n_nationkey myglue_test.tpch_csv.nation +"source = dev.default.customer +| join ON c_custkey = o_custkey dev.default.orders +| join ON l_orderkey = o_orderkey dev.default.lineitem +| join ON c_nationkey = n_nationkey dev.default.nation | where o_orderdate >= date('1993-10-01') AND o_orderdate < date_add(date('1993-10-01'), interval 3 month) AND l_returnflag = 'R' | stats sum(l_extendedprice * (1 - l_discount)) as revenue by c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment | sort - revenue | head 20",SUCCESS -"source = myglue_test.tpch_csv.partsupp -| join ON ps_suppkey = s_suppkey myglue_test.tpch_csv.supplier -| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation +"source = dev.default.partsupp +| join ON ps_suppkey = s_suppkey dev.default.supplier +| join ON s_nationkey = n_nationkey dev.default.nation | where n_name = 'GERMANY' | stats sum(ps_supplycost * ps_availqty) as value by ps_partkey | where value > [ - source = myglue_test.tpch_csv.partsupp - | join ON ps_suppkey = s_suppkey myglue_test.tpch_csv.supplier - | join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation + source = dev.default.partsupp + | join ON ps_suppkey = s_suppkey dev.default.supplier + | join ON s_nationkey = n_nationkey dev.default.nation | where n_name = 'GERMANY' | stats sum(ps_supplycost * ps_availqty) as check | eval threshold = check * 0.0001000000 | fields threshold ] | sort - value",SUCCESS -"source = myglue_test.tpch_csv.orders -| join ON o_orderkey = l_orderkey myglue_test.tpch_csv.lineitem +"source = dev.default.orders +| join ON o_orderkey = l_orderkey dev.default.lineitem | where l_commitdate < l_receiptdate and l_shipdate < l_commitdate and l_shipmode in ('MAIL', 'SHIP') @@ -409,32 +409,32 @@ by span(age, 10) | sort id | head 10",SUCCESS by l_shipmode | sort l_shipmode",SUCCESS "source = [ - source = myglue_test.tpch_csv.customer + source = dev.default.customer | left outer join ON c_custkey = o_custkey AND not like(o_comment, '%special%requests%') - myglue_test.tpch_csv.orders + dev.default.orders | stats count(o_orderkey) as c_count by c_custkey ] as c_orders | stats count() as custdist by c_count | sort - custdist, - c_count",SUCCESS -"source = myglue_test.tpch_csv.lineitem +"source = dev.default.lineitem | join ON l_partkey = p_partkey AND l_shipdate >= date('1995-09-01') AND l_shipdate < date_add(date('1995-09-01'), interval 1 month) - myglue_test.tpch_csv.part + dev.default.part | stats sum(case(like(p_type, 'PROMO%'), l_extendedprice * (1 - l_discount) else 0)) as sum1, sum(l_extendedprice * (1 - l_discount)) as sum2 | eval promo_revenue = 100.00 * sum1 / sum2 // Stats and Eval commands can combine when issues/819 resolved | fields promo_revenue",SUCCESS -"source = myglue_test.tpch_csv.supplier +"source = dev.default.supplier | join right = revenue0 ON s_suppkey = supplier_no [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | where l_shipdate >= date('1996-01-01') AND l_shipdate < date_add(date('1996-01-01'), interval 3 month) | eval supplier_no = l_suppkey | stats sum(l_extendedprice * (1 - l_discount)) as total_revenue by supplier_no ] | where total_revenue = [ source = [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | where l_shipdate >= date('1996-01-01') AND l_shipdate < date_add(date('1996-01-01'), interval 3 month) | eval supplier_no = l_suppkey | stats sum(l_extendedprice * (1 - l_discount)) as total_revenue by supplier_no @@ -443,24 +443,24 @@ by span(age, 10) | sort id | head 10",SUCCESS ] | sort s_suppkey | fields s_suppkey, s_name, s_address, s_phone, total_revenue",SUCCESS -"source = myglue_test.tpch_csv.partsupp -| join ON p_partkey = ps_partkey myglue_test.tpch_csv.part +"source = dev.default.partsupp +| join ON p_partkey = ps_partkey dev.default.part | where p_brand != 'Brand#45' and not like(p_type, 'MEDIUM POLISHED%') and p_size in (49, 14, 23, 45, 19, 3, 36, 9) and ps_suppkey not in [ - source = myglue_test.tpch_csv.supplier + source = dev.default.supplier | where like(s_comment, '%Customer%Complaints%') | fields s_suppkey ] | stats distinct_count(ps_suppkey) as supplier_cnt by p_brand, p_type, p_size | sort - supplier_cnt, p_brand, p_type, p_size",SUCCESS -"source = myglue_test.tpch_csv.lineitem -| join ON p_partkey = l_partkey myglue_test.tpch_csv.part +"source = dev.default.lineitem +| join ON p_partkey = l_partkey dev.default.part | where p_brand = 'Brand#23' and p_container = 'MED BOX' and l_quantity < [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | where l_partkey = p_partkey | stats avg(l_quantity) as avg | eval `0.2 * avg` = 0.2 * avg @@ -469,11 +469,11 @@ by span(age, 10) | sort id | head 10",SUCCESS | stats sum(l_extendedprice) as sum | eval avg_yearly = sum / 7.0 | fields avg_yearly",SUCCESS -"source = myglue_test.tpch_csv.customer -| join ON c_custkey = o_custkey myglue_test.tpch_csv.orders -| join ON o_orderkey = l_orderkey myglue_test.tpch_csv.lineitem +"source = dev.default.customer +| join ON c_custkey = o_custkey dev.default.orders +| join ON o_orderkey = l_orderkey dev.default.lineitem | where o_orderkey in [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | stats sum(l_quantity) as sum by l_orderkey | where sum > 300 | fields l_orderkey @@ -481,7 +481,7 @@ by span(age, 10) | sort id | head 10",SUCCESS | stats sum(l_quantity) by c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice | sort - o_totalprice, o_orderdate | head 100",SUCCESS -"source = myglue_test.tpch_csv.lineitem +"source = dev.default.lineitem | join ON p_partkey = l_partkey and p_brand = 'Brand#12' and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') @@ -503,19 +503,19 @@ by span(age, 10) | sort id | head 10",SUCCESS and p_size between 1 and 15 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' - myglue_test.tpch_csv.part",SUCCESS -"source = myglue_test.tpch_csv.supplier -| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation + dev.default.part",SUCCESS +"source = dev.default.supplier +| join ON s_nationkey = n_nationkey dev.default.nation | where n_name = 'CANADA' and s_suppkey in [ - source = myglue_test.tpch_csv.partsupp + source = dev.default.partsupp | where ps_partkey in [ - source = myglue_test.tpch_csv.part + source = dev.default.part | where like(p_name, 'forest%') | fields p_partkey ] and ps_availqty > [ - source = myglue_test.tpch_csv.lineitem + source = dev.default.lineitem | where l_partkey = ps_partkey and l_suppkey = ps_suppkey and l_shipdate >= date('1994-01-01') @@ -526,19 +526,19 @@ by span(age, 10) | sort id | head 10",SUCCESS ] | fields ps_suppkey ]",SUCCESS -"source = myglue_test.tpch_csv.supplier -| join ON s_suppkey = l1.l_suppkey myglue_test.tpch_csv.lineitem as l1 -| join ON o_orderkey = l1.l_orderkey myglue_test.tpch_csv.orders -| join ON s_nationkey = n_nationkey myglue_test.tpch_csv.nation +"source = dev.default.supplier +| join ON s_suppkey = l1.l_suppkey dev.default.lineitem as l1 +| join ON o_orderkey = l1.l_orderkey dev.default.orders +| join ON s_nationkey = n_nationkey dev.default.nation | where o_orderstatus = 'F' and l1.l_receiptdate > l1.l_commitdate and exists [ - source = myglue_test.tpch_csv.lineitem as l2 + source = dev.default.lineitem as l2 | where l2.l_orderkey = l1.l_orderkey and l2.l_suppkey != l1.l_suppkey ] and not exists [ - source = myglue_test.tpch_csv.lineitem as l3 + source = dev.default.lineitem as l3 | where l3.l_orderkey = l1.l_orderkey and l3.l_suppkey != l1.l_suppkey and l3.l_receiptdate > l3.l_commitdate @@ -548,16 +548,16 @@ by span(age, 10) | sort id | head 10",SUCCESS | sort - numwait, s_name | head 100",SUCCESS "source = [ - source = myglue_test.tpch_csv.customer + source = dev.default.customer | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') and c_acctbal > [ - source = myglue_test.tpch_csv.customer + source = dev.default.customer | where c_acctbal > 0.00 and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') | stats avg(c_acctbal) ] and not exists [ - source = myglue_test.tpch_csv.orders + source = dev.default.orders | where o_custkey = c_custkey ] | eval cntrycode = substring(c_phone, 1, 2) diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala index 7c19cab12..5ea123c9d 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala @@ -771,6 +771,79 @@ trait FlintSparkSuite extends QueryTest with FlintSuite with OpenSearchSuite wit | """.stripMargin) } + protected def createGeoIpTestTable(testTable: String): Unit = { + sql(s""" + | CREATE TABLE $testTable + | ( + | ip STRING, + | ipv4 STRING, + | isValid BOOLEAN + | ) + | USING $tableType $tableOptions + |""".stripMargin) + + sql(s""" + | INSERT INTO $testTable + | VALUES ('66.249.157.90', '66.249.157.90', true), + | ('2a09:bac2:19f8:2ac3::', 'Given IPv6 is not mapped to IPv4', true), + | ('192.168.2.', '192.168.2.', false), + | ('2001:db8::ff00:12:', 'Given IPv6 is not mapped to IPv4', false) + | """.stripMargin) + } + + protected def createGeoIpTable(): Unit = { + sql(s""" + | CREATE TABLE geoip + | ( + | cidr STRING, + | country_iso_code STRING, + | country_name STRING, + | continent_name STRING, + | region_iso_code STRING, + | region_name STRING, + | city_name STRING, + | time_zone STRING, + | location STRING, + | ip_range_start DECIMAL(38,0), + | ip_range_end DECIMAL(38,0), + | ipv4 BOOLEAN + | ) + | USING $tableType $tableOptions + |""".stripMargin) + + sql(s""" + | INSERT INTO geoip + | VALUES ( + | '66.249.157.0/24', + | 'JM', + | 'Jamaica', + | 'North America', + | '14', + | 'Saint Catherine Parish', + | 'Portmore', + | 'America/Jamaica', + | '17.9686,-76.8827', + | 1123654912, + | 1123655167, + | true + | ), + | ( + | '2a09:bac2:19f8::/45', + | 'CA', + | 'Canada', + | 'North America', + | 'PE', + | 'Prince Edward Island', + | 'Charlottetown', + | 'America/Halifax', + | '46.2396,-63.1355', + | 55878094401180025937395073088449675264, + | 55878094401189697343951990121847324671, + | false + | ) + | """.stripMargin) + } + protected def createNestedJsonContentTable(tempFile: Path, testTable: String): Unit = { val json = """ diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGeoipITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGeoipITSuite.scala new file mode 100644 index 000000000..7031ab067 --- /dev/null +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLGeoipITSuite.scala @@ -0,0 +1,314 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.ppl + +import java.util + +import org.opensearch.sql.expression.function.SerializableUdf.visit +import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq + +import org.apache.spark.SparkException +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation, UnresolvedStar} +import org.apache.spark.sql.catalyst.expressions.{Alias, And, CreateNamedStruct, EqualTo, Expression, GreaterThanOrEqual, LessThan, Literal} +import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.catalyst.plans.LeftOuter +import org.apache.spark.sql.catalyst.plans.logical.{DataFrameDropColumns, Filter, Join, JoinHint, LogicalPlan, Project, SubqueryAlias} +import org.apache.spark.sql.streaming.StreamTest + +class FlintSparkPPLGeoipITSuite + extends QueryTest + with LogicalPlanTestUtils + with FlintPPLSuite + with StreamTest { + + /** Test table and index name */ + private val testTable = "spark_catalog.default.flint_ppl_test" + override def beforeAll(): Unit = { + super.beforeAll() + + // Create test table + createGeoIpTestTable(testTable) + createGeoIpTable() + } + + protected override def afterEach(): Unit = { + super.afterEach() + // Stop all streaming jobs if any + spark.streams.active.foreach { job => + job.stop() + job.awaitTermination() + } + } + + private def getGeoIpQueryPlan( + ipAddress: UnresolvedAttribute, + left: LogicalPlan, + right: LogicalPlan, + projectionProperties: Alias): LogicalPlan = { + val joinPlan = getJoinPlan(ipAddress, left, right) + getProjection(joinPlan, projectionProperties) + } + + private def getJoinPlan( + ipAddress: UnresolvedAttribute, + left: LogicalPlan, + right: LogicalPlan): LogicalPlan = { + val is_ipv4 = visit("is_ipv4", util.List.of[Expression](ipAddress)) + val ip_to_int = visit("ip_to_int", util.List.of[Expression](ipAddress)) + + val t1 = SubqueryAlias("t1", left) + val t2 = SubqueryAlias("t2", right) + + val joinCondition = And( + And( + GreaterThanOrEqual(ip_to_int, UnresolvedAttribute("t2.ip_range_start")), + LessThan(ip_to_int, UnresolvedAttribute("t2.ip_range_end"))), + EqualTo(is_ipv4, UnresolvedAttribute("t2.ipv4"))) + Join(t1, t2, LeftOuter, Some(joinCondition), JoinHint.NONE) + } + + private def getProjection(joinPlan: LogicalPlan, projectionProperties: Alias): LogicalPlan = { + val projection = Project(Seq(UnresolvedStar(None), projectionProperties), joinPlan) + val dropList = Seq( + "t2.country_iso_code", + "t2.country_name", + "t2.continent_name", + "t2.region_iso_code", + "t2.region_name", + "t2.city_name", + "t2.time_zone", + "t2.location", + "t2.cidr", + "t2.ip_range_start", + "t2.ip_range_end", + "t2.ipv4").map(UnresolvedAttribute(_)) + DataFrameDropColumns(dropList, projection) + } + + test("test geoip with no parameters") { + val frame = sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip) | fields ip, a + | """.stripMargin) + + // Retrieve the results + val results: Array[Row] = frame.collect() + + // Define the expected results + val expectedResults: Array[Row] = Array( + Row( + "66.249.157.90", + Row( + "JM", + "Jamaica", + "North America", + "14", + "Saint Catherine Parish", + "Portmore", + "America/Jamaica", + "17.9686,-76.8827")), + Row( + "2a09:bac2:19f8:2ac3::", + Row( + "CA", + "Canada", + "North America", + "PE", + "Prince Edward Island", + "Charlottetown", + "America/Halifax", + "46.2396,-63.1355"))) + + // Compare the results + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Compare the logical plans + val logicalPlan: LogicalPlan = frame.queryExecution.logical + + val sourceTable: LogicalPlan = Filter( + EqualTo(UnresolvedAttribute("isValid"), Literal(true)), + UnresolvedRelation(testTable.split("\\.").toSeq)) + val geoTable: LogicalPlan = UnresolvedRelation(seq("geoip")) + val projectionStruct = CreateNamedStruct( + Seq( + Literal("country_iso_code"), + UnresolvedAttribute("t2.country_iso_code"), + Literal("country_name"), + UnresolvedAttribute("t2.country_name"), + Literal("continent_name"), + UnresolvedAttribute("t2.continent_name"), + Literal("region_iso_code"), + UnresolvedAttribute("t2.region_iso_code"), + Literal("region_name"), + UnresolvedAttribute("t2.region_name"), + Literal("city_name"), + UnresolvedAttribute("t2.city_name"), + Literal("time_zone"), + UnresolvedAttribute("t2.time_zone"), + Literal("location"), + UnresolvedAttribute("t2.location"))) + val structProjection = Alias(projectionStruct, "a")() + val geoIpPlan = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), sourceTable, geoTable, structProjection) + val expectedPlan: LogicalPlan = + Project(Seq(UnresolvedAttribute("ip"), UnresolvedAttribute("a")), geoIpPlan) + + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test geoip with one parameters") { + val frame = sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip, country_name) | fields ip, a + | """.stripMargin) + + // Retrieve the results + val results: Array[Row] = frame.collect() + // Define the expected results + val expectedResults: Array[Row] = + Array(Row("66.249.157.90", "Jamaica"), Row("2a09:bac2:19f8:2ac3::", "Canada")) + + // Compare the results + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Compare the logical plans + val logicalPlan: LogicalPlan = frame.queryExecution.logical + + val sourceTable: LogicalPlan = Filter( + EqualTo(UnresolvedAttribute("isValid"), Literal(true)), + UnresolvedRelation(testTable.split("\\.").toSeq)) + val geoTable: LogicalPlan = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + val geoIpPlan = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), sourceTable, geoTable, structProjection) + val expectedPlan: LogicalPlan = + Project(Seq(UnresolvedAttribute("ip"), UnresolvedAttribute("a")), geoIpPlan) + + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test geoip with multiple parameters") { + val frame = sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip, country_name, city_name) | fields ip, a + | """.stripMargin) + + // Retrieve the results + val results: Array[Row] = frame.collect() + // Define the expected results + val expectedResults: Array[Row] = Array( + Row("66.249.157.90", Row("Jamaica", "Portmore")), + Row("2a09:bac2:19f8:2ac3::", Row("Canada", "Charlottetown"))) + + // Compare the results + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Compare the logical plans + val logicalPlan: LogicalPlan = frame.queryExecution.logical + + val sourceTable: LogicalPlan = Filter( + EqualTo(UnresolvedAttribute("isValid"), Literal(true)), + UnresolvedRelation(testTable.split("\\.").toSeq)) + val geoTable: LogicalPlan = UnresolvedRelation(seq("geoip")) + val projectionStruct = CreateNamedStruct( + Seq( + Literal("country_name"), + UnresolvedAttribute("t2.country_name"), + Literal("city_name"), + UnresolvedAttribute("t2.city_name"))) + val structProjection = Alias(projectionStruct, "a")() + val geoIpPlan = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), sourceTable, geoTable, structProjection) + val expectedPlan: LogicalPlan = + Project(Seq(UnresolvedAttribute("ip"), UnresolvedAttribute("a")), geoIpPlan) + + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test geoip with partial projection on evaluated fields") { + val frame = sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip, city_name), b = geoip(ip, country_name) | fields ip, b + | """.stripMargin) + + // Retrieve the results + val results: Array[Row] = frame.collect() + // Define the expected results + val expectedResults: Array[Row] = + Array(Row("66.249.157.90", "Jamaica"), Row("2a09:bac2:19f8:2ac3::", "Canada")) + + // Compare the results + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Compare the logical plans + val logicalPlan: LogicalPlan = frame.queryExecution.logical + + val sourceTable: LogicalPlan = Filter( + EqualTo(UnresolvedAttribute("isValid"), Literal(true)), + UnresolvedRelation(testTable.split("\\.").toSeq)) + val geoTable: LogicalPlan = UnresolvedRelation(seq("geoip")) + + val structProjectionA = Alias(UnresolvedAttribute("t2.city_name"), "a")() + val geoIpPlanA = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), sourceTable, geoTable, structProjectionA) + + val structProjectionB = Alias(UnresolvedAttribute("t2.country_name"), "b")() + val geoIpPlanB = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), geoIpPlanA, geoTable, structProjectionB) + + val expectedPlan: LogicalPlan = + Project(Seq(UnresolvedAttribute("ip"), UnresolvedAttribute("b")), geoIpPlanB) + + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test geoip with projection on field that exists in both source and geoip table") { + val frame = sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip, country_name) | fields ipv4, a + | """.stripMargin) + + // Retrieve the results + val results: Array[Row] = frame.collect() + // Define the expected results + val expectedResults: Array[Row] = + Array(Row("66.249.157.90", "Jamaica"), Row("Given IPv6 is not mapped to IPv4", "Canada")) + + // Compare the results + implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) + assert(results.sorted.sameElements(expectedResults.sorted)) + + // Compare the logical plans + val logicalPlan: LogicalPlan = frame.queryExecution.logical + + val sourceTable: LogicalPlan = Filter( + EqualTo(UnresolvedAttribute("isValid"), Literal(true)), + UnresolvedRelation(testTable.split("\\.").toSeq)) + val geoTable: LogicalPlan = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + val geoIpPlan = + getGeoIpQueryPlan(UnresolvedAttribute("ip"), sourceTable, geoTable, structProjection) + val expectedPlan: LogicalPlan = + Project(Seq(UnresolvedAttribute("ipv4"), UnresolvedAttribute("a")), geoIpPlan) + + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test geoip with invalid parameter") { + assertThrows[ParseException](sql(s""" + | source = $testTable | where isValid = true | eval a = geoip(ip, invalid_param) | fields ip, a + | """.stripMargin)) + } + + test("test geoip with invalid ip address provided") { + val frame = sql(s""" + | source = $testTable | eval a = geoip(ip) | fields ip, a + | """.stripMargin) + + // Retrieve the results + assertThrows[SparkException](frame.collect()) + } +} diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 index ae0c4c73e..8b762dffa 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 @@ -425,9 +425,6 @@ ISPRESENT: 'ISPRESENT'; BETWEEN: 'BETWEEN'; CIDRMATCH: 'CIDRMATCH'; -// Geo Loction -GEOIP: 'GEOIP'; - // FLOWCONTROL FUNCTIONS IFNULL: 'IFNULL'; NULLIF: 'NULLIF'; @@ -437,6 +434,18 @@ TYPEOF: 'TYPEOF'; //OTHER CONDITIONAL EXPRESSIONS COALESCE: 'COALESCE'; +//GEOLOCATION FUNCTIONS +GEOIP: 'GEOIP'; + +//GEOLOCATION PROPERTIES +COUNTRY_ISO_CODE: 'COUNTRY_ISO_CODE'; +COUNTRY_NAME: 'COUNTRY_NAME'; +CONTINENT_NAME: 'CONTINENT_NAME'; +REGION_ISO_CODE: 'REGION_ISO_CODE'; +REGION_NAME: 'REGION_NAME'; +CITY_NAME: 'CITY_NAME'; +LOCATION: 'LOCATION'; + // RELEVANCE FUNCTIONS AND PARAMETERS MATCH: 'MATCH'; MATCH_PHRASE: 'MATCH_PHRASE'; diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 index e461b1c15..c4e30f0d3 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 @@ -415,6 +415,11 @@ sortbyClause evalClause : fieldExpression EQUAL expression + | geoipCommand + ; + +geoipCommand + : fieldExpression EQUAL GEOIP LT_PRTHS ipAddress = functionArg (COMMA properties = geoIpPropertyList)? RT_PRTHS ; // aggregation terms @@ -474,7 +479,6 @@ valueExpression | positionFunction # positionFunctionCall | caseFunction # caseExpr | timestampFunction # timestampFunctionCall - | geoipFunction # geoFunctionCall | LT_PRTHS valueExpression RT_PRTHS # parentheticValueExpr | LT_SQR_PRTHS subSearch RT_SQR_PRTHS # scalarSubqueryExpr | ident ARROW expression # lambda @@ -572,11 +576,6 @@ dataTypeFunctionCall : CAST LT_PRTHS expression AS convertedDataType RT_PRTHS ; -// geoip function -geoipFunction - : GEOIP LT_PRTHS (datasource = functionArg COMMA)? ipAddress = functionArg (COMMA properties = stringLiteral)? RT_PRTHS - ; - // boolean functions booleanFunctionCall : conditionFunctionBase LT_PRTHS functionArgs RT_PRTHS @@ -610,7 +609,6 @@ evalFunctionName | cryptographicFunctionName | jsonFunctionName | collectionFunctionName - | geoipFunctionName | lambdaFunctionName ; @@ -928,10 +926,6 @@ lambdaFunctionName | TRANSFORM | REDUCE ; - -geoipFunctionName - : GEOIP - ; positionFunctionName : POSITION @@ -941,6 +935,21 @@ coalesceFunctionName : COALESCE ; +geoIpPropertyList + : geoIpProperty (COMMA geoIpProperty)* + ; + +geoIpProperty + : COUNTRY_ISO_CODE + | COUNTRY_NAME + | CONTINENT_NAME + | REGION_ISO_CODE + | REGION_NAME + | CITY_NAME + | TIME_ZONE + | LOCATION + ; + // operators comparisonOperator : EQUAL diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index db191a86c..31841430c 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -346,10 +346,15 @@ public T visitExistsSubquery(ExistsSubquery node, C context) { public T visitWindow(Window node, C context) { return visitChildren(node, context); } + public T visitCidr(Cidr node, C context) { return visitChildren(node, context); } + public T visitGeoIp(GeoIp node, C context) { + return visitChildren(node, context); + } + public T visitFlatten(Flatten flatten, C context) { return visitChildren(flatten, context); } diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Eval.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Eval.java index 0cc27b6a9..c8482a4ff 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Eval.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Eval.java @@ -12,7 +12,7 @@ import lombok.Setter; import lombok.ToString; import org.opensearch.sql.ast.AbstractNodeVisitor; -import org.opensearch.sql.ast.expression.Let; +import org.opensearch.sql.ast.Node; import java.util.List; @@ -23,7 +23,7 @@ @EqualsAndHashCode(callSuper = false) @RequiredArgsConstructor public class Eval extends UnresolvedPlan { - private final List expressionList; + private final List expressionList; private UnresolvedPlan child; @Override diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java new file mode 100644 index 000000000..feefa6929 --- /dev/null +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java @@ -0,0 +1,47 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.ToString; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.Node; +import org.opensearch.sql.ast.expression.AttributeList; +import org.opensearch.sql.ast.expression.Field; +import org.opensearch.sql.ast.expression.UnresolvedExpression; + +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +@Getter +@RequiredArgsConstructor +@EqualsAndHashCode(callSuper = false) +public class GeoIp extends UnresolvedPlan { + private UnresolvedPlan child; + private final Field field; + private final UnresolvedExpression ipAddress; + private final AttributeList properties; + + @Override + public List getChild() { + return ImmutableList.of(child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitGeoIp(this, context); + } + + @Override + public UnresolvedPlan attach(UnresolvedPlan child) { + this.child = child; + return this; + } +} \ No newline at end of file diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java index e80a26bc4..e931175ff 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java @@ -11,13 +11,18 @@ import org.apache.spark.sql.catalyst.expressions.Expression; import org.apache.spark.sql.catalyst.expressions.ScalaUDF; import org.apache.spark.sql.types.DataTypes; +import scala.Function1; import scala.Function2; import scala.Option; import scala.Serializable; +import scala.runtime.AbstractFunction1; +import scala.runtime.AbstractFunction2; import scala.collection.JavaConverters; import scala.collection.mutable.WrappedArray; -import scala.runtime.AbstractFunction2; +import java.math.BigInteger; +import java.net.InetAddress; +import java.net.UnknownHostException; import java.util.Collection; import java.util.List; import java.util.Map; @@ -28,7 +33,6 @@ import static org.opensearch.sql.expression.function.JsonUtils.removeNestedKey; import static org.opensearch.sql.ppl.utils.DataTypeTransformer.seq; - public interface SerializableUdf { @@ -142,11 +146,66 @@ public Boolean apply(String ipAddress, String cidrBlock) { } }; + class geoIpUtils { + /** + * Checks if provided ip string is ipv4 or ipv6. + * + * @param ipAddress To input ip string. + * @return true if ipAddress is ipv4, false if ipaddress is ipv6, AddressString Exception if invalid ip. + */ + public static Function1 isIpv4 = new SerializableAbstractFunction1<>() { + + IPAddressStringParameters valOptions = new IPAddressStringParameters.Builder() + .allowEmpty(false) + .setEmptyAsLoopback(false) + .allow_inet_aton(false) + .allowSingleSegment(false) + .toParams(); + + @Override + public Boolean apply(String ipAddress) { + IPAddressString parsedIpAddress = new IPAddressString(ipAddress, valOptions); + + try { + parsedIpAddress.validate(); + } catch (AddressStringException e) { + throw new RuntimeException("The given ipAddress '"+ipAddress+"' is invalid. It must be a valid IPv4 or IPv6 address. Error details: "+e.getMessage()); + } + + return parsedIpAddress.isIPv4(); + } + }; + + /** + * Convert ipAddress string to interger representation + * + * @param ipAddress The input ip string. + * @return converted BigInteger from ipAddress string. + */ + public static Function1 ipToInt = new SerializableAbstractFunction1<>() { + @Override + public BigInteger apply(String ipAddress) { + try { + InetAddress inetAddress = InetAddress.getByName(ipAddress); + byte[] addressBytes = inetAddress.getAddress(); + return new BigInteger(1, addressBytes); + } catch (UnknownHostException e) { + System.err.println("Invalid IP address: " + e.getMessage()); + } + return null; + } + }; + } + + abstract class SerializableAbstractFunction1 extends AbstractFunction1 + implements Serializable { + } + /** - * get the function reference according to its name + * Get the function reference according to its name * - * @param funcName - * @return + * @param funcName string representing function to retrieve. + * @return relevant ScalaUDF for given function name. */ static ScalaUDF visit(String funcName, List expressions) { switch (funcName) { @@ -177,6 +236,24 @@ static ScalaUDF visit(String funcName, List expressions) { Option.apply("json_append"), false, true); + case "is_ipv4": + return new ScalaUDF(geoIpUtils.isIpv4, + DataTypes.BooleanType, + seq(expressions), + seq(), + Option.empty(), + Option.apply("is_ipv4"), + false, + true); + case "ip_to_int": + return new ScalaUDF(geoIpUtils.ipToInt, + DataTypes.createDecimalType(38,0), + seq(expressions), + seq(), + Option.empty(), + Option.apply("ip_to_int"), + false, + true); default: return null; } diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java index 6d50e4298..22beab605 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java @@ -57,6 +57,7 @@ import org.opensearch.sql.ast.tree.FillNull; import org.opensearch.sql.ast.tree.Filter; import org.opensearch.sql.ast.tree.Flatten; +import org.opensearch.sql.ast.tree.GeoIp; import org.opensearch.sql.ast.tree.Head; import org.opensearch.sql.ast.tree.Join; import org.opensearch.sql.ast.tree.Kmeans; @@ -70,9 +71,11 @@ import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.Trendline; +import org.opensearch.sql.ast.tree.UnresolvedPlan; import org.opensearch.sql.ast.tree.Window; import org.opensearch.sql.common.antlr.SyntaxCheckException; import org.opensearch.sql.ppl.utils.FieldSummaryTransformer; +import org.opensearch.sql.ppl.utils.GeoIpCatalystLogicalPlanTranslator; import org.opensearch.sql.ppl.utils.ParseTransformer; import org.opensearch.sql.ppl.utils.ViewUtils; import org.opensearch.sql.ppl.utils.SortUtils; @@ -570,19 +573,63 @@ public LogicalPlan visitRename(Rename node, CatalystPlanContext context) { public LogicalPlan visitEval(Eval node, CatalystPlanContext context) { visitFirstChild(node, context); List aliases = new ArrayList<>(); - List letExpressions = node.getExpressionList(); - for (Let let : letExpressions) { - Alias alias = new Alias(let.getVar().getField().toString(), let.getExpression()); - aliases.add(alias); + List expressions = node.getExpressionList(); + + // Geoip function modifies logical plan and is treated as QueryPlanVisitor instead of ExpressionVisitor + for (Node expr : expressions) { + if (expr instanceof Let) { + Let let = (Let) expr; + Alias alias = new Alias(let.getVar().getField().toString(), let.getExpression()); + aliases.add(alias); + } else if (expr instanceof UnresolvedPlan) { + expr.accept(this, context); + } else { + throw new SyntaxCheckException("Unexpected node type when visiting EVAL"); + } } - if (context.getNamedParseExpressions().isEmpty()) { - // Create an UnresolvedStar for all-fields projection - context.getNamedParseExpressions().push(UnresolvedStar$.MODULE$.apply(Option.>empty())); + + if (!aliases.isEmpty()) { + if (context.getNamedParseExpressions().isEmpty()) { + // Create an UnresolvedStar for all-fields projection + context.getNamedParseExpressions().push(UnresolvedStar$.MODULE$.apply(Option.>empty())); + } + + visitExpressionList(aliases, context); + Seq projectExpressions = context.retainAllNamedParseExpressions(p -> (NamedExpression) p); + // build the plan with the projection step + return context.apply(p -> new org.apache.spark.sql.catalyst.plans.logical.Project(projectExpressions, p)); + } else { + return context.getPlan(); } - List expressionList = visitExpressionList(aliases, context); - Seq projectExpressions = context.retainAllNamedParseExpressions(p -> (NamedExpression) p); - // build the plan with the projection step - return context.apply(p -> new org.apache.spark.sql.catalyst.plans.logical.Project(projectExpressions, p)); + } + + @Override + public LogicalPlan visitGeoIp(GeoIp node, CatalystPlanContext context) { + visitExpression(node.getProperties(), context); + List attributeList = new ArrayList<>(); + + while (!context.getNamedParseExpressions().isEmpty()) { + Expression nextExpression = context.getNamedParseExpressions().pop(); + String attributeName = nextExpression.toString(); + + if (attributeList.contains(attributeName)) { + throw new IllegalStateException("Duplicate attribute in GEOIP attribute list"); + } + + attributeList.add(0, attributeName); + } + + String fieldExpression = node.getField().getField().toString(); + Expression ipAddressExpression = visitExpression(node.getIpAddress(), context); + + return GeoIpCatalystLogicalPlanTranslator.getGeoipLogicalPlan( + new GeoIpCatalystLogicalPlanTranslator.GeoIpParameters( + fieldExpression, + ipAddressExpression, + attributeList + ), + context + ); } @Override diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index fd9240622..bfc45f50e 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -339,10 +339,18 @@ public UnresolvedPlan visitSortCommand(OpenSearchPPLParser.SortCommandContext ct public UnresolvedPlan visitEvalCommand(OpenSearchPPLParser.EvalCommandContext ctx) { return new Eval( ctx.evalClause().stream() - .map(ct -> (Let) internalVisitExpression(ct)) + .map(ct -> (ct.geoipCommand() != null) ? visit(ct.geoipCommand()) : (Let) internalVisitExpression(ct)) .collect(Collectors.toList())); } + @Override + public UnresolvedPlan visitGeoipCommand(OpenSearchPPLParser.GeoipCommandContext ctx) { + Field field = (Field) internalVisitExpression(ctx.fieldExpression()); + UnresolvedExpression ipAddress = internalVisitExpression(ctx.ipAddress); + AttributeList properties = ctx.properties == null ? new AttributeList(Collections.emptyList()) : (AttributeList) internalVisitExpression(ctx.properties); + return new GeoIp(field, ipAddress, properties); + } + private List getGroupByList(OpenSearchPPLParser.ByClauseContext ctx) { return ctx.fieldList().fieldExpression().stream() .map(this::internalVisitExpression) diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java index 19f7002b6..da1fa40aa 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java @@ -51,6 +51,7 @@ import org.opensearch.sql.common.antlr.SyntaxCheckException; import org.opensearch.sql.common.utils.StringUtils; import org.opensearch.sql.ppl.utils.ArgumentFactory; +import org.opensearch.sql.ppl.utils.GeoIpCatalystLogicalPlanTranslator; import java.util.Arrays; import java.util.Collections; @@ -454,6 +455,20 @@ public UnresolvedExpression visitLambda(OpenSearchPPLParser.LambdaContext ctx) { return new LambdaFunction(function, arguments); } + @Override + public UnresolvedExpression visitGeoIpPropertyList(OpenSearchPPLParser.GeoIpPropertyListContext ctx) { + ImmutableList.Builder properties = ImmutableList.builder(); + if (ctx != null) { + for (OpenSearchPPLParser.GeoIpPropertyContext property : ctx.geoIpProperty()) { + String propertyName = property.getText().toUpperCase(); + GeoIpCatalystLogicalPlanTranslator.validateGeoIpProperty(propertyName); + properties.add(new Literal(propertyName, DataType.STRING)); + } + } + + return new AttributeList(properties.build()); + } + private List timestampFunctionArguments( OpenSearchPPLParser.TimestampFunctionCallContext ctx) { List args = diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoIpCatalystLogicalPlanTranslator.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoIpCatalystLogicalPlanTranslator.java new file mode 100644 index 000000000..cedc00846 --- /dev/null +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoIpCatalystLogicalPlanTranslator.java @@ -0,0 +1,222 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ppl.utils; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import org.apache.spark.SparkEnv; +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute$; +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation; +import org.apache.spark.sql.catalyst.analysis.UnresolvedStar$; +import org.apache.spark.sql.catalyst.expressions.Alias$; +import org.apache.spark.sql.catalyst.expressions.And; +import org.apache.spark.sql.catalyst.expressions.CreateStruct; +import org.apache.spark.sql.catalyst.expressions.EqualTo; +import org.apache.spark.sql.catalyst.expressions.Expression; +import org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual; +import org.apache.spark.sql.catalyst.expressions.LessThan; +import org.apache.spark.sql.catalyst.expressions.NamedExpression; +import org.apache.spark.sql.catalyst.plans.logical.DataFrameDropColumns; +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; +import org.apache.spark.sql.catalyst.plans.logical.Project; +import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias$; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; +import org.opensearch.sql.ast.tree.Join; +import org.opensearch.sql.expression.function.SerializableUdf; +import org.opensearch.sql.ppl.CatalystPlanContext; +import scala.Option; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Optional; +import java.util.stream.Collectors; + +import static java.util.List.of; + +import static org.opensearch.sql.ppl.utils.DataTypeTransformer.seq; +import static org.opensearch.sql.ppl.utils.JoinSpecTransformer.join; + +public interface GeoIpCatalystLogicalPlanTranslator { + String SPARK_CONF_KEY = "spark.geoip.tablename"; + String DEFAULT_GEOIP_TABLE_NAME = "geoip"; + String GEOIP_CIDR_COLUMN_NAME = "cidr"; + String GEOIP_IP_RANGE_START_COLUMN_NAME = "ip_range_start"; + String GEOIP_IP_RANGE_END_COLUMN_NAME = "ip_range_end"; + String GEOIP_IPV4_COLUMN_NAME = "ipv4"; + String SOURCE_TABLE_ALIAS = "t1"; + String GEOIP_TABLE_ALIAS = "t2"; + List GEOIP_TABLE_COLUMNS = Arrays.stream(GeoIpProperty.values()) + .map(Enum::name) + .collect(Collectors.toList()); + + /** + * Responsible to produce a Spark Logical Plan with given GeoIp command arguments, below is the sample logical plan + * with configuration [source=users, field=a, ipAddress=ip, properties=[country_name, city_name]] + * +- 'DataFrameDropColumns ['t2.country_iso_code, 't2.country_name, 't2.continent_name, 't2.region_iso_code, 't2.region_name, 't2.city_name, 't2.time_zone, 't2.location, 't2.cidr, 't2.start, 't2.end, 't2.ipv4] + * -- +- 'Project [*, named_struct(country_name, 't2.country_name, city_name, 't2.city_name) AS a#0] + * -- -- +- 'Join LeftOuter, (((ip_to_int('ip) >= 't2.start) AND (ip_to_int('ip) < 't2.end)) AND (is_ipv4('ip) = 't2.ipv4)) + * -- -- -- :- 'SubqueryAlias t1 + * -- -- -- -- : +- 'UnresolvedRelation [users], [], false + * -- -- -- +- 'SubqueryAlias t2 + * -- -- -- -- -- +- 'UnresolvedRelation [geoip], [], false + * . + * And the corresponded SQL query: + * . + * SELECT users.*, struct(geoip.country_name, geoip.city_name) AS a + * FROM users, geoip + * WHERE geoip.ip_range_start <= ip_to_int(users.ip) + * AND geoip.ip_range_end > ip_to_int(users.ip) + * AND geoip.ip_type = is_ipv4(users.ip); + * + * @param parameters GeoIp function parameters. + * @param context Context instance to retrieved Expression in resolved form. + * @return a LogicalPlan which will project new col with geoip location based on given ipAddresses. + */ + static LogicalPlan getGeoipLogicalPlan(GeoIpParameters parameters, CatalystPlanContext context) { + applyJoin(parameters.getIpAddress(), context); + return applyProjection(parameters.getField(), parameters.getProperties(), context); + } + + /** + * Responsible to produce join plan for GeoIp command, below is the sample logical plan + * with configuration [source=users, ipAddress=ip] + * +- 'Join LeftOuter, (((ip_to_int('ip) >= 't2.start) AND (ip_to_int('ip) < 't2.end)) AND (is_ipv4('ip) = 't2.ipv4)) + * -- :- 'SubqueryAlias t1 + * -- -- : +- 'UnresolvedRelation [users], [], false + * -- +- 'SubqueryAlias t2 + * -- -- -- +- 'UnresolvedRelation [geoip], [], false + * + * @param ipAddress Expression representing ip addresses to be queried. + * @param context Context instance to retrieved Expression in resolved form. + * @return a LogicalPlan which will perform join based on ip within cidr range in geoip table. + */ + static private LogicalPlan applyJoin(Expression ipAddress, CatalystPlanContext context) { + return context.apply(left -> { + LogicalPlan right = new UnresolvedRelation(seq(getGeoipTableName()), CaseInsensitiveStringMap.empty(), false); + LogicalPlan leftAlias = SubqueryAlias$.MODULE$.apply(SOURCE_TABLE_ALIAS, left); + LogicalPlan rightAlias = SubqueryAlias$.MODULE$.apply(GEOIP_TABLE_ALIAS, right); + Optional joinCondition = Optional.of(new And( + new And( + new GreaterThanOrEqual( + SerializableUdf.visit("ip_to_int", of(ipAddress)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS,GEOIP_IP_RANGE_START_COLUMN_NAME)) + ), + new LessThan( + SerializableUdf.visit("ip_to_int", of(ipAddress)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS,GEOIP_IP_RANGE_END_COLUMN_NAME)) + ) + ), + new EqualTo( + SerializableUdf.visit("is_ipv4", of(ipAddress)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS,GEOIP_IPV4_COLUMN_NAME)) + ) + )); + context.retainAllNamedParseExpressions(p -> p); + context.retainAllPlans(p -> p); + return join(leftAlias, + rightAlias, + Join.JoinType.LEFT, + joinCondition, + new Join.JoinHint()); + }); + } + + /** + * Responsible to produce a Spark Logical Plan with given GeoIp command arguments, below is the sample logical plan + * with configuration [source=users, field=a, properties=[country_name, city_name]] + * +- 'DataFrameDropColumns ['t2.country_iso_code, 't2.country_name, 't2.continent_name, 't2.region_iso_code, 't2.region_name, 't2.city_name, 't2.time_zone, 't2.location, 't2.cidr, 't2.start, 't2.end, 't2.ipv4] + * -- +- 'Project [*, named_struct(country_name, 't2.country_name, city_name, 't2.city_name) AS a#0] + * + * @param field Name of new eval geoip column. + * @param properties List of geo properties to be returned. + * @param context Context instance to retrieved Expression in resolved form. + * @return a LogicalPlan which will return source table and new eval geoip column. + */ + static private LogicalPlan applyProjection(String field, List properties, CatalystPlanContext context) { + List projectExpressions = new ArrayList<>(); + projectExpressions.add(UnresolvedStar$.MODULE$.apply(Option.empty())); + + List geoIpStructFields = createGeoIpStructFields(properties); + Expression columnValue = (geoIpStructFields.size() == 1)? + geoIpStructFields.get(0) : CreateStruct.apply(seq(geoIpStructFields)); + + NamedExpression geoCol = Alias$.MODULE$.apply( + columnValue, + field, + NamedExpression.newExprId(), + seq(new ArrayList<>()), + Option.empty(), + seq(new ArrayList<>())); + + projectExpressions.add(geoCol); + + List dropList = createGeoIpStructFields(new ArrayList<>()); + dropList.addAll(List.of( + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS, GEOIP_CIDR_COLUMN_NAME)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS, GEOIP_IP_RANGE_START_COLUMN_NAME)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS, GEOIP_IP_RANGE_END_COLUMN_NAME)), + UnresolvedAttribute$.MODULE$.apply(seq(GEOIP_TABLE_ALIAS, GEOIP_IPV4_COLUMN_NAME)) + )); + + context.apply(p -> new Project(seq(projectExpressions), p)); + return context.apply(p -> new DataFrameDropColumns(seq(dropList), p)); + } + + static private List createGeoIpStructFields(List attributeList) { + List attributeListToUse; + if (attributeList == null || attributeList.isEmpty()) { + attributeListToUse = GEOIP_TABLE_COLUMNS; + } else { + attributeListToUse = attributeList; + } + + return attributeListToUse.stream() + .map(a -> UnresolvedAttribute$.MODULE$.apply(seq( + GEOIP_TABLE_ALIAS, + a.toLowerCase(Locale.ROOT) + ))) + .collect(Collectors.toList()); + } + + static private String getGeoipTableName() { + String tableName = DEFAULT_GEOIP_TABLE_NAME; + + if (SparkEnv.get() != null && SparkEnv.get().conf() != null) { + tableName = SparkEnv.get().conf().get(SPARK_CONF_KEY, DEFAULT_GEOIP_TABLE_NAME); + } + + return tableName; + } + + @Getter + @AllArgsConstructor + class GeoIpParameters { + private final String field; + private final Expression ipAddress; + private final List properties; + } + + enum GeoIpProperty { + COUNTRY_ISO_CODE, + COUNTRY_NAME, + CONTINENT_NAME, + REGION_ISO_CODE, + REGION_NAME, + CITY_NAME, + TIME_ZONE, + LOCATION + } + + public static void validateGeoIpProperty(String propertyName) { + try { + GeoIpProperty.valueOf(propertyName); + } catch (NullPointerException | IllegalArgumentException e) { + throw new IllegalArgumentException("Invalid properties used."); + } + } +} diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGeoipFunctionTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGeoipFunctionTranslatorTestSuite.scala new file mode 100644 index 000000000..460b9769c --- /dev/null +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanGeoipFunctionTranslatorTestSuite.scala @@ -0,0 +1,332 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.ppl + +import java.util + +import org.opensearch.flint.spark.ppl.PlaneUtils.plan +import org.opensearch.sql.expression.function.SerializableUdf.visit +import org.opensearch.sql.ppl.{CatalystPlanContext, CatalystQueryPlanVisitor} +import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq +import org.scalatest.matchers.should.Matchers + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} +import org.apache.spark.sql.catalyst.expressions.{Alias, And, CreateNamedStruct, Descending, EqualTo, Expression, ExprId, GreaterThanOrEqual, In, LessThan, Literal, NamedExpression, ScalaUDF, SortOrder} +import org.apache.spark.sql.catalyst.plans.{LeftOuter, PlanTest} +import org.apache.spark.sql.catalyst.plans.logical.{DataFrameDropColumns, Join, JoinHint, LogicalPlan, Project, Sort, SubqueryAlias} +import org.apache.spark.sql.types.DataTypes + +class PPLLogicalPlanGeoipFunctionTranslatorTestSuite + extends SparkFunSuite + with PlanTest + with LogicalPlanTestUtils + with Matchers { + + private val planTransformer = new CatalystQueryPlanVisitor() + private val pplParser = new PPLSyntaxParser() + + private def getGeoIpQueryPlan( + ipAddress: UnresolvedAttribute, + left: LogicalPlan, + right: LogicalPlan, + projectionProperties: Alias): LogicalPlan = { + val joinPlan = getJoinPlan(ipAddress, left, right) + getProjection(joinPlan, projectionProperties) + } + + private def getJoinPlan( + ipAddress: UnresolvedAttribute, + left: LogicalPlan, + right: LogicalPlan): LogicalPlan = { + val is_ipv4 = visit("is_ipv4", util.List.of[Expression](ipAddress)) + val ip_to_int = visit("ip_to_int", util.List.of[Expression](ipAddress)) + + val t1 = SubqueryAlias("t1", left) + val t2 = SubqueryAlias("t2", right) + + val joinCondition = And( + And( + GreaterThanOrEqual(ip_to_int, UnresolvedAttribute("t2.ip_range_start")), + LessThan(ip_to_int, UnresolvedAttribute("t2.ip_range_end"))), + EqualTo(is_ipv4, UnresolvedAttribute("t2.ipv4"))) + Join(t1, t2, LeftOuter, Some(joinCondition), JoinHint.NONE) + } + + private def getProjection(joinPlan: LogicalPlan, projectionProperties: Alias): LogicalPlan = { + val projection = Project(Seq(UnresolvedStar(None), projectionProperties), joinPlan) + val dropList = Seq( + "t2.country_iso_code", + "t2.country_name", + "t2.continent_name", + "t2.region_iso_code", + "t2.region_name", + "t2.city_name", + "t2.time_zone", + "t2.location", + "t2.cidr", + "t2.ip_range_start", + "t2.ip_range_end", + "t2.ipv4").map(UnresolvedAttribute(_)) + DataFrameDropColumns(dropList, projection) + } + + test("test geoip function - only ip_address provided") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source = users | eval a = geoip(ip_address)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("users")) + val geoTable = UnresolvedRelation(seq("geoip")) + + val projectionStruct = CreateNamedStruct( + Seq( + Literal("country_iso_code"), + UnresolvedAttribute("t2.country_iso_code"), + Literal("country_name"), + UnresolvedAttribute("t2.country_name"), + Literal("continent_name"), + UnresolvedAttribute("t2.continent_name"), + Literal("region_iso_code"), + UnresolvedAttribute("t2.region_iso_code"), + Literal("region_name"), + UnresolvedAttribute("t2.region_name"), + Literal("city_name"), + UnresolvedAttribute("t2.city_name"), + Literal("time_zone"), + UnresolvedAttribute("t2.time_zone"), + Literal("location"), + UnresolvedAttribute("t2.location"))) + val structProjection = Alias(projectionStruct, "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedStar(None)), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - source has same name as join alias") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=t1 | eval a = geoip(ip_address, country_name)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("t1")) + val geoTable = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedStar(None)), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - ipAddress col exist in geoip table") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=t1 | eval a = geoip(cidr, country_name)"), + context) + + val ipAddress = UnresolvedAttribute("cidr") + val sourceTable = UnresolvedRelation(seq("t1")) + val geoTable = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedStar(None)), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - duplicate parameters") { + val context = new CatalystPlanContext + + val exception = intercept[IllegalStateException] { + planTransformer.visit( + plan(pplParser, "source=t1 | eval a = geoip(cidr, country_name, country_name)"), + context) + } + + assert(exception.getMessage.contains("Duplicate attribute in GEOIP attribute list")) + } + + test("test geoip function - one property provided") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=users | eval a = geoip(ip_address, country_name)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("users")) + val geoTable = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedStar(None)), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - multiple properties provided") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=users | eval a = geoip(ip_address,country_name,location)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("users")) + val geoTable = UnresolvedRelation(seq("geoip")) + val projectionStruct = CreateNamedStruct( + Seq( + Literal("country_name"), + UnresolvedAttribute("t2.country_name"), + Literal("location"), + UnresolvedAttribute("t2.location"))) + val structProjection = Alias(projectionStruct, "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedStar(None)), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - multiple geoip calls") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan( + pplParser, + "source=t | eval a = geoip(ip_address, country_iso_code), b = geoip(ip_address, region_iso_code)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("t")) + val geoTable = UnresolvedRelation(seq("geoip")) + + val structProjectionA = Alias(UnresolvedAttribute("t2.country_iso_code"), "a")() + val colAPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjectionA) + + val structProjectionB = Alias(UnresolvedAttribute("t2.region_iso_code"), "b")() + val colBPlan = getGeoIpQueryPlan(ipAddress, colAPlan, geoTable, structProjectionB) + + val expectedPlan = Project(Seq(UnresolvedStar(None)), colBPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - other eval function used between geoip") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan( + pplParser, + "source=t | eval a = geoip(ip_address, time_zone), b = rand(), c = geoip(ip_address, region_name)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("t")) + val geoTable = UnresolvedRelation(seq("geoip")) + + val structProjectionA = Alias(UnresolvedAttribute("t2.time_zone"), "a")() + val colAPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjectionA) + + val structProjectionC = Alias(UnresolvedAttribute("t2.region_name"), "c")() + val colCPlan = getGeoIpQueryPlan(ipAddress, colAPlan, geoTable, structProjectionC) + + val randProjectList: Seq[NamedExpression] = Seq( + UnresolvedStar(None), + Alias(UnresolvedFunction("rand", Seq.empty, isDistinct = false), "b")()) + val colBPlan = Project(randProjectList, colCPlan) + + val expectedPlan = Project(Seq(UnresolvedStar(None)), colBPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - other eval function used before geoip") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=t | eval a = rand(), b = geoip(ip_address, city_name)"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("t")) + val geoTable = UnresolvedRelation(seq("geoip")) + + val structProjectionB = Alias(UnresolvedAttribute("t2.city_name"), "b")() + val colBPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjectionB) + + val randProjectList: Seq[NamedExpression] = Seq( + UnresolvedStar(None), + Alias(UnresolvedFunction("rand", Seq.empty, isDistinct = false), "a")()) + val colAPlan = Project(randProjectList, colBPlan) + + val expectedPlan = Project(Seq(UnresolvedStar(None)), colAPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip function - projection on evaluated field") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan(pplParser, "source=users | eval a = geoip(ip_address, country_name) | fields a"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("users")) + val geoTable = UnresolvedRelation(seq("geoip")) + val structProjection = Alias(UnresolvedAttribute("t2.country_name"), "a")() + + val geoIpPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjection) + val expectedPlan = Project(Seq(UnresolvedAttribute("a")), geoIpPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test geoip with partial projection on evaluated fields") { + val context = new CatalystPlanContext + + val logPlan = + planTransformer.visit( + plan( + pplParser, + "source=t | eval a = geoip(ip_address, country_iso_code), b = geoip(ip_address, region_iso_code) | fields b"), + context) + + val ipAddress = UnresolvedAttribute("ip_address") + val sourceTable = UnresolvedRelation(seq("t")) + val geoTable = UnresolvedRelation(seq("geoip")) + + val structProjectionA = Alias(UnresolvedAttribute("t2.country_iso_code"), "a")() + val colAPlan = getGeoIpQueryPlan(ipAddress, sourceTable, geoTable, structProjectionA) + + val structProjectionB = Alias(UnresolvedAttribute("t2.region_iso_code"), "b")() + val colBPlan = getGeoIpQueryPlan(ipAddress, colAPlan, geoTable, structProjectionB) + + val expectedPlan = Project(Seq(UnresolvedAttribute("b")), colBPlan) + + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } +}