From 785eaa587ea7e85fa985a893f82d55f0fb528782 Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Fri, 8 Mar 2024 15:25:30 -0800
Subject: [PATCH] Enable specifying specific integration test methods via TESTS
 environment (#10564)

* WIP

Signed-off-by: Gera Shegalov <gera@apache.org>

* WIP

Signed-off-by: Gera Shegalov <gera@apache.org>

* Enable specifying the pytest using file_or_dir args

```bash
TEST_PARALLEL=0 \
SPARK_HOME=~/dist/spark-3.1.1-bin-hadoop3.2 \
TEST_FILE_OR_DIR=~/gits/NVIDIA/spark-rapids/integration_tests/src/main/python/arithmetic_ops_test.py::test_addition  \
./integration_tests/run_pyspark_from_build.sh --collect-only

<Module src/main/python/arithmetic_ops_test.py>
  <Function test_addition[Byte]>
  <Function test_addition[Short]>
  <Function test_addition[Integer]>
  <Function test_addition[Long]>
  <Function test_addition[Float]>
  <Function test_addition[Double]>
  <Function test_addition[Decimal(7,3)]>
  <Function test_addition[Decimal(12,2)]>
  <Function test_addition[Decimal(18,0)]>
  <Function test_addition[Decimal(20,2)]>
  <Function test_addition[Decimal(30,2)]>
  <Function test_addition[Decimal(36,5)]>
  <Function test_addition[Decimal(38,10)]>
  <Function test_addition[Decimal(38,0)]>
  <Function test_addition[Decimal(7,7)]>
  <Function test_addition[Decimal(7,-3)]>
  <Function test_addition[Decimal(36,-5)]>
  <Function test_addition[Decimal(38,-10)]>
```

Signed-off-by: Gera Shegalov <gera@apache.org>
Co-authored-by: Raza Jafri <rjafri@nvidia.com>

* Changing to TESTS=module::method

Signed-off-by: Gera Shegalov <gera@apache.org>

---------

Signed-off-by: Gera Shegalov <gera@apache.org>
Co-authored-by: Raza Jafri <rjafri@nvidia.com>
---
 integration_tests/README.md                 | 33 +++++++++++++--------
 integration_tests/run_pyspark_from_build.sh | 16 +++++++---
 2 files changed, 33 insertions(+), 16 deletions(-)
diff --git a/integration_tests/README.md b/integration_tests/README.md
index 225b938ac47..7237720a114 100644
--- a/integration_tests/README.md
+++ b/integration_tests/README.md
@@ -161,9 +161,13 @@ at `$SPARK_HOME`.  It will be very useful to read the contents of the
 [run_pyspark_from_build.sh](run_pyspark_from_build.sh) to get a better insight
 into what is needed as we constantly keep working on to improve and expand the plugin-support.
 
-The python tests run with pytest and the script honors pytest parameters. Some handy flags are:
-- `-k` <pytest-file-name>. This will run all the tests in that test file.
-- `-k` <test-name>. This will also run an individual test.
+The python tests run with pytest and the script honors pytest parameters:
+
+- The explicit test specification of specific modules, methods, and their parametrization
+  is supported by using the `TESTS` environment variable instead of positional arguments
+  in pytest CLI
+- `-k` <keyword_expression>. This will run all the tests satisfying the keyword
+  expression.
 - `-s` Doesn't capture the output and instead prints to the screen.
 - `-v` Increase the verbosity of the tests
 - `-r fExXs` Show extra test summary info as specified by chars: (f)ailed, (E)rror, (x)failed, (X)passed, (s)kipped
@@ -175,7 +179,12 @@ Examples:
   ## running all integration tests for Map
   ./integration_tests/run_pyspark_from_build.sh -k map_test.py
   ## Running a single integration test in map_test
-  ./integration_tests/run_pyspark_from_build.sh -k test_map_integration_1
+  ./integration_tests/run_pyspark_from_build.sh -k 'map_test.py and test_map_integration_1'
+  ## Running tests marching the keyword "exist" from any module
+  ./integration_tests/run_pyspark_from_build.sh -k exist
+  ## Running all parametrization of the method arithmetic_ops_test.py::test_addition
+  ## and a specific parametrization of array_test.py::test_array_exists
+  TESTS="arithmetic_ops_test.py::test_addition array_test.py::test_array_exists[3VL:off-data_gen0]" ./integration_tests/run_pyspark_from_build.sh
   ```
 
 ### Spark execution mode
@@ -343,14 +352,14 @@ integration tests. For example:
 $ DATAGEN_SEED=1702166057 SPARK_HOME=~/spark-3.4.0-bin-hadoop3 integration_tests/run_pyspark_from_build.sh
 ```
 
-Tests can override the seed used using the test marker: 
+Tests can override the seed used using the test marker:
 
 ```
-@datagen_overrides(seed=<new seed here>, [condition=True|False], [permanent=True|False])`. 
+@datagen_overrides(seed=<new seed here>, [condition=True|False], [permanent=True|False])`.
 ```
 
-This marker has the following arguments: 
-- `seed`: a hard coded datagen seed to use. 
+This marker has the following arguments:
+- `seed`: a hard coded datagen seed to use.
 - `condition`: is used to gate when the override is appropriate, usually used to say that specific shims
                need the special override.
 - `permanent`: forces a test to ignore `DATAGEN_SEED` if True. If False, or if absent, the `DATAGEN_SEED` value always wins.
@@ -507,10 +516,10 @@ The marks you care about are all in marks.py
 For the most part you can ignore this file. It provides the underlying Spark session to operations that need it, but most tests should interact with
 it through `asserts.py`.
 
-All data generation and Spark function calls should occur within a Spark session. Typically 
-this is done by passing a lambda to functions in `asserts.py` such as 
-`assert_gpu_and_cpu_are_equal_collect`. However, for scalar generation like `gen_scalars`, you 
-may need to put it in a `with_cpu_session`. It is because negative scale decimals can have 
+All data generation and Spark function calls should occur within a Spark session. Typically
+this is done by passing a lambda to functions in `asserts.py` such as
+`assert_gpu_and_cpu_are_equal_collect`. However, for scalar generation like `gen_scalars`, you
+may need to put it in a `with_cpu_session`. It is because negative scale decimals can have
 problems when calling `f.lit` from outside of `with_spark_session`.
 
 ## Guidelines for Testing
diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
index 2fcb791d43b..713c06c31e3 100755
--- a/integration_tests/run_pyspark_from_build.sh
+++ b/integration_tests/run_pyspark_from_build.sh
@@ -191,10 +191,18 @@ else
     ## Under cloud environment, overwrite the '--std_input_path' param to point to the distributed file path
     INPUT_PATH=${INPUT_PATH:-"$SCRIPTPATH"}
 
-    RUN_TESTS_COMMAND=("$SCRIPTPATH"/runtests.py
-      --rootdir
-      "$LOCAL_ROOTDIR"
-      "$LOCAL_ROOTDIR"/src/main/python)
+    RUN_TESTS_COMMAND=(
+        "$SCRIPTPATH"/runtests.py
+        --rootdir "$LOCAL_ROOTDIR"
+    )
+    if [[ "${TESTS}" == "" ]]; then
+        RUN_TESTS_COMMAND+=("${LOCAL_ROOTDIR}/src/main/python")
+    else
+        read -a RAW_TESTS <<< "${TESTS}"
+        for raw_test in ${RAW_TESTS[@]}; do
+            RUN_TESTS_COMMAND+=("${LOCAL_ROOTDIR}/src/main/python/${raw_test}")
+        done
+    fi
 
     REPORT_CHARS=${REPORT_CHARS:="fE"} # default as (f)ailed, (E)rror
     TEST_COMMON_OPTS=(-v