Merge commit 'ac50f231cae3fa61795259eef0204a613fd68f85' into jerco/wi…

…p-faster-caching-option2
dbt-labs · Apr 29, 2022 · 3049247 · 3049247
2 parents ecc1847 + ac50f23
commit 3049247
Show file tree

Hide file tree

Showing 28 changed files with 728 additions and 170 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.0.0
+current_version = 1.1.0
 parse = (?P<major>\d+)
 	\.(?P<minor>\d+)
 	\.(?P<patch>\d+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -10,6 +10,24 @@ jobs:
       - checkout
       - run: tox -e flake8,unit
 
+  integration-spark-session:
+    environment:
+      DBT_INVOCATION_ENV: circle
+    docker:
+      - image: godatadriven/pyspark:3.1
+    steps:
+      - checkout
+      - run: apt-get update
+      - run: python3 -m pip install --upgrade pip
+      - run: apt-get install -y git gcc g++ unixodbc-dev libsasl2-dev
+      - run: python3 -m pip install tox
+      - run:
+          name: Run integration tests
+          command: tox -e integration-spark-session
+          no_output_timeout: 1h
+      - store_artifacts:
+          path: ./logs
+
   integration-spark-thrift:
     environment:
       DBT_INVOCATION_ENV: circle
@@ -61,6 +79,7 @@ jobs:
   integration-spark-databricks-http:
     environment:
       DBT_INVOCATION_ENV: circle
+      DBT_DATABRICKS_RETRY_ALL: True
     docker:
       - image: fishtownanalytics/test-container:10
     steps:
@@ -90,7 +109,7 @@ jobs:
           no_output_timeout: 1h
       - store_artifacts:
           path: ./logs
-          
+
   integration-spark-databricks-odbc-endpoint:
     <<: *databricks-odbc
     steps:
@@ -107,6 +126,9 @@ workflows:
   test-everything:
     jobs:
       - unit
+      - integration-spark-session:
+          requires:
+            - unit
       - integration-spark-thrift:
           requires:
             - unit

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -122,6 +122,9 @@ jobs:
 
     runs-on: ubuntu-latest
 
+    outputs:
+      is_alpha: ${{ steps.check-is-alpha.outputs.is_alpha }}
+
     steps:
       - name: Check out the repository
         uses: actions/checkout@v2
@@ -150,6 +153,14 @@ jobs:
       - name: Check wheel contents
         run: |
           check-wheel-contents dist/*.whl --ignore W007,W008
+          
+      - name: Check if this is an alpha version
+        id: check-is-alpha
+        run: |
+          export is_alpha=0
+          if [[ "$(ls -lh dist/)" == *"a1"* ]]; then export is_alpha=1; fi
+          echo "::set-output name=is_alpha::$is_alpha"
+
       - uses: actions/upload-artifact@v2
         with:
           name: dist
@@ -158,6 +169,8 @@ jobs:
   test-build:
     name: verify packages / python ${{ matrix.python-version }} / ${{ matrix.os }}
 
+    if: needs.build.outputs.is_alpha == 0
+
     needs: build
 
     runs-on: ${{ matrix.os }}

diff --git a/.github/workflows/version-bump.yml b/.github/workflows/version-bump.yml
@@ -55,6 +55,7 @@ jobs:
 
       - name: Install python dependencies
         run: |
+          sudo apt-get install libsasl2-dev
           python3 -m venv env
           source env/bin/activate
           pip install --upgrade pip     

diff --git a/.gitignore b/.gitignore
@@ -5,12 +5,14 @@ env/
 *.pyc
 __pycache__
 .tox/
+.env
 .idea/
 build/
 dist/
 dbt-integration-tests
 test/integration/.user.yml
 .DS_Store
+test.env
 .vscode
 *.log
 logs/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,35 @@
-## dbt-spark 1.1.0 (Release TBD)
+## dbt-spark 1.1.0 (April 28, 2022)
+
+### Features
+- Add session connection method ([#272](https://github.com/dbt-labs/dbt-spark/issues/272), [#279](https://github.com/dbt-labs/dbt-spark/pull/279))
 
 ### Under the hood
+- Use dbt.tests.adapter.basic in test suite ([#298](https://github.com/dbt-labs/dbt-spark/issues/298), [#299](https://github.com/dbt-labs/dbt-spark/pull/299))
+- Make internal macros use macro dispatch to be overridable in child adapters ([#319](https://github.com/dbt-labs/dbt-spark/issues/319), [#320](https://github.com/dbt-labs/dbt-spark/pull/320))
+- Override adapter method 'run_sql_for_tests' ([#323](https://github.com/dbt-labs/dbt-spark/issues/323), [#324](https://github.com/dbt-labs/dbt-spark/pull/324))
+- when a table or view doesn't exist, 'adapter.get_columns_in_relation' will return empty list instead of fail ([#328]https://github.com/dbt-labs/dbt-spark/pull/328)
+
+### Contributors
+- [@JCZuurmond](https://github.com/dbt-labs/dbt-spark/pull/279) ( [#279](https://github.com/dbt-labs/dbt-spark/pull/279))
+- [@ueshin](https://github.com/ueshin) ([#320](https://github.com/dbt-labs/dbt-spark/pull/320))
+
+## dbt-spark 1.1.0b1 (March 23, 2022)
+
+### Features
+- Adds new integration test to check against new ability to allow unique_key to be a list. ([#282](https://github.com/dbt-labs/dbt-spark/issues/282)), [#291](https://github.com/dbt-labs/dbt-spark/pull/291))
+
+### Fixes
+- Closes the connection properly ([#280](https://github.com/dbt-labs/dbt-spark/issues/280), [#285](https://github.com/dbt-labs/dbt-spark/pull/285))
+
+### Under the hood
+- get_response -> AdapterResponse ([#265](https://github.com/dbt-labs/dbt-spark/pull/265))
+- Adding stale Actions workflow ([#275](https://github.com/dbt-labs/dbt-spark/pull/275))
 - Update plugin author name (`fishtown-analytics` &rarr; `dbt-labs`) in ODBC user agent ([#288](https://github.com/dbt-labs/dbt-spark/pull/288))
+- Configure insert_overwrite models to use parquet ([#301](https://github.com/dbt-labs/dbt-spark/pull/301))
 
 ### Contributors
 - [@amychen1776](https://github.com/amychen1776) ([#288](https://github.com/dbt-labs/dbt-spark/pull/288))
+- [@ueshin](https://github.com/ueshin) ([#285](https://github.com/dbt-labs/dbt-spark/pull/285))
 
 ## dbt-spark 1.0.1rc0 (Release TBD)
 

diff --git a/README.md b/README.md
@@ -24,6 +24,56 @@ more information, consult [the docs](https://docs.getdbt.com/docs/profile-spark)
 - [Install dbt](https://docs.getdbt.com/docs/installation)
 - Read the [introduction](https://docs.getdbt.com/docs/introduction/) and [viewpoint](https://docs.getdbt.com/docs/about/viewpoint/)
 
+## Running locally
+A `docker-compose` environment starts a Spark Thrift server and a Postgres database as a Hive Metastore backend.
+Note that this is spark 2 not spark 3 so some functionalities might not be available.
+
+The following command would start two docker containers
+```
+docker-compose up -d
+```
+It will take a bit of time for the instance to start, you can check the logs of the two containers.
+If the instance doesn't start correctly, try the complete reset command listed below and then try start again.
+
+Create a profile like this one:
+
+```
+spark-testing:
+  target: local
+  outputs:
+    local:
+      type: spark
+      method: thrift
+      host: 127.0.0.1
+      port: 10000
+      user: dbt
+      schema: analytics
+      connect_retries: 5
+      connect_timeout: 60
+      retry_all: true
+```
+
+Connecting to the local spark instance:
+
+* The Spark UI should be available at [http://localhost:4040/sqlserver/](http://localhost:4040/sqlserver/)
+* The endpoint for SQL-based testing is at `http://localhost:10000` and can be referenced with the Hive or Spark JDBC drivers using connection string `jdbc:hive2://localhost:10000` and default credentials `dbt`:`dbt`
+
+Note that the Hive metastore data is persisted under `./.hive-metastore/`, and the Spark-produced data under `./.spark-warehouse/`. To completely reset you environment run the following:
+
+```
+docker-compose down
+rm -rf ./.hive-metastore/
+rm -rf ./.spark-warehouse/
+```
+
+### Reporting bugs and contributing code
+
+-   Want to report a bug or request a feature? Let us know on [Slack](http://slack.getdbt.com/), or open [an issue](https://github.com/fishtown-analytics/dbt-spark/issues/new).
+
+## Code of Conduct
+
+Everyone interacting in the dbt project's codebases, issue trackers, chat rooms, and mailing lists is expected to follow the [PyPA Code of Conduct](https://www.pypa.io/en/latest/code-of-conduct/).
+
 ## Join the dbt Community
 
 - Be part of the conversation in the [dbt Community Slack](http://community.getdbt.com/)

diff --git a/dbt/adapters/spark/__version__.py b/dbt/adapters/spark/__version__.py
@@ -1 +1 @@
-version = "1.0.0"
+version = "1.1.0"
diff --git a/dbt/adapters/spark/connections.py b/dbt/adapters/spark/connections.py
@@ -55,6 +55,7 @@ class SparkConnectionMethod(StrEnum):
     THRIFT = 'thrift'
     HTTP = 'http'
     ODBC = 'odbc'
+    SESSION = 'session'
 
 
 @dataclass
@@ -133,6 +134,18 @@ def __post_init__(self):
                 "`pip install dbt-spark[PyHive]`"
             )
 
+        if self.method == SparkConnectionMethod.SESSION:
+            try:
+                import pyspark  # noqa: F401
+            except ImportError as e:
+                raise dbt.exceptions.RuntimeException(
+                    f"{self.method} connection method requires "
+                    "additional dependencies. \n"
+                    "Install the additional required dependencies with "
+                    "`pip install dbt-spark[session]`\n\n"
+                    f"ImportError({e.msg})"
+                ) from e
+
     @property
     def type(self):
         return 'spark'
@@ -443,6 +456,12 @@ def open(cls, connection):
 
                     conn = pyodbc.connect(connection_str, autocommit=True)
                     handle = PyodbcConnectionWrapper(conn)
+                elif creds.method == SparkConnectionMethod.SESSION:
+                    from .session import (  # noqa: F401
+                        Connection,
+                        SessionConnectionWrapper,
+                    )
+                    handle = SessionConnectionWrapper(Connection())
                 else:
                     raise dbt.exceptions.DbtProfileError(
                         f"invalid credential method: {creds.method}"

diff --git a/dbt/adapters/spark/impl.py b/dbt/adapters/spark/impl.py
@@ -247,8 +247,20 @@ def get_columns_in_relation(self, relation: Relation) -> List[SparkColumn]:
             # return relation's schema. if columns are empty from cache,
             # use get_columns_in_relation spark macro
             # which would execute 'describe extended tablename' query
-            rows: List[agate.Row] = super().get_columns_in_relation(relation)
-            columns = self.parse_describe_extended(relation, rows)
+            try:
+                rows: List[agate.Row] = super().get_columns_in_relation(relation)
+                columns = self.parse_describe_extended(relation, rows)
+            except dbt.exceptions.RuntimeException as e:
+                # spark would throw error when table doesn't exist, where other
+                # CDW would just return and empty list, normalizing the behavior here
+                errmsg = getattr(e, "msg", "")
+                if (
+                    "Table or view not found" in errmsg or
+                    "NoSuchTableException" in errmsg
+                ):
+                    pass
+                else:
+                    raise e
 
         # strip hudi metadata columns.
         columns = [x for x in columns
@@ -380,6 +392,30 @@ def get_rows_different_sql(
 
         return sql
 
+    # This is for use in the test suite
+    # Spark doesn't have 'commit' and 'rollback', so this override
+    # doesn't include those commands.
+    def run_sql_for_tests(self, sql, fetch, conn):
+        cursor = conn.handle.cursor()
+        try:
+            cursor.execute(sql)
+            if fetch == "one":
+                if hasattr(cursor, 'fetchone'):
+                    return cursor.fetchone()
+                else:
+                    # AttributeError: 'PyhiveConnectionWrapper' object has no attribute 'fetchone'
+                    return cursor.fetchall()[0]
+            elif fetch == "all":
+                return cursor.fetchall()
+            else:
+                return
+        except BaseException as e:
+            print(sql)
+            print(e)
+            raise
+        finally:
+            conn.transaction_open = False
+
 
 # spark does something interesting with joins when both tables have the same
 # static values for the join condition and complains that the join condition is