From 873381f145af08922fda049c1928e33fd9ecb930 Mon Sep 17 00:00:00 2001 From: Manrique Vargas Date: Fri, 13 Oct 2023 20:16:16 -0600 Subject: [PATCH] add seed (#3) * insert overwrite instead of insert into for new seed runs * Mv1742 load csv table seed (#1) * changie commit * Update Fixes-20231013-120628.yaml * insert overwrite instead of insert into for new seed runs changie commit Mv1742 load csv table seed (#1) * changie commit * Update Fixes-20231013-120628.yaml * Mv1742 rebase truncate (#2) * merge remote * add docs-issue workflow to dbt-spark (#913) * Update seed.sql * Update seed.sql * add truncate table function * rm changelog * merge remote --- .changes/unreleased/Fixes-20231013-120628.yaml | 7 +++++++ dbt/include/spark/macros/adapters.sql | 7 +++++++ .../macros/materializations/incremental/incremental.sql | 4 ++++ dbt/include/spark/macros/materializations/seed.sql | 5 ++++- 4 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 .changes/unreleased/Fixes-20231013-120628.yaml diff --git a/.changes/unreleased/Fixes-20231013-120628.yaml b/.changes/unreleased/Fixes-20231013-120628.yaml new file mode 100644 index 000000000..a448944ad --- /dev/null +++ b/.changes/unreleased/Fixes-20231013-120628.yaml @@ -0,0 +1,7 @@ +kind: Fixes +body: Overwrite existing rows on existing seed tables. For unmanaged databases (no location specified), the current seed command in + dbt-spark appends to existing seeded tables instead overwriting. +time: 2023-10-13T12:06:28.078483-06:00 +custom: + Author: mv1742 + Issue: "112" diff --git a/dbt/include/spark/macros/adapters.sql b/dbt/include/spark/macros/adapters.sql index bfc1f198d..461e8f14f 100644 --- a/dbt/include/spark/macros/adapters.sql +++ b/dbt/include/spark/macros/adapters.sql @@ -342,6 +342,13 @@ {%- endcall %} {% endmacro %} + +{% macro spark__truncate_relation(relation) -%} + {% call statement('truncate_relation', auto_begin=False) -%} + truncate {{ relation.type }} if exists {{ relation }} + {%- endcall %} +{% endmacro %} + {% macro spark__drop_relation(relation) -%} {% call statement('drop_relation', auto_begin=False) -%} drop {{ relation.type }} if exists {{ relation }} diff --git a/dbt/include/spark/macros/materializations/incremental/incremental.sql b/dbt/include/spark/macros/materializations/incremental/incremental.sql index 10d4f3ed8..1e0531c58 100644 --- a/dbt/include/spark/macros/materializations/incremental/incremental.sql +++ b/dbt/include/spark/macros/materializations/incremental/incremental.sql @@ -66,6 +66,10 @@ re: python models and temporary views. Also, why do neither drop_relation or adapter.drop_relation work here?! + 'unmanaged' tables in spark need to manually delete the database + otherwise drop statement does not delete the underlying data. + TODO:add warning that this feature does not work for Unmanaged tables. + Managed tables are fine. --#} {% call statement('drop_relation') -%} drop table if exists {{ tmp_relation }} diff --git a/dbt/include/spark/macros/materializations/seed.sql b/dbt/include/spark/macros/materializations/seed.sql index 196479cb0..b2b1d6a44 100644 --- a/dbt/include/spark/macros/materializations/seed.sql +++ b/dbt/include/spark/macros/materializations/seed.sql @@ -5,7 +5,10 @@ {% macro spark__reset_csv_table(model, full_refresh, old_relation, agate_table) %} {% if old_relation %} + {{ adapter.truncate_relation(old_relation) }} {{ adapter.drop_relation(old_relation) }} + + {{ return(sql) }} {% endif %} {% set sql = create_csv_table(model, agate_table) %} {{ return(sql) }} @@ -27,7 +30,7 @@ {% endfor %} {% set sql %} - insert into {{ this.render() }} values + insert {% if loop.index0 == 0 -%} overwrite {% else -%} into {% endif -%} {{ this.render() }} values {% for row in chunk -%} ({%- for col_name in agate_table.column_names -%} {%- set inferred_type = adapter.convert_type(agate_table, loop.index0) -%}