From 6109056799641abf4c2815027b32d5e26ba81f44 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 15:30:38 -0400 Subject: [PATCH 01/21] refactor Term component to pull from single file --- website/docs/docs/test-terms.md | 20 +++++++++ website/docs/terms/terms.md | 23 ++++++++++ website/src/components/term/index.js | 45 +++++++++---------- website/src/components/term/styles.module.css | 1 - 4 files changed, 65 insertions(+), 24 deletions(-) create mode 100644 website/docs/docs/test-terms.md create mode 100644 website/docs/terms/terms.md diff --git a/website/docs/docs/test-terms.md b/website/docs/docs/test-terms.md new file mode 100644 index 00000000000..a8cf640482a --- /dev/null +++ b/website/docs/docs/test-terms.md @@ -0,0 +1,20 @@ +--- +title: "Testing Terms component" +id: "test-terms" +pagination_next: null +pagination_prev: null +--- + +This should show a term: Deploying + +This should show a term: Data wrangling + +This should show a term: Model + +This should show demoTerm: + +This should show noId term without children or displayText: + +This has term but no hoverSnippet set: + +This should NOT show a term (but should fail gracefully): Demo diff --git a/website/docs/terms/terms.md b/website/docs/terms/terms.md new file mode 100644 index 00000000000..f334cf7c57d --- /dev/null +++ b/website/docs/terms/terms.md @@ -0,0 +1,23 @@ +--- +cte: + hoverSnippet: A Common Table Expression (CTE) is a temporary result set that can be used in a SQL query. You can use CTEs to break up complex queries into simpler blocks of code that can connect and build on each other. + displayText: CTE +deploying: + hoverSnippet: Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. + displayText: Deploying +data-wrangling: + hoverSnippet: Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data. + displayText: data wrangling +model: + hoverSnippet: A model is an essential building block of the DAG + displayText: model + +# Test terms (DELETE BEFORE MERGE) +demoTerm: + hoverSnippet: This is a demo term hover snippet + displayText: Yay demo term +noId: + hoverSnippet: No ID hover snip +noHoverSnippet: + displayText: No Hover Snip +--- diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index 760acb736cb..b5b209e1c7e 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -20,48 +20,47 @@ export default function Term({ id, children = undefined }) { setPageReady(true) }) - const file = require('../../../docs/terms/' + id + '.md') - if(!file) - return null + const file = require('../../../docs/terms/terms.md') + const term = file?.frontMatter?.[id] - const fm = file.frontMatter - if(!fm) + if(!term) return null - - const { displayText, hoverSnippet } = fm + + const { displayText, hoverSnippet } = term; + + const displayValue = children ? children : displayText ? displayText : id return ( <> {pageReady ? ( <> - {/* If component has children, show children text, - Else, default to displayText frontmatter field, - Or filename if displayText not set - */} - {children ? children : displayText ? displayText : id} - + Else, default to displayText frontmatter field, + Or filename if displayText not set + */} + {displayValue} + {hoverSnippet && ( - {hoverSnippet} - )} + )} ) : ( - {children ? children : displayText ? displayText : id} + {displayValue} )} - ) + ); } diff --git a/website/src/components/term/styles.module.css b/website/src/components/term/styles.module.css index 482e76bda52..22603d6c058 100644 --- a/website/src/components/term/styles.module.css +++ b/website/src/components/term/styles.module.css @@ -1,5 +1,4 @@ .term { - cursor: pointer; position: relative; text-decoration: underline dotted var(--ifm-font-color-base); color: var(--ifm-font-color-base); From 5cf4e1d781ddda8b07a36a961172fc670c29fc10 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 15:43:28 -0400 Subject: [PATCH 02/21] migrate terms to single terms.md file --- website/docs/docs/test-terms.md | 4 +- website/docs/terms/terms.md | 127 +++++++++++++++++++++++++-- website/src/components/term/index.js | 14 +-- 3 files changed, 131 insertions(+), 14 deletions(-) diff --git a/website/docs/docs/test-terms.md b/website/docs/docs/test-terms.md index a8cf640482a..d33b6aa261d 100644 --- a/website/docs/docs/test-terms.md +++ b/website/docs/docs/test-terms.md @@ -13,8 +13,8 @@ This should show a term: Model This should show demoTerm: -This should show noId term without children or displayText: +This should show noDisplayText term id as children and displayText not set: -This has term but no hoverSnippet set: +This has displayText set but no hoverSnippet set: This should NOT show a term (but should fail gracefully): Demo diff --git a/website/docs/terms/terms.md b/website/docs/terms/terms.md index f334cf7c57d..f1e1d118d16 100644 --- a/website/docs/terms/terms.md +++ b/website/docs/terms/terms.md @@ -1,23 +1,136 @@ --- cte: - hoverSnippet: A Common Table Expression (CTE) is a temporary result set that can be used in a SQL query. You can use CTEs to break up complex queries into simpler blocks of code that can connect and build on each other. displayText: CTE -deploying: - hoverSnippet: Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. - displayText: Deploying + hoverSnippet: A Common Table Expression (CTE) is a temporary result set that can be used in a SQL query. You can use CTEs to break up complex queries into simpler blocks of code that can connect and build on each other. + +dag: + displayText: DAG + hoverSnippet: A DAG is a Directed Acyclic Graph, a type of graph whose nodes are directionally related to each other and don’t form a directional closed loop. + +data-extraction: + displayText: data extraction + hoverSnippet: Data extraction is the process by which data is retrieved from multiple sources, often varying in volume and structure. + +data-lake: + displayText: data lake + hoverSnippet: A data lake is a data management system used for storing large amounts of data in in its raw, native form as files. Data lakes can store any type of data—structured, semi-structured, unstructured—in one centralized place. + +data-lineage: + displayText: data lineage + hoverSnippet: Data lineage provides a holistic view of how data moves through an organization, where it’s transformed and consumed. + +data-warehouse: + displayText: data warehouse + hoverSnippet: A data warehouse is a data management system used for data storage and computing that allows for analytics activities such as transforming and sharing data. + +data-catalog: + displayText: data catalog + hoverSnippet: A data catalog is an inventory of data assets from different parts of the data stack within an organization. This catalog can display metadata, lineage, and business definitions from your different data sources. + data-wrangling: - hoverSnippet: Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data. displayText: data wrangling + hoverSnippet: Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data. + +dataframe: + displayText: dataframe + hoverSnippet: A DataFrame is a two-dimensional data structure (rows and columns). It's the most common way of representing and interacting with large datasets in Python. + +ddl: + displayText: DDL + hoverSnippet: Data Definition Language (DDL) is a group of SQL statements that you can execute to manage database objects, including tables, views, and more. + +deploying: + displayText: Deploying + hoverSnippet: Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. + +dimensional-modeling: + displayText: dimensional modeling + hoverSnippet: Dimensional modeling is a data modeling technique where you break data up into “facts” and “dimensions” to organize and describe entities within your data warehouse. + +dml: + displayText: DML + hoverSnippet: Data Manipulation Language (DML) is a class of SQL statements that are used to query, edit, add and delete row-level data from database tables or views. The main DML statements are SELECT, INSERT, DELETE, and UPDATE. + +dry: + displayText: DRY + hoverSnippet: DRY is a software development principle that stands for “Don’t Repeat Yourself.” Living by this principle means that your aim is to reduce repetitive patterns and duplicate code and logic in favor of modular and referenceable code. + +edw: + displayText: EDW + hoverSnippet: An Enterprise Data Warehouse (EDW), like any other data warehouse, is a collection of databases that centralize a business's information from multiple sources and applications. + +elt: + displayText: ELT + hoverSnippet: Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, loading it into a target data warehouse, and finally transforming it. + +etl: + displayText: ETL + hoverSnippet: Extract, Transform, Load (ETL) is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. + +grain: + displayText: grain + hoverSnippet: Your data's grain is the combination of columns at which records in a table are unique. Ideally, this is captured in a single column and a unique primary key. + +idempotent: + displayText: idempotent + hoverSnippet: Idempotent describes a process that gives you the same result no matter how many times you run it. + +json: + displayText: JSON + hoverSnippet: JSON (JavaScript Object Notation) is a minimal format for semi-structured data used to capture relationships between fields and values. + +materialization: + displayText: materialization + hoverSnippet: The exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a data warehouse. + model: hoverSnippet: A model is an essential building block of the DAG displayText: model +monotonically-increasing: + displayText: monotonically increasing + hoverSnippet: A monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. + +predicate-pushdown: + displayText: Predicate pushdown + hoverSnippet: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query + +primary-key: + displayText: primary key + hoverSnippet: A primary key is a non-null column in a database object that uniquely identifies each row. + +relational-database: + displayText: relational database + hoverSnippet: A relational database provides a structured way to store data into tables consisting of rows and columns. Different tables in a relational database can be joined together using common columns from each table, forming relationships. + +reverse-etl: + displayText: reverse ETL + hoverSnippet: Reverse ETL is the process of getting your transformed data stored in your data warehouse to end business platforms, such as sales CRMs and ad platforms. + +subquery: + displayText: subquery + hoverSnippet: A subquery is a query within another query. Subqueries are often used when you need to process data in multiple steps. + +surrogate-key: + displayText: surrogate key + hoverSnippet: A surrogate key is a unique identifier derived from the data itself. It often takes the form of a hashed value of multiple columns that will create a uniqueness constraint for each row. + +table: + displayText: table + hoverSnippet: In simplest terms, a table is the direct storage of data in rows and columns. Think excel sheet with raw values in each of the cells. + +view: + displayText: view + hoverSnippet: A view (as opposed to a table) is a defined passthrough SQL query that can be run against a database (or data warehouse). + # Test terms (DELETE BEFORE MERGE) demoTerm: hoverSnippet: This is a demo term hover snippet displayText: Yay demo term -noId: - hoverSnippet: No ID hover snip + +noDisplayText: + hoverSnippet: No Display Text hover snip + noHoverSnippet: displayText: No Hover Snip --- diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index b5b209e1c7e..e3728db8273 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -20,14 +20,22 @@ export default function Term({ id, children = undefined }) { setPageReady(true) }) + // Get terms file const file = require('../../../docs/terms/terms.md') + + // Get term by id const term = file?.frontMatter?.[id] + // If term not found in file, return children if available or null if(!term) - return null + return children || null + // Get properties from front matter const { displayText, hoverSnippet } = term; + // If component has children, show children text, + // Else, default to displayText frontmatter field, + // Or filename if displayText not set const displayValue = children ? children : displayText ? displayText : id return ( @@ -40,10 +48,6 @@ export default function Term({ id, children = undefined }) { data-tip data-for={uniqueID} > - {/* If component has children, show children text, - Else, default to displayText frontmatter field, - Or filename if displayText not set - */} {displayValue} {hoverSnippet && ( From 60d5799e0fe979f653709c1144fa1f2936a694d4 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 15:52:46 -0400 Subject: [PATCH 03/21] exclude docs in terms directory from building pages --- website/docusaurus.config.js | 3 +++ website/src/components/term/index.js | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index dd1aafc2cd5..d84ac6351ae 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -246,6 +246,9 @@ var siteSettings = { //showLastUpdateAuthor: false, sidebarCollapsible: true, + exclude: [ + 'terms/**' + ] }, blog: { blogTitle: "Developer Blog | dbt Developer Hub", diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index e3728db8273..41953c754e8 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -5,7 +5,7 @@ import styles from './styles.module.css'; {/* Props: - id: filename of term + id: maps to term in website/docs/terms/terms.md children (optional): to display different text other than displayText property for term */} From 9259ae88c6645892fdf8ae9a8d371e67f9a9784d Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 15:53:29 -0400 Subject: [PATCH 04/21] remove glossary sidebar section --- website/sidebars.js | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/website/sidebars.js b/website/sidebars.js index 3ecff4567ce..d7c41097588 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -1334,25 +1334,6 @@ const sidebarSettings = { ], }, ], - Glossary: [ - { - type: "category", - label: "Analytics Engineering Glossary", - link: { - type: "generated-index", - title: "Analytics Engineering Glossary", - description: - "The Analytics Engineering Glossary is a living collection of terms & concepts commonly used in the data industry. You can use and contribute to this resource to educate yourself, your team, and your stakeholders.", - slug: "/glossary", - }, - items: [ - { - type: "autogenerated", - dirName: "terms", - }, - ], - }, - ], SQLReference: [ { type: "category", From 3cc57a23f6d58d012350620c0f7f2a220ad73ae0 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 15:53:47 -0400 Subject: [PATCH 05/21] remove glossary nav link --- website/docusaurus.config.js | 4 ---- 1 file changed, 4 deletions(-) diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index d84ac6351ae..f820da39e8e 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -160,10 +160,6 @@ var siteSettings = { label: "Developer blog", to: "/blog", }, - { - label: "Glossary", - to: "/glossary", - }, ], }, { From 992a9330032694ee72c6e6eca287e83491f0c264 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 15:55:19 -0400 Subject: [PATCH 06/21] remove term pages and directory and relocate terms.md file --- website/docs/{terms => }/terms.md | 0 website/docs/terms/cte.md | 192 --------------- website/docs/terms/dag.md | 111 --------- website/docs/terms/data-catalog.md | 87 ------- website/docs/terms/data-extraction.md | 78 ------ website/docs/terms/data-lake.md | 112 --------- website/docs/terms/data-lineage.md | 116 --------- website/docs/terms/data-warehouse.md | 89 ------- website/docs/terms/data-wrangling.md | 166 ------------- website/docs/terms/dataframe.md | 107 --------- website/docs/terms/ddl.md | 128 ---------- website/docs/terms/deploying.md | 12 - website/docs/terms/dimensional-modeling.md | 159 ------------- website/docs/terms/dml.md | 107 --------- website/docs/terms/dry.md | 97 -------- website/docs/terms/edw.md | 63 ----- website/docs/terms/elt.md | 139 ----------- website/docs/terms/etl.md | 130 ---------- website/docs/terms/grain.md | 36 --- website/docs/terms/idempotent.md | 23 -- website/docs/terms/json.md | 103 -------- website/docs/terms/materialization.md | 20 -- website/docs/terms/model.md | 16 -- .../docs/terms/monotonically-increasing.md | 11 - website/docs/terms/predicate-pushdown.md | 10 - website/docs/terms/primary-key.md | 148 ------------ website/docs/terms/relational-database.md | 88 ------- website/docs/terms/reverse-etl.md | 94 -------- website/docs/terms/subquery.md | 224 ------------------ website/docs/terms/surrogate-key.md | 196 --------------- website/docs/terms/table.md | 25 -- website/docs/terms/view.md | 36 --- website/src/components/term/index.js | 2 +- 33 files changed, 1 insertion(+), 2924 deletions(-) rename website/docs/{terms => }/terms.md (100%) delete mode 100644 website/docs/terms/cte.md delete mode 100644 website/docs/terms/dag.md delete mode 100644 website/docs/terms/data-catalog.md delete mode 100644 website/docs/terms/data-extraction.md delete mode 100644 website/docs/terms/data-lake.md delete mode 100644 website/docs/terms/data-lineage.md delete mode 100644 website/docs/terms/data-warehouse.md delete mode 100644 website/docs/terms/data-wrangling.md delete mode 100644 website/docs/terms/dataframe.md delete mode 100644 website/docs/terms/ddl.md delete mode 100644 website/docs/terms/deploying.md delete mode 100644 website/docs/terms/dimensional-modeling.md delete mode 100644 website/docs/terms/dml.md delete mode 100644 website/docs/terms/dry.md delete mode 100644 website/docs/terms/edw.md delete mode 100644 website/docs/terms/elt.md delete mode 100644 website/docs/terms/etl.md delete mode 100644 website/docs/terms/grain.md delete mode 100644 website/docs/terms/idempotent.md delete mode 100644 website/docs/terms/json.md delete mode 100644 website/docs/terms/materialization.md delete mode 100644 website/docs/terms/model.md delete mode 100644 website/docs/terms/monotonically-increasing.md delete mode 100644 website/docs/terms/predicate-pushdown.md delete mode 100644 website/docs/terms/primary-key.md delete mode 100644 website/docs/terms/relational-database.md delete mode 100644 website/docs/terms/reverse-etl.md delete mode 100644 website/docs/terms/subquery.md delete mode 100644 website/docs/terms/surrogate-key.md delete mode 100644 website/docs/terms/table.md delete mode 100644 website/docs/terms/view.md diff --git a/website/docs/terms/terms.md b/website/docs/terms.md similarity index 100% rename from website/docs/terms/terms.md rename to website/docs/terms.md diff --git a/website/docs/terms/cte.md b/website/docs/terms/cte.md deleted file mode 100644 index 87ef31abc8e..00000000000 --- a/website/docs/terms/cte.md +++ /dev/null @@ -1,192 +0,0 @@ ---- -id: cte -title: CTE in SQL -description: A CTE is a temporary result set that can be used in a SQL query. You can think of a CTE as a separate, smaller query within the larger query you’re building up. -displayText: CTE -hoverSnippet: A Common Table Expression (CTE) is a temporary result set that can be used in a SQL query. You can use CTEs to break up complex queries into simpler blocks of code that can connect and build on each other. ---- - - - CTE in SQL: Quite possibly the best thing to happen to SQL - - -In a formal sense, a Common Table Expression (CTE), is a temporary result set that can be used in a SQL query. You can use CTEs to break up complex queries into simpler blocks of code that can connect and build on each other. In a less formal, more human-sense, you can think of a CTE as a separate, smaller query within the larger query you’re building up. Creating a CTE is essentially like making a temporary that you can access throughout the rest of the query you are writing. - -There are two-types of CTEs: recursive and non-recursive. This glossary focuses on non-recursive CTEs. - -## Why you should care about CTEs - -Have you ever read through a query and thought: - -- “What does this part of the query do?” -- “What are all the sources referenced in this query? Why did I reference this dependency?” -- “My query is not producing the results I expect and I’m not sure which part of the query is causing that.” - -These thoughts often arise when we’ve written SQL queries and models that utilize complex business logic, references and joins multiple upstream dependencies, and are not outputting expected results. In a nutshell, these thoughts can occur often when you’re trying to write data models! - -How can you make these complexities in your code more digestible and usable? CTEs to the rescue! - -## CTE Syntax: How it works - -To use CTEs, you begin by defining your first CTE using the `WITH` statement followed by a `SELECT` statement. - -Let’s break down this example involving a `rename_columns` CTE below: - -```sql -with rename_columns as ( - - select - - id as customer_id, - lower(first_name) as customer_first_name, - lower(last_name) as customer_last_initial - - from {{ ref('raw_customers') }} - -) - -select * from rename_columns -``` - -In this query above, you first create a CTE called `rename_columns` where you conduct a -simple `SELECT` statement that renames and lower cases some columns from a `raw_customers` /model. The final `select * from rename_columns` selects all results from the `rename_columns` CTE. - -While you shouldn't always think of CTEs as having classical arguments like SQL functions, you’ve got to call the necessary inputs for CTEs something. - -- CTE_EXPRESSION_NAME: This is the name of the CTE you can reference in other CTEs or SELECT statements. In our example, `rename_columns` is the CTE_EXPRESSION_NAME. **If you are using multiple CTEs in one query, it’s important to note that each CTE_EXPRESSION_NAME must be unique.** -- CTE_QUERY: This is the `SELECT` statement whose result set is produced by the CTE. In our example above, the `select … from {{ ref('raw_customers') }}` is the CTE_QUERY. The CTE_QUERY is framed by parenthesis. - -## When to use CTEs - -The primary motivation to implement CTEs in your code is to simplify the complexity of your queries and increase your code’s readability. There are other great benefits to using CTEs in your queries which we’ll outline below. - -### Simplification - -When people talk about how CTEs can simplify your queries, they specifically mean how CTEs can help simplify the structure, readability, and debugging process of your code. - -#### Establish Structure - -In leveraging CTEs, you can break complex code into smaller segments, ultimately helping provide structure to your code. At dbt Labs, we often like to use the [import, logical, and final structure](/guides/refactoring-legacy-sql?step=5#implement-cte-groupings) for CTEs which creates a predictable and organized structure to your dbt models. - -#### Easily identify dependencies - -When you import all of your dependencies as CTEs in the beginning of your query/model, you can automatically see which models, tables, or views your model relies on. - -#### Clearly label code blocks - -Utilizing the CTE_EXPRESSION_NAME, you can title what your CTE is accomplishing. This provides greater insight into what each block of code is performing and can help contextualize why that code is needed. This is incredibly helpful for both the developer who writes the query and the future developer who may inherit it. - -#### Test and debug more easily - -When queries are long, involve multiple joins, and/or complex business logic, it can be hard to understand why your query is not outputting the result you expect. By breaking your query into CTEs, you can separately test that each CTE is working properly. Using the process of elimination of your CTEs, you can more easily identify the root cause. - -### Substitution for a view - -Oftentimes you want to reference data in a query that could, or may have existed at one point, as a view. Instead of worrying about the view actually existing, you can leverage CTEs to create the temporary result you would want from the view. - -### Support reusability - -Using CTEs, you can reference the same resulting set multiple times in one query without having to duplicate your work by referencing the CTE_EXPRESSION_NAME in your from statement. - -## CTE example - -Time to dive into an example using CTEs! For this example, you'll be using the data from our [jaffle_shop demo dbt](https://github.com/dbt-labs/jaffle_shop) project. In the `jaffle_shop`, you have three tables: one for customers, orders, and payments. - -In this query, you're creating three CTEs to ultimately allow you to segment buyers by how many times they’ve purchased. - -```sql -with import_orders as ( - - select * from {{ ref('orders') }} - -), -aggregate_orders as ( - - select - - customer_id, - count(order_id) as count_orders - - from import_orders - where status not in ('returned', 'return pending') - group by 1 - -), -segment_users as ( - - select - - *, - case - when count_orders >= 3 then 'super_buyer' - when count_orders <3 and count_orders >= 2 then - 'regular_buyer' - else 'single_buyer' - end as buyer_type - - from aggregate_orders - -) -select * from segment_users -``` - -Let’s break this query down a bit: - -1. In the first `import_orders` CTE, you are simply importing the `orders` table which holds the data I’m interested in creating the customer segment off of. Note that this first CTE starts with a `WITH` statement and no following CTEs begin with a `WITH` statement. -2. The second `aggregate_orders` CTE utilizes the `import_orders` CTE to get a count of orders per user with a filter applied. -3. The last `segment_users` CTE builds off of the `aggregate_orders` by selecting the `customer_id`, `count_orders`, and creating your `buyer_type` segment. Note that the final `segment_users` CTE does not have a comma after its closing parenthesis. -4. The final `select * from segment_users` statement simply selects all results from the `segment_users` CTE. - -Your results from running this query look a little like this: - -| USER_ID | COUNT_ORDERS | BUYER_TYPE | -|---|---|---| -| 3 | 3 | super_buyer | -| 64 | 1 | single_buyer | -| 94 | 2 | regular_buyer | - -:::tip Tip -If you are finding yourself using the same code for a certain CTE across multiple -queries or models, that’s probably a good sign that CTE should be its own [model](https://docs.getdbt.com/docs/build/models) or view. -::: - -## CTE vs Subquery - -A is a nested query that can oftentimes be used in place of a CTE. Subqueries have different syntax than CTEs, but often have similar use cases. This content won’t go too deep into subqueries here, but it'll highlight some of the main differences between CTEs and subqueries below. - -| CTE | Subquery | -|---|---| -| Typically more readable since CTEs can be used to give structure to your query | Typically less readable, especially if there are many nested queries | -| Allows for recursiveness | Does not allow for recursiveness | -| CTEs must have unique CTE_EXPRESSION_NAMES when used in a query | Subqueries don’t always have to be explicitly named | -| CTEs cannot be used in a `WHERE` clause | Subqueries can be used in a `WHERE` clause | - -## Data warehouse support for CTEs - -CTEs are likely to be supported across most, if not all, [modern data warehouses](https://blog.getdbt.com/future-of-the-modern-data-stack/). Please use this table to see more information about using CTEs in your specific . - -| Data Warehouse | Support CTEs? | -|---|---| -|[Snowflake](https://docs.snowflake.com/en/user-guide/queries-cte.html) | :white_check_mark: | -|[Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_WITH_clause.html) | :white_check_mark: | -|[Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax) | :white_check_mark: | -|[Databricks](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-qry-select-cte.html) | :white_check_mark: | -|[Postgres](https://www.postgresqltutorial.com/postgresql-cte/) | :white_check_mark: | - -## Conclusion - -CTEs are essentially temporary views that can be used throughout a query. They are a great way to give your SQL more structure and readability, and offer simplified ways to debug your code. You can leverage appropriately named CTEs to easily identify upstream dependencies and code functionality. CTEs also support recursiveness and reusability in the same query. Overall, CTEs can be an effective way to level-up your SQL to be more organized and understandable. - -## Further Reading - -If you’re interested in reading more about CTE best practices, check out some of our favorite content around model refactoring and style: - -- [Refactoring Legacy SQL to dbt](/guides/refactoring-legacy-sql?step=5#implement-cte-groupings) -- [dbt Labs Style Guide](https://docs.getdbt.com/best-practices/how-we-style/0-how-we-style-our-dbt-projects) -- [Modular Data Modeling Technique](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) - -Want to know why dbt Labs loves CTEs? Check out the following pieces: - -- [Why we use so many CTEs](https://discourse.getdbt.com/t/why-the-fishtown-sql-style-guide-uses-so-many-ctes/1091) -- [CTEs are Passthroughs](https://discourse.getdbt.com/t/ctes-are-passthroughs-some-research/155) - diff --git a/website/docs/terms/dag.md b/website/docs/terms/dag.md deleted file mode 100644 index 93e2956ebb3..00000000000 --- a/website/docs/terms/dag.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -id: dag -title: DAG -description: A DAG is a Directed Acyclic Graph, a type of graph whose nodes are directionally related to each other and don’t form a directional closed loop. -displayText: DAG -hoverSnippet: A DAG is a Directed Acyclic Graph, a type of graph whose nodes are directionally related to each other and don’t form a directional closed loop. ---- - - - What is a DAG and why is it important? - dbt Labs - - -A DAG is a **D**irected **A**cyclic **G**raph, a type of graph whose nodes are directionally related to each other and don’t form a directional closed loop. In the practice of analytics engineering, DAGs are often used to visually represent the relationships between your data models. - -While the concept of a DAG originated in mathematics and gained popularity in computational work, DAGs have found a home in the modern data world. They offer a great way to visualize data pipelines and lineage, and they offer an easy way to understand dependencies between data models. - -## DAG use cases and best practices - -DAGs are an effective tool to help you understand relationships between your data models and areas of improvement for your overall [data transformations](https://www.getdbt.com/analytics-engineering/transformation/). - -### Unpacking relationships and data lineage - -Can you look at one of your data models today and quickly identify all the upstream and downstream models? If you can’t, that’s probably a good sign to start building or looking at your existing DAG. - -:::tip Upstream or downstream? - -How do you know if a model is upstream or downstream from the model you’re currently looking at? Upstream models are models that must be performed prior to the current model. In simple terms, the current model depends on upstream models in order to exist. Downstream relationships are the outputs from your current model. In a visual DAG, such as the dbt Lineage Graph, upstream models are to the left of your selected model and downstream models are to the right of your selected model. Ever confused? Use the arrows that create the directedness of a DAG to understand the direction of movement. - -::: - -One of the great things about DAGs is that they are *visual*. You can clearly identify the nodes that connect to each other and follow the lines of directions. When looking at a DAG, you should be able to identify where your data sources are going and where that data is potentially being referenced. - -Take this mini-DAG for an example: - - - -What can you learn from this DAG? Immediately, you may notice a handful of things: - -- `stg_users`and `stg_user_groups` models are the parent models for `int_users` -- A join is happening between `stg_users` and `stg_user_groups` to form the `int_users` model -- `stg_orgs` and `int_users` are the parent models for `dim_users` -- `dim_users` is at the end of the DAG and is therefore downstream from a total of four different models - -Within 10 seconds of looking at this DAG, you can quickly unpack some of the most important elements about a project: dependencies and data lineage. Obviously, this is a simplified version of DAGs you may see in real life, but the practice of identifying relationships and data flows remains very much the same, regardless of the size of the DAG. - -What happens if `stg_user_groups` just up and disappears one day? How would you know which models are potentially impacted by this change? Look at your DAG and understand model dependencies to mitigate downstream impacts. - -### Auditing projects - -A potentially bold statement, but there is no such thing as a perfect DAG. DAGs are special in-part because they are unique to your business, data, and data models. There’s usually always room for improvement, whether that means making a CTE into its own view or performing a join earlier upstream, and your DAG can be an effective way to diagnose inefficient data models and relationships. - -You can additionally use your DAG to help identify bottlenecks, long-running data models that severely impact the performance of your data pipeline. Bottlenecks can happen for multiple reasons: -- Expensive joins -- Extensive filtering or [use of window functions](https://docs.getdbt.com/blog/how-we-shaved-90-minutes-off-model) -- Complex logic stored in views -- Good old large volumes of data - -...to name just a few. Understanding the factors impacting model performance can help you decide on [refactoring approaches](https://learn.getdbt.com/courses/refactoring-sql-for-modularity), [changing model materialization](https://docs.getdbt.com/blog/how-we-shaved-90-minutes-off-model#attempt-2-moving-to-an-incremental-model)s, replacing multiple joins with surrogate keys, or other methods. - - - -### Modular data modeling best practices - -See the DAG above? It follows a more traditional approach to data modeling where new data models are often built from raw sources instead of relying on intermediary and reusable data models. This type of project does not scale with team or data growth. As a result, analytics engineers tend to aim to have their DAGs not look like this. - -Instead, there are some key elements that can help you create a more streamlined DAG and [modular data models](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/): - -- Leveraging [staging, intermediate, and mart layers](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) to create layers of distinction between sources and transformed data -- Abstracting code that’s used across multiple models to its own model -- Joining on surrogate keys versus on multiple values - -These are only a few examples of some best practices to help you organize your data models, business logic, and DAG. - -:::tip Is your DAG keeping up with best practices? - -Instead of manually auditing your DAG for best practices, the [dbt project evaluator package](https://github.com/dbt-labs/dbt-project-evaluator) can help audit your project and find areas of improvement. - -::: - -## dbt and DAGs - -The marketing team at dbt Labs would be upset with us if we told you we think dbt actually stood for “dag build tool,” but one of the key elements of dbt is its ability to generate documentation and infer relationships between models. And one of the hallmark features of [dbt Docs](https://docs.getdbt.com/docs/build/documentation) is the Lineage Graph (DAG) of your dbt project. - -Whether you’re using dbt Core or Cloud, dbt docs and the Lineage Graph are available to all dbt developers. The Lineage Graph in dbt Docs can show a model or source’s entire lineage, all within a visual frame. Clicking within a model, you can view the Lineage Graph and adjust selectors to only show certain models within the DAG. Analyzing the DAG here is a great way to diagnose potential inefficiencies or lack of modularity in your dbt project. - - - -The DAG is also [available in the dbt Cloud IDE](https://www.getdbt.com/blog/on-dags-hierarchies-and-ides/), so you and your team can refer to your lineage while you build your models. - -:::tip Leverage exposures - -One of the newer features of dbt is [exposures](https://docs.getdbt.com/docs/build/exposures), which allow you to define downstream use of your data models outside of your dbt project *within your dbt project*. What does this mean? This means you can add key dashboards, machine learning or data science pipelines, reverse ETL syncs, or other downstream use cases to your dbt project’s DAG. - -This level of interconnectivity and transparency can help boost data governance (who has access to and who [owns](https://docs.getdbt.com/reference/resource-configs/meta#designate-a-model-owner) this data) and transparency (what are the data sources and models affecting your key reports). - -::: - -## Conclusion - -A Directed acyclic graph (DAG) is a visual representation of your data models and their connection to each other. The key components of a DAG are that nodes (sources/models/exposures) are directionally linked and don’t form acyclic loops. Overall, DAGs are an effective tool for understanding data lineage, dependencies, and areas of improvement in your data models. - -> *Get started with [dbt today](https://www.getdbt.com/signup/) to start building your own DAG!* - -## Further reading - -Ready to restructure (or create your first) DAG? Check out some of the resources below to better understand data modularity, data lineage, and how dbt helps bring it all together: - -- [Data modeling techniques for more modularity](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -- [How we structure our dbt projects](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) -- [How to audit your DAG](https://www.youtube.com/watch?v=5W6VrnHVkCA) -- [Refactoring legacy SQL to dbt](/guides/refactoring-legacy-sql) diff --git a/website/docs/terms/data-catalog.md b/website/docs/terms/data-catalog.md deleted file mode 100644 index 59bb983767a..00000000000 --- a/website/docs/terms/data-catalog.md +++ /dev/null @@ -1,87 +0,0 @@ ---- -id: data-catalog -title: Data catalog -description: A data catalog is an inventory of data assets from different parts of the data stack within an organization. This catalog can display metadata, lineage, and business definitions from your different data sources. -displayText: data catalog -hoverSnippet: A data catalog is an inventory of data assets from different parts of the data stack within an organization. This catalog can display metadata, lineage, and business definitions from your different data sources. ---- - - - Data catalog: a centralized place for data about your data - - -A data catalog is an inventory of data assets from different parts of the data stack within an organization. It helps to connect different parts of your stack by showing how your data relates to one another, all within one central location. A catalog can display metadata, lineage, and business definitions from your different data sources and allows for easy collaboration and governance. Data catalogs allow analytics engineers to properly document all parts of their data stack, making for easy ownership. - -## Features of a data catalog - -Data catalogs are known for 6 features that make them so powerful for both data teams and business users: - -- Access to metadata -- Business glossary functionality -- Built-in data lineage -- Support collaboration -- Enhanced data governance -- Varied integrations - -### Metadata - -Metadata is the data that describes data; this refers to characteristics like who created the dataset, its size, the databases and schemas it contains, and when it was last updated. It tells you where you can find a particular data source in terms of where it lives within your . Those properties help an analytics engineer fully understand the data and its intricacies before working with it. - -### Business glossary - -Business glossaries within a data catalog are helpful for understanding how a dataset and its columns relate back to their specific organization. They often contain information about the business description and purpose of a dataset or model, and they display the business definitions of columns and logic within a dataset. Business glossaries are particularly helpful for knowing which dataset and column to use in your logic when writing models or defining metrics. - -### Data lineage - -Data lineage provides a holistic view of how data moves through an organization, where it is transformed, protected, and consumed. Lineage shows the relationships between datasets and models across different platforms within your data stack. Lineage is particularly helpful for understanding dependencies between datasets. Depending on the tool you use, catalogs can show it on a dataset or even column level. This way, when you are updating any process within your stack, you can do so with these dependencies in mind. - -### Collaboration - -Data catalogs make collaboration between data and business teams easy. It allows you to see who owns what datasets from both a technical and organizational perspective. Business teams know who to talk to when data is incorrect or not fresh, while data teams know who to turn to for questions on business context. You can also know things like when a data was last updated, how it's used, or to get more context on your warehouse. - -### Data governance - -Data governance allows you to control who can access which domains within a catalog or specific datasets. Most data catalogs allow you to organize your data based on a specific area of the business and then limit user access to only their area of expertise. Catalogs also help with governing which datasets meet the data quality standards required for utilization. Many tools display a quality score and let you know when a dataset hasn’t been run in a while or is deprecated, preventing users from using unreliable data sources. - -Data catalogs can also be used to identify specific datasets and columns that contain PII data. This way, teams can have a better understanding of who should and should not have access to sensitive data. Having better clarity over sensitive data will help your business stay compliant and ensure its properly protecting customer information. - -### Integrations - -Data catalogs are compatible with many other tools in your modern data stack. They typically allow the documentation of your data warehouse, transformation tool, and business intelligence tool to all sit in one central location. This helps to build transparency across the stack and creates a single source of truth for the organization to depend on. - -## Types of data catalogs - -Like most data tools, there are two different types: an open-source data catalog and an enterprise data catalog. Let’s talk about the differences between these and the pros and cons of each. - -### Open source - -Open source data catalogs are free for you to use and often provide a great level of flexibility. You can build a custom solution that meets your exact needs and security requirements. But because they are free, you will be expected to manage the entire platform and set it up. Oftentimes, it’s not as simple as plugging in your various credentials to each tool in your modern data stack. It requires careful reading through the provided documentation and setting up each tool on its own, which often requires a certain threshold of technical skill. This makes for a typically more intense and potentially lengthy set-up process because there may not be experienced people to help you along the way and walk you through the steps. - -Open source tools also require maintenance. Oftentimes, settings will change in the connected platforms and it's up to your team to ensure compatibility and fix any breaking changes. - -In addition, with open source tools, you often need to host them yourself on a cloud provider of choice if your catalog will see broad use across the team. Depending on what you prefer, you may have to deploy multiple microservices on a platform like AWS or Azure. - -Lastly, you want to keep in mind your end user: Is this data catalog meant to be utilized by the data team or the larger business teams? Business users may have a harder time navigating an open source tool because it’s usually not as easy as logging in with an account. It requires more technical expertise to use effectively. If a business user has trouble with the catalog, it could cause a potential lag in important processes. - -### Enterprise data catalog software - -Enterprise data catalogs are different from open source in that they are completely managed by the company that creates them. You pay a fee to use them and are paying for the ongoing support, quick set-up process, and the minimal maintenance that comes with it. You are typically walked through it with a dedicated resource, and the integrations with external tools can be smoother because the vendor has teams dedicated to maintaining those relationships. - -The biggest things to keep in mind with enterprise data catalogs is your budget, use cases, and greater data culture. Can your organization afford to pay for a data catalog tool? While they require fewer engineering resources to maintain, they do come with a price tag. When considering if it is worth spending your money on an enterprise tool, make sure you consider where your business and data teams stand. Is your business at a place where it respects the initiatives put in place by the data team? Are the initiatives big enough where having one would make sense? - -Does the data team fully understand the data and its lineage? If they don’t, it's probably too early to put this in front of business users. You want to make sure they are set up for success when being trained to use a new tool. - -Do you have sophisticated data models and sources that the business knows how to use? If not, it may be worth focusing on building out the right metrics and models to be used first. - -Is the culture data-driven? If business users are caught up in their own processes like Google spreadsheets, they may not even utilize a catalog. You don’t want to pay for a tool that is too sophisticated for where the business and data teams currently stand. Don’t rush the process. -### Data catalog tools -Data teams may choose to use third-party tools with data cataloging capabilities such as [Atlan](https://ask.atlan.com/hc/en-us/articles/4433673207313-How-to-set-up-dbt-Cloud), Alation, [Collibra](https://marketplace.collibra.com/listings/dbt-lineage-to-collibra-integration/), [Metaphor](https://support.metaphor.io/hc/en-us/articles/9302185081627), [Select Star](https://docs.selectstar.com/integrations/dbt/dbt-cloud), [Stemma](https://docs.stemma.ai/docs/stemma/getting-started/what-we-need-from-you/dbt-integration/), [Castor](https://docs.castordoc.com/integrations/dbt), and others. These tools often integrate directly with your data pipelines and dbt workflows and offer zoomed-in data cataloging and lineage capabilities. - -## Conclusion - -Data catalogs are a valuable asset to any data team and business as a whole. They allow people within an organization to find the data that they need when they need it and understand its quality or sensitivity. This makes communication across teams more seamless, preventing problems that impact the business in the long run. Weigh your options in terms of whether to go with open source or enterprise, trusting that the decision you land on will be best for your organization. - -## Additional reading - -- [Why both data cataloging?](https://www.getdbt.com/analytics-engineering/transformation/data-catalog/) -- [Glossary: Data warehouse](/terms/data-warehouse) diff --git a/website/docs/terms/data-extraction.md b/website/docs/terms/data-extraction.md deleted file mode 100644 index 52148a35421..00000000000 --- a/website/docs/terms/data-extraction.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -id: data-extraction -title: Data extraction -description: Data extraction is the process by which data is retrieved from multiple sources, often varying in volume and structure. -displayText: data extraction -hoverSnippet: Data extraction is the process by which data is retrieved from multiple sources, often varying in volume and structure. ---- - - - Data extraction: The first step of the ELT process - - -There is no data work without data. So how do data teams get the data they need? Data extraction is the process by which data is retrieved from multiple sources, often varying in volume and structure. Most data extraction processes are followed by a loading process, where that extracted data is loaded into a central . - -To actually extract said data, teams will often rely on various [ETL tools](https://docs.getdbt.com/terms/elt#elt-tools) or create custom scripts to call API endpoints. Other times data teams may be forced to do some hacky work like manually making and dropping a CSV into an S3 bucket. Definitely a rarity. But not unheard of. - -Come take a deep dive into data extraction, the process that allows us data folk to actually play with, well, said data. - -## Data extraction process: How does it work? - -There are two primary ways modern data teams are using to extract data: tools and custom scripts. - -- **Extraction via ETL tools**: SaaS ETL tools like Fivetran, Airbyte, and more, allow data folks to select connectors or data sources and sync their required data directly to their data warehouses. These platforms reduce the need to write custom API calls to data sources and instead allow data folks to worry more on transforming their data when it hits their data warehouse. -- **Extraction via custom scripts**: It’s probably inevitable, but at one point, you’re likely to find yourself hacking together a Python script to make API calls to a data source that doesn’t have a connector in an ETL tool. But let’s be real: while this is intimidating, it isn’t the end of the world. Writing and maintaining custom scripts for extracting data from data source APIs is not the most fun and there are real concerns (API limits, access tokens, lack of documentation, changing APIs, writing to external storage or directly to your data warehouse) to look out for, but gear up, read up on some basic curl requests and Python, and you got this. - -These two methods above are for automated extraction, processes that you only need to run once (in theory) to get the data you need on a regular basis. For non-automated processes, such as one-time extractions or uploads to your data warehouse, data folks can upload their data to external storage, such as S3 buckets, to load to your data warehouse, or leverage [dbt seeds](/docs/build/seeds). - -## Commonly extracted data - -Obviously, the type of business you work for and the systems your team uses will determine the data you extract. However, there are usually some common sources that data teams will extract for business users: -- Ad platforms such as Facebook Ads, Google Ads, or Pinterest Ads -- Accounting softwares like Netsuite -- Sales CRMs such as Salesforce or HubSpot -- Backend application databases -- Customer service SaaS products like Zendesk or Kustomer - -The data that is typically extracted and loaded in your data warehouse is data that business users will need for baseline reporting, OKR measurement, or other analytics. - -:::tip Don’t fix what’s not broken -As we just said, there are usually common data sources that data teams will extract from, regardless of business. Instead of writing transformations for these tables and data sources, leverage [dbt packages](https://hub.getdbt.com/) to save yourself some carpal tunnel and use the work someone else has already done for you. -::: - -## Data extraction tools - -If you’re not writing your own extraction scripts, you’re likely using an [ELT tool](https://docs.getdbt.com/terms/elt#elt-tools) to help you extract and load your various data sources into your data warehouse. Below, you’ll find some commonly used tools to help you do just that. - -| Tool | Description | Open source option? | -|:---:|:---:|:---:| -| Airbyte | Airbyte is an open-source and cloud service that allows teams to create data extraction and load pipelines. | ✅ | -| Stitch by Talend | Stitch (part of Talend) is another SaaS product that has many data connectors to extract data and load it into data warehouses. | ❌ | -| Fivetran/HVR | Fivetran is a SaaS company that helps data teams extract, load, and perform some transformation on their data. Fivetran easily integrates with modern data warehouses and dbt. They also offer transformations that leverage dbt Core. | ❌ | -| Funnel | Funnel is another product that can extract and load data. Funnel’s data connectors are primarily focused around marketing data sources. | ❌ | - -## Data extraction challenges to look out for - -There are definitely some considerable considerations in data extraction, mainly around costs and viability. - -- **Cadence and costs**: How often does your data need to be synced or refreshed? How often will your stakeholders really be looking at the data? There can be considerable costs to hitting API endpoints or retrieving data via ELT tools depending on the cadence you set for your data extractions. Talk to your stakeholders, understand when folks would leverage fresher data, and run some basic cost-benefit analyses to understand the cadence that works for your data extractions. -- **Viability**: Can you even extract the data your stakeholders need? As analytics engineers, your initial reaction is to check if an ETL tool has an existing connector for it. If it doesn’t, you may have to whip up a script to call the API (if there is one). If there is no API available, well, then it’s time to put on your creativity hat and get hacky! -- **PII concerns**: Oftentimes, data teams may be interested in masking PII data before it even hits their data warehouse. This would involve masking or removing the PII data immediately after extraction and immediately prior to loading the data into your data warehouse. For folks that want to mask PII, but are okay with masking it once it’s in their data warehouse, data teams can create masking policies using dbt packages. -- **Data accuracy**: This is less of a concern for data extracted via ETL tools or custom scripts, but for internal sources, such as static CSV files manually input by someone on your marketing team, you’re going to want to ensure that data is accurate (ideally before it hits your data warehouse). Not the end of the world if it does, but more of a nuisance than anything and something to look out for. - -:::tip Testing your data sources -Using dbt, data folks can run automated tests on their raw data that is loaded into their data warehouse via [sources](https://docs.getdbt.com/docs/build/sources). -::: - -## Conclusion - -Having no data extraction is the equivalent of a conductor not having an orchestra at their disposal: sad. Overall, data extraction in analytics engineering is the process of extracting data, usually via an automated ETL tool or script, for data sources that will later be loaded into a central data warehouse. There are some considerations to look at prior to the data extraction process, such as cost, viability, and PII concerns. - -## Further reading - -Ready to take a deeper dive into all things data extraction, ELT and dbt? Check out some of our favorite resources below: - -- [Glossary: ELT](https://docs.getdbt.com/terms/elt) -- [Glossary: ETL](https://docs.getdbt.com/terms/etl) -- [Four questions to help accurately scope analytics engineering projects](https://www.getdbt.com/blog/4-questions-to-help-you-more-accurately-scope-analytics-engineering-projects/) -- [Five principles that will keep your data warehouse organized](https://www.getdbt.com/blog/five-principles-that-will-keep-your-data-warehouse-organized/) diff --git a/website/docs/terms/data-lake.md b/website/docs/terms/data-lake.md deleted file mode 100644 index e1b75a616b9..00000000000 --- a/website/docs/terms/data-lake.md +++ /dev/null @@ -1,112 +0,0 @@ ---- -id: data-lake -title: Data lake -description: A data lake is a data management system used for storing large amounts of data in in its raw, native form as files. -displayText: data lake -hoverSnippet: A data lake is a data management system used for storing large amounts of data in in its raw, native form as files. Data lakes can store any type of data—structured, semi-structured, unstructured—in one centralized place. ---- - - - Data lake: an integral addition to the MDS - - -A data lake is a data management system used for storing large amounts of data in in its raw, native form as files. Data lakes can store any type of data—structured, semi-structured, unstructured—in one centralized place. Several common data file formats that are widely being used today include CSV, , XML, Parquet, and Avro. This makes the data lake a cost-effective and flexible storage container in contrast to the , where data must be in a structured and tabular format. The primary use case of a data lake in many organizations is to serve as an initial staging area before data is ready to be transformed and aggregated in a data warehouse. - -## How do data lakes provide value? - -In the past, some organizations couldn’t store all their data in one centralized place because databases and data warehouses could only store structured, relational data. On top of that, data storage was once cost-prohibitive, hence data teams would have to filter and transform data volumes to smaller sizes first to be able to store them. These challenges have been addressed by cloud data lakes; they allow for scalability, flexibility, and cost savings—all of which are handled by the cloud platform itself. - -### Scalability - -Data lakes allow you to scale your storage up or down depending on how much data you need to store at a particular point in time. You no longer have to know and calculate upfront how much storage capacity you need because of the advent of cloud data lakes. In the past, setting up a new data lake involved considerable hardware configuration tasks. Now, all of this can be achieved in a few steps by hitting a few buttons on your web browser or by typing a few lines of code on your computer. - -### Flexibility - -At times, a data team might know data from a new source could be useful, but they might now know how it would be used yet. Data lakes offer a place to store this data without needing to build a use case for structuring or shaping it first. This is different from the approach that data warehouses take, where it was optimized to store and analyze relational, structured data. In addition to the data lake’s ability to store raw, uncurated data, the advent of data lake query engines (ex. Athena, Dremio, Starburst, etc.) mean that data analysts and data scientists can now perform exploratory data analysis (EDA) on top of a data lake using this layer of abstraction, without having to bring it into the data warehouse first. - -### Cost-effectiveness - -The rise of the data lake coincided with the cloud computing revolution. Data teams no longer had to worry about making massive upfront hardware investments for data storage. Instead, you pay a usage-based fee dependent on how much data you store and how many compute queries you run. - -### Modern approaches - -As mentioned earlier, storing data in the past was an expensive endeavor, therefore organizations had to curate and think through what type of data they brought into their data warehouse. This approach is called ETL (Extract-Transform-Load), where only transformed data ultimately gets stored and analyzed in a data warehouse or data lake. - -The ability to store tons of data in a cost-efficient and flexible way in the data lake gave rise to a new approach to processing data, a technique that aligns with the modern practice of analytics engineering—the ,Extract-Load-Transform (ELT) approach. In this new process, data is immediately loaded to the destination data lake upon extraction from the source. The benefit of this approach is that it allows for flexibility and exploration of new business use cases which may or may not be known initially when data is ingested. - -## What are the drawbacks and challenges when using a data lake? - -For all of the advantages of data lakes listed above such as cost-effectiveness and flexibility, they also come with several drawbacks and challenges. - -### Inability to do fast reporting - -Query performance and speed is one capability area where data warehouses typically trump data lakes. While structuring data first may seem inflexible and rigid at times, it is the right approach to implement when you have analyses and reports that you want to run frequently. The following are several query performance techniques that can only be applied to data warehouses: - -| Performance technique | Description | Practical scenario | -|:---:|:---:|:---:| -| Columnar storage | Data is physically organized in columns in a data warehouse rather than rows (in a database) or files (in a data lake) | Most analysis and reports require pulling only a subset of columns from an entire . Columnar storage makes your queries run faster by retrieving only the relevant columns for your analysis | -| Query caching | When a query is executed, the result is temporarily stored for a period of time | When someone executes a query on a table, the results of that table will be made available right away to the next user who executes the same query, significantly reducing computation time | -| Data partitioning | Grouping similar data together based on selected table dimensions | Many organizations partition their tables based on a dimension that includes a date field. The reason for this is that most analyses only require pulling data on a rolling two-year period. If you want to calculate year-to-date sales this year and compare it to the same period last year, partitioning your data based on date will make your queries run faster by eliminating the need to scan through the entire table first | - -That being said, storing it in a data lake first and exploring it with an ad hoc query engine would be the recommended approach if you have a massive data set with a still undefined use case you want to explore. - -### Lack of fine-grained access control - -It is difficult to enforce fine-grained access control on your data when it's in its raw form. Fine-grained access control pertains to granting permissions to a particular subset of your data set by restricting access to certain rows and columns. These two concepts are known as column-level security and row-level security: - -- **Column-level security**: A bank may want to anonymize columns that contain personally identifiable information (PII) such as credit card numbers, social security numbers, and so on. To achieve this, analytics engineers use a variety of encryption functions available in their data warehouse. -- **Row-level security**: Imagine a retailer with a massive table containing millions of rows of sales transactions across all 50 states in the US. These companies may want to dynamically enforce limited querying permissions to end-users based on which state they’re in. For example, when an analyst based in California starts querying the table, the data set would pre-filter itself to only show sales data from California, even if the analyst attempts to query the entire table. This type of row-level data governance is typically better suited for data warehouses than data lakes. - -## Data lake use cases - -Organizations use data lakes for many different reasons. Most of these reasons ultimately tie back to the three primary benefits of cost-effectiveness, scalability, and flexibility summarized earlier. Below are common use cases that data lakes are able to achieve: - -### Data archiving and storage - -Data lakes can support cost-effective archiving of historical data that is no longer being actively used. Most organizations have data retention and lifecycle policies that indicate how business data should be stored and analyzed, where it is typically organized into three tiers: Hot, Warm, and Cold storage. As an example, a company may state that the past two years’ worth of data belongs in the hot tier, data from three to five years ago are in the warm tier, and anything beyond that in the cold tier. - -| Storage tier | Access pattern | Description | -|:---:|:---:|:---:| -| Hot | Data that is being used often | This is primarily the level in which data warehouses lie. At this level, data is highly structured and optimized for reporting and analytics. Data lakes may also lie at this tier to support machine learning and exploratory data analysis use cases | -| Warm | Data that is infrequently accessed | At this level, data is infrequently accessed and stored at a lower cost than in the hot tier. On some occasions, data may need to be transitioned back to the hot tier which cloud computing companies allow you to do with relative ease | -| Cold | Data stored for archiving purposes | Data in this tier is rarely accessed. Typically, cold data must be retained for regulatory and compliance purposes on a long-term basis, if not indefinitely. | - -### Data science and machine learning - -Because of a data lake’s ability to store any type of data format, it lends itself well to advanced analytics use cases, especially those that require the use of semi-structured and unstructured data that data warehouses traditionally don’t support. Some examples include: - -- **Sentiment analysis**: This is a technique that uses statistics and natural language processing (NLP) algorithms to determine the emotional meaning of communications. Organizations use sentiment analysis to evaluate customer reviews, call center interactions, social media posts, and other related content, all of which require the use of unstructured data sources (e.g. free-form text, audio recordings) -- **Predictive maintenance**: This is a common use case in the field of manufacturing, mining, and other heavy industries. Organizations take advantage of a data lake’s ability to store machine logs, sensor and telemetry data to predict the probability of a piece of equipment failing before it happens. This enables the company to make proactive actions to service the equipment, thus preventing defects and maximizing resource utilization. - -### Exploratory data analysis (EDA) - -Because you don’t need to impose a formal structure for how data is organized in a data lake, you can perform preliminary data exploration on that data, such as calculate summary statistics, discover anomalies and outliers, and plot data visualizations to derive preliminary insights. Commonly referred to as EDA, this is typically conducted as an initial step before formalizing a data science or machine learning use case. - -## Data lake vs. data warehouse - -| | Data lake | Data warehouse | -|---|---|---| -| Types of data | Structured, Semi-Structured, Unstructured | Structured | -| Data stored in | Folders and files in raw format | Schemas and tabular data format | -| Schema/schema definition | Store data in its raw format, transform the data later | Must know upfront | -| Intended users | Data engineers, analytics engineers, data analysts, data scientists | Analytics engineers, data analysts, business analysts | -| Common use cases | Data archiving and storage, data science and machine learning, exploratory data analysis | Business intelligence, dashboarding, reporting and analytics | - -## Data platforms that support data lake workloads - -| Data Platform | Description | Data warehouse | -|:---:|:---:|---| -| Cloudera | Cloudera Open Data Lakehouse is a platform that provides data lake flexibility and data warehouse performance in a single platform. | Structured | -| Databricks | Databricks is a cloud-based collaborative data science, data engineering, and data analytics platform that brings the best of data warehouses and data lakes into a single unified platform. | Schemas and tabular data format | -| Dremio | Dremio is the data lakehouse platform built for SQL and built on open source technologies that both data engineers and data analysts love. Dremio powers BI dashboards and analytics directly on data lake storage. | Must know upfront | -| Snowflake | Snowflake is a fully-managed platform for data warehousing, data lakes, data engineering, data science, and data application development. | Analytics engineers, data analysts, business analysts | - -## Conclusion - -The data lake is the younger data management platform compared to its data warehouse counterpart. Because of its unique ability to hold large amounts of data in its native, raw format, it has allowed organizations to store all its data in a centralized place, even if sometimes they don’t have a definitive use case for the data yet. In addition, it serves as a great buffer and landing zone for data before it is ultimately transformed and aggregated in a data warehouse. Lastly, it has unlocked a world of new possibilities by enabling organizations to build data science and machine learning use cases on top of it. The data lake is an integral pillar in the Modern Data Stack and the practice of analytics engineering. - -## Additional reading -- [Glossary: Data warehouse](/terms/data-warehouse) -- [Glossary: ETL](/terms/etl) -- [Glossary: ELT](/terms/elt) -- [Glossary: EDW](/terms/edw) diff --git a/website/docs/terms/data-lineage.md b/website/docs/terms/data-lineage.md deleted file mode 100644 index 42217db40d8..00000000000 --- a/website/docs/terms/data-lineage.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -id: data-lineage -title: What is data lineage? -description: Data lineage provides a holistic view of how data moves through an organization, where it’s transformed and consumed. -displayText: data lineage -hoverSnippet: Data lineage provides a holistic view of how data moves through an organization, where it’s transformed and consumed. ---- - - - What is data lineage? And how do you get started? - - -Data lineage provides a holistic view of how data moves through an organization, where it’s transformed and consumed. Overall, data lineage is a fundamental concept to understand in the practice of analytics engineering and modern data work. - -At a high level, a data lineage system typically provides data teams and consumers with one or both of the following resources: - -- A visual graph (DAG) of sequential workflows at the data set or column level -- A data catalog of data asset origins, owners, definitions, and policies - -This holistic view of the data pipeline allows data teams to build, troubleshoot, and analyze workflows more efficiently. It also enables business users to understand the origins of reporting data and provides a means for data discovery. - -We’ll unpack why data lineage is important, how it works in the context of analytics engineering, and where some existing challenges still exist for data lineage. - -## Why is data lineage important? - -As a data landscape grows in size and complexity, the benefits of data lineage become more apparent. For data teams, the three main advantages of data lineage include reducing root-cause analysis headaches, minimizing unexpected downstream headaches when making upstream changes, and empowering business users. - -### Root cause analysis - -It happens: dashboards and reporting fall victim to data pipeline breaks. Data teams quickly need to diagnose what’s wrong, fix where things may be broken, and provide up-to-date numbers to their end business users. But when these breaks happen (and they surely do) how can teams quickly identify the root cause of the problem? - -If data teams have some form of data lineage in place, they can more easily identify the root cause of the broken pipeline or data quality issue. By backing out into the data models, sources, and pipelines powering a dashboard a report, data teams can understand all the upstream elements impacting that work and see where the issues lie. - -Will a data lineage or a DAG solve your breaking pipelines? Definitely not. Will it potentially make your life easier to find problems in your data work? Heck yes. - -### Downstream impacts on upstream changes - -You may have been here—your backend engineering team drops the `customers` table to create a newer, more accurate `users` table. The only bad thing is…[they forgot to tell the data team about the change](https://docs.getdbt.com/blog/when-backend-devs-spark-joy). - -When you have a data lineage system, you can visually see which downstream models, nodes, and exposures are impacted by big upstream changes such as source or model renaming or removals. Referring to your DAG or data lineage system before any significant change to your analytics work is a great way to help prevent accidental downstream issues. - -### Value to business users - -While data lineage makes it easier for data teams to manage pipelines, stakeholders and leaders also benefit from data lineage, primarily around promoting data transparency into the data pipelines. - -**Shared data literacy** - -New hires, existing team members, and internal data practitioners can independently explore a holistic view of the data pipeline with a data lineage system. For data teams using a DAG to encapsulate their data work, business users have a clear visual representation of how data flows from different sources to the dashboards they consume in their BI tool, providing an increased level of transparency in data work. At the end of the day, the added visibility makes it easier for everyone to be on the same page. - -**Pipeline cleanliness** - -A visual graph (DAG) of how data flows through various workflows makes it easy to identify redundant loads of source system data or workflows that produce identical reporting insights. - -Spotlighting redundant data models can help trim down on WET (write every time/write everything twice) code, non-performant joins, and ultimately help promote reusability, modularity, and standardization within a data pipeline. - -Overall, data lineage and data-driven business go hand-in-hand. A data lineage system allows data teams to be more organized and efficient, business users to be more confident, and data pipelines to be more modular. - -## How does data lineage work? - -In the greater data world, you may often hear of data lineage systems based on tagging, patterns or parsing-based systems. In analytics engineering however, you’ll often see data lineage implemented in a DAG or through third-party tooling that integrates into your data pipeline. - -### DAGs (directed acyclic graphs) - -If you use a transformation tool such as dbt that automatically infers relationships between data sources and models, a DAG automatically populates to show you the lineage that exists for your [data transformations](https://www.getdbt.com/analytics-engineering/transformation/). - - - -Your is used to visually show upstream dependencies, the nodes that must come before a current model, and downstream relationships, the work that is impacted by the current model. DAGs are also directional—they show a defined flow of movement and form non-cyclical loops. - -Ultimately, DAGs are an effective way to see relationships between data sources, models, and dashboards. DAGs are also a great way to see visual bottlenecks, or inefficiencies in your data work (see image below for a DAG with...many bottlenecks). Data teams can additionally add [meta fields](https://docs.getdbt.com/reference/resource-configs/meta) and documentation to nodes in the DAG to add an additional layer of governance to their dbt project. - - - -:::tip Automatic > Manual - -DAGs shouldn’t be dependent on manual updates. Instead, your DAG should be automatically inferred and created with your data transformation and pipelines. Leverage tools such as dbt to build your own version-controlled DAG as you develop your data models. - -::: - -### Third-party tooling - -Data teams may also choose to use third-party tools with lineage capabilities such as [Atlan](https://ask.atlan.com/hc/en-us/articles/4433673207313-How-to-set-up-dbt-Cloud), Alation, [Collibra](https://marketplace.collibra.com/listings/dbt-lineage-to-collibra-integration/), [Datafold](https://www.datafold.com/column-level-lineage), Metaphor, [Monte Carlo](https://docs.getmontecarlo.com/docs/dbt-cloud), [Select Star](https://docs.selectstar.com/integrations/dbt/dbt-cloud), or [Stemma](https://docs.stemma.ai/docs/stemma/getting-started/what-we-need-from-you/dbt-integration/). These tools often integrate directly with your data pipelines and dbt workflows and offer zoomed-in data lineage capabilities such as column-level or business logic-level lineage. - -## Data lineage challenges - -The biggest challenges around data lineage become more apparent as your data, systems, and business questions grow. - -### Data lineage challenge #1: Scaling data pipelines - -As dbt projects scale with data and organization growth, the number of sources, models, macros, seeds, and [exposures](https://docs.getdbt.com/docs/build/exposures) invariably grow. And with an increasing number of nodes in your DAG, it can become harder to audit your DAG for WET code or inefficiencies. - -Working with dbt projects with thousands of models and nodes can feel overwhelming, but remember: your DAG and data lineage are meant to help you, not be your enemy. Tackle DAG audits in chunks, document all models, and [leverage strong structure conventions](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview). - -:::tip dbt project evaluator - -Is your DAG keeping up with best practices? Instead of manually auditing your DAG, the [dbt project evaluator package](https://github.com/dbt-labs/dbt-project-evaluator) can help audit your project and find areas of improvement. - -::: - -### Data lineage challenge #2: Column-level lineage - -Complex workflows also add to the difficulties a data lineage system will encounter. For example, consider the challenges in describing a data source's movement through a pipeline as it's filtered, pivoted, and joined with other tables. These challenges increase when the granularity of the data lineage shifts from the table to the column level. - -As data lineage graphs mature and grow, it becomes clear that column- or field-level lineage is often a needed layer of specificity that is not typically built in to data lineage systems. Learn more about the [column-level lineage](/docs/collaborate/column-level-lineage) feature in [dbt Explorer](https://www.getdbt.com/product/dbt-explorer) and how it can help you gain insights. - -## Conclusion - -Data lineage is the holistic overview of how data moves through an organization or system, and is typically represented by a DAG. Analytics engineering practitioners use their DAG and data lineage to unpack root causes in broken pipelines, audit their models for inefficiencies, and promote greater transparency in their data work to business users. Overall, using your data lineage and DAG to know when your data is transformed and where it’s consumed is the foundation for good analytics work. - -## Further reading - -DAGs, data lineage, and root cause analysis…tell me more! Check out some of our favorite resources of writing modular models, DRY code, and data modeling best practices: - -- [Glossary: DRY](https://docs.getdbt.com/terms/dry) -- [Data techniques for modularity](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -- [How we structure our dbt projects](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) diff --git a/website/docs/terms/data-warehouse.md b/website/docs/terms/data-warehouse.md deleted file mode 100644 index cf6f5de3d20..00000000000 --- a/website/docs/terms/data-warehouse.md +++ /dev/null @@ -1,89 +0,0 @@ ---- -id: data-warehouse -title: Data warehouse -description: How have data warehouses evolved over the last 40 years? Explore the nuanced changes in use case since Inmon first coined the term. -displayText: data warehouse -hoverSnippet: A data warehouse is a data management system used for data storage and computing that allows for analytics activities such as transforming and sharing data. ---- - - - Data warehouses in the modern data stack - dbt Labs - - -A data warehouse is a data management system used for data storage and computing that allows for analytics activities such as transforming and sharing data. It helps businesses to capture and store data from external sources. Analytics engineers and data analysts use it to query datasets using SQL, helping to transform them into powerful data models and reports. Data warehouses are the central source of truth for any modern data stack. Data is ingested, transformed, and shared to other tools from the warehouse. - -There are two main types of data warehouses — on-prem warehouses and cloud warehouses. An on-prem data warehouse is a physical location where companies need to maintain hardware and software in order to store data. A cloud data warehouse is available anywhere and doesn’t include a physical location that you need to access. In this arrangement, you pay to use the storage space and compute power that is provided and maintained by another company. - -## History of data warehouses - -While data has been stored throughout history, it wasn’t until the 1980s that technology began to accelerate and the first official data warehouse was created. It was an on-prem warehouse consisting of a lot of computer processing and storage towers, taking up a lot of space. As you can imagine, this caused a lot of problems. It not only took up a lot of physical space, but employees had to maintain the hardware and software of these warehouses. This quickly became expensive and unrealistic for smaller companies without the budget or space. - -When Amazon began scaling their on-prem data warehouses to support their business, they noticed an opportunity to sell compute capacity to other businesses in order to save costs. This is when Redshift, Amazon’s cloud data warehouse product, came to be. Shortly after, other tech giants like Google and Microsoft who were also building data infrastructure followed suit. - -Now, you can be anywhere and access the power of an online warehouse. You no longer need to maintain the infrastructure yourself but can pay a company to do this for you. This is cheaper and allows for faster data capabilities. - - -## Why businesses need data warehouses - -Data warehouses were once unrealistic due to the costs associated with them. Now that cloud warehouses make them available to nearly everyone, they have a plethora of benefits to offer businesses. Cloud warehouses allow for scalability, availability, cost savings, and increased security- all of which are handled by the provider themself. - -### Scalability - -Data warehouses allow you to scale computing up or down depending on how fast you need your transformations to run and how much you are willing to spend. You can turn computing resources on or off as well in order to save on costs. - -### Availability - -Data warehouses are always available. While latency may vary based on source and destination locations, your data can be accessed anywhere, at any time. This is ideal for the remote culture that we are currently living in, where anyone can work from anywhere. - -### Cost savings - -Because you no longer need to maintain all of the infrastructure, you can save on costs related to maintenance. Because the data warehouse companies manage so much data, they are able to unlock cost-savings that you wouldn’t be able to otherwise. - -### Security - -Data warehouses offer advanced security features that ensure your data is always secure. They often directly handle certain compliance strategies needed with healthcare and financial data, eliminating the need for you to do this yourself. They also have features such as roles and users which help you control who has access to your data. But we will get into this more later. - -## Potential business use cases - -Businesses can leverage data warehouses for many different reasons. Most of these reasons end up savings time and money for the business, whether directly or indirectly. - -### Consolidating all of your data in one place - -First, a data warehouse acts as a single source of truth for all of your data. Rather than having all of your data spread across different platforms, it is available to you in one place. This allows you to standardize all of your core metrics and data definitions, rather than depending on the metrics calculated by platforms like Google and Facebook. If you find that different metrics don’t align across platforms, a data warehouse acts as a dependable source for the right metric. Rather than relying on external platforms, you now have one that centralizes all of your data. - -Not to mention, you will save your analytics engineer and data analyst a few headaches. They would otherwise have to manually pull needed data from various sources. Not having a single source of truth decreases your data quality, wastes your data team’s precious time, and makes it difficult to combine data from different sources. - -### Ability to control who has access and the type of access they have - -Data warehouses have extensive security features which allow you to control who has access to what. You have the ability to give someone as little or extensive permissions as you wish. Warehouses give you the ability to create users and assign them roles. Each role has its own set of permissions to which databases and tables it can see. Then, you can also choose who is allowed to query those tables or even update and delete them. - -When anyone in your organization can easily access your data, bad things can happen. You risk the potential of important data being deleted, incorrectly edited, or inappropriately accessed. Data warehouses users, roles, policies, and security measures can help ensure data is in the right hands of the right people. - -### Fast reporting - -Because all of your data is located in the same place, it allows for faster reporting compared to pulling data from many different sources. A central location allows for you to quickly access and query millions of rows of data, allowing transformations and reporting to be done much faster. - -## Data platforms that support data warehousing workloads - -| **Data platform** | **Description** | -|---|---| -| Snowflake | Snowflake is a fully-managed platform for data warehousing, data lakes, data engineering, data science, and data application development. | -| Databricks | Databricks is a cloud-based collaborative data science, data engineering, and data analytics platform that combines the best of data warehouses and data lakes into a lakehouse architecture. | -| Google BigQuery | Google BigQuery is a serverless, highly scalable data warehouse that comes with a built-in query engine. | -| Amazon Redshift | Amazon Redshift is a fully-managed petabyte-scale cloud-based data warehouse designed for large scale data set storage and analysis. | -| Postgres | PostgreSQL is an advanced, enterprise class open source relational database that supports both SQL (relational) and (non-relational) querying. | - -## Data warehouse vs data lake - -A data lake is a system where you store, process, and query unstructured, semi-structured, and structured data at almost any scale. The main difference between a data warehouse and a data lake is the type and way data is stored. Data warehouses contain structured data that is meant to organize data for analytics use. Data lakes can contain pretty much any kind of data—structured or unstructured—and data is usually left in its raw format until it's ready to use. Compare that to data warehouses, whose primary goal is to be a place for data teams to store both raw and transformed, usable data. - -## Conclusion - -Data warehouses have come a long way [in the last 40 years](https://www.getdbt.com/blog/future-of-the-modern-data-stack/). They began as a physical location with huge costs associated with them to a system available to anyone, anywhere at an affordable cost. They have the power to centralize all of your business’s data, allowing for faster analytics operations, standardized KPIs, and a single source of truth. All businesses need a data warehouse in order to operate quickly and efficiently with data that they can rely on. The question isn’t whether or not you need a data warehouse, but which data warehouse you should choose. Make a list of the key features needed for your business and use that to assess the options above. - -## Additional reading - -- [Operational analytics](https://www.getdbt.com/analytics-engineering/use-cases/operational-analytics/) -- [Glossary: ETL](https://docs.getdbt.com/terms/etl/) -- [Glossary: ELT](https://docs.getdbt.com/terms/elt/) - diff --git a/website/docs/terms/data-wrangling.md b/website/docs/terms/data-wrangling.md deleted file mode 100644 index 46a14a25949..00000000000 --- a/website/docs/terms/data-wrangling.md +++ /dev/null @@ -1,166 +0,0 @@ ---- -id: data-wrangling -title: Data wrangling -description: Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data. -displayText: data wrangling -hoverSnippet: Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data. ---- - - - Data wrangling: the workflow that bred analytics engineers - - -Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. For analytics engineers, you may know this better by the name of data cleaning. In data science or machine learning, "wrangling" often refers to prepping the data for model creation. - -The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data in a downstream model without worrying about basic cleaning like renaming, datatype casting, etc. Data wrangling acts as preparation for the development of [intermediate, fct/dim, or mart data models](/best-practices/how-we-structure/1-guide-overview) that form the base layer that other data work can be built off of. Analytics engineers tend to do data wrangling work in the staging layer as a first transformation step after loading the data. This eliminates a foundational step done by an analytics engineer or analyst when building a downstream data model or dashboard. - -## Data wrangling steps - -The data wrangling *structured* process includes data discovery, structuring, cleaning, enriching, validating, and publishing. While this is the general workflow, there isn't one definitive workflow. This will vary depending on the transformation tool you’re using and specific use case. - -### Data discovery - -Data discovery involves getting to know the data that you are working with. This involves looking at key statistical measures of your dataset. Some of these include: - -- Row count -- Number of columns -- Column data types -- Distribution of column values -- Number of duplicate rows -- Number of nulls - -Oftentimes, data warehouses have a preview capability so data team members can easily see a table’s makeup (column name, type, row count, etc.), but functions such as `SUM()` and `COUNT()` will come in handy for finding these values. You can use the `GROUP BY` statement with these functions to find the counts of certain rows for different categories of data. In addition, you’ll want to identify primary keys, check for duplicates of primary keys, and ensure every row of data has a column that can act as a primary key! - -### Structuring - -Structuring your data is a type of transformation that involves reformatting and reorganizing your data so that it is stored in a way that makes the values usable. This could mean rearranging how the data is displayed in columns and rows. Chances are you are using an tool to ingest your data, so the data is likely in a tabular format and you won’t need to do that much restructuring. If your data is structured, you really only need to worry about nested data types such as data. When structuring your data, you want to ask yourself these questions: - -- Is your data in the format you need to perform analysis on it? Does your data need to be potentially unnested? *Should you nest or objectize columns together?* -- Do the column names and values look correct for your use case? - -If your data is not in a format that is usable, you can look into different solutions such as pivoting or using different functions to unpack lists and JSON files so that they are in a tabular format. Pivoting is helpful because it allows you to change the way your dataset is structured by rearranging the way columns, rows, and their values are displayed. dbt has a [pre-built macro](https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/pivot.sql) that makes pivoting less of a headache and more of a breeze. - -### Cleaning - -The cleaning stage involves using different functions so that the values in your data tables are usable in your models and reports. The majority of the work done in staging models is this type of cleaning that includes: - -- Datatype casting -- Lower/upper casing string values -- Converting timestamps -- Aliasing/column renaming -- Removing appropriate duplicates or nulls you found in the discovery process -- Eliminating unnecessary characters or spaces from values - -Certain cleaning steps, like removing rows with null values, are helpful to do at the beginning of the process because removing nulls and duplicates from the start can increase the performance of your downstream models. In the cleaning step, it’s important to follow a standard for your transformations here. This means you should be following a consistent naming convention for your columns (especially for your primary keys) and casting to the same timezone and datatypes throughout your models. Examples include making sure all dates are in UTC time rather than source timezone-specific, all strings are in either lower or upper case, etc. - -:::tip dbt to the rescue! -If you're struggling to do all the cleaning on your own, remember that dbt packages ([dbt expectations](https://github.com/calogica/dbt-expectations), [dbt_utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/), and [re_data](https://www.getre.io/)) and their macros are also available to help you clean up your data. -::: - -### Enriching - -Enriching your data means enhancing it by supplementing incomplete or missing data. This could involve basic case or coalesce statements that use an already existing column in order to produce a new column. It could also look like joining an already existing date column with a date table that contains more extensive information about a certain date. Keep in mind that you don’t want to go overboard with enriching or joining here—you only want to add what will be repeatedly used in modeling and analysis. - -:::tip Python for enrichment? -With the new capability of [Python in dbt](/docs/build/python-models), will folks start using Python to help enrich their data? Only time will tell, but we’re eager to hear how you want to be using Python in dbt. Please join the [#dbt-core-python-models channel](https://www.getdbt.com/community/join-the-community/) to join in on the discussions happening around them. -::: - -### Validating - -Validating data is the process of ensuring that the changes you just made to a dataset during your transformation are accurate. At this stage, you may be asking yourself: -- Are the primary keys still unique? Are there the same number of primary keys in this transformed table than in my upstream sources? -- Has the relationship with the upstream table(s) changed at all, or is it still 1-1? If not, is that expected behavior? -- Has the distribution of column values changed at all? Are column values even correct? -- Did I select the correct columns I want present at this stage? - -To answer these questions, you'll likely find yourself looking for and counting nulls, rows, duplicates, and primary keys. You'll likely reference upstream models regularly in this phase to ensure your transformation code is accurate and performing what you intended it to do. - -Validation is always a little manual, but [dbt tests, macros, and packages](#validating-1) can help make your data validation a little easier 😉 . - -### Publishing - -The last step of the data wrangling process is publishing. In analytics engineering, we typically refer to this as “pushing to production”. This essentially means that you are making the data models available to use in downstream data models, dashboards, and reports. This additionally means pushing the code changes for these staging models to the main branch in your git repository. For non-ephemeral models, the process of publishing could be as simple as running a query as a , creating a table in your production , or running dbt Cloud in production for table recreation. - -CI/CD jobs are often used as part of the publishing process to test and linter code before it is pushed to production. This helps to ensure changes made are actually reliable and safe to merge. CI/CD is a best practice because it allows data models to be updated quickly and efficiently, ensuring no downstream models are impacted. - -When pushing to production, you want to make sure these data models are accessible by those building the models and reports. This may mean you have to play around with users, roles, and permissions in your data warehouse. Your transformation tool should have read access from these tables. Additionally, you could use dbt grants to apply these permissions directly at build time. - -## Data wrangling benefits - -Why should you spend all of that time doing relatively tedious and repetitive work? Well, there are a number of benefits that can make the slog worth it. Those benefits include: - -- Increased data quality -- Increase data usability/modularity -- More standardization -- Deeper understanding of data -- Potential performance improvements on downstream models - -### Increased data quality - -Data wrangling increases the overall quality of your code and the data it produces. Because the cleaning is already done and validated, you don’t have to worry about someone forgetting to clean or standardize a dataset downstream and using messy or inconsistent data. - -### Increased data usability/modularity - -Because data is wrangled once when it is ingested into the data warehouse, analytics engineers don’t need to constantly be recleaning and transforming source data from its origin and follow practices. Wrangled data allows them to use clean and modular models repeatedly throughout their work. - -### Standardization - -When data is wrangled, it is matched with a standard set that your data team establishes that is then applied to all datasets. It ultimately creates consistent staging layers for analytics engineers to build their intermediate, fct/dim, and mart models. Data team members don’t need to worry about upholding standards in downstream models because this is already done when the data is first ingested. - -### Deeper understanding of data - -By first wrangling or cleaning data, you get to learn about the data’s intricacies in the process. Though manual, this process allows you to find issues in the data and understand them deeply before using them in downstream processes. This minimzes potential problems that can go unnoticed because you’ve already explored and validated the datasets. It also helps you understand how tables can be joined together downstream. - -Additionally, this initial data exploration and transformation helps you collaborate better with [backend application developers](https://docs.getdbt.com/blog/when-backend-devs-spark-joy) or data engineers to work on formatting the raw data in a format that is most appropriate for analytics work. - -### Potential performance improvements on downstream models - -Lastly, data wrangling allows for potential improvements in performance in downstream models. Because you’ve cleaned the data and potentially removed duplicates and null values, models will be quicker to run. -## Data wrangling in SQL - -SQL is the most common language for data wrangling. While you can wrangle data using other languages, such as Python, SQL is the most common (and straightforward) language used for data wrangling and transformation in relational databases. Let’s look at some of the most common SQL functions for each of the data wrangling steps. - -### SQL cleaning - -- `CAST` is commonly used to cast values in a column to a specified data type. - -- `CONVERT_TZ` can be used to convert values in a column to a specific timezone. - -- `LOWER`/`UPPER` is used to capitalize or lowercase string values. - -- `TRIM` can remove leading or trailing characters in strings, making string functions easier to use downstream or more consistent across tables. - -- `REPLACE` replaces a specified character in column values. - -You can also use custom built macros, such as those from a dbt package called [re_data](https://hub.getdbt.com/re-data/re_data/latest/), to clean columns using SQL. - -### Enriching - -Enriching data using SQL can often involve the use of functions, such as: - -- CASE statements allow you to replace values using “when-then” statements. They end with an “else” statement to catch the values that don’t fall in any of the “when-then” statements. -- `IFNULL` replaces any null values in a column with whatever value you specify. -- `COALESCE` returns the first non-null value from a list or column that you give it. This function is useful for replacing null values with one that you specify or coalescing multiple column values together. - -### Structuring - -Pivot tables come in handy when restructuring your data. You can use them to make your column names your values and vice versa. Dbt has a [macro](https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/pivot.sql) built out that allows you to completely customize and pivot your tables without having to write crazy complicated code. - -For nested data types such as JSON, you’ll want to check out the JSON parsing and extraction function of your data warehouse to help work with this data. - -### Validating - -dbt offers [generic data tests](/docs/build/data-tests#more-generic-data-tests) in every dbt project that allows you to validate accepted, unique, and null values. They also allow you to validate the relationships between tables and that the primary key is unique. - -If you can’t find what you need with the generic tests, you can download an additional dbt testing package called [dbt_expectations](https://hub.getdbt.com/calogica/dbt_expectations/0.1.2/) that dives even deeper into how you can test the values in your columns. This package has useful data tests like `expect_column_values_to_be_in_type_list`, `expect_column_values_to_be_between`, and `expect_column_value_lengths_to_equal`. - -## Conclusion - -You could argue that data wrangling is one of the most important parts of an analytics engineer's job. It increases data quality, makes your data usable, standardizes it, increases your understanding, and improves performance. None of this would be possible without data discovery, structuring, cleaning, enriching, validating, and publishing steps that make up the wrangling process. - -## Futher reading - -- [Our favorite SQL functions](https://www.getdbt.com/sql-foundations/top-sql-functions/) -- [Glossary: Data warehouse](/terms/data-warehouse) -- [Glossary: Primary key](/terms/primary-key) -- [Glossary: JSON](/terms/json) diff --git a/website/docs/terms/dataframe.md b/website/docs/terms/dataframe.md deleted file mode 100644 index e91b5d59cf6..00000000000 --- a/website/docs/terms/dataframe.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -id: dataframe -title: DataFrame -description: A DataFrame is a way of storing and manipulating tabular data in Python. They gained popularity first as a part of R and then as a part of pandas. -displayText: dataframe -hoverSnippet: A DataFrame is a two-dimensional data structure (rows and columns). It's the most common way of representing and interacting with large datasets in Python. ---- - - What is a DataFrame in Python? - dbt Labs - - -A DataFrame is a way of storing and manipulating tabular data in Python. DataFrames are often likened to tables with columns and rows that you could find in any , Google Sheet, or Excel workbook. - -A DataFrame entry in an analytics engineering glossary…what is happening? You’re reading this right. While SQL is the go-to programming language for most analytics engineering work, there are likely inevitable situations where you've found yourself writing some Python and using DataFrames. - -While DataFrames are also used in other languages for data processing, such as R and Scala, the focus of this glossary page will be on Python DataFrames, their use cases, and their relation to analytics engineering work. - -## How DataFrames work - -DataFrames have a long history ([going back to 1990](https://towardsdatascience.com/preventing-the-death-of-the-dataframe-8bca1c0f83c8#:~:text=The%20earliest%20%E2%80%9Cdataframe%E2%80%9D%2C%20originally,Hastie%20in%201992%20%5B1%5D)!), but gained popularity first as a part of R and then as a part of [pandas](https://pandas.pydata.org/), an open source Python library of useful data analysis and manipulation tools. To work with DataFrames in Python, folks typically need to import the pandas library in the beginning of their script, `.py` file, or Python notebook with the conventional `import pandas as pd`. - -One of the strengths of DataFrames lies in its ability to take data in its original form (ex. array, list, , parquet, dictionary) and form a tabular (rows and columns) format out of it. Once this data is in a tabular format, you can apply functions and packages to that data to clean, transform, and enrich it. - -Below is an example creation of a Python DataFrame from a list and some light enrichment on it: - -```python -import pandas as pd - -def is_credit_card_purchase(x): - if x == 'credit_card': - return True - else: - return False - -jaffle_shop_orders = [[1, 1, 'credit_card', 1000], [2, 2, 'credit_card', 2000], [3,3, 'coupon', 100]] -orders_df = pd.DataFrame(jaffle_shop_orders, columns=['unique_id', 'order_id', 'payment_method', 'amount']) -orders_df.set_index(['unique_id'], inplace=True) -orders_df['is_credit_card'] = orders_df['payment_method'].apply(is_credit_card_purchase) - -print(orders_df) -``` - -This script will return an `orders_df` DataFrame that looks like this: - -| unique_id | order_id | payment_method | amount | is_credit_card | -|---|---|---|---|---| -| 1 | 1 | credit_card | 1000 | True -| 2 | 2 | credit_card | 2000 | True -| 3 | 3 | coupon | 100 | False - -:::info A note on Python flavors -If you’re running Python in Snowflake via Snowpark, you would typically be working with [Snowpark](https://docs.snowflake.com/en/developer-guide/snowpark/python/working-with-dataframes.html) or pandas DataFrames. For folks running Python from Google BigQuery or Databricks users, they can use both pandas or [PySpark DataFrames](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html). There might be slight syntax differences between the different Python flavors of Snowpark, PySpark, and pandas, but much of the functionality remains the same. -::: - -It's also possible and common practice to string together a number of DataFrame transformations. For example, if `df` represents a DataFrame containing one row per person living in the Eastern United States over the last decade, you can calculate the number of people living in Philadelphia each year: - -```python -df.filter("city == 'Philadelphia'") - .withColumn("population", count("name")) - .group_by("year") -``` - -In most distributed frameworks, these transformations are evaluated "lazily." Rather than performing each transformation, calculating its results, and storing those results, the framework develops a *plan* for how it *will* perform those calculations. When you want to *act* on the transformed DataFrame—see the top 10 results, or write it back to a table in the database—then the framework's optimizer calculates the most efficient way to deliver those results, based on all the steps you have defined. - -If you're familiar with SQL, you can think of a DataFrame like a `select` statement, and each new DataFrame operation as a separate . - -You can write a long SQL query containing many complex CTEs. When you run the query with `limit 10` to see a sample of its results, or create that query as a table in the database (what dbt does when it runs your model), the data warehouse optimizes your query and produces the results in the most efficient way possible. - -## DataFrame use cases - -You could probably write hundreds of pages on DataFrame use cases and examples, but at their core, DataFrames, *in the context of analytics engineering*, are often used to manipulate data outside of SQL capabilities, work with data during API extraction, and leverage data science and machine learning. - -### Enrichment and manipulation of data outside of SQL capabilities - -Let’s just say it: there’s a lot of things you can do in Python that could do in SQL and vice versa, but Python packages typically win out when it comes to data enrichment. A typical use case for Python DataFrames is the ability to apply Python libraries or functions to data in the DataFrame. - -In practice, this could look like applying an [IP parser](https://pypi.org/project/ipparser/) to an IP address column, using a package to determine whether a [date falls on a holiday](/docs/build/python-models#using-pypi-packages), or leveraging [numpy](https://numpy.org/) for performant and complex mathematical computations. - -:::tip dbt x Python DataFrames -dbt supports the use of beta [Python models in dbt](/docs/build/python-models). What does this mean exactly? This means that Python-defined data transformations can be created and used in a dbt project in the same vein as a classic dbt SQL model. These Python models are incredibly new and the team is eagerly looking for feedback in how folks want to use and ritualize them. -::: - -### Manipulation of data during extraction and loading scripts - -It’s not the most pleasant of experiences, but as an analytics engineer, you’re going to find yourself writing a hacky Python script at one point to extract data from a system or API that doesn’t have an innate connector in an [ETL tool](https://docs.getdbt.com/terms/elt#elt-tools). - -As you unpack and unnest the JSON received from these API endpoints, you’ll likely use DataFrames to make your data (and life) a little easier to work with. We won’t go into great depth here since this probably won’t happen too often in your career as an analytics engineer, but it’s beneficial to understand the basics of DataFrames and working with [requests, JSON, and DataFrames](https://stackoverflow.com/questions/42518864/convert-json-data-from-request-into-pandas-dataframe). - -### Data science and machine learning - -If SQL is an analytics engineer’s oven, Python is a data scientist's stovetop. Data scientists and machine learning engineers often use Python and DataFrames to perform exploratory analysis, feature engineering and data preparation, and the application of models and algorithms on datasets. Understanding and using DataFrames is step 1 (of many steps) to becoming a data person that can create meaningful data science and machine learning models. - -All this data science and machine learning talk…“But, I’m an analytics engineer,” you say adamantly. One of the great, beautiful, and sometimes frustrating qualities about analytics engineers is their jack-of-all-trades-ness. You can transform data in your sleep, talk ROI and CPAs all day with your VP of marketing, and use git like you studied computer science in college—what can’t you do?? You’ve probably experimented with a predictive analytics model, some light forecasting, or sentiment analysis at one point in your data journey. You may not be interested in making the conversion to full-fledged data scientists or machine learning engineer, but enjoy a challenge from time to time. - -There’s a reason data warehouses and platforms like Snowflake, BigQuery, and Databricks are providing support for Python: because folks are asking for it. There are endless use cases for Python and DataFrames that fall outside of data science and machine learning work, but as you start working and feeling more comfortable in Python, you may be tempted to start experimenting with these different forms of data work. And the world’s your oyster, right? - -## Conclusion - -A DataFrame is a tabular data storage format in Python that is widely used across different roles in the data world. Since a DataFrame stores data in rows and columns, similar to how analytics engineers manipulate tables stored in data warehouses, data folks can transform, engineer, and enrich data in DataFrames using Python and Python packages. Analytics engineers may find themselves using DataFrames when they’re extracting data via APIs, enriching data with third-party packages, or experimenting with data science and machine learning models. - -## Further reading - -Are you ready to dip your toes in DataFrames, Python, and dbt? Check out some of the resources below to learn more about how dbt is embracing Python: - -- [Python models in dbt](/docs/build/python-models) -- #beta-feedback-python-models Slack channel in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/) -- [Best practices for developing Python models in dbt discussion](https://github.com/dbt-labs/docs.getdbt.com/discussions/1811) \ No newline at end of file diff --git a/website/docs/terms/ddl.md b/website/docs/terms/ddl.md deleted file mode 100644 index c4324e75fa9..00000000000 --- a/website/docs/terms/ddl.md +++ /dev/null @@ -1,128 +0,0 @@ ---- -id: ddl -title: DDL -description: Data Definition Language (DDL) is a group of SQL statements that you can execute to manage database objects, including tables, views, and more. -displayText: DDL -hoverSnippet: Data Definition Language (DDL) is a group of SQL statements that you can execute to manage database objects, including tables, views, and more. ---- - - - What is Data Definition Language (DDL) in SQL? - - -Data Definition Language (DDL) is a group of SQL statements that you can execute to manage database objects, including tables, views, and more. Using DDL statements, you can perform powerful commands in your database such as creating, modifying, and dropping objects. DDL commands are usually executed in a SQL browser or stored procedure. - -DDL is contrasted with Data Manipulation Language (DML) which is the SQL that is used to actually access and manipulate data in database objects. The majority of data analysts will rarely execute DDL commands and will do the majority of their work creating DML statements to model and analyze data. - -:::note Note -Data folks don’t typically write DDL [since dbt will do it for them](https://docs.getdbt.com/docs/about/overview#:~:text=dbt%20allows%20analysts%20avoid%20writing,dbt%20takes%20care%20of%20materialization.). -::: - -To be honest, DDL is definitely some of the drier content that exists out there in the greater data world. However, because DDL commands are often uncompromising and should be used with caution, it’s incredibly important to understand how they work and when they should be used. We hope you can use this page to learn about the basics, strengths, and limitations of DDL statements. - -## Types of DDL Statements - -DDL statements are used to create, drop, and manipulate objects in your database. They are often, but not always, unforgiving and irreversible. “With great power comes great responsibility,” is usually the first thing I think of before I execute a DDL command. We’ll highlight some of the primary DDL commands that are used by analytics engineers below. - -:::important Important -The syntax for DDL commands can be pretty database-specific. We are trying to make this glossary page as generic as possible, but please use the “Further Reading” section to see the specifics on how the following DDL commands would be implemented in your database of interest! -::: - -### ALTER - -Using the `ALTER` DDL command, you can change an object in your database that already exists. By "change", we specifically mean you can: - -- Add new, remove, and rename columns to views and tables -- Rename a view or table -- Modify the structure of a view or table -- And more! - -The generic syntax to use the ALTER command is as follows: - -```sql -ALTER ; -``` - -To alter a table’s column, you may do something like this: - -```sql -ALTER TABLE customers rename column last_name as last_initial; -``` - -In this example, you have to rename the `last_name` column [in jaffle_shop’s](https://github.com/dbt-labs/jaffle_shop) `customers` table to be called `last_initial`. - -### DROP - -The `DROP` command. Probably the most high-stakes DDL statement one can execute. One that should be used with the *utmost* of care. At its core, an executed `DROP` statement will remove that object from the . You can drop tables, views, schemas, databases, users, functions, and more. - -Some data warehouses such as Snowflake allow you to add restrictions to `DROP` statements to caution you about the impact of dropping a table, view, or schema before it’s actually dropped. In practice, we recommend you never drop raw source tables as they are often your baseline of truth. Your database user also usually needs the correct permissions to drop database objects. - -The syntax to use the `DROP` command is as follows: - -```sql -DROP ; -``` - -You can drop your `customer` table like this: - -```sql -DROP TABLE customers; -``` - -### CREATE - -With the `CREATE` statement, you can create new objects in your data warehouse. The most common objects created with this statement are tables, schemas, views, and functions. Unlike `DROP`, `ALTER`, and `TRUNCATE` commands, there’s little risk with running `CREATE` statements since you can always drop what you create. - -Creating tables and views with the `CREATE` command requires a strong understanding of how you want the data structured, including column name and data type. Using the `CREATE` command to establish tables and views can be laborious and repetitive, especially if the schema objects contain many columns, but is an effective way to create new objects in a database. After you create a table, you can use DML `INSERT` statements and/or a transformation tool such as dbt to actually get data in it. - -The generic syntax to use the `CREATE` command is as follows: - -```sql -CREATE ; -``` - -Creating a table using the `CREATE` statement may look a something like this: - -```sql -CREATE TABLE prod.jaffle_shop.jaffles ( - id varchar(255), - jaffle_name varchar(255) - created_at timestamp, - ingredients_list varchar(255), - is_active boolean -); -``` - -Note that you had to explicitly define column names and column data type here. *You must have a strong understanding of your data’s structure when using the CREATE command for tables and views.* - -### TRUNCATE - -The `TRUNCATE` command will remove all rows from a table while maintaining the underlying table structure. The `TRUNCATE` command is only applicable for table objects in a database. Unlike `DROP` statements, `TRUNCATE` statements don’t remove the actual table from the database, just the data stored in them. - -The syntax to use the `TRUNCATE` command is as follows: - -```sql -TRUNCATE TABLE ; -``` - -You can truncate your jaffle_shop’s `payments` table by executing this statement: - -```sql -TRUNCATE TABLE payments; -``` - -Previously, this table was 113 rows. After executing this statement, the table is still in your database, but now has zero rows. - -## Conclusion - -DDL statements allow you to remove, edit, and add database objects. Some of the most common DDL statements you’ll execute include `CREATE`, `DROP`, `COMMENT`, `ALTER`, and more. DDL commands are typically executed in a SQL browser or stored procedure. Ultimately, DDL commands are all-powerful and potentially high-risk and should be used with the greatest of care. In the case of DDL, **do not** throw caution to the wind… - -## Further reading - -For database-specific DDL resources, check out the following: - -- [DDL commands in Snowflake](https://docs.snowflake.com/en/sql-reference/sql-ddl-summary.html) -- [SQL commands in Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/c_SQL_commands.html) (contains DDL) -- [DDL statements in Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language) -- [DDL statements in Databricks](https://docs.databricks.com/sql/language-manual/index.html#ddl-statements) -- [DDL in Amazon Athena](https://docs.aws.amazon.com/athena/latest/ug/language-reference.html) diff --git a/website/docs/terms/deploying.md b/website/docs/terms/deploying.md deleted file mode 100644 index 53e59658142..00000000000 --- a/website/docs/terms/deploying.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -id: deploying -title: Deploying -description: Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. -displayText: Deploying -hoverSnippet: Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. ---- - -Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. For more details, refer to [Deploy dbt jobs](/docs/deploy/deployments). - - - diff --git a/website/docs/terms/dimensional-modeling.md b/website/docs/terms/dimensional-modeling.md deleted file mode 100644 index de88f7c318d..00000000000 --- a/website/docs/terms/dimensional-modeling.md +++ /dev/null @@ -1,159 +0,0 @@ ---- -id: dimensional-modeling -title: Dimensional modeling -description: Dimensional modeling is a data modeling technique where you break data up into “facts” and “dimensions” to organize and describe entities in your data warehouse -displayText: dimensional modeling -hoverSnippet: Dimensional modeling is a data modeling technique where you break data up into “facts” and “dimensions” to organize and describe entities within your data warehouse. ---- - - - Dimensional modeling: An essential concept in data modeling - - -Dimensional modeling is a data modeling technique where you break data up into “facts” and “dimensions” to organize and describe entities within your data warehouse. The result is a staging layer in the data warehouse that cleans and organizes the data into the business end of the warehouse that is more accessible to data consumers. - -By breaking your data down into clearly defined and organized entities, your consumers can make sense of what that data is, what it’s used for, and how to join it with new or additional data. Ultimately, using dimensional modeling for your data can help create the appropriate layer of models to expose in an end business intelligence (BI) tool. - -There are a few different methodologies for dimensional modeling that have evolved over the years. The big hitters are the Kimball methodology and the Inmon methodology. Ralph Kimball’s work formed much of the foundation for how data teams approached data management and data modeling. Here, we’ll focus on dimensional modeling from Kimball’s perspective—why it exists, where it drives value for teams, and how it’s evolved in recent years. - -## What are we trying to do here? - -Let’s take a step back for a second and ask ourselves: why should you read this glossary page? What are you trying to accomplish with dimensional modeling and data modeling in general? Why have you taken up this rewarding, but challenging career? Why are *you* here? - -This may come as a surprise to you, but we’re not trying to build a top-notch foundation for analytics—we’re actually trying to build a bakery. - -Not the answer you expected? Well, let’s open up our minds a bit and explore this analogy. - -If you run a bakery (and we’d be interested in seeing the data person + baker venn diagram), you may not realize you’re doing a form of dimensional modeling. What’s the final output from a bakery? It’s that glittering, glass display of delicious-looking cupcakes, cakes, cookies, and everything in between. But a cupcake just didn’t magically appear in the display case! Raw ingredients went through a rigorous process of preparation, mixing, melting, and baking before they got there. - -Just as eating raw flour isn’t that appetizing, neither is deriving insights from raw data since it rarely has a nice structure that makes it poised for analytics. There’s some considerable work that’s needed to organize data and make it usable for business users. - -This is where dimensional modeling comes into play; it’s a method that can help data folks create meaningful entities (cupcakes and cookies) to live inside their [data mart](https://docs.getdbt.com/best-practices/how-we-structure/4-marts) (your glass display) and eventually use for business intelligence purposes (eating said cookies). - -So I guess we take it back—you’re not just trying to build a bakery, you’re also trying to build a top-notch foundation for meaningful analytics. Dimensional modeling can be a method to get you part of the way there. - -## Facts vs. dimensions - -The ultimate goal of dimensional modeling is to be able to categorize your data into their fact or dimension models, making them the key components to understand. So what are these components? - -### Facts - -A fact is a collection of information that typically refers to an action, event, or result of a business process. As such, people typically liken facts to verbs. In terms of a real business, some facts may look like account creations, payments, or emails sent. - -It’s important to note that fact tables act as a historical record of those actions. You should almost never overwrite that data when it needs updating. Instead, you add new data as additional rows onto that table. - -For many businesses, marketing and finance teams need to understand all the touchpoints leading up to a sale or conversion. A fact table for a scenario like this might look like a `fct_account_touchpoints` table: - -| **unique_id** | **touchpoint_id** | **account_id** | **touchpoint_name** | **touchpoint_created_at_utc** | -|---|---|---|---|---| -| 23534 | 34 | 325611 | fall_convention_2020 | 2022-01-30 00:11:26 | -| 12312 | 29 | 325611 | demo_1 | 2022-05-29 01:42:07 | -| 66782 | 67 | 325611 | demo_2 | 2022-06-25 04:10:32 | -| 85311 | 15 | 105697 | fall_convention_2020 | 2022-05-29 06:13:45 | - -Accounts may have many touch points and this table acts as a true log of events leading up to an account conversion. - -This table is great and all for helping understanding what might have led to a conversion or account creation, but what if business users need additional context on these accounts or touchpoints? That’s where dimensions come into play. - -### Dimensions -A dimension is a collection of data that describe who or what took action or was affected by the action. Dimensions are typically likened to nouns. They add context to the stored events in fact tables. In terms of a business, some dimensions may look like users, accounts, customers, and invoices. - -A noun can take multiple actions or be affected by multiple actions. It’s important to call out: a noun doesn’t become a new thing whenever it does something. As such, when updating dimension tables, you should overwrite that data instead of duplicating them, like you would in a fact table. - -Following the example from above, a dimension table for this business would look like an `dim_accounts` table with some descriptors: - -| account_id | account_created_at_utc | account_name | account_status | billing_address | -|---|---|---|---|---| -| 325611 | 2022-06-29 12:11:43 | Not a Pyramid Scheme | active | 9999 Snake Oil Rd, Los Angeles, CA | -| 234332 | 2019-01-03 07:34:50 | Charlie’s Angels’ Chocolate Factory | inactive | 123 Wonka Way, Indianapolis, IN | -| 105697 | 2020-12-11 11:50:22 | Baggins Thievery | active | The Shire | - -In this table, each account only has one row. If an account’s name or status were to be updated, new values would overwrite existing records versus appending new rows. - -:::tip Snapshots -For fact tables you want to keep track of changes to, folks can leverage [dbt snapshots](/docs/build/snapshots). -::: - -### Facts and dimensions at play with each other -Cool, you think you’ve got some facts and dimensions that can be used to qualify your business. There’s one big consideration left to think about: how do these facts and dimensions interact with each other? - -![Image of depicting how facts and dimensions join together to create analytics ready datasets](/img/docs/terms/dimensional-modeling/fact-star.png) - -Pre-cloud data warehouses, there were two dominant design options, star schemas and snowflake schemas, that were used to concretely separate out the lines between fact and dimension tables. - -- In a star schema, there’s one central fact table that can join to relevant dimension tables. -- A snowflake schema is simply an extension of a star schema; dimension tables link to other dimension tables making it form a snowflake-esque shape. - -It sounds really nice to have this clean setup with star or snowflake schemas. Almost as if it’s too good to be true (and it very well could be). - -The development of cheap cloud storage, BI tools great at handling joins, the evolution of SQL capabilities, and data analysts with growing skill sets have changed the way data folks use to look at dimensional modeling and star schemas. Wide tables consisting of fact and dimension tables joined together are now a competitive option for data teams. - -Below, we’ll dig more into the design process of dimensional modeling, wide tables, and the beautiful ambiguity of it all. - -## The dimensional modeling design process - -According to the Kimball Group, the official(™) four-step design process is (1) selecting a business process to analyze, (2) declaring the , (3) Identifying the dimensions, and (4) Identifying the facts. That makes dimensional modeling sound really easy, but in reality, it’s packed full of nuance. - -Coming back down to planet Earth, your design process is how you make decisions about: - -- Whether something should be a fact or a dimension -- Whether you should keep fact and dimension tables separate or create wide, joined tables - -This is something that data philosophers and thinkers could debate long after we’re all gone, but let’s explore some of the major questions to hold you over in the meantime. - -### Should this entity be a fact or dimension? - -Time to put on your consultant hat because that dreaded answer is coming: it depends. This is what makes dimensional modeling a challenge! - -Kimball would say that a fact must be numeric. The inconvenient truth is: an entity can be viewed as a fact or a dimension depending on the analysis you are trying to run. - -:::note Birds of a feather -If you ran a clinic, you would probably have a log of appointments by patient. At first, you could think of appointments as facts—they are, after all, events that happen and patients can have multiple appointments—and patients as dimensions. But what if your business team really cared about the appointment data itself—how well it went, when it happened, the duration of the visit. You could, in this scenario, make the case for treating this appointments table as a dimension table. If you cared more about looking at your data at a patient-level, it probably makes sense to keep appointments as facts and patients as dimensions. All this to say is that there’s inherent complexity in dimensional modeling, and it’s up to you to draw those lines and build those models. -::: - -So then, how do you know which is which if there aren’t any hard rules!? Life is a gray area, my friend. Get used to it. - -A general rule of thumb: go with your gut! If something feels like it should be a fact to meet your stakeholders' needs, then it’s a fact. If it feels like a dimension, it’s a dimension. The world is your oyster. If you find that you made the wrong decision down the road, (it’s usually) no big deal. You can remodel that data. Just remember: you’re not a surgeon. No one will die if you mess up (hopefully). So, just go with what feels right because you’re the expert on your data 👉😎👉 - -Also, this is why we have data teams. Dimensional modeling and data modeling is usually a collaborative effort; working with folks on your team to understand the data and stakeholder wants will ultimately lead to some rad data marts. - -### Should I make a wide table or keep them separate? - -Yet again, it depends. Don’t roll your eyes. Strap in for a quick history lesson because the answer to this harkens back to the very inception of dimensional modeling. - -Back in the day before cloud technology adoption was accessible and prolific, storing data was expensive and joining data was relatively cheap. Dimensional modeling came about as a solution to these issues. Separating collections of data into smaller, individual tables (star schema-esque) made the data cheaper to store and easier to understand. So, individual tables were the thing to do back then. - -Things are different today. Cloud storage costs have gotten really inexpensive. Instead, computing is the primary cost driver. Now, keeping all of your tables separate can be expensive because every time you join those tables, you’re spending usage credits. - -Should you just add everything to one, wide table? No. One table will never rule them all. Knowing whether something should be its own fact table or get added on to an existing table generally comes down to understanding who will be your primary end consumers. - -For end business users who are writing their own SQL, feel comfortable performing joins, or use a tool that joins tables for them, keeping your data as separate fact and dimension tables is pretty on-par. In this setup, these users have the freedom and flexibility to join and explore as they please. - -If your end data consumers are less comfortable with SQL and your BI tool doesn’t handle joins well, you should consider joining several fact and dimension tables into wide tables. Another consideration: these wide, heavily joined tables can tend to wind up pretty specialized and specific to business departments. Would these types of wide tables be helpful for you, your data team, and your business users? Well, that’s for you to unpack. - -## Advantages and disadvantages of dimensional modeling - -The benefits and drawbacks of dimensional modeling are pretty straightforward. Generally, the main advantages can be boiled down to: - -* **More accessibility**: Since the output of good dimensional modeling is a [data mart](https://docs.getdbt.com/best-practices/how-we-structure/4-marts), the tables created are easier to understand and more accessible to end consumers. -* **More flexibility**: Easy to slice, dice, filter, and view your data in whatever way suits your purpose. -* **Performance**: Fact and dimension models are typically materialized as tables or [incremental models](https://docs.getdbt.com/docs/build/incremental-models). Since these often form the core understanding of a business, they are queried often. Materializing them as tables allows them to be more performant in downstream BI platforms. - -The disadvantages include: -* **Navigating ambiguity**: You need to rely on your understanding of your data and stakeholder wants to model your data in a comprehensible and useful way. What you know about your data and what people really need out of the data are two of the most fundamental and difficult things to understand and balance as a data person. -* **Utility limited by your BI tool**: Some BI tools don’t handle joins well, which can make queries from separated fact and dimensional tables painful. Other tools have long query times, which can make querying from ultra-wide tables not fun. - -## Conclusion - -Dimensional data modeling is a data modeling technique that allows you to organize your data into distinct entities that can be mixed and matched in many ways. That can give your stakeholders a lot of flexibility. [While the exact methodologies have changed](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/)—and will continue to, the philosophical principle of having tables that are sources of truth and tables that describe them will continue to be important in the work of analytics engineering practitioners. - - -## Additional Reading - -Dimensional modeling is a tough, complex, and opinionated topic in the data world. Below you’ll find some additional resources that may help you identify the data modeling approach that works best for you, your data team, and your end business users: - - - -* [Modular data modeling techniques](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -* [Stakeholder-friendly model naming conventions](https://docs.getdbt.com/blog/stakeholder-friendly-model-names/) -* [How we structure our dbt projects guide](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) diff --git a/website/docs/terms/dml.md b/website/docs/terms/dml.md deleted file mode 100644 index 54fe69e845e..00000000000 --- a/website/docs/terms/dml.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -id: dml -title: DML -description: Data Manipulation Language (DML) is a class of SQL statements that are used to query, edit, add and delete row-level data from database tables or views. -displayText: DML -hoverSnippet: Data Manipulation Language (DML) is a class of SQL statements that are used to query, edit, add and delete row-level data from database tables or views. The main DML statements are SELECT, INSERT, DELETE, and UPDATE. ---- - - - DML: The SQL statements that make the data world go 'round - - -Data Manipulation Language (DML) is a class of SQL statements that are used to query, edit, add and delete row-level data from database tables or views. The main DML statements are `SELECT`, `INSERT`, `DELETE`, and `UPDATE`. - -DML is contrasted with Data Definition Language (DDL) which is a series of SQL statements that you can use to edit and manipulate the *structure* of databases and the objects in them. - -Similar to DDL, DML can be a *tad* bit boring. However, DML statements are what allows analysts and analytics engineers to do their work. We hope you can use this glossary to understand when and why DML statements are used and how they may contrast with similar DDL commands. - - -## Types of DML Statements - -The primary DML statements are `SELECT`, `INSERT`, `DELETE`, and `UPDATE`. With the exception of `SELECT` statements, all of the others are only applicable to data within tables in a database. The primary difference between `SELECT` and all the other DML statements is its impact to row-level data: - -- To *change* the actual data that lives in tables, use `INSERT`, `DELETE`, and `UPDATE` statements -- To *access* the data in databse object, use `SELECT` statements - -:::important Important -For the most part, the syntax for DML statements are pretty universal across [Supported Data Platforms](https://docs.getdbt.com/docs/supported-data-platforms) including Google Bigquery, Databricks, Postgres, Amazon Redshift, and Snowflake. Regardless, please use the “Further Reading” section to see the specifics on how the following DML statements would be implemented in your database of interest! -::: - -### SELECT - -Ah, our favorite of DML statements! This is the SQL we all know and love (most of the time). Because the `SELECT` statement allows you to access and manipulate data that exists in database objects, it makes it the true powerhouse in data analysis and analytics engineering. - -You write `SELECT` statements to create queries that build data models and perform robust analysis. With `SELECT` statements, you can join different views and tables, qualify data by setting filters, apply functions and operators on the data, and more. `SELECT` statements, unlike `INSERT`, `DELETE`, and `UPDATE`, don’t actually change the row-level value stored in the tables/views. Instead, you write `SELECT` statements to express the business logic needed to perform analysis. - -All `SELECT` statements need three elements: a `SELECT` clause in the beginning, the actual field selection and manipulation, and a `FROM` statement which is specifying which database object you’re trying to access. - -Here’s an example `SELECT` statement: - -```sql -select - - payment_method, - sum(amount) AS amount - -from {{ ref('raw_payments') }} -group by 1 -``` - -In this example, your selection of the `payment_method` column and summation of the `amount` column is the meat of your query. The `from {{ ref('raw_payments') }}` specifies the actual table you want to do the selecting from. - -### INSERT - -Using the `INSERT` DML command, you can add rows to a table that exists in your database. To be honest, data folks are rarely inserting data into tables manually with the `INSERT` command. Instead, data team members will most often use data that’s already been inserted by an tool or other data ingestion process. - -You can insert a record [in jaffle_shop’s](https://github.com/dbt-labs/jaffle_shop) `raw_customers` table like this: - -```sql -INSERT INTO raw_customers VALUES (101, 'Kira', 'F.'); -``` - -As you can see from this example, you clearly set all the column values that exist in your `raw_customers` table. For `INSERT` statements, you can explicitly specify the values you want to insert or use a query result to set the column values. - -### DELETE - -The `DELETE` command will remove rows in an existing table in your database. In practice, you will usually specify a `WHERE` clause with your `DELETE` statement to only remove specific rows from a table. But, you shouldn't really ever delete rows from tables. Instead, you should apply filters on queries themselves to remove rows from your modeling or analysis. - -For the most part, if you wanted to remove all existing rows in a table, but keep the underlying table structure, you would use the `TRUNCATE` DDL command. If you wanted to remove all rows and drop the entire table, you could use the `DROP` DDL command. - -You can delete the record for any Henry W. in jaffle_shop’s `customers` table by executing this statement: - -```sql -DELETE FROM customers WHERE first_name = 'Henry' AND last_name = 'W.'; -``` - -### UPDATE - -With the `UPDATE` statement, you can change the actual data in existing rows in a table. Unlike the `ALTER` DDL command that changes the underlying structure or naming of database objects, the `UPDATE` statement will alter the actual row-level data. You can qualify an `UPDATE` command with a `WHERE` statement to change the values of columns of only specific rows. - -You can manually update the status column of an order in your orders table like this: - -```sql -UPDATE orders SET status = 'returned' WHERE order_id = 7; -``` - -:::tip Tip -The `UPDATE` statement is often compared to the `MERGE` statement. With `MERGE` statements, you can insert, update, *and* delete records in a single command. Merges are often utilized when there is data between two tables that needs to be reconciled or updated. You'll see merges most commonly executed when a source table is updated and a downstream table needs to be updated as a result of this change. Learn more about [how dbt uses merges in incremental models here](https://docs.getdbt.com/docs/build/incremental-overview#how-incremental-models-work-in-dbt). -::: - -## Conclusion - -DML statements allow you to query, edit, add, and remove data stored in database objects. The primary DML commands are `SELECT`, `INSERT`, `DELETE`, and `UPDATE`. Using DML statements, you can perform powerful actions on the actual data stored in your system. You'll typically see DML `SELECT` statements written in data models to conduct data analysis or create new tables and views. In many ways, DML is the air that us data folks breathe! - -## Further reading - -For more resources on why people who use dbt don’t write DML, check out the following: - -- [Why not write DML](/faqs/Project/why-not-write-dml) -- [SQL dialect](/faqs/Models/sql-dialect) - -For database-specific DML documents, please check out the resources below: - -- [DML in Snowflake](https://docs.snowflake.com/en/sql-reference/sql-dml.html) -- [Updating tables with DML commands in Redshift](https://docs.aws.amazon.com/redshift/latest/dg/t_Updating_tables_with_DML_commands.html) -- [DML in Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-manipulation-language) -- [Delta Lake DML for Databricks](https://databricks.com/blog/2020/09/29/diving-into-delta-lake-dml-internals-update-delete-merge.html) diff --git a/website/docs/terms/dry.md b/website/docs/terms/dry.md deleted file mode 100644 index 04b83642a08..00000000000 --- a/website/docs/terms/dry.md +++ /dev/null @@ -1,97 +0,0 @@ ---- -id: dry -title: DRY -description: DRY is a software development principle that stands for “Don’t Repeat Yourself.” Living by this principle means that your aim is to reduce repetitive patterns and code. -displayText: DRY -hoverSnippet: DRY is a software development principle that stands for “Don’t Repeat Yourself.” Living by this principle means that your aim is to reduce repetitive patterns and duplicate code and logic in favor of modular and referenceable code. ---- - - - What is DRY? Hint: It makes for great code - dbt Labs - - -DRY is a software development principle that stands for “Don’t Repeat Yourself.” Living by this principle means that your aim is to reduce repetitive patterns and duplicate code and logic in favor of modular and referenceable code. - -The DRY code principle was originally made with software engineering in mind and coined by Andy Hunt and Dave Thomas in their book, _The Pragmatic Programmer_. They believed that “every piece of knowledge must have a single, unambiguous, authoritative representation within a system.” As the field of analytics engineering and [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) develops, there’s a growing need to adopt [software engineering best practices](https://www.getdbt.com/product/what-is-dbt/), including writing DRY code. - -## Why write DRY code? - -DRY code is one of the practices that makes a good developer, a great developer. Solving a problem by any means is great to a point, but eventually, you need to be able to write code that's maintainable by people other than yourself and scalable as system load increases. That's the essence of DRY code. - -But what's so great about being DRY as a bone anyway, when you can be WET? - -### Don’t be WET - -WET, which stands for “Write Everything Twice,” is the opposite of DRY. It's a tongue-in-cheek reference to code that doesn’t exactly meet the DRY standard. In a practical sense, WET code typically involves the repeated _writing_ of the same code throughout a project, whereas DRY code would represent the repeated _reference_ of that code. - -Well, how would you know if your code isn't DRY enough? That’s kind of subjective and will vary by the norms set within your organization. That said, a good rule of thumb is [the Rule of Three](https://en.wikipedia.org/wiki/Rule_of_three_(writing)#:~:text=The%20rule%20of%20three%20is,or%20effective%20than%20other%20numbers.). This rule states that the _third_ time you encounter a certain pattern, you should probably abstract it into some reusable unit. - -There is, of course, a tradeoff between simplicity and conciseness in code. The more abstractions you create, the harder it can be for others to understand and maintain your code without proper documentation. So, the moral of the story is: DRY code is great as long as you [write great documentation.](https://docs.getdbt.com/docs/build/documentation) - -### Save time & energy - -DRY code means you get to write duplicate code less often. You're saving lots of time writing the same thing over and over. Not only that, but you're saving your cognitive energy for bigger problems you'll end up needing to solve, instead of wasting that time and energy on tedious syntax. - -Sure, you might have to frontload some of your cognitive energy to create a good abstraction. But in the long run, it'll save you a lot of headaches. Especially if you're building something complex and one typo can be your undoing. - -### Create more consistent definitions - -Let's go back to what Andy and Dave said in _The Pragmatic Programmer_: “Every piece of knowledge must have a single, unambiguous, authoritative representation within a system.” As a data person, the words “single” and “unambiguous” might have stood out to you. - -Most teams have essential business logic that defines the successes and failures of a business. For a subscription-based DTC company, this could be [monthly recurring revenue (MRR)](https://www.getdbt.com/blog/modeling-subscription-revenue/) and for a SaaS product, this could look like customer lifetime value (CLV). Standardizing the SQL that generates those metrics is essential to creating consistent definitions and values. - -By writing DRY definitions for key business logic and metrics that are referenced throughout a dbt project and/or BI (business intelligence) tool, data teams can create those single, unambiguous, and authoritative representations for their essential transformations. Gone are the days of 15 different definitions and values for churn, and in are the days of standardization and DRYness. - -:::important dbt Semantic Layer, powered by MetricFlow - -The [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), powered by [MetricFlow](/docs/build/about-metricflow), simplifies the process of defining and using critical business metrics, like revenue in the modeling layer (your dbt project). By centralizing metric definitions, data teams can ensure consistent self-service access to these metrics in downstream data tools and applications. The dbt Semantic Layer eliminates duplicate coding by allowing data teams to define metrics on top of existing models and automatically handles data joins. - -::: - -## Tools to help you write DRY code - -Let’s just say it: Writing DRY code is easier said than done. For classical software engineers, there’s a ton of resources out there to help them write DRY code. In the world of data transformation, there are also some tools and methodologies that can help folks in [the field of analytics engineering](https://www.getdbt.com/what-is-analytics-engineering/) write more DRY and [modular code](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/). - - -### Common Table Expressions (CTEs) - -CTEs are a great way to help you write more DRY code in your data analysis and dbt models. In a formal sense, a CTE is a temporary results set that can be used in a query. In a much more human and practical sense, we like to think of CTEs as separate, smaller queries within the larger query you’re building up. Essentially, you can use CTEs to break up complex queries into simpler blocks of code that are easier to debug and can connect and build off of each other. - -If you’re referencing a specific query, perhaps for aggregations that join back to an unaggregated view, CTEs can simply be referenced throughout a query with its CTE_EXPRESSION_NAME. - - -### View materializations - -View [materializations](https://docs.getdbt.com/docs/build/materializations) are also extremely useful for abstracting code that might otherwise be repeated often. A is a defined passthrough SQL query that can be run against a database. Unlike a table, it doesn’t store data, but it defines the logic that you need to use to fetch the underlying data. - -If you’re referencing the same query, CTE, or block of code, throughout multiple data models, that’s probably a good sign that code should be its own view. - -For example, you might define a SQL view to count new users created in a day: - -```sql - select - created_date, - count(distinct(user_id)) as new_users - from {{ ref('users') }} - group by created_date -``` - -While this is a simple query, writing this logic every time you need it would be super tedious. And what if the `user_id` field changed to a new name? If you’d written this in a WET way, you’d have to find every instance of this code and make the change to the new field versus just updating it once in the code for the view. - -To make any subsequent references to this view DRY-er, you simply reference the view in your data model or query. - -### dbt macros and packages - -dbt also supports the use of [macros](/docs/build/jinja-macros) and [packages](https://docs.getdbt.com/docs/build/packages) to help data folks write DRY code in their dbt projects. Macros are Jinja-supported functions that can be reused and applied throughout a dbt project. Packages are libraries of dbt code, typically models, macros, and/or tests, that can be referenced and used in a dbt project. They are a great way to use transformations for common data sources (like [ad platforms](https://hub.getdbt.com/dbt-labs/facebook_ads/latest/)) or use more [custom tests for your data models](https://hub.getdbt.com/calogica/dbt_expectations/0.1.2/) _without having to write out the code yourself_. At the end of the day, is there really anything more DRY than that? - -## Conclusion - -DRY code is a principle that you should always be striving for. It saves you time and energy. It makes your code more maintainable and extensible. And potentially most importantly, it’s the fine line that can help transform you from a good analytics engineer to a great one. - -## Further reading - -* [Data modeling technique for more modularity](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -* [Why we use so many CTEs](https://docs.getdbt.com/docs/best-practices) -* [Glossary: CTE](https://docs.getdbt.com/terms/cte) -* [Glossary: Materialization](https://docs.getdbt.com/terms/materialization) -* [Glossary: View](https://docs.getdbt.com/terms/view) diff --git a/website/docs/terms/edw.md b/website/docs/terms/edw.md deleted file mode 100644 index 1ac0f37ee47..00000000000 --- a/website/docs/terms/edw.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -id: edw -title: EDW -description: The primary difference between an EDW and a regular data warehouse is, well, semantics and perspective. An EDW like any other data warehouse, is a collection of databases that centralize a business's data -displayText: EDW -hoverSnippet: An Enterprise Data Warehouse (EDW), like any other data warehouse, is a collection of databases that centralize a business's information from multiple sources and applications. ---- - - - What does an EDW (Enterprise Data Warehouse) really mean? - - -An Enterprise Data Warehouse (EDW), like any other , is a collection of databases that centralize a business's information from multiple sources and applications. The primary difference between an EDW and a regular data warehouse is, well, semantics and perspective. - -The data stored in an EDW comes from all different functions of a company—marketing, finance, engineering, product, and more. The primary goal of an EDW is to provide a central and organized home for both the raw and transformed version of this data. EDWs in modern data practices are typically set-up in the cloud, meaning that the servers used to run the warehouse are owned and managed by a cloud provider such as Snowflake, AWS Redshift, or Google BigQuery. - - -## Data warehouse vs enterprise data warehouse - -![](/img/docs/terms/edw/meme.png) - -In an EDW, all departments of an organization store their raw and transformed data in databases within a main warehouse. For organizations that are not calling their data warehouse an EDW and have a more siloed setup, there’s a chance each department each has *their own separate* data warehouse for storage and computation. **But practically, the difference between a data warehouse and an enterprise data warehouse is semantics.** - -Organization size, distribution, data complexity, and business needs can all determine whether a company wants a centralized data warehouse or distributed warehouses per function. Nonetheless, if your organization only has one data warehouse that centrally houses all of your data sources, the distinction isn't really necessary, but *technically* that could be called an EDW. - -In the world of analytics engineering, most teams have one central data warehouse that houses data from all of their different departments and functions. - - -### Why is this distinction necessary? - -One of the main distinctions is in an organization’s users and distribution. If an organization has multiple databases, a central data warehouse is used to create separate entities between raw and source data, staging work, and ready-for-use analytics datasets. In this EDW and classic data warehouse setup, data is accessible across an organization, data teams can create tables that join data from multiple sources, and users can gain enriched perspectives into their data. - -If a company has very siloed departments that manage their own data, budgets, and have little need for crossover with other departments or data sources, emphasizing the difference between a central EDW data warehouse and their own data warehouse could be a necessity for budgeting and governance reasons. - -Lastly, the somewhat exponential adoption of cloud data warehouses in the last decade has shifted the terminology from what many people called an EDW to a data warehouse. - - -## Enterprise data warehouse use cases - -There are a variety of reasons why an organization might opt to have an EDW or data warehouse. A centralized and organized data warehouse provide advantages for the following use cases: - -- Create clear partitions between raw, staging, and heavily transformed data -- Standardize data definitions and metrics across multiple data sources -- Connect a BI tool to one central data warehouse and surface that data to users across a business - -### Benefits of an EDW - -Like most other data warehouses, the benefit of an EDW is the ability to store raw and transformed data from multiple sources in one single data warehouse. Users across different departments and data team members embedded in different functions can all have access to the same data. Cloud data warehouses also scale with data and users, making EDWs an appropriate place for organizations to grow their analytics work. - -EDWs also help in building a 360-degree view of the company by combining different sources of information, such as customer feedback, financial records, product inventory, and marketing insights. All of this information can then be organized in data marts, schemas, and tables within one EDW that are eventually exposed to a BI tool. - -In addition, because all of an organization’s data is stored in one place, data teams can provide access to only those who need access to specific schemas and tables. Keeping these access patterns and changes in only one data warehouse will limit the amount of data needed to go through for auditing and other security regulations. - -## Conclusion - -An enterprise data warehouse is, in general, like any other data warehouse; it acts as a central home for multiple departments’ raw and transformed data. An EDW is often composed of multiple databases to store raw, staging, development, and production-ready data. The primary benefits for an EDW are centralization, standardization, and accessibility. You probably have a data warehouse setup like an EDW, you’re likely just not calling it that 😉 - - -## Additional reading -EDW, data warehouse, or something different altogether? Check out some of our favorite resources on the fundamental of data storage and organization: - -- [Glossary: Dimensional modeling](https://docs.getdbt.com/terms/dimensional-modeling) -- [Glossary: Data warehouse](https://docs.getdbt.com/terms/data-warehouse) \ No newline at end of file diff --git a/website/docs/terms/elt.md b/website/docs/terms/elt.md deleted file mode 100644 index 59cfc77778c..00000000000 --- a/website/docs/terms/elt.md +++ /dev/null @@ -1,139 +0,0 @@ ---- -id: elt -title: What is ELT (Extract, Load, Transform)? -description: ELT is the process of first extraction data from different sources, then loading it into a data warehouse, and finally transforming it. -displayText: ELT -hoverSnippet: Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, loading it into a target data warehouse, and finally transforming it. ---- - - What is ELT (Extract, Load, Transform)? How does it differ from ETL? - -Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, then loading it into a target , and finally transforming it. - -ELT has emerged as a paradigm for how to manage information flows in a modern data warehouse. This represents a fundamental shift from how data previously was handled when Extract, Transform, Load (ETL) was the data workflow most companies implemented. - -Transitioning from ETL to ELT means that you no longer have to capture your transformations during the initial loading of the data into your data warehouse. Rather, you are able to load all of your data, then build transformations on top of it. Data teams report that the ELT workflow has several advantages over the traditional ETL workflow which we’ll go over [in-depth later in this glossary](#benefits-of-elt). - -## How ELT works - -In an ELT process, data is extracted from data sources, loaded into a target data platform, and finally transformed for analytics use. We’ll go over the three components (extract, load, transform) in detail here. - -![Diagram depicting the ELT workflow. Data is depicted being extracted from example data sources like an Email CRM, Facebook Ads platform, Backend databases, and Netsuite. The data is then loaded as raw data into a data warehouse. From there, the data is transformed within the warehouse by renaming, casting, joining, or enriching the raw data. The result is then modeled data inside your data warehouse.](/img/docs/terms/elt/elt-diagram.png) - -### Extract - -In the extraction process, data is extracted from multiple data sources. The data extracted is, for the most part, data that teams eventually want to use for analytics work. Some examples of data sources can include: - -- Backend application databases -- Marketing platforms -- Email and sales CRMs -- and more! - -Accessing these data sources using Application Programming Interface (API) calls can be a challenge for individuals and teams who don't have the technical expertise or resources to create their own scripts and automated processes. However, the recent development of certain open-source and Software as a Service (SaaS) products has removed the need for this custom development work. By establishing the option to create and manage pipelines in an automated way, you can extract the data from data sources and load it into data warehouses via a user interface. - -Since not every data source will integrate with SaaS tools for extraction and loading, it’s sometimes inevitable that teams will write custom ingestion scripts in addition to their SaaS tools. - -### Load - -During the loading stage, data that was extracted is loaded into the target data warehouse. Some examples of modern data warehouses include Snowflake, Amazon Redshift, and Google BigQuery. Examples of other data storage platforms include data lakes such as Databricks’s Data Lakes. Most of the SaaS applications that extract data from your data sources will also load it into your target data warehouse. Custom or in-house extraction and load processes usually require strong data engineering and technical skills. - -At this point in the ELT process, the data is mostly unchanged from its point of extraction. If you use an extraction and loading tool like Fivetran, there may have been some light normalization on your data. But for all intents and purposes, the data loaded into your data warehouse at this stage is in its raw format. - -### Transform - -In the final transformation step, the raw data that has been loaded into your data warehouse is finally ready for modeling! When you first look at this data, you may notice a few things about it… - -- Column names may or may not be clear -- Some columns are potentially the incorrect data type -- Tables are not joined to other tables -- Timestamps may be in the incorrect timezone for your reporting -- fields may need to be unnested -- Tables may be missing primary keys -- And more! - -...hence the need for transformation! During the transformation process, data from your data sources is usually: - -- **Lightly Transformed**: Fields are cast correctly, timestamp fields’ timezones are made uniform, tables and fields are renamed appropriately, and more. -- **Heavily Transformed**: Business logic is added, appropriate materializations are established, data is joined together, etc. -- **QA’d**: Data is tested according to business standards. In this step, data teams may ensure primary keys are unique, model relations match-up, column values are appropriate, and more. - -Common ways to transform your data include leveraging modern technologies such as dbt, writing custom SQL scripts that are automated by a scheduler, utilizing stored procedures, and more. - -## ELT vs ETL - -The primary difference between the traditional ETL and the modern ELT workflow is when [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) and loading take place. In ETL workflows, data extracted from data sources is transformed prior to being loaded into target data platforms. Newer ELT workflows have data being transformed after being loaded into the data platform of choice. Why is this such a big deal? - -| | ELT | ETL | -|---|---|---| -| Programming skills required| Often little to no code to extract and load data into your data warehouse. | Often requires custom scripts or considerable data engineering lift to extract and transform data prior to load. | -| Separation of concerns | Extraction, load, and transformation layers can be explicitly separated out by different products. | ETL processes are often encapsulated in one product. | -| Distribution of transformations | Since transformations take place last, there is greater flexibility in the modeling process. Worry first about getting your data in one place, then you have time to explore the data to understand the best way to transform it. | Because transformation occurs before data is loaded into the target location, teams must conduct thorough work prior to make sure data is transformed properly. Heavy transformations often take place downstream in the BI layer. | -| [Data team distribution](https://www.getdbt.com/data-teams/analytics-job-descriptions/) | ELT workflows empower data team members who know SQL to create their own extraction and loading pipelines and transformations. | ETL workflows often require teams with greater technical skill to create and maintain pipelines. | - -Why has ELT adoption grown so quickly in recent years? A few reasons: - -- **The abundance of cheap cloud storage with modern data warehouses.** The creation of modern data warehouses such Redshift and Snowflake has made it so teams of all sizes can store and scale their data at a more efficient cost. This was a huge enabler for the ELT workflow. -- **The development of low-code or no-code data extractors and loaders.** Products that require little technical expertise such as Fivetran and Stitch, which can extract data from many data sources and load it into many different data warehouses, have helped lower the barrier of entry to the ELT workflow. Data teams can now relieve some of the data engineering lift needed to extract data and create complex transformations. -- **A true code-based, version-controlled transformation layer with the development of dbt.** Prior to the development of dbt, there was no singular transformation layer product. dbt helps data analysts apply software engineering best practices (version control, CI/CD, and testing) to data transformation, ultimately allowing for anyone who knows SQL to be a part of the ELT process. -- **Increased compatibility between ELT layers and technology in recent years.** With the expansion of extraction, loading, and transformation layers that integrate closely together and with cloud storage, the ELT workflow has never been more accessible. For example, Fivetran creates and maintains [dbt packages](https://hub.getdbt.com/) to help write dbt transformations for the data sources they connect to. - -## Benefits of ELT - -You often hear about the benefits of the ELT workflow to data, but you can sometimes forget to talk about the benefits it brings to people. There are a variety of benefits that this workflow brings to the actual data (which we’ll outline in detail below), such as the ability to recreate historical transformations, test data and data models, and more. We'll also want to use this section to emphasize the empowerment the ELT workflow brings to both data team members and business stakeholders. - -### ELT benefit #1: Data as code - -Ok we said it earlier: The ELT workflow allows data teams to function like software engineers. But what does this really mean? How does it actually impact your data? - -#### Analytics code can now follow the same best practices as software code - -At its core, data transformations that occur last in a data pipeline allow for code-based and version-controlled transformations. These two factors alone permit data team members to: - -- Easily recreate historical transformations by rolling back commits -- Establish code-based tests -- Implement CI/CD workflows -- Document data models like typical software code. - -#### Scaling, made sustainable - -As your business grows, the number of data sources correspondingly increases along with it. As such, so do the number of transformations and models needed for your business. Managing a high number of transformations without version control or automation is not scalable. - -The ELT workflow capitalizes on transformations occurring last to provide flexibility and software engineering best practices to data transformation. Instead of having to worry about how your extraction scripts scale as your data increases, data can be extracted and loaded automatically with a few clicks. - -### ELT benefit #2: Bring the power to the people - -The ELT workflow opens up a world of opportunity for the people that work on that data, not just the data itself. - -#### Empowers data team members - -Data analysts, analytics engineers, and even data scientists no longer have to be dependent on data engineers to create custom pipelines and models. Instead, they can use point-and-click products such as Fivetran and Airbyte to extract and load the data for them. - -Having the transformation as the final step in the ELT workflow also allows data folks to leverage their understanding of the data and SQL to focus more on actually modeling the data. - -#### Promotes greater transparency for end busines users - -Data teams can expose the version-controlled code used to transform data for analytics to end business users by no longer having transformations hidden in the ETL process. Instead of having to manually respond to the common question, “How is this data generated?” data folks can direct business users to documentation and repositories. Having end business users involved or viewing the data transformations promote greater collaboration and awareness between business and data folks. - -## ELT tools - -As mentioned earlier, the recent development of certain technologies and products has helped lower the barrier of entry to implementing the ELT workflow. Most of these new products act as one or two parts of the ELT process, but some have crossover across all three parts. We’ll outline some of the current tools in the ELT ecosystem below. - -| Product | E/L/T? | Description | Open source option? | -|---|---|---|---| -| Fivetran/HVR | E, some T, L | Fivetran is a SaaS company that helps data teams extract, load, and perform some transformation on their data. Fivetran easily integrates with modern data warehouses and dbt. They also offer transformations that leverage dbt Core. | :x: | -| Stitch by Talend | E, L | Stitch (part of Talend) is another SaaS product that has many data connectors to extract data and load it into data warehouses. | :x: | -| Airbyte | E, L | Airbyte is an open-source and cloud service that allows teams to create data extraction and load pipelines. | :white_check_mark: | -| Funnel | E, some T, L | Funnel is another product that can extract and load data. Funnel’s data connectors are primarily focused around marketing data sources. | :x: | -| dbt | T | dbt is the transformation tool that enables data analysts and engineers to transform, test, and document data in the cloud data warehouse. dbt offers both an open-source and cloud-based product. | :white_check_mark: | - -## Conclusion - -The past few years have been a whirlwind for the data world. The increased accessibility and affordability of cloud warehouses, no-code data extractors and loaders, and a true transformation layer with dbt has allowed for the ELT workflow to become the preferred analytics workflow. ETL predates ELT and differs in when data is transformed. In both processes, data is first extracted from different sources. However, in ELT processes, data is loaded into the target data platform and then transformed. The ELT workflow ultimately allows for data team members to extract, load, and model their own data in a flexible, accessible, and scalable way. - -## Further reading - -Here's some of our favorite content about the ELT workflow: - -- [The case for the ELT workflow](https://www.getdbt.com/analytics-engineering/case-for-elt-workflow/) -- [A love letter to ETL tools](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/) -- [What is dbt?](https://getdbt.com/product/what-is-dbt/) \ No newline at end of file diff --git a/website/docs/terms/etl.md b/website/docs/terms/etl.md deleted file mode 100644 index 321f59a65d0..00000000000 --- a/website/docs/terms/etl.md +++ /dev/null @@ -1,130 +0,0 @@ ---- -id: etl -title: What is ETL (Extract, Transform, Load)? -description: ETL is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. -displayText: ETL -hoverSnippet: Extract, Transform, Load (ETL) is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. ---- - - - What is ETL (Extract, Transform, Load)? How has it evolved? - - -ETL, or “Extract, Transform, Load”, is the process of first extracting data from a data source, transforming it, and then loading it into a target . In ETL workflows, much of the meaningful [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) occurs outside this primary pipeline in a downstream business intelligence (BI) platform. - -ETL is contrasted with the newer (Extract, Load, Transform) workflow, where transformation occurs after data has been loaded into the target data warehouse. In many ways, the ETL workflow could have been renamed the ETLT workflow, because a considerable portion of meaningful data transformations happen outside the data pipeline. The same transformations can occur in both ETL and ELT workflows, the primary difference is *when* (inside or outside the primary ETL workflow) and *where* the data is transformed (ETL platform/BI tool/data warehouse). - -It’s important to talk about ETL and understand how it works, where it provides value, and how it can hold people back. If you don’t talk about the benefits and drawbacks of systems, how can you expect to improve them? - -## How ETL works - -In an ETL process, data is first extracted from a source, transformed, and then loaded into a target data platform. We’ll go into greater depth for all three steps below. - -![A diagram depicting the ETL workflow. The diagram starts by depicting raw data being extracted from various example data sources like an email CRM, Facebook Ads platform, a backend database, and Netsuite. Once the data is extracted, the raw data is transformed within the data pipeline via renaming, casting, joining, and enriching. After the data is transformed within the data pipeline, the modeled data is loaded into a data warehouse.](/img/docs/terms/etl/etl-diagram.png) - -### Extract - -In this first step, data is extracted from different data sources. Data that is extracted at this stage is likely going to be eventually used by end business users to make decisions. Some examples of these data sources include: - -- Ad platforms (Facebook Ads, Google Ads, etc.) -- Backend application databases -- Sales CRMs -- And more! - -To actually get this data, data engineers may write custom scripts that make Application Programming Interface (API) calls to extract all the relevant data. Because making and automating these API calls gets harder as data sources and data volume grows, this method of extraction often requires strong technical skills. In addition, these extraction scripts also involve considerable maintenance since APIs change relatively often. Data engineers are often incredibly competent at using different programming languages such as Python and Java. Data teams can also extract from these data sources with open source and Software as a Service (SaaS) products. - -### Transform - -At this stage, the raw data that has been extracted is normalized and modeled. In ETL workflows, much of the actual meaningful business logic, metric calculations, and entity joins tend to happen further down in a downstream BI platform. As a result, the transformation stage here is focused on data cleanup and normalization – renaming of columns, correct casting of fields, timestamp conversions. - -To actually transform the data, there’s two primary methods teams will use: - -- **Custom solutions**: In this solution, data teams (typically data engineers on the team), will write custom scripts and create automated pipelines to transform the data. Unlike ELT transformations that typically use SQL for modeling, ETL transformations are often written in other programming languages such as Python or Scala. Data engineers may leverage technologies such as Apache Spark or Hadoop at this point to help process large volumes of data. -- **ETL products**: There are ETL products that will extract, transform, and load your data in one platform. [These tools](#etl-tools) often involve little to no code and instead use Graphical User Interfaces (GUI) to create pipelines and transformations. - -### Load - -In the final stage, the transformed data is loaded into your target data warehouse. Once this transformed data is in its final destination, it’s most commonly exposed to end business users either in a BI tool or in the data warehouse directly. - -The ETL workflow implies that your raw data does not live in your data warehouse. *Because transformations occur before load, only transformed data lives in your data warehouse in the ETL process.* This can make it harder to ensure that transformations are performing the correct functionality. - -## How ETL is being used - -While ELT adoption is growing, we still see ETL use cases for processing large volumes of data and adhering to strong data governance principles. - -### ETL to efficiently normalize large volumes of data - -ETL can be an efficient way to perform simple normalizations across large data sets. Doing these lighter transformations across a large volume of data during loading can help get the data formatted properly and quickly for downstream use. In addition, end business users sometimes need quick access to raw or somewhat normalized data. Through an ETL workflow, data teams can conduct lightweight transformations on data sources and quickly expose them in their target data warehouse and downstream BI tool. - -### ETL for hashing PII prior to load - -Some companies will want to mask, hash, or remove PII values before it enters their data warehouse. In an ETL workflow, teams can transform PII to hashed values or remove them completely during the loading process. This limits where PII is available or accessible in an organization’s data warehouse. - -## ETL challenges - -There are reasons ETL has persisted as a workflow for over twenty years. However, there are also reasons why there’s been such immense innovation in this part of the data world in the past decade. From our perspective, the technical and human limitations we describe below are some of the reasons ELT has surpassed ETL as the preferred workflow. - -### ETL challenge #1: Technical limitations - -**Limited or lack of version control** - -When transformations exist as standalone scripts or deeply woven in ETL products, it can be hard to version control the transformations. Not having version control on transformation as code means that data teams can’t easily recreate or rollback historical transformations and perform code reviews. - -**Immense amount of business logic living in BI tools** - -Some teams with ETL workflows only implement much of their business logic in their BI platform versus earlier in their transformation phase. While most organizations have some business logic in their BI tools, an excess of this logic downstream can make rendering data in the BI tool incredibly slow and potentially hard to track if the code in the BI tool is not version controlled or exposed in documentation. - -**Challenging QA processes** - -While data quality testing can be done in ETL processes, not having the raw data living somewhere in the data warehouse inevitably makes it harder to ensure data models are performing the correct functionality. In addition, quality control continually gets harder as the number of data sources and pipelines within your system grows. - -### ETL challenge #2: Human limitations - -**Data analysts can be excluded from ETL work** - -Because ETL workflows often involve incredibly technical processes, they've restricted data analysts from being involved in the data workflow process. One of the greatest strengths of data analysts is their knowledge of the data and SQL, and when extractions and transformations involve unfamiliar code or applications, they and their expertise can be left out of the process. Data analysts and scientists also become dependent on other people to create the schemas, tables, and datasets they need for their work. - -**Business users are kept in the dark** - -Transformations and business logic can often be buried deep in custom scripts, ETL tools, and BI platforms. At the end of the day, this can hurt business users: They're kept out of the data modeling process and have limited views into how data transformation takes place. As a result, end business users often have little clarity on data definition, quality, and freshness, which ultimately can decrease trust in the data and data team. - -## ETL vs ELT - -You may read other articles or technical documents that use ETL and ELT interchangeably. On paper, the only difference is the order in which the T and the L appear. However, this mere switching of letters dramatically changes the way data exists in and flows through a business’ system. - -In both processes, data from different data sources is extracted in similar ways. However, in ELT, data is then directly loaded into the target data platform versus being transformed in ETL. Now, via ELT workflows, both raw and transformed data can live in a data warehouse. In ELT workflows, data folks have the flexibility to model the data after they’ve had the opportunity to explore and analyze the raw data. ETL workflows can be more constraining since transformations happen immediately after extraction. We break down some of the other major differences between the two below: - -| | ELT | ETL | -|---|---|---| -| Programming skills required | Often requires little to no code to extract and load data into your data warehouse. | Often requires custom scripts or considerable data engineering lift to extract and transform data prior to load. | -| Separation of concerns | Extraction, load, and transformation layers can be explicitly separated out by different products. | ETL processes are often encapsulated in one product. | -| Distribution of transformations | Since transformations take place last, there is greater flexibility in the modeling process. Worry first about getting your data in one place, then you have time to explore the data to understand the best way to transform it. | Because transformation occurs before data is loaded into the target location, teams must conduct thorough work prior to make sure data is transformed properly. Heavy transformations often take place downstream in the BI layer. | -| [Data team roles](https://www.getdbt.com/data-teams/analytics-job-descriptions/) | ELT workflows empower data team members who know SQL to create their own extraction and loading pipelines and transformations. | ETL workflows often require teams with greater technical skill to create and maintain pipelines. | - -While ELT is growing in adoption, it’s still important to talk about when ETL might be appropriate and where you'll see challenges with the ETL workflow. - -## ETL tools - -There exists a variety of ETL technologies to help teams get data into their data warehouse. A good portion of ETL tools on the market today are geared toward enterprise businesses and teams, but there are some that are also applicable for smaller organizations. - -| Platform | E/T/L? | Description | Open source option? | -|---|---|---|---| -| Informatica | E, T, L | An all-purpose ETL platform that supports low or no-code extraction, transformations and loading. Informatica also offers a broad suite of data management solutions beyond ETL and is often leveraged by enterprise organizations. | :x: | -| Integrate.io | E, T, L | A newer ETL product focused on both low-code ETL as well as reverse ETL pipelines. | :x: | -| Matillion | E, T, L | Matillion is an end-to-end ETL solution with a variety of native data connectors and GUI-based transformations. | :x: | -| Microsoft SISS | E, T, L | Microsoft’s SQL Server Integration Services (SISS) offers a robust, GUI-based platform for ETL services. SISS is often used by larger enterprise teams. | :x: | -| Talend Open Studio | E, T, L | An open source suite of GUI-based ETL tools. | :white_check_mark: | - -## Conclusion - -ETL, or “Extract, Transform, Load,” is the process of extracting data from different data sources, transforming it, and loading that transformed data into a data warehouse. ETL typically supports lighter transformations during the phase prior to loading and more meaningful transformations to take place in downstream BI tools. We’re seeing now that ETL is fading out and the newer ELT workflow is replacing it as a practice for many data teams. However, it’s important to note that ETL allowed us to get us to where we are today: Capable of building workflows that extract data within simple UIs, store data in scalable cloud data warehouses, and write data transformations like software engineers. - -## Further Reading - -Please check out some of our favorites reads regarding ETL and ELT below: - -- [Glossary: ELT](https://docs.getdbt.com/terms/elt) -- [The case for the ELT workflow](https://www.getdbt.com/analytics-engineering/case-for-elt-workflow/) -- [A love letter to ETL tools](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/) -- [Reverse ETL](https://www.getdbt.com/analytics-engineering/use-cases/operational-analytics/) - diff --git a/website/docs/terms/grain.md b/website/docs/terms/grain.md deleted file mode 100644 index 608a5c6391d..00000000000 --- a/website/docs/terms/grain.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -id: grain -title: Data grain -description: Grain is the combination of columns at which records in a table are unique. Ideally, this is captured in a single column or a unique primary key. -displayText: grain -hoverSnippet: Your data's grain is the combination of columns at which records in a table are unique. Ideally, this is captured in a single column and a unique primary key. ---- - - - Data grain: What granularity means in terms of data modeling - - -Grain is the combination of columns at which records in a table are unique. Ideally, this is captured in a single column, a unique , but even then, there is descriptive grain behind that unique id. Let’s look at some examples to better understand this concept. - -| user_id | address | -| --- | --- | -| 1 | 123 Jaffle Ln | -| 2 | 456 Waffle St | -| 3 | 789 Raffle Rd | - -In the above table, each `user_id` is unique. This table is at the *user* *grain*. - -| user_id | address | -| --- | --- | -| 1 | 123 Jaffle Ln | -| 1 | 420 Jaffle Ln | -| 2 | 456 Waffle St | -| 3 | 789 Raffle Rd | - -In the above table, `user_id` is no longer unique. The combination of `user_id` and `address` creates a unique combination, thus this table is at the *user* *address* *grain*. We generally describe the grain conceptually based on the names of the columns that make it unique. A more realistic combination you might see in the wild would be a table that capture the state of all users at midnight every day. The combination of the captured `updated_date` and `user_id` would be unique, meaning our table is at *user per day* grain. - -In both examples listed in the previous paragraph, we’d want to create a of some sort from the combination of columns that comprise the grain. This gives our table a primary key, which is crucial for testing and optimization, and always a best practice. Typically, we’ll name this primary key based on the verbal description of the grain. For the latter example, we’d have `user_per_day_id`. This will be more solid and efficient than testing than repeatedly relying on the combination of those two columns. - -Thinking deeply about grain is a crucial part of data modeling. As we design models, we need to consider the entities we’re describing, and what dimensions (time, attributes, etc.) might fan those entities out so they’re no longer unique, as well as how we want to deal with those. Do we need to apply transformations to deduplicate and collapse the grain? Or do we intentionally want to expand the grain out, like in our *user per day* example? - -There’s no right answer here, we have the power to do either as it meets our needs. The key is just to make sure we have a clear sense of our grain for every model we create, that we’ve captured it in a primary key, and that we’re applying tests to ensure that our primary key column is unique and not null. \ No newline at end of file diff --git a/website/docs/terms/idempotent.md b/website/docs/terms/idempotent.md deleted file mode 100644 index ea3ef0a099b..00000000000 --- a/website/docs/terms/idempotent.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -id: idempotent -title: Idempotent -description: Idempotent is an adjective to describe a process that gives you the same result no matter how many times you run it. -displayText: idempotent -hoverSnippet: Idempotent describes a process that gives you the same result no matter how many times you run it. ---- - - - What is idempotency and why is the concept important in data? - - -Idempotent is an adjective to describe a process that gives you the same result no matter how many times you run it. - -For a mathematical example, adding 1 changes the results, but multiplying by 1 is idempotent. When you add 1 to a number and then add 1 again, you get different results. If you multiply a number by 1 and multiply by 1 again, you do get the same result. - -A more real-world example of idempotency is the process of saving a file in a word processor. Given the same inputs (i.e. the same document contents), clicking "_Save_" one time will leave your system in the exact same state as clicking "_Save_" five times in a row. - -A non-idempotent version of the "_Save_" button might do something like "Append the paragraph I just wrote to the end of the file". Doing _that_ five times in a row will _not_ leave you in the same state as doing it one time; your most recent paragraph would have duplicates. - -If word processors only gave us non-idempotent "Append paragraph" / "Update paragraph" / "Delete paragraph" operations, then saving our document changes would be a lot more difficult! We'd have to keep track of which paragraphs we previously saved, and either make sure to not save them again or have a process in place to regularly clean up duplicate paragraphs. The implementation of the "_Save_" button in word processors takes the collection of low-level non-idempotent filesystem operations (read/append/overwrite/delete), and systematically runs them in a certain order so that the _user_ doesn't have to deal with the non-idempotency. The user can just focus on writing -- choosing words, editing for clarity, ensuring paragraphs aren't too long, etc. -- and the word processor deals with making sure the words get persisted properly to disk. - -This word processing analogy is very similar to what dbt does for [data transformation](https://www.getdbt.com/analytics-engineering/transformation/): it takes the collection of low-level non-idempotent database operations (`SELECT`/`INSERT`/`UPDATE`/`DELETE` -- collectively known as DML statements), and systematically runs them in a certain order so that analytics engineers don't have to deal with non-idempotency. We can just focus on the data -- [choosing good model and column names](https://docs.getdbt.com/blog/on-the-importance-of-naming), [documenting them](/community/resources/viewpoint#documentation), [ensuring data consumers can understand them](https://docs.getdbt.com/docs/best-practices#consider-the-information-architecture-of-your-data-warehouse), etc. -- and [`dbt run`](https://docs.getdbt.com/reference/commands/run) will make sure the database ends up in the right state. diff --git a/website/docs/terms/json.md b/website/docs/terms/json.md deleted file mode 100644 index 652fb58cbe3..00000000000 --- a/website/docs/terms/json.md +++ /dev/null @@ -1,103 +0,0 @@ ---- -id: json -title: JSON -description: JSON (JavaScript Object Notation) is a minimal format for semi-structured data used to capture relationships between fields and values. -displayText: JSON -hoverSnippet: JSON (JavaScript Object Notation) is a minimal format for semi-structured data used to capture relationships between fields and values. ---- - -JSON stands for JavaScript Object Notation. JSON is a minimal format which is great for processing data for applications. It can capture many types of relationships in a concise format and is a commonly used format for semi-structured data. The tables in your contain structured data (as opposed to semi-structured) where for each row, each field typically contains one value. Structured data, or tabular data, is intuitive and easy to read, but semi-structured data offers more flexibility. - -Let’s talk through what that looks like in practice so you can get a better sense of what we mean. - -## JSON syntax example - -When looking at data formatted in JSON, we say that the data is stored in **JSON objects**. These are composed of key-value pairs. JSON objects are enclosed in curly brackets (`{ }`) and each key-value pair is separated by a comma. Here’s an example: - -```json -order = {"customer_id":2947, "order_id":4923, "order_items":"cheesecake"} -``` - -`order` is the JSON object. `"customer_id":2947` is one of the key-value pairs within this JSON object. - -If I wanted to find the `customer_id`, I could return that value with `order["customer_id"]` or `order.customer_id`. It’s easy for us to simply read the `customer_id` just by looking at the JSON object in this example, but what if your JSON object contains hundreds of key-value pairs or complex nesting? Being aware of how to pull information out of JSON is essential if you’re working with it in the wild. - -A key feature of JSON is that it can contain data types that aren’t normally found in relational databases, namely **dictionaries** and **arrays**. Let’s break down what that means and then we’ll look at an example to pull everything together. - -### Dictionaries and arrays in JSON - -JSON inherits its syntax from JavaScript (JS) so dictionaries and arrays are formatted in the same way as they are in JS. Dictionaries are formatted just like JSON objects and consist of key-value pairs. Arrays are lists of values and they’re enclosed in square brackets (`[ ]`) and each value is separated by a comma, like so: - -```json -menu_items = ["cheesecake", "danish", "coffee"] -``` - -Individual values from an array can be called by referencing the location of a value within the array. Arrays are zero-indexed which means that the first item is at position 0 and we count up from there. - -- `menu_items[0]` will return “cheesecake” -- `menu_items[1]` will return “danish” -- `menu_items[2]` will return “coffee” - -Dictionaries and arrays can be nested in JSON objects as well as nested in each other. **Dictionaries and arrays can only be values. They can never be keys.** - -Here’s an example of a JSON object describing a tweet from [Twitter’s developer platform](https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/overview). - -```json -tweet = -{ - "created_at": "Thu Apr 06 15:24:15 +0000 2017", - "id_str": "850006245121695744", - "text": "1\/ Today we\u2019re sharing our vision for the future of the Twitter API platform!\nhttps:\/\/t.co\/XweGngmxlP", - "user": { - "id": 2244994945, - "name": "Twitter Dev", - "screen_name": "TwitterDev", - "location": "Internet", - "url": "https:\/\/dev.twitter.com\/", - "description": "Your official source for Twitter Platform news, updates & events. Need technical help? Visit https:\/\/twittercommunity.com\/ \u2328\ufe0f #TapIntoTwitter" - }, - "place": { - }, - "entities": { - "hashtags": [ - ], - "urls": [ - { - "url": "https:\/\/t.co\/XweGngmxlP", - "unwound": { - "url": "https:\/\/cards.twitter.com\/cards\/18ce53wgo4h\/3xo1c", - "title": "Building the Future of the Twitter API Platform" - } - } - ], - "user_mentions": [ - ] - } -} -``` - -Here's a quick quiz to see if you're understanding the file's structure: - -
-How would you call the user ID? -tweet['user']['id'] -
- -
-How would you call the unwound url? -tweet['entities']['urls'][0]['unwound']['url'] -
- -As you can see, JSON objects can get complex pretty quickly. - -## Why is JSON important in modern analytics? - -Semi-structured data offers flexibility with the trade-off of being more complex. JSON doesn’t require a pre-defined schema. It allows nesting, values can be different data types, and it lends itself well to changes in the shape of the incoming data. As you can imagine, the above Tweet object would look very different if we tried to restructure it so it could fit into a table. It would be hard to read or we would lose information or both. - -## Use cases for JSON - -JSON is lightweight and often used to transfer data over a network connection. As we’ve seen, data from social media sites are often stored as JSON objects. JSON is also commonly how data from IoT sensors is formatted and you’ll often see JSON when using an API. - -## Conclusion - -The greatest strength of JSON also acts as its weakness—the data it contains informs the shape the object takes, rather than the other way around. Structured data is the bread and butter of analytics work, but a semi-structured format is an alternative option when a tabular format becomes too rigid to describe the relationships between different entities. \ No newline at end of file diff --git a/website/docs/terms/materialization.md b/website/docs/terms/materialization.md deleted file mode 100644 index 328076f1483..00000000000 --- a/website/docs/terms/materialization.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -id: materialization -title: Materialization -description: A materialization is the exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a data warehouse. -displayText: materialization -hoverSnippet: The exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a data warehouse. ---- - - - What does materialization mean in the context of dbt? - - -:::important This page could use some love -This term would benefit from additional depth and examples. Have knowledge to contribute? [Create an issue in the docs.getdbt.com repository](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to begin the process of becoming a glossary contributor! -::: - -The exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a . It's the manner in which the data is represented, and each of those options is defined either canonically (tables, views, incremental), or bespoke. - -It is important to consider the downstream impacts of your materialization choice on query run times and macro capabilities. - diff --git a/website/docs/terms/model.md b/website/docs/terms/model.md deleted file mode 100644 index 83871d1339e..00000000000 --- a/website/docs/terms/model.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -id: model -title: Model -description: A model is an essential building block of the DAG -displayText: model -hoverSnippet: A model is an essential building block of the DAG ---- - -A model is an essential building block of the DAG that lives in a single file and contains logic that transforms data. This logic can be expressed as a SQL `select` statement or a Python dataframe operation. Models can be materialized in the warehouse in different ways — most of these [materialization](/terms/materialization) require models to be built in the warehouse. - -For more information, refer to: - -* [About dbt models](/docs/build/models) -* [Quickstart guides](/guides?tags=Quickstart) -* [Model properties](/reference/model-properties) -* [Materializations](/reference/resource-configs/materialized) diff --git a/website/docs/terms/monotonically-increasing.md b/website/docs/terms/monotonically-increasing.md deleted file mode 100644 index b4e3987995d..00000000000 --- a/website/docs/terms/monotonically-increasing.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -id: monotonically-increasing -title: Monotonically increasing -description: A monotonically increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. -displayText: monotonically increasing -hoverSnippet: A monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. ---- - -Monotonicity means unchanging (think monotone); a monotonic sequence is a sequence where the order of the value of the elements does not change. In other words, a monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example the sequences `[1, 6, 7, 11, 131]` or `[2, 5, 5, 5, 6, 10]`.. - -Monotonically-increasing values often appear in primary keys generated by production systems. In an analytics engineering context, you should avoid generating such values or assuming their existence in your models, because they make it more difficult to create an data model. Instead you should create a which is derived from the unique component(s) of a row. diff --git a/website/docs/terms/predicate-pushdown.md b/website/docs/terms/predicate-pushdown.md deleted file mode 100644 index 8e9bad85b6b..00000000000 --- a/website/docs/terms/predicate-pushdown.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -id: predicate-pushdown -title: predicate pushdown -description: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query -displayText: Predicate pushdown -hoverSnippet: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query ---- - -A predicate pushdown is an expression used to determine what rows in a database apply to a particular query. For example, if you filter in a `WHERE` clause based on a specific dimension value, the database searches to determine what values in the database apply to the query. The optimization known as "predicate pushdown" involves applying this filtering process to the database, leading to enhanced and faster query performance. - diff --git a/website/docs/terms/primary-key.md b/website/docs/terms/primary-key.md deleted file mode 100644 index d67d928a218..00000000000 --- a/website/docs/terms/primary-key.md +++ /dev/null @@ -1,148 +0,0 @@ ---- -id: primary-key -title: Primary key -description: A primary key is a non-null column in a database object that uniquely identifies each row. Primary keys take the form of a natural or surrogate key. -displayText: primary key -hoverSnippet: A primary key is a non-null column in a database object that uniquely identifies each row. ---- - - - Primary key in SQL (AKA Constraints) — dbt Labs - - -A primary key is a non-null column in a database object that uniquely identifies each row. Primary keys take the form of a natural or . It’s important to note that for each or in your database, there must only be one primary key column per database object. - -At their core, you create and use these row-level unique identifiers to: - -* Ensure a lack of duplicate rows in your tables -* Identify table grains easily -* Help unpack how tables join together -* Establish a consistent naming system for primary keys across your data models - -One of the great things about data modeling is that there are very few rules to it. You have the flexibility to create the models and columns that are applicable to your business and the SQL you use to accomplish that is pretty much up to you and your team. _Having a primary key in each data model is pretty much the one rule you can’t break._ Without primary keys that are tested for non-nullness and uniqueness, duplicate or null records can slip undetected into your data models and cause counts to be incorrect. These two reasons coupled together can create a sense of distrust in the data and data team. - -Use this glossary page to understand the importance of primary keys, how natural keys and surrogate keys differ, and how support for primary keys varies. - -## Types of primary keys - -Primary keys can be established two ways: naturally or derived through the data in a surrogate key. - -* A **natural key** is a primary key that is innate to the data. Perhaps in tables there’s a unique `id` field in each table that would act as the natural key. You can use documentation like entity relationship diagrams (ERDs) to help understand natural keys in APIs or tables. In a perfect world, all of our primary keys would be natural keys… _but this is an imperfect world!_ -* A **surrogate key** is a hashed value of multiple fields in a dataset that create a uniqueness constraint on that dataset. You’ll essentially need to make a surrogate key in every table that lacks a natural key. An example of this could be a custom table that reports daily performance per `ad_id` from an ad platform. You can derive a surrogate key by hashing the `date` and `ad_id` fields to create a unique value per row. - -A note on primary key data types: natural keys will often take the form of an integer or other numeric value (ex. 45932). Surrogate keys, on the other hand, are usually alphanumeric strings since they are hashed values (ex. ‘62aef884fbe3470ce7d9a92140b09b17’). - -:::tip Tip -dbt supports [packages](https://docs.getdbt.com/docs/build/packages), libraries of open-source macros and data models, to help data teams avoid doing duplicative work. One of these packages, [dbt_utils](https://github.com/dbt-labs/dbt-utils), contains a series of macros that are built to alleviate common struggles in data modeling. The [surrogate_key](https://github.com/dbt-labs/dbt-utils#surrogate_key-source) macro offers a DRY (don’t repeat yourself) solution to creating surrogate keys across different data warehouses in the event that your data doesn’t contain natural keys. -::: - -## Data warehouse support for primary keys - -What do we mean when we say a primary key is supported in a database? What does it mean if primary keys are enforced? - -* **Support**: If a primary key is supported in a database, that means they allow you to explicitly let the system know if a specific field is a primary key. This will happen in the DDL (data definition language) to create the table, like in the example below, or an `ALTER` statement that specifies which field is the primary key. -* **Enforcement**: If a database enforces primary keys, that means it would raise an error if one of the constraints on primary keys (uniqueness and non-null) was broken during an `INSERT` or `UPDATE` statement. - -The table below gives an overview of primary key support and enforcement in some of the major data warehouses. Below the table you’ll additionally see a breakdown of some details around primary key implementation for these data warehouses. - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Supports primary keys?Fully enforces primary keys?
Snowflake
Amazon Redshift
Google BigQuery
Databricks
Postgres
- -### Snowflake - -Snowflake allows for data folks to explicitly identify primary keys during table creation or using an `ALTER` statement. To see identified primary keys in your database, you can run the `SHOW PRIMARY KEYS` command. It’s important to note, however, that Snowflake primary key indicators are purely descriptive–meaning they don’t enforce either non-nullness or uniqueness requirements. However, Snowflake offers a separate `NOT NULL` constraint that will be enforced for specified fields. - -### Amazon Redshift - -With Redshift, you can specify primary keys constraints on tables, but Redshift won’t provide out-of-the-box primary key enforcement. Similar to Snowflake, Redshift does allow users to add a [`NOT NULL` constraint](https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_TABLE_NEW.html) that is actually enforced. - -In general for Redshift, it’s still good practice to define your primary keys (regardless of the lack of uniqueness enforcement) because they can help the [query planner](https://docs.getdbt.com/blog/redshift-configurations-dbt-model-optimizations) more quickly identify uniqueness and foreign key relationships. - -### Google BigQuery - -BigQuery is pretty unique here in that it doesn’t support or enforce primary keys. If your team is on BigQuery, you’ll need to have some [pretty solid data testing](/docs/build/data-tests) in place to ensure your primary key fields are unique and non-null. - -### Databricks - -Databricks’ Delta tables in Unity Catalog provide support for declaring [informational primary keys](https://docs.databricks.com/tables/constraints.html#declare-primary-key-and-foreign-key-relationships). These primary key constraints are not enforced. Databricks currently offers [two enforced constraint](https://docs.databricks.com/tables/constraints.html#enforced-constraints-on-databricks) types: `not-null` and `check`. The `not-null` one is pretty straightforward, but the `check` constraint is more unique to Databricks. With the `check` constraint, you can test that a certain boolean expression executes as `true` for each row in a table. This constraint is more likely to be helpful for ensuring accepted values are met for fields rather than for primary key requirements. - -### Postgres - -Postgres is the true standout here in that it both supports and enforces primary keys! However, you shouldn’t be too surprised about this. One of the primary use cases for Postgres is that it often serves as the home for backend application tables and is usually managed by a [team of backend developers](https://docs.getdbt.com/blog/when-backend-devs-spark-joy). Since these tables often act as a source of truth for many businesses, it’s critical that primary key fields must exist, be non-null, and unique. - -## How to indicate primary keys - -For data warehouses that support primary keys (like Snowflake, Amazon Redshift, and Postgres), you can add a primary key indicator to the column you want to use as a primary key in the DDL to create the table. You may also use an `ALTER` DDL statement to set a column as a primary key if the table is already created. - -In the example below, you can add a new `jaffles` table to the [jaffle_shop](https://github.com/dbt-labs/jaffle_shop) project and make the `id` field the primary key. - -```sql -CREATE TABLE prod.jaffle_shop.jaffles ( - id varchar(255) primary key, - jaffle_name varchar(255) - created_at timestamp, - ingredients_list varchar(255), - is_active boolean -); -``` - -:::note Note -If you don't have a field in your table that would act as a natural primary key, you’ll need to[ create a surrogate key](https://docs.getdbt.com/blog/sql-surrogate-keys) for it. -::: - -If your data warehouse doesn’t provide out-of-the box support and enforcement for primary keys, it’s important to clearly label and put your own constraints on primary key fields. This could look like: - -* **Creating a consistent naming convention for your primary keys**: You may see an `id` field or fields prefixed with `pk_` (ex. `pk_order_id`) to identify primary keys. You may also see the primary key be named as the obvious table grain (ex. In the jaffle shop’s `orders` table, the primary key is called `order_id`). -* **Adding automated [data tests](/docs/build/data-tests) to your data models**: Use a data tool, such as dbt, to create not null and unique tests for your primary key fields. - -## Testing primary keys - -When we talk about testing our primary keys, we really mean testing their uniqueness and non-nullness. Given that not all modern data warehouses support or enforce primary key constraints, your data team will likely fall under two scenarios: - -1. For databases that support primary key enforcement, you should receive failures when your constraints are broken. -2. For databases that don’t offer support and enforcement of primary keys, you’re going to need to regularly test that primary keys aren’t violating their golden rule of uniqueness and non-nullness. To do this, we recommend implementing a tool like dbt that allows you to define version-controlled and code-based tests on your data models. Using these tests, you should create [not null](https://docs.getdbt.com/reference/resource-properties/tests#not_null) and [unique](https://docs.getdbt.com/reference/resource-properties/tests#unique) tests for every primary key field throughout your dbt project. Other methods for primary key testing may look like writing custom tests or ad hoc queries that check for uniqueness and non-nullness. - -:::tip Tip -You can use dbt’s [documentation](https://docs.getdbt.com/docs/build/documentation) and [testing](https://docs.getdbt.com/reference/resource-properties/tests) capabilities to clearly identify and QA primary keys in your data models. For your primary key column, you should mention that the field is the unique identifier for that table and test for uniqueness and non-nullness. -::: - -## Conclusion - -Say it with me or get it tattooed on your lower back: every database object in your data warehouse needs a primary key. At their core, primary keys are fields that uniquely identify each row in a table and help ensure there are no duplicates in the data. Primary keys take shape as either natural keys, fields that are innate to the data, or as surrogate keys, hashed column values that create a uniqueness constraint on the data. Not every modern data warehouse provides explicit support or enforcement of primary keys, so it’s incredibly important to have a method to test that your primary keys are unique and not null. - -## Further reading - -* [Testing primary keys in dbt](https://docs.getdbt.com/blog/primary-key-testing) -* [Surrogate keys and dbt](https://docs.getdbt.com/blog/sql-surrogate-keys) -* [dbt Constraints Snowflake Labs package](https://hub.getdbt.com/snowflake-labs/dbt_constraints/latest/) diff --git a/website/docs/terms/relational-database.md b/website/docs/terms/relational-database.md deleted file mode 100644 index 8f05e5f4944..00000000000 --- a/website/docs/terms/relational-database.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -id: relational-database -title: Relational database -description: A relational database provides a structured way to store data into tables consisting of rows and columns. Different tables in a relational database can be joined together using common columns from each table, forming relationships. -displayText: relational database -hoverSnippet: A relational database provides a structured way to store data into tables consisting of rows and columns. Different tables in a relational database can be joined together using common columns from each table, forming relationships. ---- - - - Relational database: A way to get order out of data chaos - - -A relational database provides a structured way to store data into tables consisting of rows and columns. Different tables in a relational database can be joined together using common columns from each table, forming relationships. - -Analytics engineers use relational database models to process high volumes of data that, in its rawest form, is too difficult for an end user or analyst to read and comprehend. Thanks to these models, people can easily query, interpret, and derive insight out of data using the accessible SQL. - -Anyone who’s ever managed or modeled data will tell you that data points are only meaningful in relation to each other. The very philosophy behind data management and data analytics has centered on forming a narrative out of seemingly disparate elements. - -At the heart of this notion sits the relational database, which was first introduced by computer scientist E.F. Codd in the year 1970 — 13 years before the internet was even invented! - -## How relational databases work - -The legwork behind relational databases lies in establishing pre-defined relationships between tables, also called “entities”. For example, in the [jaffle_shop](https://github.com/dbt-labs/jaffle_shop) ecommerce store database where customers’ information is stored in a `customers` table and orders information is stored in an `orders` table, a relationship is defined such that each order is attributed to a customer. - -![](/img/docs/terms/relational-database/relation.png) - -The way relationships are defined is via primary keys and foreign keys. - -By definition, a is a column (or combination of columns as a surrogate key) which identifies a unique record. There can be only one primary key per table, and the primary key should be unique and not null. - -On the other hand, a foreign key is a column (or combination of columns) in one table that references the primary key in another table. In the above example, multiple orders can belong to one customer. Assuming that `id` is defined as the primary key for the `customers` table, `user_id` in the `orders` table would be the foreign key. - -In analytics engineering, where the focus is geared towards data modeling and creating a reporting layer for a BI tool, relational databases are a great fit. Data modeling defines how the data elements are related to each other, and a well-organized database is the cornerstone of effective data querying. - -## Use cases for relational databases - -Relational databases are best for structured data that can be organized into tables made up of rows and columns. Data teams rely on relational databases for storing transactional data, and also when data querying and data analysis is needed. - -### Transactional processing - -As mentioned earlier, relational databases are a great fit for transaction-oriented systems such as CRM tools, e-commerce platforms, or finance software. Companies tend to use relational databases when transactional consistency is required, as they offer a near failsafe environment for data accuracy and completion. When a transaction consists of several steps, the system treats the steps as a single transaction and assures that the operation follows an ‘all-or-nothing’ scenario, ie: the steps either all survive or all fail. - -### Modeling data and organizing it for analysis - -Relational databases support common data modeling techniques such as , Data Vault, or sometimes hybrid approaches that combine different modeling techniques. Such methodologies allow teams to organize their data into useful data structures. - -A data model is the overarching conceptual layer that organizes data entities and their relationships. The specific physical implementation of that data model including the definitions of data types and constraints constitutes the database schema. - -Having organized data entities also helps analytics engineers and analysts build meaningful queries that derive data in a format and granularity that is otherwise not directly available in the base database. - -Most analytics engineers have to deal with both relational (typically structured data) and non-relational data (typically unstructured data) coming in from multiple sources. The data is then transformed until it ultimately gets modeled into data entities using relational modeling approaches. More on non-relational databases in the following section, but in a nutshell, structured data is data that can be easily stored in a relational database system, while unstructured data is composed of formats that cannot easily (or at all) be broken down into tabular data. Common examples of unstructured data include video files, PDFs, audio files, and social media posts. - -Another popular format is semi-structured data which is inherently difficult to organize into rows and columns, but contains semantic markup that makes it possible to extract the underlying information. Some examples include XML and . - -Relational data warehouses provide relational databases that are specifically optimized for analytical querying rather than transaction processing. Increasingly, data warehouses are providing better support for unstructured data, or data that cannot be stored in relational tables. . - -Even when analytics engineers do not physically enforce relationships at the database level (many modern data warehouses allow for defining relational constraints but do not actually enforce them), they do follow a relational process. This process enables them to still organize the data into logical entities whenever possible, and in order to make sure that the data is not redundant and easily queryable. - -## Relational database vs. non-relational database - -The main difference between a relational and non-relational database is in how they store information. Relational databases are well-suited for data that is structured and store values in tables, and non-relational databases store data in a non-tabular form called unstructured data. - -As datasets are becoming dramatically more complex and less structured, the format of the ingested data can sometimes be unpredictable which makes the case for non-relational databases (also called NoSQL). - -NoSQL databases are also typically better suited for granular real-time monitoring. On the other hand, relational databases make it easier to look at transformed and aggregated data, making them a more appropriate fit for reporting and analytics. - -The below table summarizes the main differences between a relational and a non-relational database: - -| | Relational Database | Non-Relational Database | -|---|---|---| -| Data storage | Data is stored in tables. | Data is stored in document files, graph stores, key-value stores, or wide-column stores. | -| Data format | Data is structured. | Data is mainly unstructured. | -| Usage | Mainly used for recording transactions, data modeling, and data analysis. | Mainly used to ingest large volume real-time data streams. | -| Data Integrity | The relationships and constraints defined help ensure higher data integrity. | Non-relational databases do not guarantee data integrity. | -| Scalability | Scalable at a high price tag. | Highly scalable. | - -## Conclusion - -Relational databases store data in a systematic way, and support querying multiple tables together in order to generate business insights. - -Often starting off with unorganized and chaotic data, analytics engineers leverage relational databases to bring structure and consistency to their data. - -Relational databases also have a strong record of transactional consistency. While some companies are racing to embrace non-relational databases in order to handle the big volume of unstructured data, most of their workloads likely remain transactional and analytical in nature which is why relational databases are very common. - -## Further reading - -- [Glossary: Primary key](/terms/primary-key) -- [Glossary: Data warehouse](/terms/data-warehouse) diff --git a/website/docs/terms/reverse-etl.md b/website/docs/terms/reverse-etl.md deleted file mode 100644 index a3ccd0b0f70..00000000000 --- a/website/docs/terms/reverse-etl.md +++ /dev/null @@ -1,94 +0,0 @@ ---- -id: reverse-etl -title: Reverse ETL -description: Reverse ETL is the process of getting your transformed data stored in your data warehouse to end business platforms, such as sales CRMs and ad platforms. -displayText: reverse ETL -hoverSnippet: Reverse ETL is the process of getting your transformed data stored in your data warehouse to end business platforms, such as sales CRMs and ad platforms. ---- - - - Reverse ETL, demystified: What it is in plain english - - -Reverse ETL is the process of getting your transformed data stored in your data warehouse to end business platforms, such as sales CRMs and ad platforms. Once in an end platform, that data is often used to drive meaningful business actions, such as creating custom audiences in ad platforms, personalizing email campaigns, or supplementing data in a sales CRM. You may also hear about reverse ETL referred to as operational analytics or data activation. - -Reverse ETL efforts typically happen after data teams have set up their [modern data stack](https://www.getdbt.com/blog/future-of-the-modern-data-stack/) and ultimately have a consistent and automated way to extract, load, and transform data. Data teams are also often responsible for setting up the pipelines to send down data to business platforms, and business users are typically responsible for *using the data* once it gets to their end platform. - -Ultimately, reverse ETL is a way to put data where the work is already happening, support self-service efforts, and help business users derive real action out of their data. - -## How reverse ETL works - -In the reverse ETL process, transformed data is synced from a data warehouse to external tools in order to be leveraged by different business teams. - -![A diagram depicting how the reverse ETL process works. It starts with data being extract from data sources like email CRMs, Facebook Ad platforms, backend databases, and NetSuite. The raw data is then loaded into a data warehouse. After loading, the data is transformed and modeled. The modeled data is then loaded directly back into the tools that created the data, like Email CRMs, Facebook Ad platforms, and others so the insights are more accessible to business users.](/img/docs/terms/reverse-etl/reverse-etl-diagram.png) - -The power of reverse ETL comes from sending down *already transformed data* to business platforms. Raw data, while beautiful in its own way, typically lacks the structure, aggregations, and aliasing to be useful for end business users off the bat. After data teams transform data for business use in pipelines, typically to expose in an end business intelligence (BI) tool, they can also send this cleaned and meaningful data to other platforms where business users can derive value using [reverse ETL tools](#reverse-etl-tools). - -Data teams can choose to write additional transformations that may need to happen for end business tools in reverse ETL tools themselves or by creating [additional models in dbt](https://getdbt.com/open-source-data-culture/reverse-etl-playbook/). - -## Why use reverse ETL? - -There’s a few reasons why your team may want to consider using reverse ETL: - -### Putting data where the work is happening - -While most data teams would love it if business users spent a significant portion of their time in their BI tool, that’s neither practical nor necessarily the most efficient use of their time. In the real world, many business users will spend some time in a BI tool, identify the data that could be useful in a platform they spend a significant amount of time in, and work with the data team to get that data where they need it. Users feel comfortable and confident in the systems they use everyday—why not put the data in the places that allow them to thrive? - -### Manipulating data to fit end platform requirements - -Reverse ETL helps you to put data your business users need *in the format their end tool expects*. Oftentimes, end platforms expect data fields to be named or cast in a certain way. Instead of business users having to manually input those values in the correct format, you can transform your data using a product like dbt or directly in a reverse ETL tool itself, and sync down that data in an automated way. - -### Supporting self-service efforts - -By sending down data-team approved data in reverse ETL pipelines, your business users have the flexibility to use that data however they see fit. Soon, your business users will be making audiences, testing personalization efforts, and running their end platform like a well-oiled, data-powered machine. - - -## Reverse ETL use cases - -Just as there are almost endless opportunities with data, there are many potential different use cases for reverse ETL. We won’t go into every possible option, but we’ll cover some of the common use cases that exist for reverse ETL efforts. - -### Personalization - -Reverse ETL allows business users to access data that they normally would only have access to in a BI tool *in the platforms they use every day*. As a result, business users can now use this data to personalize how they create ads, send emails, and communicate with customers. - -Personalization was all the hype a few years ago and now, you rarely ever see an email come into your inbox without some sort of personalization in-place. Data teams using reverse ETL are able to pass down important customer information, such as location, customer lifetime value (CLV), tenure, and other fields, that can be used to create personalized emails, establish appropriate messaging, and segment email flows. All we can say: the possibilities for personalization powered by reverse ETL are endless. - -### Sophisticated paid marketing initiatives - -At the end of the day, businesses want to serve the right ads to the right people (and at the right cost). A common use case for reverse ETL is for teams to use their customer data to create audiences in ad platforms to either serve specific audiences or create lookalikes. While ad platforms have gotten increasingly sophisticated with their algorithms to identify high-value audiences, it usually never hurts to try supplementing those audiences with your own data to create sophisticated audiences or lookalikes. - -### Self-service analytics culture - -We hinted at it earlier, but reverse ETL efforts can be an effective way to promote a self-service analytics culture. When data teams put the data where business users need it, business users can confidently access it on their own, driving even faster insights and action. Instead of requesting a data pull from a data team member, they can find the data they need directly within the platform that they use. Reverse ETL allows business users to act on metrics that have already been built out and validated by data teams without creating ad-hoc requests. - -### “Real-time” data - -It would be amiss if we didn’t mention reverse ETL and the notion of “real-time” data. While you can have the debate over the meaningfulness and true value-add of real-time data another time, reverse ETL can be a mechanism to bring data to end business platforms in a more “real-time” way. - -Data teams can set up syncs in reverse ETL tools at higher cadences, allowing business users to have the data they need, faster. Obviously, there’s some cost-benefit analysis on how often you want to be loading data via [ETL tools](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/) and hitting your data warehouse, but reverse ETL can help move data into external tools at a quicker cadence if deemed necessary. - -All this to say: move with caution in the realm of “real-time”, understand your stakeholders’ wants and decision-making process for real-time data, and work towards a solution that’s both practical and impactful. - -## Reverse ETL tools - -Reverse ETL tools typically establish the connection between your data warehouse and end business tools, offer an interface to create additional transformations or audiences, and support automation of downstream syncs. Below are some examples of tools that support reverse ETL pipelines. - -| Tool | Description | Open source option? | -|:---:|:---:|:---:| -| Hightouch | A platform to sync data models and create custom audiences for downstream business platforms. | :x: | -| Polytomic | A unified sync platform for syncing to and from data warehouses (ETL and Reverse ETL), databases, business apps, APIs, and spreadsheets. | :x: | -| Census | Another reverse ETL tool that can sync data from your data warehouse to your go-to-market tools. | :x: | -| Rudderstack | Also a CDP (customer data platform), Rudderstack additionally supports pushing down data and audience to external tools, such as ad platforms and email CRMs. | :white_check_mark: | -| Grouparoo | Grouparoo, part of Airbyte, is an open source framework to move data from data warehouses to different cloud-based tools. | :white_check_mark: | - -## Conclusion - -Reverse ETL enables you to sync your transformed data stored in your data warehouse to external platforms often used by marketing, sales, and product teams. It allows you to leverage your data in a whole new way. Reverse ETL pipelines can support personalization efforts, sophisticated paid marketing initiatives, and ultimately offer new ways to leverage your data. In doing this, it creates a self-service analytics culture where stakeholders can receive the data they need in, in the places they need, in an automated way. - -## Further reading - -If you’re interested learning more about reverse ETL and the impact it could have on your team, check out the following: - -- [How dbt Labs’s data team approaches reverse ETL](https://getdbt.com/open-source-data-culture/reverse-etl-playbook/) -- [The operational data warehouse in action: Reverse ETL, CDPs, and the future of data activation](https://www.getdbt.com/coalesce-2021/operational-data-warehouse-reverse-etl-cdp-data-activation/) -- [The analytics engineering guide: Operational analytics](https://www.getdbt.com/analytics-engineering/use-cases/operational-analytics/) diff --git a/website/docs/terms/subquery.md b/website/docs/terms/subquery.md deleted file mode 100644 index d7aecdd52cc..00000000000 --- a/website/docs/terms/subquery.md +++ /dev/null @@ -1,224 +0,0 @@ ---- -id: subquery -title: Subquery in SQL -description: "A subquery is what the name suggests: a query within another query. The true inception of SQL. Subqueries are often used when you need to process data in several steps." -displayText: subquery -hoverSnippet: A subquery is a query within another query. Subqueries are often used when you need to process data in multiple steps. ---- - - - What is a Subquery in SQL and when are they useful? - dbt Labs - -A subquery is what the name suggests: a query within another query. _The true inception of SQL_. Subqueries are often used when you need to process data in several steps. For the majority of subqueries you’ll see in actual practice, the inner query will execute first and pass its result to the outer query it's nested in. - -Subqueries are usually contrasted with Common Table Expressions (CTEs) as they have similar use cases. Unlike CTEs, which are usually separate `SELECT` statements within a query, subqueries are usually `SELECT` statements nested within a `JOIN`, `FROM`, or `WHERE` statement in a query. - -To be honest, we rarely write subqueries here at dbt Labs since we prefer to use CTEs. We find that CTEs, in general, support better query readability, organization, and debugging. However, subqueries are a foundational concept in SQL and still widely used. We hope you can use this glossary to better understand how to use subqueries and how they differ from CTEs. - -## Subquery syntax - -While there are technically several types of subqueries, the general syntax to build them is the same. A subquery usually consists of the following: - -- Enclosing parentheses -- A name -- An actual SELECT statement -- A main query it is nested in via a FROM, WHERE, or JOIN clause - -Let’s take this to an example, using the [sample jaffle_shop dataset](https://github.com/dbt-labs/jaffle_shop). - -```sql -select customer_id, count(order_id) as cnt_orders - from ( - select * from {{ ref('orders') }} - ) all_orders -group by 1 -``` - -Given the elements of subqueries laid out in the beginning, let’s break down this example into its respective parts. - -| Subquery elements | Example | -|---|---| -| Enclosing parentheses | :white_check_mark: | -| Subquery name | `all_orders` | -| `SELECT` statement | `select * from {{ ref('orders') }}` | -| Main query it is nested in | `select customer_id, count(order_id) as cnt_orders from all_orders group by 1` | - -When this query is actually executed, it will start by running the innermost query first. In this case, it would run `select * from {{ ref('orders') }}` first. Then, it would pass those results to the outer query, which is where you grab the count of orders by `customer_id`. - -```note Note -If you want to learn more about what a `ref` is, [check out our documentation on it.](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) -``` - -This is a relatively straightforward example, but should hopefully show you that subqueries start off like most other queries. As you nest more subqueries together, that’s when you unearth the power of subqueries, but also when you start to notice some readability tradeoffs. If you are using subqueries regularly, you'll want to leverage indenting and [strong naming conventions](https://docs.getdbt.com/blog/on-the-importance-of-naming) for your subqueries to clearly distinguish code functionality. - -## Types of subqueries - -In your day-to-day, you won’t normally formalize the names of the different types of subqueries you can write, but when someone uses the term “correlated subquery” at a data conference, you'll want to know what that means! - -### Nested subqueries - -Nested subqueries are subqueries like the one you saw in the first example: a subquery where the inner query is executed first (and once) and passes its result to the main query. The majority of subqueries you will see in the real world are likely to be a nested subquery. These are most useful when you need to process data in multiple steps. - -:::tip Debugging subqueries tip -It’s important to note that since the inner query is executed first in a nested subquery, the inner query must be able to execute by itself. If it’s unable to successfully run independently, it cannot pass results to the outer query. -::: - -### Correlated subqueries - -A correlated subquery is a nested subquery’s counterpart. If nested subqueries execute the inner query first and pass their result to the outer query, correlated subqueries execute the outer query first and pass their result to their inner query. For correlated subqueries, it’s useful to think about how the code is actually executed. - -In a correlated subquery, the outer query will execute row-by-row. For each row, that result from the outer query will be passed to the inner query. Compare this to nested queries: in a nested query, the inner query is executed first and only once before being passed to the outer query. - -These types of subqueries are most useful when you need to conduct analysis on a row-level. - -### Scalar and non-scalar subqueries - -Scalar subqueries are queries that only return a single value. More specifically, this means if you execute a scalar subquery, it would return one column value of one specific row. Non-scalar subqueries, however, can return single or multiple rows and may contain multiple columns. - -You may want to use a scalar subquery if you’re interested in passing only a single-row value into an outer query. This type of subquery can be useful when you’re trying to remove or update a specific row’s value using a Data Manipulation Language (DML) statement. - -## Subquery examples - -You may often see subqueries in joins and DML statements. The following sections contain examples for each scenario. - -### Subquery in a join - -In this example, you want to get the lifetime value per customer using your `raw_orders` and `raw_payments` table. Let’s take a look at how you can do that with a subquery in a join: - -```sql -select - - orders.user_id, - sum(payments.amount) as lifetime_value - -from {{ ref('raw_orders') }} as orders -left join ( - - select - - order_id, - amount - - from {{ ref('raw_payments') }} - -) all_payments -on orders.id = payments.order_id -group by 1 -``` - -Similar to what you saw in the first example, let’s break down the elements of this query. - -| Subquery elements | Example | -|---|---| -| Enclosing parentheses | :white_check_mark: | -| Subquery name | `all_payments` | -| `SELECT` statement | `select order_id, amount from {{ ref('raw_payments') }}` | -| Main query it is nested in | `select orders.user_id, sum(payments.amount) as lifetime_value from {{ ref('raw_orders') }} as orders...` | - -In this example, the `all_payments` subquery will execute first. you use the data from this query to join on the `raw_orders` table to calculate lifetime value per user. Unlike the first example, the subquery happens in the join statement. Subqueries can happen in `JOIN`, `FROM`, and `WHERE` clauses. - -### Subquery in a DML command - -You may also see subqueries used in DML commands. As a jogger, DML commands are a series of SQL statements that you can write to access and manipulate row-level data in database objects. Oftentimes, you’ll want to use a query result in a qualifying `WHERE` clause to only delete, update, or manipulate certain rows of data. - -In the following example, you'll attempt to update the status of certain orders based on the payment method used in the `raw_payments` table. - -```sql -UPDATE raw_orders -set status = 'returned' -where order_id in ( -select order_id -from raw_payments -where payment_method = 'bank_transfer') -``` - -## Subquery vs CTE - -A subquery is a nested query that can oftentimes be used in place of a CTE. Subqueries have different syntax than CTEs, but often have similar use cases. The content won’t go too deep into CTEs here, but it’ll highlight some of the main differences between CTEs and subqueries below. - -| CTE | Subquery | -|---|---| -| Typically more readable since CTEs can be used to give structure to your query | Typically less readable, especially if there are many nested queries | -| Reusable in the same query | Must declare the subquery everytime it is used in a query | -| Allows for recursiveness | Does not allow for recursiveness | -| CTEs must have unique CTE_EXPRESSION_NAMES when used in a query | Subqueries don’t always have to be explicitly named | -| CTEs cannot be used in a `WHERE` clause | Subqueries can be used in a `WHERE` clause | - -### Subquery vs CTE example - -The following example demonstrates the similarities and differences between subqueries and CTEs. Using the [first subquery example](#subquery-in-a-join), you can compare how you would perform that query using subquery or a CTE: - - - - -```sql Subquery example -select customer_id, count(order_id) as cnt_orders - from ( - - select * from {{ ref('orders') }} - - ) all_orders -group by 1 -``` - - - -```sql CTE example -with all_orders as ( - -select * from {{ ref('orders') }} - -), -aggregate_orders as ( - - select - - customer_id, - count(order_id) as cnt_orders - - from all_orders - group by 1 - -) -select * from aggregate_orders -``` - - - - -While the code for the query involving CTEs may be longer in lines, it also allows us to explicitly define code functionality using the CTE name. Unlike the subquery example that executes its inner query and then the outer query, the query using CTEs executes moving down the code. - -Again, choosing to use CTEs over subqueries is a personal choice. It may help to write out the same code functionality in a subquery and with CTEs and see what is more understandable to you. - -## Data warehouse support for subqueries - -Subqueries are likely to be supported across most, if not all, modern data warehouses. Please use this table to see more information about using subqueries in your specific data warehouse. - -| Data warehouse | Supports subqueries? | -|---|---| -| [Snowflake](https://docs.snowflake.com/en/user-guide/querying-subqueries.html) | :white_check_mark: | -| [Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_Subquery_examples.html) | :white_check_mark: | -| [Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/subqueries) | :white_check_mark: | -| [Databricks](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-qry-query.html) | :white_check_mark: | -| [Postgres](https://www.postgresqltutorial.com/postgresql-subquery/) | :white_check_mark: | - -## Conclusion - -I’m going to be honest, I was hesitant to start writing the glossary page for SQL subqueries. As someone who has been using CTEs almost exclusively in their data career, I was intimidated by this concept. However, I am excited to say: Subqueries are not as scary as I expected them to be! - -At their core, subqueries are nested queries within a main query. They are often implemented in `FROM`, `WHERE`, and `JOIN` clauses and are used to write code that builds on itself. Despite the fact that subqueries are SQL like any other query, it is important to note that subqueries can struggle in their readability, structure, and debugging process due to their nested nature. Because of these downsides, we recommend leveraging CTEs over subqueries whenever possible. - -I have not been made a subquery convert, but I’m walking away from this a little less intimidated by subqueries and I hope you are too. - -## Further reading - -Please check out some of our favorite readings related to subqueries! - -- [Glossary: CTE](https://docs.getdbt.com/terms/cte) -- [On the importance of naming: model naming conventions (Part 1)](https://docs.getdbt.com/blog/on-the-importance-of-naming) diff --git a/website/docs/terms/surrogate-key.md b/website/docs/terms/surrogate-key.md deleted file mode 100644 index a53db3090cd..00000000000 --- a/website/docs/terms/surrogate-key.md +++ /dev/null @@ -1,196 +0,0 @@ ---- -id: surrogate-key -title: Surrogate key -description: A surrogate key is a unique identifier derived from the data itself. It's commonly a hashed value of multiple columns that will create a unique id for each row. -displayText: surrogate key -hoverSnippet: A surrogate key is a unique identifier derived from the data itself. It often takes the form of a hashed value of multiple columns that will create a uniqueness constraint for each row. ---- - - - What is a surrogate key in database table? - dbt Labs - - -A surrogate key is a unique identifier derived from the data itself. It often takes the form of a hashed value of multiple columns that will create a uniqueness constraint for each row. You will need to create a surrogate key for every table that doesn't have a natural . - -Why would you ever need to make a surrogate key? Shouldn’t all tables innately just have a field that uniquely identifies each row? Now that would be too easy… - -Let’s say you have a table with all license plate numbers and the state of the plate. While license plate numbers are unique to their state, there could be duplicate license plate numbers across different states. So by default, there’s no natural key that can uniquely identify each row here. In order to uniquely identify each record in this table, you could create a surrogate key based on the unique combination of license plate number and its state. - -## Surrogate keys, natural keys, and primary keys oh my! - -Primary keys can be established two ways: naturally or derived through the data in a surrogate key. - -* A __natural key__ is a primary key that is innate to the data. Perhaps in some tables there’s a unique `id` field in each table that would act as the natural key. You can use documentation like entity relationship diagrams (ERDs) to help understand natural keys in APIs or backend application database tables. -* A __surrogate key__ is a hashed value of multiple fields in a dataset that create a uniqueness constraint on that dataset. You’ll essentially need to make a surrogate key in every table that lacks a natural key. - -:::note Note -You may also hear about primary keys being a form of a _constraint_ on a database object. Column constraints are specified in the to create or alter a database object. For data warehouses that support the enforcement of primary key constraints, this means that an error would be raised if a field's uniqueness or non-nullness was broken upon an `INSERT` or `UPDATE` statement. Most modern data warehouses don’t support _and_ enforce [primary key constraints](https://docs.getdbt.com/terms/primary-key#Data-warehouse-support-for-primary-keys), so it’s important to have [automated testing](https://docs.getdbt.com/blog/primary-key-testing#how-to-test-primary-keys-with-dbt) in-place to ensure your primary keys are unique and not null. -::: - -## How surrogate keys are created - -In analytics engineering, you can generate surrogate keys using a hashing method of your choice. Remember, in order to truly create a uniqueness constraint on a database object, you’ll need to hash the fields together that _make each row unique_; when you generate a correct surrogate key for a dataset, you’re really establishing the true of that dataset. - -Let’s take this to an example. Below, there is a table you pull from an ad platform that collects `calendar_date`, `ad_id`, and some performance columns. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
calendar_datead_idimpressionsspendclicksconversions
2022-05-16212887444523.009432166
2022-05-162143236.4940
2022-05-05212125600117244.561731856
- - -In this state, this table has no natural key that can act as a primary key. You know the grain of this table: this is showing performance for each `ad_id` per `calendar_date`. Therefore, hashing those two fields will create a uniqueness constraint on this table. - -To create a surrogate key for this table using the MD5 function, run the following: - -```sql -select - md5(calendar_date || ad_id) as unique_id, - * -from {{ source('ad_platform', 'custom_daily_report')}} -``` - -After executing this, the table would now have the `unique_id` field now uniquely identifying each row. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
unique_idcalendar_datead_idimpressionsspendclicksconversions
62aef884fbe3470ce7d9a92140b09b172022-05-16212887444523.009432166
ea385f7a5e560ef4d8a78f7d913927e42022-05-162143236.4940
53a33f257d1d4f2446469ac5adad1c0c2022-05-05212125600117244.561731856
- -## Testing surrogate keys - -Amazing, you just made a surrogate key! You can just move on to the next data model, right? No!! It’s critically important to test your surrogate keys for uniqueness and non-null values to ensure that the correct fields were chosen to create the surrogate key. - -In order to test for null and unique values you can utilize code-based data tests like [dbt tests](/docs/build/data-tests), that can check fields for nullness and uniqueness. You can additionally utilize simple SQL queries or unit tests to check if surrogate key count and non-nullness is correct. - -## A note on hashing algorithms - -Depending on your data warehouse, there’s several cryptographic hashing options to create surrogate keys. The primary hashing methods include MD5 or other algorithms, like HASH or SHA. Choosing the appropriate hashing function is dependent on your dataset and what your warehouse supports. - - - - - - - - - - - - - - - - - - - - - - -
Hashing algorithmBit lengthKnown collisions?
HASH64 bitsYes, past ~4 billion elements
MD5128 bitsYes, but incredibly unlikely
SHA256256 bitsNo
- -:::note Note -A collision occurs when two pieces of data that are different end up hashing to the same value. If a collision occurs, a different hashing method should be used. -::: - - -## Why we like surrogate keys - -Let’s keep it brief: surrogate keys allow data folks to quickly understand the grain of the database object and are compatible across many different data warehouses. - - -### Readability - -Because surrogate keys are comprised of the fields that make a uniqueness constraint on the data, you can quickly identify the grain of the data. For example, if you see in your data model that the surrogate key field is created by hashing the `ad_id` and `calendar_date` fields, you can immediately know the true grain of the data. When you clearly understand the grain of a database object, this can make for an easier understanding of how entities join together and fan out. - - -### Compatibility - -Making a surrogate key involves a relatively straightforward usage of SQL: maybe some coalescing, concatenation, and a hashing method. Most, if not all, modern data warehouses support both the ability to concat, coalesce, and hash fields. They may not have the exact same syntax or hashing functions available, but their core functionality is the same. - -:::tip Tip -dbt supports several macros to help data folks write DRY (don’t repeat yourself) code. The [surrogate_key macro](https://github.com/dbt-labs/dbt-utils#surrogate_key-source) helps you create surrogate keys with the MD5 function without having to worry about coalescing potentially null field values. -::: - - -## Performance concerns for surrogate keys - -In the past, you may have seen surrogate keys take the form of integers (ex. 1, 2, 3, 4). These surrogate keys were often limited to 4-bit integers that could be indexed quickly. However, in the practice of analytics engineering, surrogate keys derived from the data often take the form of a hashed string value. Given this form, these surrogate keys are not necessarily optimized for performance for large table scans and complex joins. For large data models (millions, billions, trillions of rows) that have surrogate keys, you should materialize them as tables or [incremental models](https://docs.getdbt.com/docs/build/incremental-models) to help make joining entities more efficient. - -## Conclusion - -Surrogate keys are unique row identifiers that are created by using columns in a database object to create a uniqueness constraint on the data. To create a surrogate key, you will use a cryptographic algorithm usually in the form of the MD5 function to hash together fields that create a uniqueness constraint on the dataset. Ultimately, surrogate keys are a great way to create unique row identifiers for database objects that lack them naturally and allow folks to easily identify the grain of the data. - -## Further reading - -Want to learn more about keys, dbt, and everything in-between? Check out the following: - -* [Glossary: Primary keys](https://docs.getdbt.com/terms/primary-key) -* [Generating surrogate keys across warehouses](https://docs.getdbt.com/blog/sql-surrogate-keys) -* [Generating an auto-incrementing ID in dbt](https://discourse.getdbt.com/t/generating-an-auto-incrementing-id-in-dbt/579/2) -* [The most underutilized function in SQL](https://www.getdbt.com/blog/the-most-underutilized-function-in-sql/) diff --git a/website/docs/terms/table.md b/website/docs/terms/table.md deleted file mode 100644 index bfc4e680660..00000000000 --- a/website/docs/terms/table.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -id: table -title: Table -description: "Read this guide to understand how tables work in dbt." -displayText: table -hoverSnippet: In simplest terms, a table is the direct storage of data in rows and columns. Think excel sheet with raw values in each of the cells. ---- - -In simplest terms, a table is the direct storage of data in rows and columns. Think excel sheet with raw values in each of the cells. - -Here is an example of a table: - -| character_id | first_name | last_name | email | -| ------------ | ------------ | --------- | --------------------- | -| 01 | Frodo | Baggins | frodo@lotr.com | -| 02 | Bilbo | Baggins | bilbo@theshire.co.uk | -| 03 | Gandalf | The Grey | greywizard1@gmail.com | - -Tables do use storage in your . The data can be queried directly because you are directly pulling from the raw data itself. If a particular table was created by underlying data, the table will not be automatically updated. - -This table definition applies to most data warehouses, however, there are different flavors of tables for different warehouses. For example, Snowflake has transient and temporary tables that support different features. - -## Why are tables useful? - -Tables are an excellent choice for persisting transformed data in your warehouse at the time of execution. However, if the underlying data used is changed, the table will not reflect the underlying changes. If that is something you need, dbt Labs recommends views. diff --git a/website/docs/terms/view.md b/website/docs/terms/view.md deleted file mode 100644 index 53c122ca9e6..00000000000 --- a/website/docs/terms/view.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -id: view -title: View -description: Read this guide to understand how views work in dbt. -displayText: view -hoverSnippet: A view (as opposed to a table) is a defined passthrough SQL query that can be run against a database (or data warehouse). ---- -:::important This page could use some love -This term would benefit from additional depth and examples. Have knowledge to contribute? [Create an issue in the docs.getdbt.com repository](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to begin the process of becoming a glossary contributor! -::: - -A view (as opposed to a ) is a defined passthrough SQL query that can be run against a database (or ). A view doesn’t store data, like a table does, but it defines the logic that you need to fetch the underlying data. - -For example, you might define a SQL view to count new users in a day: - -```sql - select - created_date, - count(distinct(user_id)) as new_users - from users - group by created_date -``` - -But this SQL might get tedious to write over and over again, so instead you could define it as a view called `new_users`, and instead query `select * from new_users`. - -When that `new_users` query runs, the underlying view compiles and runs against the database. - -## Tips on using views - -A healthy relationship with views is built on expectations. - -You shouldn’t expect a view in itself to be your final destination in terms of data modeling (they’re slow + often more costly to query than tables, not great for connecting to a downstream process like reporting), but you should trust them to get you from point A to point B. - -## Further reading - -- [Best practices guide on choosing table vs view materializations](/best-practices) diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index 41953c754e8..60e6889f21c 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -21,7 +21,7 @@ export default function Term({ id, children = undefined }) { }) // Get terms file - const file = require('../../../docs/terms/terms.md') + const file = require('../../../docs/terms.md') // Get term by id const term = file?.frontMatter?.[id] From 0b8fce0e7917efc4d34d26ea19d357484115a720 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 15:55:48 -0400 Subject: [PATCH 07/21] update demo test-terms page --- website/docs/docs/test-terms.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/docs/test-terms.md b/website/docs/docs/test-terms.md index d33b6aa261d..be21d2bca1c 100644 --- a/website/docs/docs/test-terms.md +++ b/website/docs/docs/test-terms.md @@ -17,4 +17,4 @@ This should show noDisplayText term id as children and displayText not set: -This should NOT show a term (but should fail gracefully): Demo +This should NOT show a hover snippet (but should still show children): Demo From 46c23a50134cd516bd6b333037d39d82a279f946 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 15:57:00 -0400 Subject: [PATCH 08/21] remove link component from term component --- website/src/components/term/index.js | 1 - 1 file changed, 1 deletion(-) diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index 60e6889f21c..6ec098221e6 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -1,5 +1,4 @@ import React, { useState, useEffect } from 'react' -import Link from '@docusaurus/Link'; import ReactTooltip from "react-tooltip"; import styles from './styles.module.css'; From 7034b34d2e311630705dfbf4792df823c999036f Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 15:58:34 -0400 Subject: [PATCH 09/21] adjust pageReady handling in terms component --- website/src/components/term/index.js | 37 ++++++++++------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index 6ec098221e6..41846612c42 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -39,30 +39,19 @@ export default function Term({ id, children = undefined }) { return ( <> - {pageReady ? ( - <> - - {displayValue} - - {hoverSnippet && ( - - {hoverSnippet} - - )} - - ) : ( - {displayValue} + + {displayValue} + + {pageReady && hoverSnippet && ( + + {hoverSnippet} + )} ); From 06f2c755fd77cd9efa7b2eedfa6e0528b36eacf3 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 16:12:26 -0400 Subject: [PATCH 10/21] update name of hover-terms file --- website/docs/{terms.md => hover-terms.md} | 0 website/docusaurus.config.js | 2 +- website/src/components/term/index.js | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename website/docs/{terms.md => hover-terms.md} (100%) diff --git a/website/docs/terms.md b/website/docs/hover-terms.md similarity index 100% rename from website/docs/terms.md rename to website/docs/hover-terms.md diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index f820da39e8e..82eb6df54f4 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -243,7 +243,7 @@ var siteSettings = { sidebarCollapsible: true, exclude: [ - 'terms/**' + 'hover-terms.md' ] }, blog: { diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index 41846612c42..4dcd4d876d6 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -20,7 +20,7 @@ export default function Term({ id, children = undefined }) { }) // Get terms file - const file = require('../../../docs/terms.md') + const file = require('../../../docs/hover-terms.md') // Get term by id const term = file?.frontMatter?.[id] From 238844f030588f1167c14d4512835918b532477f Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 16:12:54 -0400 Subject: [PATCH 11/21] update comment --- website/src/components/term/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index 4dcd4d876d6..9080205a63a 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -4,7 +4,7 @@ import styles from './styles.module.css'; {/* Props: - id: maps to term in website/docs/terms/terms.md + id: maps to term in website/docs/hover-terms.md children (optional): to display different text other than displayText property for term */} From 575c176764eb959c6d38194977d6d01b8460458e Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Fri, 20 Sep 2024 16:25:43 -0400 Subject: [PATCH 12/21] update term links on two pages --- website/docs/docs/build/incremental-models.md | 6 +++--- website/docs/sql-reference/aggregate-functions/sql-sum.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/docs/build/incremental-models.md b/website/docs/docs/build/incremental-models.md index 2f8bbc46c3a..c48030cc32d 100644 --- a/website/docs/docs/build/incremental-models.md +++ b/website/docs/docs/build/incremental-models.md @@ -94,7 +94,7 @@ Not specifying a `unique_key` will result in append-only behavior, which means d The optional `unique_key` parameter specifies a field (or combination of fields) that defines the grain of your model. That is, the field(s) identify a single unique row. You can define `unique_key` in a configuration block at the top of your model, and it can be a single column name or a list of column names. -The `unique_key` should be supplied in your model definition as a string representing a single column or a list of single-quoted column names that can be used together, for example, `['col1', 'col2', …])`. Columns used in this way should not contain any nulls, or the incremental model run may fail. Either ensure that each column has no nulls (for example with `coalesce(COLUMN_NAME, 'VALUE_IF_NULL')`), or define a single-column [surrogate key](/terms/surrogate-key) (for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source)). +The `unique_key` should be supplied in your model definition as a string representing a single column or a list of single-quoted column names that can be used together, for example, `['col1', 'col2', …])`. Columns used in this way should not contain any nulls, or the incremental model run may fail. Either ensure that each column has no nulls (for example with `coalesce(COLUMN_NAME, 'VALUE_IF_NULL')`), or define a single-column [surrogate key](https://www.getdbt.com/blog/guide-to-surrogate-key) (for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source)). :::tip In cases where you need multiple columns in combination to uniquely identify each row, we recommend you pass these columns as a list (`unique_key = ['user_id', 'session_number']`), rather than a string expression (`unique_key = 'concat(user_id, session_number)'`). @@ -103,7 +103,7 @@ By using the first syntax, which is more universal, dbt can ensure that the colu When you pass a list in this way, please ensure that each column does not contain any nulls, or the incremental model run may fail. -Alternatively, you can define a single-column [surrogate key](/terms/surrogate-key), for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source). +Alternatively, you can define a single-column [surrogate key](https://www.getdbt.com/blog/guide-to-surrogate-key), for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source). ::: When you define a `unique_key`, you'll see this behavior for each row of "new" data returned by your dbt model: @@ -111,7 +111,7 @@ When you define a `unique_key`, you'll see this behavior for each row of "new" d * If the same `unique_key` is present in the "new" and "old" model data, dbt will update/replace the old row with the new row of data. The exact mechanics of how that update/replace takes place will vary depending on your database, [incremental strategy](/docs/build/incremental-strategy), and [strategy specific configs](/docs/build/incremental-strategy#strategy-specific-configs). * If the `unique_key` is _not_ present in the "old" data, dbt will insert the entire row into the table. -Please note that if there's a unique_key with more than one row in either the existing target table or the new incremental rows, the incremental model may fail depending on your database and [incremental strategy](/docs/build/incremental-strategy). If you're having issues running an incremental model, it's a good idea to double check that the unique key is truly unique in both your existing database table and your new incremental rows. You can [learn more about surrogate keys here](/terms/surrogate-key). +Please note that if there's a unique_key with more than one row in either the existing target table or the new incremental rows, the incremental model may fail depending on your database and [incremental strategy](/docs/build/incremental-strategy). If you're having issues running an incremental model, it's a good idea to double check that the unique key is truly unique in both your existing database table and your new incremental rows. You can [learn more about surrogate keys here](https://www.getdbt.com/blog/guide-to-surrogate-key). :::info While common incremental strategies, such as`delete+insert` + `merge`, might use `unique_key`, others don't. For example, the `insert_overwrite` strategy does not use `unique_key`, because it operates on partitions of data rather than individual rows. For more information, see [About incremental_strategy](/docs/build/incremental-strategy). diff --git a/website/docs/sql-reference/aggregate-functions/sql-sum.md b/website/docs/sql-reference/aggregate-functions/sql-sum.md index 494a3863ad3..8216e3f790b 100644 --- a/website/docs/sql-reference/aggregate-functions/sql-sum.md +++ b/website/docs/sql-reference/aggregate-functions/sql-sum.md @@ -11,7 +11,7 @@ slug: /sql-reference/sum The SQL SUM function is handy and ever-present in data work. Let’s unpack what it is, how to use it, and why it's valuable. -Jumping into it, the SUM aggregate function allows you to calculate the sum of a numeric column or across a set of rows for a column. Ultimately, the SUM function is incredibly useful for calculating meaningful business metrics, such as Lifetime Value (LTV), and creating key numeric fields in [`fct_` and `dim_` models](/terms/dimensional-modeling). +Jumping into it, the SUM aggregate function allows you to calculate the sum of a numeric column or across a set of rows for a column. Ultimately, the SUM function is incredibly useful for calculating meaningful business metrics, such as Lifetime Value (LTV), and creating key numeric fields in [`fct_` and `dim_` models](https://www.getdbt.com/blog/guide-to-dimensional-modeling). ## How to use the SUM function in a query From 115d50feaa24dc6a4bdd1487e717bd26dff1ebbc Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Mon, 23 Sep 2024 10:04:12 -0400 Subject: [PATCH 13/21] move hover-terms into terms directory to suppress partial warning --- website/docs/{ => terms}/hover-terms.md | 0 website/src/components/term/index.js | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename website/docs/{ => terms}/hover-terms.md (100%) diff --git a/website/docs/hover-terms.md b/website/docs/terms/hover-terms.md similarity index 100% rename from website/docs/hover-terms.md rename to website/docs/terms/hover-terms.md diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index 9080205a63a..9abcd1fa006 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -4,7 +4,7 @@ import styles from './styles.module.css'; {/* Props: - id: maps to term in website/docs/hover-terms.md + id: maps to term in website/docs/terms/hover-terms.md children (optional): to display different text other than displayText property for term */} @@ -20,7 +20,7 @@ export default function Term({ id, children = undefined }) { }) // Get terms file - const file = require('../../../docs/hover-terms.md') + const file = require('../../../docs/terms/hover-terms.md') // Get term by id const term = file?.frontMatter?.[id] From a4aa9b65d771b5e3d8d8fe6fbe824ddf59c25fee Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Mon, 23 Sep 2024 10:18:36 -0400 Subject: [PATCH 14/21] fix text color when term used in callout --- website/src/components/term/styles.module.css | 2 -- 1 file changed, 2 deletions(-) diff --git a/website/src/components/term/styles.module.css b/website/src/components/term/styles.module.css index 22603d6c058..8cc86743941 100644 --- a/website/src/components/term/styles.module.css +++ b/website/src/components/term/styles.module.css @@ -1,10 +1,8 @@ .term { position: relative; text-decoration: underline dotted var(--ifm-font-color-base); - color: var(--ifm-font-color-base); } .term:hover { - color: var(--ifm-link-color); text-decoration: underline dotted var(--ifm-link-color); } .termToolTip { From 769ca87ccb12515524e7994bc1f5f756e8b268f2 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Mon, 23 Sep 2024 10:22:21 -0400 Subject: [PATCH 15/21] revert pageReady edits to Term component --- website/src/components/term/index.js | 32 +++++++++++++++++----------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index 9abcd1fa006..99f354957cc 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -39,19 +39,25 @@ export default function Term({ id, children = undefined }) { return ( <> - - {displayValue} - - {pageReady && hoverSnippet && ( - - {hoverSnippet} - + {pageReady ? ( + <> + + {displayValue} + + {hoverSnippet && ( + + {hoverSnippet} + + )} + + ) : ( + {displayValue} )} ); From 93288c9a2b2949488ed378f9da7aea7c3474bdcd Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Mon, 23 Sep 2024 10:32:08 -0400 Subject: [PATCH 16/21] only use ReactTooltip if hoverSnippet set --- website/src/components/term/index.js | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index 99f354957cc..0c9a1b8b623 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -39,22 +39,20 @@ export default function Term({ id, children = undefined }) { return ( <> - {pageReady ? ( + {pageReady && hoverSnippet ? ( <> {displayValue} - {hoverSnippet && ( - - {hoverSnippet} - - )} + + {hoverSnippet} + ) : ( {displayValue} From d8e8999cb874f795cb65d45153616e21aeac4d31 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Mon, 23 Sep 2024 10:43:37 -0400 Subject: [PATCH 17/21] add redirects --- website/vercel.json | 60 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/website/vercel.json b/website/vercel.json index defdc449276..b530e838857 100644 --- a/website/vercel.json +++ b/website/vercel.json @@ -3530,6 +3530,66 @@ "source": "/best-practices/how-we-structure/5-semantic-layer-marts", "destination": "/best-practices/how-we-build-our-metrics/semantic-layer-7-semantic-structure", "permanent": true + }, + { + "source": "/terms/cte", + "destination": "https://www.getdbt.com/blog/guide-to-cte", + "permanent": true + }, + { + "source": "/terms/dag", + "destination": "https://www.getdbt.com/blog/guide-to-dag", + "permanent": true + }, + { + "source": "/terms/data-lineage", + "destination": "https://www.getdbt.com/blog/guide-to-data-lineage", + "permanent": true + }, + { + "source": "/terms/ddl", + "destination": "https://www.getdbt.com/blog/guide-to-ddl", + "permanent": true + }, + { + "source": "/terms/dimensional-modeling", + "destination": "https://www.getdbt.com/blog/guide-to-dimensional-modeling", + "permanent": true + }, + { + "source": "/terms/dml", + "destination": "https://www.getdbt.com/blog/guide-to-dml", + "permanent": true + }, + { + "source": "/terms/dry", + "destination": "https://www.getdbt.com/blog/guide-to-dry", + "permanent": true + }, + { + "source": "/terms/grain", + "destination": "https://www.getdbt.com/blog/guide-to-grain", + "permanent": true + }, + { + "source": "/terms/subquery", + "destination": "https://www.getdbt.com/blog/guide-to-subquery", + "permanent": true + }, + { + "source": "/terms/surrogate-key", + "destination": "https://www.getdbt.com/blog/guide-to-surrogate-key", + "permanent": true + }, + { + "source": "/glossary", + "destination": "https://www.getdbt.com/blog", + "permanent": true + }, + { + "source": "/terms/:path*", + "destination": "https://www.getdbt.com/blog", + "permanent": true } ], "headers": [ From b5e9a92e53b4737bf728ab486087b0cf858d844c Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Tue, 24 Sep 2024 15:59:03 -0400 Subject: [PATCH 18/21] update data-grain redirect --- website/vercel.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/vercel.json b/website/vercel.json index b530e838857..c240bbec877 100644 --- a/website/vercel.json +++ b/website/vercel.json @@ -3568,7 +3568,7 @@ }, { "source": "/terms/grain", - "destination": "https://www.getdbt.com/blog/guide-to-grain", + "destination": "https://www.getdbt.com/blog/guide-to-data-grain", "permanent": true }, { From 4ec0649f09f88dbbacf4191fcfdfa5a5bd1e1e59 Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Thu, 26 Sep 2024 11:01:09 -0400 Subject: [PATCH 19/21] bring back 3 etl pages so these remain live while working on new blog post --- website/docs/terms/elt.md | 139 ++++++++++++++++++++++++++++++ website/docs/terms/etl.md | 130 ++++++++++++++++++++++++++++ website/docs/terms/reverse-etl.md | 94 ++++++++++++++++++++ website/vercel.json | 2 +- 4 files changed, 364 insertions(+), 1 deletion(-) create mode 100644 website/docs/terms/elt.md create mode 100644 website/docs/terms/etl.md create mode 100644 website/docs/terms/reverse-etl.md diff --git a/website/docs/terms/elt.md b/website/docs/terms/elt.md new file mode 100644 index 00000000000..0e7d11bf7dd --- /dev/null +++ b/website/docs/terms/elt.md @@ -0,0 +1,139 @@ +--- +id: elt +title: What is ELT (Extract, Load, Transform)? +description: ELT is the process of first extraction data from different sources, then loading it into a data warehouse, and finally transforming it. +displayText: ELT +hoverSnippet: Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, loading it into a target data warehouse, and finally transforming it. +--- + + What is ELT (Extract, Load, Transform)? How does it differ from ETL? + +Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, then loading it into a target , and finally transforming it. + +ELT has emerged as a paradigm for how to manage information flows in a modern data warehouse. This represents a fundamental shift from how data previously was handled when Extract, Transform, Load (ETL) was the data workflow most companies implemented. + +Transitioning from ETL to ELT means that you no longer have to capture your transformations during the initial loading of the data into your data warehouse. Rather, you are able to load all of your data, then build transformations on top of it. Data teams report that the ELT workflow has several advantages over the traditional ETL workflow which we’ll go over [in-depth later in this glossary](#benefits-of-elt). + +## How ELT works + +In an ELT process, data is extracted from data sources, loaded into a target data platform, and finally transformed for analytics use. We’ll go over the three components (extract, load, transform) in detail here. + +![Diagram depicting the ELT workflow. Data is depicted being extracted from example data sources like an Email CRM, Facebook Ads platform, Backend databases, and Netsuite. The data is then loaded as raw data into a data warehouse. From there, the data is transformed within the warehouse by renaming, casting, joining, or enriching the raw data. The result is then modeled data inside your data warehouse.](/img/docs/terms/elt/elt-diagram.png) + +### Extract + +In the extraction process, data is extracted from multiple data sources. The data extracted is, for the most part, data that teams eventually want to use for analytics work. Some examples of data sources can include: + +- Backend application databases +- Marketing platforms +- Email and sales CRMs +- and more! + +Accessing these data sources using Application Programming Interface (API) calls can be a challenge for individuals and teams who don't have the technical expertise or resources to create their own scripts and automated processes. However, the recent development of certain open-source and Software as a Service (SaaS) products has removed the need for this custom development work. By establishing the option to create and manage pipelines in an automated way, you can extract the data from data sources and load it into data warehouses via a user interface. + +Since not every data source will integrate with SaaS tools for extraction and loading, it’s sometimes inevitable that teams will write custom ingestion scripts in addition to their SaaS tools. + +### Load + +During the loading stage, data that was extracted is loaded into the target data warehouse. Some examples of modern data warehouses include Snowflake, Amazon Redshift, and Google BigQuery. Examples of other data storage platforms include data lakes such as Databricks’s Data Lakes. Most of the SaaS applications that extract data from your data sources will also load it into your target data warehouse. Custom or in-house extraction and load processes usually require strong data engineering and technical skills. + +At this point in the ELT process, the data is mostly unchanged from its point of extraction. If you use an extraction and loading tool like Fivetran, there may have been some light normalization on your data. But for all intents and purposes, the data loaded into your data warehouse at this stage is in its raw format. + +### Transform + +In the final transformation step, the raw data that has been loaded into your data warehouse is finally ready for modeling! When you first look at this data, you may notice a few things about it… + +- Column names may or may not be clear +- Some columns are potentially the incorrect data type +- Tables are not joined to other tables +- Timestamps may be in the incorrect timezone for your reporting +- fields may need to be unnested +- Tables may be missing primary keys +- And more! + +...hence the need for transformation! During the transformation process, data from your data sources is usually: + +- **Lightly Transformed**: Fields are cast correctly, timestamp fields’ timezones are made uniform, tables and fields are renamed appropriately, and more. +- **Heavily Transformed**: Business logic is added, appropriate materializations are established, data is joined together, etc. +- **QA’d**: Data is tested according to business standards. In this step, data teams may ensure primary keys are unique, model relations match-up, column values are appropriate, and more. + +Common ways to transform your data include leveraging modern technologies such as dbt, writing custom SQL scripts that are automated by a scheduler, utilizing stored procedures, and more. + +## ELT vs ETL + +The primary difference between the traditional ETL and the modern ELT workflow is when [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) and loading take place. In ETL workflows, data extracted from data sources is transformed prior to being loaded into target data platforms. Newer ELT workflows have data being transformed after being loaded into the data platform of choice. Why is this such a big deal? + +| | ELT | ETL | +|---|---|---| +| Programming skills required| Often little to no code to extract and load data into your data warehouse. | Often requires custom scripts or considerable data engineering lift to extract and transform data prior to load. | +| Separation of concerns | Extraction, load, and transformation layers can be explicitly separated out by different products. | ETL processes are often encapsulated in one product. | +| Distribution of transformations | Since transformations take place last, there is greater flexibility in the modeling process. Worry first about getting your data in one place, then you have time to explore the data to understand the best way to transform it. | Because transformation occurs before data is loaded into the target location, teams must conduct thorough work prior to make sure data is transformed properly. Heavy transformations often take place downstream in the BI layer. | +| [Data team distribution](https://www.getdbt.com/data-teams/analytics-job-descriptions/) | ELT workflows empower data team members who know SQL to create their own extraction and loading pipelines and transformations. | ETL workflows often require teams with greater technical skill to create and maintain pipelines. | + +Why has ELT adoption grown so quickly in recent years? A few reasons: + +- **The abundance of cheap cloud storage with modern data warehouses.** The creation of modern data warehouses such Redshift and Snowflake has made it so teams of all sizes can store and scale their data at a more efficient cost. This was a huge enabler for the ELT workflow. +- **The development of low-code or no-code data extractors and loaders.** Products that require little technical expertise such as Fivetran and Stitch, which can extract data from many data sources and load it into many different data warehouses, have helped lower the barrier of entry to the ELT workflow. Data teams can now relieve some of the data engineering lift needed to extract data and create complex transformations. +- **A true code-based, version-controlled transformation layer with the development of dbt.** Prior to the development of dbt, there was no singular transformation layer product. dbt helps data analysts apply software engineering best practices (version control, CI/CD, and testing) to data transformation, ultimately allowing for anyone who knows SQL to be a part of the ELT process. +- **Increased compatibility between ELT layers and technology in recent years.** With the expansion of extraction, loading, and transformation layers that integrate closely together and with cloud storage, the ELT workflow has never been more accessible. For example, Fivetran creates and maintains [dbt packages](https://hub.getdbt.com/) to help write dbt transformations for the data sources they connect to. + +## Benefits of ELT + +You often hear about the benefits of the ELT workflow to data, but you can sometimes forget to talk about the benefits it brings to people. There are a variety of benefits that this workflow brings to the actual data (which we’ll outline in detail below), such as the ability to recreate historical transformations, test data and data models, and more. We'll also want to use this section to emphasize the empowerment the ELT workflow brings to both data team members and business stakeholders. + +### ELT benefit #1: Data as code + +Ok we said it earlier: The ELT workflow allows data teams to function like software engineers. But what does this really mean? How does it actually impact your data? + +#### Analytics code can now follow the same best practices as software code + +At its core, data transformations that occur last in a data pipeline allow for code-based and version-controlled transformations. These two factors alone permit data team members to: + +- Easily recreate historical transformations by rolling back commits +- Establish code-based tests +- Implement CI/CD workflows +- Document data models like typical software code. + +#### Scaling, made sustainable + +As your business grows, the number of data sources correspondingly increases along with it. As such, so do the number of transformations and models needed for your business. Managing a high number of transformations without version control or automation is not scalable. + +The ELT workflow capitalizes on transformations occurring last to provide flexibility and software engineering best practices to data transformation. Instead of having to worry about how your extraction scripts scale as your data increases, data can be extracted and loaded automatically with a few clicks. + +### ELT benefit #2: Bring the power to the people + +The ELT workflow opens up a world of opportunity for the people that work on that data, not just the data itself. + +#### Empowers data team members + +Data analysts, analytics engineers, and even data scientists no longer have to be dependent on data engineers to create custom pipelines and models. Instead, they can use point-and-click products such as Fivetran and Airbyte to extract and load the data for them. + +Having the transformation as the final step in the ELT workflow also allows data folks to leverage their understanding of the data and SQL to focus more on actually modeling the data. + +#### Promotes greater transparency for end busines users + +Data teams can expose the version-controlled code used to transform data for analytics to end business users by no longer having transformations hidden in the ETL process. Instead of having to manually respond to the common question, “How is this data generated?” data folks can direct business users to documentation and repositories. Having end business users involved or viewing the data transformations promote greater collaboration and awareness between business and data folks. + +## ELT tools + +As mentioned earlier, the recent development of certain technologies and products has helped lower the barrier of entry to implementing the ELT workflow. Most of these new products act as one or two parts of the ELT process, but some have crossover across all three parts. We’ll outline some of the current tools in the ELT ecosystem below. + +| Product | E/L/T? | Description | Open source option? | +|---|---|---|---| +| Fivetran/HVR | E, some T, L | Fivetran is a SaaS company that helps data teams extract, load, and perform some transformation on their data. Fivetran easily integrates with modern data warehouses and dbt. They also offer transformations that leverage dbt Core. | :x: | +| Stitch by Talend | E, L | Stitch (part of Talend) is another SaaS product that has many data connectors to extract data and load it into data warehouses. | :x: | +| Airbyte | E, L | Airbyte is an open-source and cloud service that allows teams to create data extraction and load pipelines. | :white_check_mark: | +| Funnel | E, some T, L | Funnel is another product that can extract and load data. Funnel’s data connectors are primarily focused around marketing data sources. | :x: | +| dbt | T | dbt is the transformation tool that enables data analysts and engineers to transform, test, and document data in the cloud data warehouse. dbt offers both an open-source and cloud-based product. | :white_check_mark: | + +## Conclusion + +The past few years have been a whirlwind for the data world. The increased accessibility and affordability of cloud warehouses, no-code data extractors and loaders, and a true transformation layer with dbt has allowed for the ELT workflow to become the preferred analytics workflow. ETL predates ELT and differs in when data is transformed. In both processes, data is first extracted from different sources. However, in ELT processes, data is loaded into the target data platform and then transformed. The ELT workflow ultimately allows for data team members to extract, load, and model their own data in a flexible, accessible, and scalable way. + +## Further reading + +Here's some of our favorite content about the ELT workflow: + +- [The case for the ELT workflow](https://www.getdbt.com/analytics-engineering/case-for-elt-workflow/) +- [A love letter to ETL tools](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/) +- [What is dbt?](https://getdbt.com/product/what-is-dbt/) diff --git a/website/docs/terms/etl.md b/website/docs/terms/etl.md new file mode 100644 index 00000000000..321f59a65d0 --- /dev/null +++ b/website/docs/terms/etl.md @@ -0,0 +1,130 @@ +--- +id: etl +title: What is ETL (Extract, Transform, Load)? +description: ETL is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. +displayText: ETL +hoverSnippet: Extract, Transform, Load (ETL) is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. +--- + + + What is ETL (Extract, Transform, Load)? How has it evolved? + + +ETL, or “Extract, Transform, Load”, is the process of first extracting data from a data source, transforming it, and then loading it into a target . In ETL workflows, much of the meaningful [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) occurs outside this primary pipeline in a downstream business intelligence (BI) platform. + +ETL is contrasted with the newer (Extract, Load, Transform) workflow, where transformation occurs after data has been loaded into the target data warehouse. In many ways, the ETL workflow could have been renamed the ETLT workflow, because a considerable portion of meaningful data transformations happen outside the data pipeline. The same transformations can occur in both ETL and ELT workflows, the primary difference is *when* (inside or outside the primary ETL workflow) and *where* the data is transformed (ETL platform/BI tool/data warehouse). + +It’s important to talk about ETL and understand how it works, where it provides value, and how it can hold people back. If you don’t talk about the benefits and drawbacks of systems, how can you expect to improve them? + +## How ETL works + +In an ETL process, data is first extracted from a source, transformed, and then loaded into a target data platform. We’ll go into greater depth for all three steps below. + +![A diagram depicting the ETL workflow. The diagram starts by depicting raw data being extracted from various example data sources like an email CRM, Facebook Ads platform, a backend database, and Netsuite. Once the data is extracted, the raw data is transformed within the data pipeline via renaming, casting, joining, and enriching. After the data is transformed within the data pipeline, the modeled data is loaded into a data warehouse.](/img/docs/terms/etl/etl-diagram.png) + +### Extract + +In this first step, data is extracted from different data sources. Data that is extracted at this stage is likely going to be eventually used by end business users to make decisions. Some examples of these data sources include: + +- Ad platforms (Facebook Ads, Google Ads, etc.) +- Backend application databases +- Sales CRMs +- And more! + +To actually get this data, data engineers may write custom scripts that make Application Programming Interface (API) calls to extract all the relevant data. Because making and automating these API calls gets harder as data sources and data volume grows, this method of extraction often requires strong technical skills. In addition, these extraction scripts also involve considerable maintenance since APIs change relatively often. Data engineers are often incredibly competent at using different programming languages such as Python and Java. Data teams can also extract from these data sources with open source and Software as a Service (SaaS) products. + +### Transform + +At this stage, the raw data that has been extracted is normalized and modeled. In ETL workflows, much of the actual meaningful business logic, metric calculations, and entity joins tend to happen further down in a downstream BI platform. As a result, the transformation stage here is focused on data cleanup and normalization – renaming of columns, correct casting of fields, timestamp conversions. + +To actually transform the data, there’s two primary methods teams will use: + +- **Custom solutions**: In this solution, data teams (typically data engineers on the team), will write custom scripts and create automated pipelines to transform the data. Unlike ELT transformations that typically use SQL for modeling, ETL transformations are often written in other programming languages such as Python or Scala. Data engineers may leverage technologies such as Apache Spark or Hadoop at this point to help process large volumes of data. +- **ETL products**: There are ETL products that will extract, transform, and load your data in one platform. [These tools](#etl-tools) often involve little to no code and instead use Graphical User Interfaces (GUI) to create pipelines and transformations. + +### Load + +In the final stage, the transformed data is loaded into your target data warehouse. Once this transformed data is in its final destination, it’s most commonly exposed to end business users either in a BI tool or in the data warehouse directly. + +The ETL workflow implies that your raw data does not live in your data warehouse. *Because transformations occur before load, only transformed data lives in your data warehouse in the ETL process.* This can make it harder to ensure that transformations are performing the correct functionality. + +## How ETL is being used + +While ELT adoption is growing, we still see ETL use cases for processing large volumes of data and adhering to strong data governance principles. + +### ETL to efficiently normalize large volumes of data + +ETL can be an efficient way to perform simple normalizations across large data sets. Doing these lighter transformations across a large volume of data during loading can help get the data formatted properly and quickly for downstream use. In addition, end business users sometimes need quick access to raw or somewhat normalized data. Through an ETL workflow, data teams can conduct lightweight transformations on data sources and quickly expose them in their target data warehouse and downstream BI tool. + +### ETL for hashing PII prior to load + +Some companies will want to mask, hash, or remove PII values before it enters their data warehouse. In an ETL workflow, teams can transform PII to hashed values or remove them completely during the loading process. This limits where PII is available or accessible in an organization’s data warehouse. + +## ETL challenges + +There are reasons ETL has persisted as a workflow for over twenty years. However, there are also reasons why there’s been such immense innovation in this part of the data world in the past decade. From our perspective, the technical and human limitations we describe below are some of the reasons ELT has surpassed ETL as the preferred workflow. + +### ETL challenge #1: Technical limitations + +**Limited or lack of version control** + +When transformations exist as standalone scripts or deeply woven in ETL products, it can be hard to version control the transformations. Not having version control on transformation as code means that data teams can’t easily recreate or rollback historical transformations and perform code reviews. + +**Immense amount of business logic living in BI tools** + +Some teams with ETL workflows only implement much of their business logic in their BI platform versus earlier in their transformation phase. While most organizations have some business logic in their BI tools, an excess of this logic downstream can make rendering data in the BI tool incredibly slow and potentially hard to track if the code in the BI tool is not version controlled or exposed in documentation. + +**Challenging QA processes** + +While data quality testing can be done in ETL processes, not having the raw data living somewhere in the data warehouse inevitably makes it harder to ensure data models are performing the correct functionality. In addition, quality control continually gets harder as the number of data sources and pipelines within your system grows. + +### ETL challenge #2: Human limitations + +**Data analysts can be excluded from ETL work** + +Because ETL workflows often involve incredibly technical processes, they've restricted data analysts from being involved in the data workflow process. One of the greatest strengths of data analysts is their knowledge of the data and SQL, and when extractions and transformations involve unfamiliar code or applications, they and their expertise can be left out of the process. Data analysts and scientists also become dependent on other people to create the schemas, tables, and datasets they need for their work. + +**Business users are kept in the dark** + +Transformations and business logic can often be buried deep in custom scripts, ETL tools, and BI platforms. At the end of the day, this can hurt business users: They're kept out of the data modeling process and have limited views into how data transformation takes place. As a result, end business users often have little clarity on data definition, quality, and freshness, which ultimately can decrease trust in the data and data team. + +## ETL vs ELT + +You may read other articles or technical documents that use ETL and ELT interchangeably. On paper, the only difference is the order in which the T and the L appear. However, this mere switching of letters dramatically changes the way data exists in and flows through a business’ system. + +In both processes, data from different data sources is extracted in similar ways. However, in ELT, data is then directly loaded into the target data platform versus being transformed in ETL. Now, via ELT workflows, both raw and transformed data can live in a data warehouse. In ELT workflows, data folks have the flexibility to model the data after they’ve had the opportunity to explore and analyze the raw data. ETL workflows can be more constraining since transformations happen immediately after extraction. We break down some of the other major differences between the two below: + +| | ELT | ETL | +|---|---|---| +| Programming skills required | Often requires little to no code to extract and load data into your data warehouse. | Often requires custom scripts or considerable data engineering lift to extract and transform data prior to load. | +| Separation of concerns | Extraction, load, and transformation layers can be explicitly separated out by different products. | ETL processes are often encapsulated in one product. | +| Distribution of transformations | Since transformations take place last, there is greater flexibility in the modeling process. Worry first about getting your data in one place, then you have time to explore the data to understand the best way to transform it. | Because transformation occurs before data is loaded into the target location, teams must conduct thorough work prior to make sure data is transformed properly. Heavy transformations often take place downstream in the BI layer. | +| [Data team roles](https://www.getdbt.com/data-teams/analytics-job-descriptions/) | ELT workflows empower data team members who know SQL to create their own extraction and loading pipelines and transformations. | ETL workflows often require teams with greater technical skill to create and maintain pipelines. | + +While ELT is growing in adoption, it’s still important to talk about when ETL might be appropriate and where you'll see challenges with the ETL workflow. + +## ETL tools + +There exists a variety of ETL technologies to help teams get data into their data warehouse. A good portion of ETL tools on the market today are geared toward enterprise businesses and teams, but there are some that are also applicable for smaller organizations. + +| Platform | E/T/L? | Description | Open source option? | +|---|---|---|---| +| Informatica | E, T, L | An all-purpose ETL platform that supports low or no-code extraction, transformations and loading. Informatica also offers a broad suite of data management solutions beyond ETL and is often leveraged by enterprise organizations. | :x: | +| Integrate.io | E, T, L | A newer ETL product focused on both low-code ETL as well as reverse ETL pipelines. | :x: | +| Matillion | E, T, L | Matillion is an end-to-end ETL solution with a variety of native data connectors and GUI-based transformations. | :x: | +| Microsoft SISS | E, T, L | Microsoft’s SQL Server Integration Services (SISS) offers a robust, GUI-based platform for ETL services. SISS is often used by larger enterprise teams. | :x: | +| Talend Open Studio | E, T, L | An open source suite of GUI-based ETL tools. | :white_check_mark: | + +## Conclusion + +ETL, or “Extract, Transform, Load,” is the process of extracting data from different data sources, transforming it, and loading that transformed data into a data warehouse. ETL typically supports lighter transformations during the phase prior to loading and more meaningful transformations to take place in downstream BI tools. We’re seeing now that ETL is fading out and the newer ELT workflow is replacing it as a practice for many data teams. However, it’s important to note that ETL allowed us to get us to where we are today: Capable of building workflows that extract data within simple UIs, store data in scalable cloud data warehouses, and write data transformations like software engineers. + +## Further Reading + +Please check out some of our favorites reads regarding ETL and ELT below: + +- [Glossary: ELT](https://docs.getdbt.com/terms/elt) +- [The case for the ELT workflow](https://www.getdbt.com/analytics-engineering/case-for-elt-workflow/) +- [A love letter to ETL tools](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/) +- [Reverse ETL](https://www.getdbt.com/analytics-engineering/use-cases/operational-analytics/) + diff --git a/website/docs/terms/reverse-etl.md b/website/docs/terms/reverse-etl.md new file mode 100644 index 00000000000..a3ccd0b0f70 --- /dev/null +++ b/website/docs/terms/reverse-etl.md @@ -0,0 +1,94 @@ +--- +id: reverse-etl +title: Reverse ETL +description: Reverse ETL is the process of getting your transformed data stored in your data warehouse to end business platforms, such as sales CRMs and ad platforms. +displayText: reverse ETL +hoverSnippet: Reverse ETL is the process of getting your transformed data stored in your data warehouse to end business platforms, such as sales CRMs and ad platforms. +--- + + + Reverse ETL, demystified: What it is in plain english + + +Reverse ETL is the process of getting your transformed data stored in your data warehouse to end business platforms, such as sales CRMs and ad platforms. Once in an end platform, that data is often used to drive meaningful business actions, such as creating custom audiences in ad platforms, personalizing email campaigns, or supplementing data in a sales CRM. You may also hear about reverse ETL referred to as operational analytics or data activation. + +Reverse ETL efforts typically happen after data teams have set up their [modern data stack](https://www.getdbt.com/blog/future-of-the-modern-data-stack/) and ultimately have a consistent and automated way to extract, load, and transform data. Data teams are also often responsible for setting up the pipelines to send down data to business platforms, and business users are typically responsible for *using the data* once it gets to their end platform. + +Ultimately, reverse ETL is a way to put data where the work is already happening, support self-service efforts, and help business users derive real action out of their data. + +## How reverse ETL works + +In the reverse ETL process, transformed data is synced from a data warehouse to external tools in order to be leveraged by different business teams. + +![A diagram depicting how the reverse ETL process works. It starts with data being extract from data sources like email CRMs, Facebook Ad platforms, backend databases, and NetSuite. The raw data is then loaded into a data warehouse. After loading, the data is transformed and modeled. The modeled data is then loaded directly back into the tools that created the data, like Email CRMs, Facebook Ad platforms, and others so the insights are more accessible to business users.](/img/docs/terms/reverse-etl/reverse-etl-diagram.png) + +The power of reverse ETL comes from sending down *already transformed data* to business platforms. Raw data, while beautiful in its own way, typically lacks the structure, aggregations, and aliasing to be useful for end business users off the bat. After data teams transform data for business use in pipelines, typically to expose in an end business intelligence (BI) tool, they can also send this cleaned and meaningful data to other platforms where business users can derive value using [reverse ETL tools](#reverse-etl-tools). + +Data teams can choose to write additional transformations that may need to happen for end business tools in reverse ETL tools themselves or by creating [additional models in dbt](https://getdbt.com/open-source-data-culture/reverse-etl-playbook/). + +## Why use reverse ETL? + +There’s a few reasons why your team may want to consider using reverse ETL: + +### Putting data where the work is happening + +While most data teams would love it if business users spent a significant portion of their time in their BI tool, that’s neither practical nor necessarily the most efficient use of their time. In the real world, many business users will spend some time in a BI tool, identify the data that could be useful in a platform they spend a significant amount of time in, and work with the data team to get that data where they need it. Users feel comfortable and confident in the systems they use everyday—why not put the data in the places that allow them to thrive? + +### Manipulating data to fit end platform requirements + +Reverse ETL helps you to put data your business users need *in the format their end tool expects*. Oftentimes, end platforms expect data fields to be named or cast in a certain way. Instead of business users having to manually input those values in the correct format, you can transform your data using a product like dbt or directly in a reverse ETL tool itself, and sync down that data in an automated way. + +### Supporting self-service efforts + +By sending down data-team approved data in reverse ETL pipelines, your business users have the flexibility to use that data however they see fit. Soon, your business users will be making audiences, testing personalization efforts, and running their end platform like a well-oiled, data-powered machine. + + +## Reverse ETL use cases + +Just as there are almost endless opportunities with data, there are many potential different use cases for reverse ETL. We won’t go into every possible option, but we’ll cover some of the common use cases that exist for reverse ETL efforts. + +### Personalization + +Reverse ETL allows business users to access data that they normally would only have access to in a BI tool *in the platforms they use every day*. As a result, business users can now use this data to personalize how they create ads, send emails, and communicate with customers. + +Personalization was all the hype a few years ago and now, you rarely ever see an email come into your inbox without some sort of personalization in-place. Data teams using reverse ETL are able to pass down important customer information, such as location, customer lifetime value (CLV), tenure, and other fields, that can be used to create personalized emails, establish appropriate messaging, and segment email flows. All we can say: the possibilities for personalization powered by reverse ETL are endless. + +### Sophisticated paid marketing initiatives + +At the end of the day, businesses want to serve the right ads to the right people (and at the right cost). A common use case for reverse ETL is for teams to use their customer data to create audiences in ad platforms to either serve specific audiences or create lookalikes. While ad platforms have gotten increasingly sophisticated with their algorithms to identify high-value audiences, it usually never hurts to try supplementing those audiences with your own data to create sophisticated audiences or lookalikes. + +### Self-service analytics culture + +We hinted at it earlier, but reverse ETL efforts can be an effective way to promote a self-service analytics culture. When data teams put the data where business users need it, business users can confidently access it on their own, driving even faster insights and action. Instead of requesting a data pull from a data team member, they can find the data they need directly within the platform that they use. Reverse ETL allows business users to act on metrics that have already been built out and validated by data teams without creating ad-hoc requests. + +### “Real-time” data + +It would be amiss if we didn’t mention reverse ETL and the notion of “real-time” data. While you can have the debate over the meaningfulness and true value-add of real-time data another time, reverse ETL can be a mechanism to bring data to end business platforms in a more “real-time” way. + +Data teams can set up syncs in reverse ETL tools at higher cadences, allowing business users to have the data they need, faster. Obviously, there’s some cost-benefit analysis on how often you want to be loading data via [ETL tools](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/) and hitting your data warehouse, but reverse ETL can help move data into external tools at a quicker cadence if deemed necessary. + +All this to say: move with caution in the realm of “real-time”, understand your stakeholders’ wants and decision-making process for real-time data, and work towards a solution that’s both practical and impactful. + +## Reverse ETL tools + +Reverse ETL tools typically establish the connection between your data warehouse and end business tools, offer an interface to create additional transformations or audiences, and support automation of downstream syncs. Below are some examples of tools that support reverse ETL pipelines. + +| Tool | Description | Open source option? | +|:---:|:---:|:---:| +| Hightouch | A platform to sync data models and create custom audiences for downstream business platforms. | :x: | +| Polytomic | A unified sync platform for syncing to and from data warehouses (ETL and Reverse ETL), databases, business apps, APIs, and spreadsheets. | :x: | +| Census | Another reverse ETL tool that can sync data from your data warehouse to your go-to-market tools. | :x: | +| Rudderstack | Also a CDP (customer data platform), Rudderstack additionally supports pushing down data and audience to external tools, such as ad platforms and email CRMs. | :white_check_mark: | +| Grouparoo | Grouparoo, part of Airbyte, is an open source framework to move data from data warehouses to different cloud-based tools. | :white_check_mark: | + +## Conclusion + +Reverse ETL enables you to sync your transformed data stored in your data warehouse to external platforms often used by marketing, sales, and product teams. It allows you to leverage your data in a whole new way. Reverse ETL pipelines can support personalization efforts, sophisticated paid marketing initiatives, and ultimately offer new ways to leverage your data. In doing this, it creates a self-service analytics culture where stakeholders can receive the data they need in, in the places they need, in an automated way. + +## Further reading + +If you’re interested learning more about reverse ETL and the impact it could have on your team, check out the following: + +- [How dbt Labs’s data team approaches reverse ETL](https://getdbt.com/open-source-data-culture/reverse-etl-playbook/) +- [The operational data warehouse in action: Reverse ETL, CDPs, and the future of data activation](https://www.getdbt.com/coalesce-2021/operational-data-warehouse-reverse-etl-cdp-data-activation/) +- [The analytics engineering guide: Operational analytics](https://www.getdbt.com/analytics-engineering/use-cases/operational-analytics/) diff --git a/website/vercel.json b/website/vercel.json index c240bbec877..c775221bff0 100644 --- a/website/vercel.json +++ b/website/vercel.json @@ -3587,7 +3587,7 @@ "permanent": true }, { - "source": "/terms/:path*", + "source": "/terms/:path((?!elt|etl|reverse-etl).*)", "destination": "https://www.getdbt.com/blog", "permanent": true } From 33bdcc374f853144253247f7b4109557f587febe Mon Sep 17 00:00:00 2001 From: Jason Karlavige Date: Thu, 26 Sep 2024 12:49:40 -0400 Subject: [PATCH 20/21] remove test terms and test page --- website/docs/docs/test-terms.md | 20 -------------------- website/docs/terms/hover-terms.md | 11 ----------- 2 files changed, 31 deletions(-) delete mode 100644 website/docs/docs/test-terms.md diff --git a/website/docs/docs/test-terms.md b/website/docs/docs/test-terms.md deleted file mode 100644 index be21d2bca1c..00000000000 --- a/website/docs/docs/test-terms.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -title: "Testing Terms component" -id: "test-terms" -pagination_next: null -pagination_prev: null ---- - -This should show a term: Deploying - -This should show a term: Data wrangling - -This should show a term: Model - -This should show demoTerm: - -This should show noDisplayText term id as children and displayText not set: - -This has displayText set but no hoverSnippet set: - -This should NOT show a hover snippet (but should still show children): Demo diff --git a/website/docs/terms/hover-terms.md b/website/docs/terms/hover-terms.md index f1e1d118d16..d0b024a941d 100644 --- a/website/docs/terms/hover-terms.md +++ b/website/docs/terms/hover-terms.md @@ -122,15 +122,4 @@ table: view: displayText: view hoverSnippet: A view (as opposed to a table) is a defined passthrough SQL query that can be run against a database (or data warehouse). - -# Test terms (DELETE BEFORE MERGE) -demoTerm: - hoverSnippet: This is a demo term hover snippet - displayText: Yay demo term - -noDisplayText: - hoverSnippet: No Display Text hover snip - -noHoverSnippet: - displayText: No Hover Snip --- From a59ad5965b012e69fe76a5c6874d3d9451522575 Mon Sep 17 00:00:00 2001 From: Doug Beatty <44704949+dbeatty10@users.noreply.github.com> Date: Wed, 2 Oct 2024 07:54:31 -0600 Subject: [PATCH 21/21] Snapshot config can be set in schema / properties YAML file starting v1.9 (#6206) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Preview](https://docs-getdbt-com-git-dbeatty10-patch-2-dbt-labs.vercel.app/reference/snapshot-configs) resolves: https://github.com/dbt-labs/docs.getdbt.com/issues/5627 ## What are you changing in this pull request and why? Snapshot config can be set in schema / properties YAML file starting v1.9 ## 🎩 v1.8 and below: image Versionless and v1.9 and above: image ## Checklist - [x] I have reviewed the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) so my content adheres to these guidelines. - [x] The topic I'm writing about is for specific dbt version(s) and I have versioned it according to the [version a whole page](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) and/or [version a block of content](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#versioning-blocks-of-content) guidelines. - [ ] Add a note to the prerelease version [Migration Guide](https://github.com/dbt-labs/docs.getdbt.com/tree/current/website/docs/docs/dbt-versions/core-upgrade) --------- Co-authored-by: Mirna Wong <89008547+mirnawong1@users.noreply.github.com> --- .../docs/docs/dbt-versions/release-notes.md | 1 + website/docs/reference/snapshot-configs.md | 26 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/website/docs/docs/dbt-versions/release-notes.md b/website/docs/docs/dbt-versions/release-notes.md index 11fdfd4dedf..f98cd199ad7 100644 --- a/website/docs/docs/dbt-versions/release-notes.md +++ b/website/docs/docs/dbt-versions/release-notes.md @@ -20,6 +20,7 @@ Release notes are grouped by month for both multi-tenant and virtual private clo ## October 2024 +- **Enhancement**: In dbt Cloud Versionless, snapshots defined in SQL files can now use `config` defined in `schema.yml` YAML files. This update resolves the previous limitation that required snapshot properties to be defined exclusively in `dbt_project.yml` and/or a `config()` block within the SQL file. This enhancement will be included in the upcoming dbt Core v1.9 release. - **New:** dbt Explorer now includes trust signal icons, which is currently available as a [Preview](/docs/dbt-versions/product-lifecycles#dbt-cloud). Trust signals offer a quick, at-a-glance view of data health when browsing your dbt models in Explorer. These icons indicate whether a model is **Healthy**, **Caution**, **Degraded**, or **Unknown**. For accurate health data, ensure the resource is up-to-date and has had a recent job run. Refer to [Trust signals](/docs/collaborate/explore-projects#trust-signals-for-resources) for more information. - **New:** Auto exposures are now available in Preview in dbt Cloud. Auto-exposures helps users understand how their models are used in downstream analytics tools to inform investments and reduce incidents. It imports and auto-generates exposures based on Tableau dashboards, with user-defined curation. To learn more, refer to [Auto exposures](/docs/collaborate/auto-exposures). diff --git a/website/docs/reference/snapshot-configs.md b/website/docs/reference/snapshot-configs.md index 5afe429cfb4..ed4e990953a 100644 --- a/website/docs/reference/snapshot-configs.md +++ b/website/docs/reference/snapshot-configs.md @@ -80,7 +80,31 @@ snapshots: -**Note:** Required snapshot properties _will not_ work when defined in `config` YAML blocks. We recommend that you define these in `dbt_project.yml` or a `config()` block within the snapshot `.sql` file. + + +**Note:** Required snapshot properties _will not_ work when only defined in `config` YAML blocks. We recommend that you define these in `dbt_project.yml` or a `config()` block within the snapshot `.sql` file or upgrade to v1.9. + + + + + + + +```yml +snapshots: + - name: + config: + database: + schema: + unique_key: + strategy: timestamp | check + updated_at: + check_cols: [] | all + +``` + + +