diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 870dadcd183..d2bb72552bd 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -9,6 +9,7 @@ To learn more about the writing conventions used in the dbt Labs docs, see the [ - [ ] I have reviewed the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) so my content adheres to these guidelines. - [ ] The topic I'm writing about is for specific dbt version(s) and I have versioned it according to the [version a whole page](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) and/or [version a block of content](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#versioning-blocks-of-content) guidelines. - [ ] I have added checklist item(s) to this list for anything anything that needs to happen before this PR is merged, such as "needs technical review" or "change base branch." +- [ ] The content in this PR requires a dbt release note, so I added one to the [release notes page](https://docs.getdbt.com/docs/dbt-versions/dbt-cloud-release-notes). '; + const endMarker = ''; + + // Get the deployment URL and links from environment variables + const deploymentUrl = process.env.DEPLOYMENT_URL; + const links = process.env.LINKS; + + // Build the deployment content without leading whitespace + const deploymentContent = [ + `${startMarker}`, + '---', + '🚀 Deployment available! Here are the direct links to the updated files:', + '', + `${links}`, + '', + `${endMarker}` + ].join('\n'); + + // Remove existing deployment content between markers + const regex = new RegExp(`${startMarker}[\\s\\S]*?${endMarker}`, 'g'); + body = body.replace(regex, '').trim(); + + // Append the new deployment content + body = `${body}\n\n${deploymentContent}`; + + // Update the PR description + await github.rest.pulls.update({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: prNumber, + body: body, + }); + env: + DEPLOYMENT_URL: ${{ steps.vercel_url.outputs.deployment_url }} + LINKS: ${{ steps.links.outputs.links }} diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml new file mode 100644 index 00000000000..5feaaa12a20 --- /dev/null +++ b/.github/workflows/vale.yml @@ -0,0 +1,80 @@ +name: Vale linting + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - 'website/docs/**/*' + - 'website/blog/**/*' + - 'website/**/*' + +jobs: + vale: + name: Vale linting + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: List repository contents + run: | + pwd + ls -R + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install Vale + run: pip install vale==2.27.0 # Install a stable version of Vale + + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v34 + with: + files: | + website/**/*.md + separator: ' ' + + - name: Debugging - Print changed files + if: ${{ steps.changed-files.outputs.any_changed == 'true' }} + run: | + echo "Changed files:" + echo "${{ steps.changed-files.outputs.all_changed_and_modified_files }}" + + - name: Confirm files exist + if: ${{ steps.changed-files.outputs.any_changed == 'true' }} + run: | + echo "Checking if files exist..." + for file in ${{ steps.changed-files.outputs.all_changed_and_modified_files }}; do + if [ -f "$file" ]; then + echo "Found: $file" + else + echo "File not found: $file" + exit 1 + fi + done + + - name: Run vale + if: ${{ steps.changed-files.outputs.any_changed == 'true' }} + uses: errata-ai/vale-action@reviewdog + with: + token: ${{ secrets.GITHUB_TOKEN }} + reporter: github-check + files: ${{ steps.changed-files.outputs.all_changed_and_modified_files }} + separator: ' ' + version: '2.27.0' + +# - name: Post summary comment +# if: ${{ steps.changed-files.outputs.any_changed == 'true' }} +# run: | +# COMMENT="❗️Oh no, some Vale linting found issues! Please check the **Files change** tab for detailed results and make the necessary updates." +# COMMENT+=$'\n' +# COMMENT+=$'\n\n' +# COMMENT+="➡️ Link to detailed report: [Files changed](${{ github.event.pull_request.html_url }}/files)" +# gh pr comment ${{ github.event.pull_request.number }} --body "$COMMENT" +# env: +# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.hyperlint/config.yaml b/.hyperlint/config.yaml new file mode 100644 index 00000000000..03082114ae1 --- /dev/null +++ b/.hyperlint/config.yaml @@ -0,0 +1,10 @@ +content_dir: /docs +authorized_users: + - mirnawong1 + - matthewshaver + - nghi-ly + - runleonarun + - nataliefiann + +vale: + enabled: true diff --git a/.vale.ini b/.vale.ini new file mode 100644 index 00000000000..58aff923afe --- /dev/null +++ b/.vale.ini @@ -0,0 +1,7 @@ +StylesPath = styles +MinAlertLevel = warning + +Vocab = EN + +[*.md] +BasedOnStyles = custom diff --git a/contributing/adding-page-components.md b/contributing/adding-page-components.md index 68294e7d149..7a92d627995 100644 --- a/contributing/adding-page-components.md +++ b/contributing/adding-page-components.md @@ -4,7 +4,7 @@ You can use the following components to provide code snippets for each supported Identify code by labeling with the warehouse names: -```code +```sql
@@ -32,7 +32,7 @@ You can use the following components to provide code snippets in a tabbed view. Identify code and code files by labeling with the component they are describing: -```code +```sql ` tag. This allows you to share a link to a page with a pre-selected tab so that clicking on a tab creates a unique hyperlink for that tab. However, this feature doesn't provide an anchor link, which means the browser won't scroll to the tab. Additionally, you can define the search parameter name to use. If the tabs content is under a header, you can alternatively link to the header itself, instaed of the `queryString` prop. +You can use the [queryString](https://docusaurus.io/docs/next/markdown-features/tabs?current-os=ios#query-string) prop in the `` tag. This allows you to share a link to a page with a pre-selected tab so that clicking on a tab creates a unique hyperlink for that tab. However, this feature doesn't provide an anchor link, which means the browser won't scroll to the tab. Additionally, you can define the search parameter name to use. If the tabs content is under a header, you can alternatively link to the header itself, instead of the `queryString` prop. In the following example, clicking a tab adds a search parameter to the end of the URL: `?current-os=android or ?current-os=ios`. -``` +```sql Android @@ -105,3 +105,48 @@ In the following example, clicking a tab adds a search parameter to the end of t ``` + +## Markdown Links + +Refer to the Links section of the Content Style Guide to read about how you can use links in the dbt product documentation. + +## Collapsible header + + +
+

Shows and hides children elements

+
+
+ +```markdown + +
+

Shows and hides children elements

+
+
+
+``` + +## File component + +```yml + + +```yaml +password: hunter2 +``` + +``` + +## LoomVideo component + +
{``}
+ + + +## YoutubeVideo component + +
{``}
+ + + diff --git a/contributing/content-style-guide.md b/contributing/content-style-guide.md index 022efa127a5..a8520bc0e0d 100644 --- a/contributing/content-style-guide.md +++ b/contributing/content-style-guide.md @@ -624,6 +624,12 @@ When describing icons that appear on-screen, use the [_Google Material Icons_](h :white_check_mark:Click on the menu icon +#### Upload icons +If you're using icons to document things like [third-party vendors](https://docs.getdbt.com/docs/cloud-integrations/avail-sl-integrations), etc. — you need to add the icon file in the following locations to ensure the icons render correctly in light and dark mode: + +- website/static/img/icons +- website/static/img/icons/white + ### Image names Two words that are either adjectives or nouns describing the name of a file separated by an underscore `_` (known as `snake_case`). The two words can also be separated by a hyphen (`kebab-case`). diff --git a/contributing/lightbox.md b/contributing/lightbox.md index 5f35b4d9639..95feccbe779 100644 --- a/contributing/lightbox.md +++ b/contributing/lightbox.md @@ -25,4 +25,9 @@ You can use the Lightbox component to add an image or screenshot to your page. I /> ``` +Note that if you're using icons to document things like third party vendors, etc, — you need to add the icon file in the following locations to ensure the icons render correctly in light and dark mode: + +- `website/static/img/icons` +- `website/static/img/icons/white` + diff --git a/contributing/single-sourcing-content.md b/contributing/single-sourcing-content.md index 537980ebdfb..6dc14d760b1 100644 --- a/contributing/single-sourcing-content.md +++ b/contributing/single-sourcing-content.md @@ -90,7 +90,7 @@ This component can be added directly to a markdown file in a similar way as othe Both properties can be used together to set a range where the content should show. In the example below, this content will only show if the selected version is between **0.21** and **1.0**: ```markdown - + Versioned content here diff --git a/styles/Vocab/EN/accept.txt b/styles/Vocab/EN/accept.txt new file mode 100644 index 00000000000..e673e2ef83d --- /dev/null +++ b/styles/Vocab/EN/accept.txt @@ -0,0 +1,67 @@ +dbt Cloud +dbt Core +dbt Semantic Layer +dbt Explorer +dbt +dbt-tonic +dbtonic +IDE +CLI +Config +info +docs +yaml +YAML +SQL +bash +shell +MetricFlow +jinja +jinja2 +sqlmesh +Snowflake +Databricks +Fabric +Redshift +Azure +DevOps +Athena +Amazon +UI +CSV +S3 +SCD +repo +dbt_project.yml +boolean +defaultValue= +DWH +DWUs +shoutout +ADF +BQ +gcloud +MSFT +DDL +APIs +API +SSIS +PBI +PowerBI +datetime +PySpark +:::caution +:::note +:::info +:::tip +:::warning +\<[^>]+\> +\b[A-Z]{2,}(?:/[A-Z]{2,})?\b +\w+-\w+ +\w+/\w+ +n/a +N/A +\ diff --git a/styles/custom/LatinAbbreviations.yml b/styles/custom/LatinAbbreviations.yml new file mode 100644 index 00000000000..44a3c9d6e8c --- /dev/null +++ b/styles/custom/LatinAbbreviations.yml @@ -0,0 +1,15 @@ +# LatinAbbreviations.yml +extends: substitution +message: "Avoid Latin abbreviations: '%s'. Consider using '%s' instead." +level: warning + +swap: + 'e.g.': 'for example' + 'e.g': 'for example' + 'eg': 'for example' + 'i.e.': 'that is' + 'i.e': 'that is' + 'etc.': 'and so on' + 'etc': 'and so on' + 'N.B.': 'Note' + 'NB': 'Note' diff --git a/styles/custom/Repitition.yml b/styles/custom/Repitition.yml new file mode 100644 index 00000000000..4cd620146cf --- /dev/null +++ b/styles/custom/Repitition.yml @@ -0,0 +1,6 @@ +extends: repetition +message: "'%s' is repeated!" +level: warning +alpha: true +tokens: + - '[^\s]+' diff --git a/styles/custom/SentenceCaseHeaders.yml b/styles/custom/SentenceCaseHeaders.yml new file mode 100644 index 00000000000..d1d6cd97c67 --- /dev/null +++ b/styles/custom/SentenceCaseHeaders.yml @@ -0,0 +1,34 @@ +extends: capitalization +message: "'%s' should use sentence-style capitalization. Try '%s' instead." +level: warning +scope: heading +match: $sentence # Enforces sentence-style capitalization +indicators: + - ":" +exceptions: + - '\bdbt\b' + - '\bdbt\s+Cloud\b' + - '\bdbt\s+Core\b' + - '\bdbt\s+Cloud\s+CLI\b' + - Snowflake + - Databricks + - Azure + - GCP + - AWS + - SQL + - CLI + - API + - YAML + - JSON + - HTML + - Redshift + - Google + - BigQuery + - SnowSQL + - Snowsight + - Snowpark + - Fabric + - Microsoft + - Postgres + - Explorer + - IDE diff --git a/styles/custom/Typos.yml b/styles/custom/Typos.yml new file mode 100644 index 00000000000..456517950a9 --- /dev/null +++ b/styles/custom/Typos.yml @@ -0,0 +1,39 @@ +extends: spelling + +message: "Oops there's a typo -- did you really mean '%s'? " +level: warning + +action: + name: suggest + params: + - spellings + +custom: true +filters: + - '\bdbt\b' + - '\bdbt\s+Cloud\b' + - '\bdbt\s+Core\b' + - '\bdbt\s+Cloud\s+CLI\b' + - '\bdbt\s+.*?\b' + - '<[^>]+>' # Ignore all HTML-like components starting with < and ending with > + - '<[^>]+>.*<\/[^>]+>' + +--- + +extends: existence + +message: "Ignore specific patterns" +level: skip +tokens: + - '\bdbt\b' + - '\bdbt\s+Cloud\b' + - '\bdbt\s+Core\b' + - '\bdbt\s+Cloud\s+CLI\b' + - '\bdbt\s+.*?\b' + - '<[^>]+>' # Ignore all HTML-like components starting with < and ending with > + - '<[^>]+>.*<\/[^>]+>' + - '\w+-\w+' + - '\w+/\w+' + - '\w+/\w+|\w+-\w+|n/a' + - 'n/a' + - 'N/A' diff --git a/styles/custom/UIElements.yml b/styles/custom/UIElements.yml new file mode 100644 index 00000000000..ff9c5f86187 --- /dev/null +++ b/styles/custom/UIElements.yml @@ -0,0 +1,19 @@ +extends: existence +message: "UI elements like '%s' should be bold." +level: warning +tokens: + # Match UI elements that are not bolded (i.e., not within **) + - '(? + +dbt Mesh is powerful because it allows teams to operate _independently_ and _collaboratively_, each team free to build on their own but contributing to a larger, shared set of data outputs. + +The flexibility of dbt Mesh means that it can support [a wide variety of patterns and designs](/best-practices/how-we-mesh/mesh-3-structures). Today, let’s dive into one pattern that is showing promise as a way to enable teams working on very different dbt deployments to work together. + +## How Hybrid Mesh enables collaboration between dbt Core and dbt Cloud teams + +**_Scenario_** — A company with a central data team uses dbt Core. The setup is working well for that team. They want to scale their impact to enable faster decision-making, organization-wide. The current dbt Core setup isn't well suited for onboarding a larger number of less-technical, nontechnical, or less-frequent contributors. + +**_The goal_** — Enable three domain teams of less-technical users to leverage and extend the central data models, with full ownership over their domain-specific dbt models. + + - **Central data team:** Data engineers comfortable using dbt Core and the command line interface (CLI), building and maintaining foundational data models for the entire organization. + + - **Domain teams:** Data analysts comfortable working in SQL but not using the CLI and prefer to start working right away without managing local dbt Core installations or updates. The team needs to build transformations specific to their business context. Some of these users may have tried dbt in the past, but they were not able to successfully onboard to the central team's setup. + +**_Solution: Hybrid Mesh_** — Data teams can use dbt Mesh to connect projects *across* dbt Core and dbt Cloud, creating a workflow where everyone gets to work in their preferred environment while creating a shared lineage that allows for visibility, validation, and ownership across the data pipeline. + +Each team will fully own its dbt code, from development through deployment, using the product that is appropriate to their needs and capabilities _while sharing data products across teams using both dbt Core and dbt Cloud._ + + + +Creating a Hybrid Mesh is mostly the same as creating any other [dbt Mesh](/guides/mesh-qs?step=1) workflow — there are a few considerations but mostly _it just works_. We anticipate it will continue to see adoption as more central data teams look to onboard their downstream domain teams. + +A Hybrid Mesh can be adopted as a stable long-term pattern, or as an intermediary while you perform a [migration from dbt Core to dbt Cloud](/guides/core-cloud-2?step=1). + +## How to build a Hybrid Mesh +Enabling a Hybrid Mesh is as simple as a few additional steps to import the metadata from your Core project into dbt Cloud. Once you’ve done this, you should be able to operate your dbt Mesh like normal and all of our [standard recommendations](/best-practices/how-we-mesh/mesh-1-intro) still apply. + +### Step 1: Prepare your Core project for access through dbt Mesh + +Configure public models to serve as stable interfaces for downstream dbt Projects. + +- Decide which models from your Core project will be accessible in your Mesh. For more information on how to configure public access for those models, refer to the [model access page.](/docs/collaborate/govern/model-access) +- Optionally set up a [model contract](/docs/collaborate/govern/model-contracts) for all public models for better governance. +- Keep dbt Core and dbt Cloud projects in separate repositories to allow for a clear separation between upstream models managed by the dbt Core team and the downstream models handled by the dbt Cloud team. + +### Step 2: Mirror each "producer" Core project in dbt Cloud +This allows dbt Cloud to know about the contents and metadata of your project, which in turn allows for other projects to access its models. + +- [Create a dbt Cloud account](https://www.getdbt.com/signup/) and a dbt project for each upstream Core project. + - Note: If you have [environment variables](/docs/build/environment-variables) in your project, dbt Cloud environment variables must be prefixed with `DBT_ `(including `DBT_ENV_CUSTOM_ENV_` or `DBT_ENV_SECRET`). Follow the instructions in [this guide](https://docs.getdbt.com/guides/core-to-cloud-1?step=8#environment-variables) to convert them for dbt Cloud. +- Each upstream Core project has to have a production [environment](/docs/dbt-cloud-environments) in dbt Cloud. You need to configure credentials and environment variables in dbt Cloud just so that it will resolve relation names to the same places where your dbt Core workflows are deploying those models. +- Set up a [merge job](/docs/deploy/merge-jobs) in a production environment to run `dbt parse`. This will enable connecting downstream projects in dbt Mesh by producing the necessary [artifacts](/reference/artifacts/dbt-artifacts) for cross-project referencing. + - Note: Set up a regular job to run `dbt build` instead of using a merge job for `dbt parse`, and centralize your dbt orchestration by moving production runs to dbt Cloud. Check out [this guide](/guides/core-to-cloud-1?step=9) for more details on converting your production runs to dbt Cloud. +- Optional: Set up a regular job (for example, daily) to run `source freshness` and `docs generate`. This will hydrate dbt Cloud with additional metadata and enable features in [dbt Explorer](/docs/collaborate/explore-projects) that will benefit both teams, including [Column-level lineage](/docs/collaborate/column-level-lineage). + +### Step 3: Create and connect your downstream projects to your Core project using dbt Mesh +Now that dbt Cloud has the necessary information about your Core project, you can begin setting up your downstream projects, building on top of the public models from the project you brought into Cloud in [Step 2](#step-2-mirror-each-producer-core-project-in-dbt-cloud). To do this: +- Initialize each new downstream dbt Cloud project and create a [`dependencies.yml` file](/docs/collaborate/govern/project-dependencies#use-cases). +- In that `dependencies.yml` file, add the dbt project name from the `dbt_project.yml` of the upstream project(s). This sets up cross-project references between different dbt projects: + + ```yaml + # dependencies.yml file in dbt Cloud downstream project + projects: + - name: upstream_project_name + ``` +- Use [cross-project references](/reference/dbt-jinja-functions/ref#ref-project-specific-models) for public models in upstream project. Add [version](/reference/dbt-jinja-functions/ref#versioned-ref) to references of versioned models: + ```yaml + select * from {{ ref('upstream_project_name', 'monthly_revenue') }} + ``` + +And that’s all it takes! From here, the domain teams that own each dbt Project can build out their models to fit their own use cases. You can now build out your Hybrid Mesh however you want, accessing the full suite of dbt Cloud features. +- Orchestrate your Mesh to ensure timely delivery of data products and make them available to downstream consumers. +- Use [dbt Explorer](/docs/collaborate/explore-projects) to trace the lineage of your data back to its source. +- Onboard more teams and connect them to your Mesh. +- Build [semantic models](/docs/build/semantic-models) and [metrics](/docs/build/metrics-overview) into your projects to query them with the [dbt Semantic Layer](https://www.getdbt.com/product/semantic-layer). + + +## Conclusion + +In a world where organizations have complex and ever-changing data needs, there is no one-size fits all solution. Instead, data practitioners need flexible tooling that meets them where they are. The Hybrid Mesh presents a model for this approach, where teams that are comfortable and getting value out of dbt Core can collaborate frictionlessly with domain teams on dbt Cloud. diff --git a/website/blog/2024-10-04-iceberg-is-an-implementation-detail.md b/website/blog/2024-10-04-iceberg-is-an-implementation-detail.md new file mode 100644 index 00000000000..dc9b78bba8d --- /dev/null +++ b/website/blog/2024-10-04-iceberg-is-an-implementation-detail.md @@ -0,0 +1,84 @@ +--- +title: "Iceberg Is An Implementation Detail" +description: "This blog will talk about iceberg table support and why it both matters and doesn't" +slug: icebeg-is-an-implementation-detail + +authors: [amy_chen] + +tags: [table formats, iceberg] +hide_table_of_contents: false + +date: 2024-10-04 +is_featured: false +--- + +If you haven’t paid attention to the data industry news cycle, you might have missed the recent excitement centered around an open table format called Apache Iceberg™. It’s one of many open table formats like Delta Lake, Hudi, and Hive. These formats are changing the way data is stored and metadata accessed. They are groundbreaking in many ways. + +But I have to be honest: **I don’t care**. But not for the reasons you think. + + + +## What is Iceberg? + +To have this conversation, we need to start with the same foundational understanding of Iceberg. Apache Iceberg is a high-performance open table format developed for modern data lakes. It was designed for large-scale datasets, and within the project, there are many ways to interact with it. When people talk about Iceberg, it often means multiple components including but not limited to: + +1. Iceberg Table Format - an open-source table format with large-scale data. Tables materialized in iceberg table format are stored on a user’s infrastructure, such as S3 Bucket. +2. Iceberg Data Catalog - an open-source metadata management system that tracks the schema, partition, and versions of Iceberg tables. +3. Iceberg REST Protocol (also called Iceberg REST API) is how engines can support and speak to other Iceberg-compatible catalogs. + +If you have been in the industry, you also know that everything I just wrote above about Iceberg could easily be replaced by `Hive,` `Hudi,` or `Delta.` This is because they were all designed to solve essentially the same problem. Ryan Blue (creator of Iceberg) and Michael Armbrust (creator of Delta Lake) recently sat down for this [fantastic chat](https://vimeo.com/1012543474) and said two points that resonated with me: + +- “We never intended for people to pay attention to this area. It’s something we wanted to fix, but people should be able to not pay attention and just work with their data. Storage systems should just work.” +- “We solve the same challenges with different approaches.” + +At the same time, the industry is converging on Apache Iceberg. [Iceberg has the highest availability of read and write support](https://medium.com/sundeck/2024-lakehouse-format-rundown-7edd75015428). + + + + +Snowflake launched Iceberg support in 2022. Databricks launched Iceberg support via Uniform last year. Microsoft announced Fabric support for Iceberg in September 2024 at Fabric Con. **Customers are demanding interoperability, and vendors are listening**. + +Why does this matter? Standardization of the industry benefits customers. When the industry standardizes - customers have the gift of flexibility. Everyone has a preferred way of working, and with standardization — they can always bring their preferred tools to their organization’s data. + +## Just another implementation detail + +I’m not saying open table formats aren't important. The metadata management and performance make them very meaningful and should be paid attention to. Our users are already excited to use it to create data lakes to save on storage costs, create more abstraction from their computing, etc. + +But when building data models or focusing on delivering business value through analytics, my primary concern is not *how* the data is stored—it's *how* I can leverage it to generate insights and drive decisions. The analytics development lifecycle is hard enough without having to take into every detail. dbt abstracts the underlying platform and lets me focus on writing SQL and orchestrating my transformations. It’s a feature that I don’t need to think about how tables are stored or optimized—I just need to know that when I reference dim_customers or fct_sales, the correct data is there and ready to use. **It should just work.** + +## Sometimes the details do matter + +While table formats are an implementation detail for data transformation — Iceberg can impact dbt developers when the implementation details aren’t seamless. Currently, using Iceberg requires a significant amount of upfront configuration and integration work beyond just creating tables to get started. + +One of the biggest hurdles is managing Iceberg’s metadata layer. This metadata often needs to be synced with external catalogs, which requires careful setup and ongoing maintenance to prevent inconsistencies. Permissions and access controls add another layer of complexity—because multiple engines can access Iceberg tables, you have to ensure that all systems have the correct access to both the data files and the metadata catalog. Currently, setting up integrations between these engines is also far from seamless; while some engines natively support Iceberg, others require brittle workarounds to ensure the metadata is synced correctly. This fragmented landscape means you could land with a web of interconnected components. + +## Fixing it + +**Today, we announced official support for the Iceberg table format in dbt.** By supporting the Iceberg table format, it’s one less thing you have to worry about on your journey to adopting Iceberg. + +With support for Iceberg Table Format, it is now easier to convert your dbt models using proprietary table formats to Iceberg by updating your configuration. After you have set up your external storage for Iceberg and connected it to your platforms, you will be able to jump into your dbt model and update the configuration to look something like this: + + + +It is available on these adapters: + +- Athena +- Databricks +- Snowflake +- Spark +- Starburst/Trino +- Dremio + +As with the beauty of any open-source project, Iceberg support grew organically, so the implementations vary. However, this will change in the coming months as we converge onto one dbt standard. This way, no matter which adapter you jump into, the configuration will always be the same. + +## dbt the Abstraction Layer + +dbt is more than about abstracting away the DDL to create and manage objects. It’s also about ensuring an opinionated approach to managing and optimizing your data. That remains true for our strategy around Iceberg Support. + +In our dbt-snowflake implementation, we have already started to [enforce best practices centered around how to manage the base location](https://docs.getdbt.com/reference/resource-configs/snowflake-configs#base-location) to ensure you don’t create technical debt accidentally, ensuring your Iceberg implementation scales over time. And we aren’t done yet. + +That said, while we can create the models, there is a *lot* of initial work to get to that stage. dbt developers must still consider the implementation, like how their external volume has been set up or where dbt can access the metadata. We have to make this better. + +Given the friction of getting launched on Iceberg, over the coming months, we will enable more capabilities to empower users to adopt Iceberg. It should be easier to read from foreign Iceberg catalogs. It should be easier to mount your volume. It should be easier to manage refreshes. And you should also trust that permissions and governance are consistently enforced. + +And this work doesn’t stop at Iceberg. The framework we are building is also compatible with other table formats, ensuring that whatever table format works for you is supported on dbt. This way — dbt users can also stop caring about table formats. **It’s just another implementation detail.** diff --git a/website/blog/2024-10-05-snowflake-feature-store.md b/website/blog/2024-10-05-snowflake-feature-store.md new file mode 100644 index 00000000000..cf5c55be1b5 --- /dev/null +++ b/website/blog/2024-10-05-snowflake-feature-store.md @@ -0,0 +1,273 @@ +--- +title: "Snowflake feature store and dbt: A bridge between data pipelines and ML" +description: A deep-dive into the workflow steps you can take to build and deploy ML models within a single platform. +slug: snowflake-feature-store +authors: [randy_pettus, luis_leon] +tags: [snowflake ML] +hide_table_of_contents: false +date: 2024-10-08 +is_featured: true +--- + +Flying home into Detroit this past week working on this blog post on a plane and saw for the first time, the newly connected deck of the Gordie Howe International [bridge](https://www.freep.com/story/news/local/michigan/detroit/2024/07/24/gordie-howe-bridge-deck-complete-work-moves-to-next-phase/74528258007/) spanning the Detroit River and connecting the U.S. and Canada. The image stuck out because, in one sense, a feature store is a bridge between the clean, consistent datasets and the machine learning models that rely upon this data. But, more interesting than the bridge itself is the massive process of coordination needed to build it. This construction effort — I think — can teach us more about processes and the need for feature stores in machine learning (ML). + +Think of the manufacturing materials needed as our data and the building of the bridge as the building of our ML models. There are thousands of engineers and construction workers taking materials from all over the world, pulling only the specific pieces needed for each part of the project. However, to make this project truly work at this scale, we need the warehousing and logistics to ensure that each load of concrete rebar and steel meets the standards for quality and safety needed and is available to the right people at the right time — as even a single fault can have catastrophic consequences or cause serious delays in project success. This warehouse and the associated logistics play the role of the feature store, ensuring that data is delivered consistently where and when it is needed to train and run ML models. + + + +## What is a feature? + +A feature is a transformed or enriched data that serves as an input into a machine learning model to make predictions. In machine learning, a data scientist derives features from various data sources to build a model that makes predictions based on historical data. To capture the value from this model, the enterprise must operationalize the data pipeline, ensuring that the features being used in production at inference time match those being used in training and development. + +## What role does dbt play in getting data ready for ML models? + +dbt is the standard for data transformation in the enterprise. Organizations leverage dbt at scale to deliver clean and well-governed datasets wherever and whenever they are needed. Using dbt to manage the data transformation processes to cleanse and prepare datasets used in feature development will ensure consistent datasets of guaranteed data quality — meaning that feature development will be consistent and reliable. + + +## Who is going to use this and what benefits will they see? + +Snowflake and dbt are already a well-established and trusted combination for delivering data excellence across the enterprise. The ability to register dbt pipelines in the Snowflake Feature Store further extends this combination for ML and AI workloads, while fitting naturally into the data engineering and feature pipelines already present in dbt. + + +Some of the key benefits are: + +- **Feature collaboration** — Data scientists, data analysts, data engineers, and machine learning engineers collaborate on features used in machine learning models in both Python and SQL, enabling teams to share and reuse features. As a result, teams can improve the time to value of models while improving the understanding of their components. This is all backed by Snowflake’s role-based access control (RBAC) and governance. +- **Feature consistency** — Teams are assured that features generated for training sets and those served for model inference are consistent. This can especially be a concern for large organizations where multiple versions of the truth might persist. Much like how dbt and Snowflake help enterprises have a single source of data truth, now they can have a single source of truth for features. +- **Feature visibility and use** — The Snowflake Feature Store provides an intuitive SDK to work with ML features and their associated metadata. In addition, users can browse and search for features in the Snowflake UI, providing an easy way to identify features +- **Point-in-time correctness** — Snowflake retrieves point-in-time correct features using ASOF Joins, removing the significant complexity in generating the right feature value for a given time period whether for training or batch prediction retrieval. +- **Integration with data pipelines** — Teams that have already built data pipelines in dbt can continue to use these with the Snowflake Feature Store. No additional migration or feature re-creation is necessary as teams plug into the same pipelines. + +## Why did we integrate/build this with Snowflake? + +How does dbt help with ML workloads today? dbt plays a pivotal role in preparing data for ML models by transforming raw data into a format suitable for feature engineering. It helps orchestrate and automate these transformations, ensuring that data is clean, consistent, and ready for ML applications. The combination of Snowflake’s powerful AI Data Cloud and dbt’s transformation prowess makes it an unbeatable pair for organizations aiming to scale their ML operations efficiently. + +## Making it easier for ML/Data Engineers to both build & deploy ML data & models + +dbt is a perfect tool to promote collaboration between data engineers, ML engineers, and data scientists. dbt is designed to support collaboration and quality of data pipelines through features including version control, environments and development life cycles, as well as built-in data and pipeline testing. Leveraging dbt means that data engineers and data scientists can collaborate and develop new models and features while maintaining the rigorous governance and high quality that's needed. + +Additionally, dbt Mesh makes maintaining domain ownership extremely easy by breaking up portions of our data projects and pipelines into connected projects where critical models can be published for consumption by others with strict data contracts enforcing quality and governance. This paradigm supports rapid development as each project can be kept to a maintainable size for its contributors and developers. Contracting on published models used between these projects ensures the consistency of the integration points between them. + +Finally, dbt Cloud also provides [dbt Explorer](/docs/collaborate/explore-projects) — a perfect tool to catalog and share knowledge about organizational data across disparate teams. dbt Explorer provides a central place for information on data pipelines, including lineage information, data freshness, and quality. Best of all, dbt Explorer updates every time dbt jobs run, ensuring this information is always up-to-date and relevant. + +## What tech is at play? + +Here’s what you need from dbt. dbt should be used to manage data transformation pipelines and generate the datasets needed by ML engineers and data scientists maintaining the Snowflake Feature Store. dbt Cloud Enterprise users should leverage dbt Mesh to create different projects with clear owners for these different domains of data pipelines. This Mesh design will promote easier collaboration by keeping each dbt project smaller and more manageable for the people building and maintaining it. dbt also supports both SQL and Python-based transformations making it an ideal fit for AI/ML workflows, which commonly leverage both languages. + +Using dbt for the data transformation pipelines will also ensure the quality and consistency of data products, which is critical for ensuring successful AI/ML efforts. + +## Snowflake ML overview + +The Feature Store is one component of [Snowflake ML’s](https://www.snowflake.com/en/data-cloud/snowflake-ml/) integrated suite of machine learning features that powers end-to-end machine learning within a single platform. Data scientists and ML engineers leverage ready-to-use ML functions or build custom ML workflows all without any data movement or without sacrificing governance. Snowflake ML includes scalable feature engineering and model training capabilities. Meanwhile, the Feature Store and Model Registry allow teams to store and use features and models in production, providing an end-to-end suite for operating ML workloads at scale. + + +## What do you need to do to make it all work? + +dbt Cloud offers the fastest and easiest way to run dbt. It offers a Cloud-based IDE, Cloud-attached CLI, and even a low-code visual editor option (currently in beta), meaning it’s perfect for connecting users across different teams with different workflows and tooling preferences, which is very common in AI/ML workflows. This is the tool you will use to prepare and manage data for AI/ML, promote collaboration across the different teams needed for a successful AI/ML workflow, and ensure the quality and consistency of the underlying data that will be used to create features and train models. + +Organizations interested in AI/ML workflows through Snowflake should also look at the new dbt Snowflake Native App — a Snowflake Native Application that extends the functionality of dbt Cloud into Snowflake. Of particular interest is Ask dbt — a chatbot that integrates directly with Snowflake Cortex and the dbt Semantic Layer to allow natural language questions of Snowflake data. + + +## How to power ML pipelines with dbt and Snowflake’s Feature Store + +Let’s provide a brief example of what this workflow looks like in dbt and Snowflake to build and use the powerful capabilities of a Feature Store. For this example, consider that we have a data pipeline in dbt to process customer transaction data. Various data science teams in the organization need to derive features from these transactions to use in various models, including to predict fraud and perform customer segmentation and personalization. These different use cases all benefit from having related features, such as the count of transactions or purchased amounts over different periods of time (for example, the last day, 7 days, or 30 days) for a given customer. + +Instead of the data scientists building out their own workflows to derive these features, let’s look at the flow of using dbt to manage the feature pipeline and Snowflake’s Feature Store to solve this problem. The following subsections describe the workflow step by step. + +### Create feature tables as dbt models + +The first step consists of building out a feature table as a dbt model. Data scientists and data engineers plug in to existing dbt pipelines and derive a table that includes the underlying entity (for example, customer id, timestamp and feature values). The feature table aggregates the needed features at the appropriate timestamp for a given entity. Note that Snowflake provides various common feature and query patterns available [here](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/examples). So, in our example, we would see a given customer, timestamp, and features representing transaction counts and sums over various periods. Data scientists can use SQL or Python directly in dbt to build this table, which will push down the logic into Snowflake, allowing data scientists to use their existing skill set. + +Window aggregations play an important role in the creation of features. Because the logic for these aggregations is often complex, let’s see how Snowflake and dbt make this process easier by leveraging Don’t Repeat Yourself (DRY) principles. We’ll create a macro that will allow us to use Snowflake’s `range between` syntax in a repeatable way: + +```sql +{% macro rolling_agg(column, partition_by, order_by, interval='30 days', agg_function='sum') %} + {{ agg_function }}({{ column }}) over ( + partition by {{ partition_by }} + order by {{ order_by }} + range between interval '{{ interval }}' preceding and current row + ) +{% endmacro %} + +``` + +Now, we use this macro in our feature table to build out various aggregations of customer transactions over the last day, 7 days, and 30 days. Snowflake has just taken significant complexity away in generating appropriate feature values and dbt has just made the code even more readable and repeatable. While the following example is built in SQL, teams can also build these pipelines using Python directly. + +```sql + +select + tx_datetime, + customer_id, + tx_amount, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "1 days", "sum") }} + as tx_amount_1d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "7 days", "sum") }} + as tx_amount_7d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "30 days", "sum") }} + as tx_amount_30d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "1 days", "avg") }} + as tx_amount_avg_1d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "7 days", "avg") }} + as tx_amount_avg_7d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "30 days", "avg") }} + as tx_amount_avg_30d, + {{ rolling_agg("*", "CUSTOMER_ID", "TX_DATETIME", "1 days", "count") }} + as tx_cnt_1d, + {{ rolling_agg("*", "CUSTOMER_ID", "TX_DATETIME", "7 days", "count") }} + as tx_cnt_7d, + {{ rolling_agg("*", "CUSTOMER_ID", "TX_DATETIME", "30 days", "count") }} + as tx_cnt_30d +from {{ ref("stg_transactions") }} + +``` + +### Create or connect to a Snowflake Feature Store + +Once a feature table is built in dbt, data scientists use Snowflake’s [snowflake-ml-python](https://docs.snowflake.com/en/developer-guide/snowflake-ml/snowpark-ml) package to create or connect to an existing Feature Store in Snowflake. Data scientists can do this all in Python, including in Jupyter Notebooks or directly in Snowflake using [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks). + +Let’s go ahead and create the Feature Store in Snowflake: + + +```sql +from snowflake.ml.feature_store import ( + FeatureStore, + FeatureView, + Entity, + CreationMode +) + +fs = FeatureStore( + session=session, + database=fs_db, + name=fs_schema, + default_warehouse='WH_DBT', + creation_mode=CreationMode.CREATE_IF_NOT_EXIST, +) + +``` + +### Create and register feature entities + +The next step consists of creating and registering [entities](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/entities). These represent the underlying objects that features are associated with, forming the join keys used for feature lookups. In our example, the data scientist can register various entities, including for the customer, a transaction id, or other necessary attributes. + +Let’s create some example entities. + +```python +customer = Entity(name="CUSTOMER", join_keys=["CUSTOMER_ID"]) +transaction = Entity(name="TRANSACTION", join_keys=["TRANSACTION_ID"]) +fs.register_entity(customer) +fs.register_entity(transaction) + +``` + +### Register feature tables as feature views + +After registering entities, the next step is to register a [feature view](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/feature-views). This represents a group of related features that stem from the features tables created in the dbt model. In this case, note that the feature logic, refresh, and consistency is managed by the dbt pipeline. The feature view in Snowflake enables versioning of the features while providing discoverability among teams. + +```python +# Create a dataframe from our feature table produced in dbt +customers_transactions_df = session.sql(f""" + SELECT + CUSTOMER_ID, + TX_DATETIME, + TX_AMOUNT_1D, + TX_AMOUNT_7D, + TX_AMOUNT_30D, + TX_AMOUNT_AVG_1D, + TX_AMOUNT_AVG_7D, + TX_AMOUNT_AVG_30D, + TX_CNT_1D, + TX_CNT_7D, + TX_CNT_30D + FROM {fs_db}.{fs_data_schema}.ft_customer_transactions + """) + +# Create a feature view on top of these features +customer_transactions_fv = FeatureView( + name="customer_transactions_fv", + entities=[customer], + feature_df=customers_transactions_df, + timestamp_col="TX_DATETIME", + refresh_freq=None, + desc="Customer transaction features with window aggregates") + +# Register the feature view for use beyond the session +customer_transactions_fv = fs.register_feature_view( + feature_view=customer_transactions_fv, + version="1", + #overwrite=True, + block=True) + +``` + +### Search and discover features in the Snowflake UI + +Now, with features created, teams can view their features directly in the Snowflake UI, as shown below. This enables teams to easily search and browse features, all governed through Snowflake’s role-based access control (RBAC). + + + +### Generate training dataset + +Now that the feature view is created, data scientists produce a [training dataset](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/modeling#generating-tables-for-training) that uses the feature view. In our example, whether the data scientist is building a fraud or segmentation model, they will retrieve point-in-time correct features for a customer at a specific point in time using the Feature Store’s `generate_training_set` method. + +To generate the training set, we need to supply a spine dataframe, representing the entities and timestamp values that we will need to retrieve features for. The following example shows this using a few records, although teams can leverage other tables to produce this spine. + +```python +spine_df = session.create_dataframe( + [ + ('1', '3937', "2019-05-01 00:00"), + ('2', '2', "2019-05-01 00:00"), + ('3', '927', "2019-05-01 00:00"), + ], + schema=["INSTANCE_ID", "CUSTOMER_ID", "EVENT_TIMESTAMP"]) + +train_dataset = fs.generate_dataset( + name= "customers_fv", + version= "1_0", + spine_df=spine_df, + features=[customer_transactions_fv], + spine_timestamp_col= "EVENT_TIMESTAMP", + spine_label_cols = [] +) + +``` + +Now that we have produced the training dataset, let’s see what it looks like. + + + +### Train and deploy a model + +Now with this training set, data scientists can use [Snowflake Snowpark](https://docs.snowflake.com/en/developer-guide/snowpark/index) and [Snowpark ML Modeling](https://docs.snowflake.com/en/developer-guide/snowflake-ml/modeling) to use familiar Python frameworks for additional preprocessing, feature engineering, and model training all within Snowflake. The model can be registered in the Snowflake [Model Registry](https://docs.snowflake.com/en/developer-guide/snowflake-ml/model-registry/overview) for secure model management. Note that we will leave the model training for you as part of this exercise. + +### Retrieve features for predictions + +For inference, data pipelines retrieve feature values using the [retrieve_feature_values](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/modeling#retrieving-features-and-making-predictions) method. These retrieved values can be fed directly to a model’s predict capability in your Python session using a developed model or by invoking a model’s predict method from Snowflake’s Model Registry. For batch scoring purposes, teams can build this entire pipeline using [Snowflake ML](https://docs.snowflake.com/en/developer-guide/snowflake-ml/overview). The following code demonstrates how the features are retrieved using this method. + +```python +infernce_spine = session.create_dataframe( + [ + ('1', '3937', "2019-07-01 00:00"), + ('2', '2', "2019-07-01 00:00"), + ('3', '927', "2019-07-01 00:00"), + ], + schema=["INSTANCE_ID", "CUSTOMER_ID", "EVENT_TIMESTAMP"]) + +inference_dataset = fs.retrieve_feature_values( + spine_df=infernce_spine, + features=[customer_transactions_fv], + spine_timestamp_col="EVENT_TIMESTAMP", +) + +inference_dataset.to_pandas() + +``` + +Here’s an example view of our features produced for model inferencing. + + + +## Conclusion + +We’ve just seen how quickly and easily you can begin to develop features through dbt and leverage the Snowflake Feature Store to deliver predictive modeling as part of your data pipelines. The ability to build and deploy ML models, including integrating feature storage, data transformation, and ML logic within a single platform, simplifies the entire ML life cycle. Combining this new power with the well-established partnership of dbt and Snowflake unlocks even more potential for organizations to safely build and explore new AI/ML use cases and drive further collaboration in the organization. + +The code used in the examples above is publicly available on [GitHub](https://github.com/sfc-gh-rpettus/dbt-feature-store). Also, you can run a full example yourself in this [quickstart guide](https://quickstarts.snowflake.com/guide/getting-started-with-feature-store-and-dbt/index.html?index=..%2F..index#0) from the Snowflake docs. diff --git a/website/blog/authors.yml b/website/blog/authors.yml index 85f05a545f9..271130a477d 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -1,7 +1,7 @@ --- amy_chen: image_url: /img/blog/authors/achen.png - job_title: Product Ecosystem Manager + job_title: Product Manager links: - icon: fa-linkedin url: https://www.linkedin.com/in/yuanamychen/ @@ -386,6 +386,14 @@ lucas_bergodias: job_title: Analytics Engineer name: Lucas Bergo Dias organization: Indicium Tech +luis_leon: + image_url: /img/blog/authors/luis-leon.png + job_title: Partner Solutions Architect + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/luis-leon-03965463/ + name: Luis Leon + organization: dbt Labs matt_winkler: description: Matt is an ex-data scientist who chose to embrace the simplicity of using SQL to manage and testing data pipelines with dbt. He previously worked as a hands-on ML practitioner, and consulted with Fortune 500 clients to build and maintain ML Ops pipelines using (mostly) AWS Sagemaker. He lives in the Denver area, and you can say hello on dbt Slack or on LinkedIn. image_url: /img/blog/authors/matt-winkler.jpeg @@ -449,6 +457,14 @@ pedro_brito_de_sa: url: https://www.linkedin.com/in/pbritosa/ name: Pedro Brito de Sa organization: Sage +randy_pettus: + image_url: /img/blog/authors/randy-pettus.png + job_title: Senior Partner Sales Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/randypettus/ + name: Randy Pettus + organization: Snowflake rastislav_zdechovan: image_url: /img/blog/authors/rastislav-zdechovan.png job_title: Analytics Engineer diff --git a/website/blog/ctas.yml b/website/blog/ctas.yml index ac56d4cc749..1f9b13afa7b 100644 --- a/website/blog/ctas.yml +++ b/website/blog/ctas.yml @@ -25,3 +25,8 @@ subheader: Coalesce is the premiere analytics engineering conference! Sign up now for innovation, collaboration, and inspiration. Don't miss out! button_text: Register now url: https://coalesce.getdbt.com/register +- name: coalesce_2024_catchup + header: Missed Coalesce 2024? + subheader: Catch up on Coalesce 2024 and register to access a select number of on-demand sessions. + button_text: Register and watch + url: https://coalesce.getdbt.com/register/online diff --git a/website/blog/metadata.yml b/website/blog/metadata.yml index d0009fd62c4..8b53a7a2a04 100644 --- a/website/blog/metadata.yml +++ b/website/blog/metadata.yml @@ -2,7 +2,7 @@ featured_image: "" # This CTA lives in right sidebar on blog index -featured_cta: "coalesce_2024_signup" +featured_cta: "coalesce_2024_catchup" # Show or hide hero title, description, cta from blog index show_title: true diff --git a/website/dbt-versions.js b/website/dbt-versions.js index 60efef64f75..825af8ac6ee 100644 --- a/website/dbt-versions.js +++ b/website/dbt-versions.js @@ -15,20 +15,20 @@ */ exports.versions = [ { - version: "1.9.1", + version: "1.10", customDisplay: "Cloud (Versionless)", }, + { + version: "1.9", + isPrerelease: true, + }, { version: "1.8", EOLDate: "2025-04-15", }, { version: "1.7", - EOLDate: "2024-10-30", - }, - { - version: "1.6", - EOLDate: "2024-07-31", + EOLDate: "2024-11-01", }, ]; @@ -42,6 +42,14 @@ exports.versions = [ * @property {string} lastVersion The last version the page is visible in the sidebar */ exports.versionedPages = [ + { + page: "docs/build/incremental-microbatch", + firstVersion: "1.9", + }, + { + page: "reference/resource-configs/snapshot_meta_column_names", + firstVersion: "1.9", + }, { page: "reference/resource-configs/target_database", lastVersion: "1.8", @@ -54,134 +62,6 @@ exports.versionedPages = [ page: "reference/global-configs/indirect-selection", firstVersion: "1.8", }, - { - page: "reference/resource-configs/store_failures_as", - firstVersion: "1.7", - }, - { - page: "docs/build/build-metrics-intro", - firstVersion: "1.6", - }, - { - page: "docs/build/sl-getting-started", - firstVersion: "1.6", - }, - { - page: "docs/build/about-metricflow", - firstVersion: "1.6", - }, - { - page: "docs/build/join-logic", - firstVersion: "1.6", - }, - { - page: "docs/build/validation", - firstVersion: "1.6", - }, - { - page: "docs/build/semantic-models", - firstVersion: "1.6", - }, - { - page: "docs/build/group-by", - firstVersion: "1.6", - }, - { - page: "docs/build/entities", - firstVersion: "1.6", - }, - { - page: "docs/build/metrics-overview", - firstVersion: "1.6", - }, - { - page: "docs/build/cumulative", - firstVersion: "1.6", - }, - { - page: "docs/build/derived", - firstVersion: "1.6", - }, - { - page: "docs/build/measure-proxy", - firstVersion: "1.6", - }, - { - page: "docs/build/ratio", - firstVersion: "1.6", - }, - { - page: "reference/commands/clone", - firstVersion: "1.6", - }, - { - page: "docs/collaborate/govern/project-dependencies", - firstVersion: "1.6", - }, - { - page: "reference/dbt-jinja-functions/thread_id", - firstVersion: "1.6", - }, - { - page: "reference/resource-properties/deprecation_date", - firstVersion: "1.6", - }, - { - page: "reference/commands/retry", - firstVersion: "1.6", - }, - { - page: "docs/build/groups", - firstVersion: "1.5", - }, - { - page: "docs/collaborate/govern/model-contracts", - firstVersion: "1.5", - }, - { - page: "reference/commands/show", - firstVersion: "1.5", - }, - { - page: "docs/collaborate/govern/model-access", - firstVersion: "1.5", - }, - { - page: "docs/collaborate/govern/model-versions", - firstVersion: "1.5", - }, - { - page: "reference/programmatic-invocations", - firstVersion: "1.5", - }, - { - page: "reference/resource-configs/contract", - firstVersion: "1.5", - }, - { - page: "reference/resource-configs/group", - firstVersion: "1.5", - }, - { - page: "reference/resource-properties/access", - firstVersion: "1.5", - }, - { - page: "reference/resource-properties/constraints", - firstVersion: "1.5", - }, - { - page: "reference/resource-properties/latest_version", - firstVersion: "1.5", - }, - { - page: "reference/resource-properties/versions", - firstVersion: "1.5", - }, - { - page: "reference/resource-configs/on_configuration_change", - firstVersion: "1.6", - }, ]; /** @@ -194,12 +74,5 @@ exports.versionedPages = [ * @property {string} firstVersion The first version the category is visible in the sidebar */ exports.versionedCategories = [ - { - category: "Model governance", - firstVersion: "1.5", - }, - { - category: "Build your metrics", - firstVersion: "1.6", - }, + ]; diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md index 7990cf6752f..da882dba6c5 100644 --- a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md @@ -241,7 +241,9 @@ measures: ## Reviewing our work -Our completed code will look like this, our first semantic model! +Our completed code will look like this, our first semantic model! Here are two examples showing different organizational approaches: + + @@ -288,6 +290,68 @@ semantic_models: description: The total tax paid on each order. agg: sum ``` + + + + + + +```yml +semantic_models: + - name: orders + defaults: + agg_time_dimension: ordered_at + description: | + Order fact table. This table is at the order grain with one row per order. + + model: ref('stg_orders') + + entities: + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + + dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using BigQuery + type: time + type_params: + time_granularity: day + - name: is_large_order + type: categorical + expr: case when order_total > 50 then true else false end + + measures: + - name: order_total + description: The total revenue for each order. + agg: sum + - name: order_count + description: The count of individual orders. + expr: 1 + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum +``` + + +As you can see, the content of the semantic model is identical in both approaches. The key differences are: + +1. **File location** + - Co-located approach: `models/marts/orders.yml` + - Parallel sub-folder approach: `models/semantic_models/sem_orders.yml` + +2. **File naming** + - Co-located approach: Uses the same name as the corresponding mart (`orders.yml`) + - Parallel sub-folder approach: Prefixes the file with `sem_` (`sem_orders.yml`) + +Choose the approach that best fits your project structure and team preferences. The co-located approach is often simpler for new projects, while the parallel sub-folder approach can be clearer for migrating large existing projects to the Semantic Layer. ## Next steps diff --git a/website/docs/best-practices/how-we-mesh/mesh-2-who-is-dbt-mesh-for.md b/website/docs/best-practices/how-we-mesh/mesh-2-who-is-dbt-mesh-for.md index b6fadc2d7a6..4c8adfa86a1 100644 --- a/website/docs/best-practices/how-we-mesh/mesh-2-who-is-dbt-mesh-for.md +++ b/website/docs/best-practices/how-we-mesh/mesh-2-who-is-dbt-mesh-for.md @@ -23,9 +23,6 @@ Is dbt Mesh a good fit in this scenario? Absolutely! There is no other way to sh - Onboarding hundreds of people and dozens of projects is full of friction! The challenges of a scaled, global organization are not to be underestimated. To start the migration, prioritize teams that have strong dbt familiarity and fundamentals. dbt Mesh is an advancement of core dbt deployments, so these teams are likely to have a smoother transition. Additionally, prioritize teams that manage strategic data assets that need to be shared widely. This ensures that dbt Mesh will help your teams deliver concrete value quickly. -- Bi-directional project dependencies -- currently, projects in dbt Mesh are treated like dbt resources in that they cannot depend on each other. However, many teams may want to be able to share data assets back and forth between teams. - - We've added support for [enabling bidirectional dependencies](/best-practices/how-we-mesh/mesh-3-structures#cycle-detection) across projects. If this sounds like your organization, dbt Mesh is the architecture you should pursue. ✅ diff --git a/website/docs/best-practices/how-we-mesh/mesh-3-structures.md b/website/docs/best-practices/how-we-mesh/mesh-3-structures.md index c75c566610b..38066811d8a 100644 --- a/website/docs/best-practices/how-we-mesh/mesh-3-structures.md +++ b/website/docs/best-practices/how-we-mesh/mesh-3-structures.md @@ -66,7 +66,7 @@ Since the launch of dbt Mesh, the most common pattern we've seen is one where pr Users may need to contribute models across multiple projects and this is fine. There will be some friction doing this, versus a single repo, but this is _useful_ friction, especially if upstreaming a change from a “spoke” to a “hub.” This should be treated like making an API change, one that the other team will be living with for some time to come. You should be concerned if your teammates find they need to make a coordinated change across multiple projects very frequently (every week), or as a key prerequisite for ~20%+ of their work. -### Cycle detection +### Cycle detection import CycleDetection from '/snippets/_mesh-cycle-detection.md'; diff --git a/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md b/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md index f1fb7422acf..a884de90c49 100644 --- a/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md +++ b/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md @@ -80,7 +80,7 @@ models: ## Split your projects 1. **Move your grouped models into a subfolder**. This will include any model in the selected group, it's associated YAML entry, as well as its parent or child resources as appropriate depending on where this group sits in your DAG. - 1. Note that just like in your dbt project, circular refereneces are not allowed! Project B cannot have parents and children in Project A, for example. + 1. Note that just like in your dbt project, circular references are not allowed! Project B cannot have parents and children in Project A, for example. 2. **Create a new `dbt_project.yml` file** in the subdirectory. 3. **Copy any macros** used by the resources you moved. 4. **Create a new `packages.yml` file** in your subdirectory with the packages that are used by the resources you moved. diff --git a/website/docs/best-practices/how-we-mesh/mesh-5-faqs.md b/website/docs/best-practices/how-we-mesh/mesh-5-faqs.md index 1ae49928ae5..9f12f7d2c20 100644 --- a/website/docs/best-practices/how-we-mesh/mesh-5-faqs.md +++ b/website/docs/best-practices/how-we-mesh/mesh-5-faqs.md @@ -215,7 +215,7 @@ There’s model-level access within dbt, role-based access for users and groups First things first: access to underlying data is always defined and enforced by the underlying data platform (for example, BigQuery, Databricks, Redshift, Snowflake, Starburst, etc.) This access is managed by executing “DCL statements” (namely `grant`). dbt makes it easy to [configure `grants` on models](/reference/resource-configs/grants), which provision data access for other roles/users/groups in the data warehouse. However, dbt does _not_ automatically define or coordinate those grants unless they are configured explicitly. Refer to your organization's system for managing data warehouse permissions. -[dbt Cloud Enterprise plans](https://www.getdbt.com/pricing) support [role-based access control (RBAC)](/docs/cloud/manage-access/enterprise-permissions#how-to-set-up-rbac-groups-in-dbt-cloud) that manages granular permissions for users and user groups. You can control which users can see or edit all aspects of a dbt Cloud project. A user’s access to dbt Cloud projects also determines whether they can “explore” that project in detail. Roles, users, and groups are defined within the dbt Cloud application via the UI or by integrating with an identity provider. +[dbt Cloud Enterprise plans](https://www.getdbt.com/pricing) support [role-based access control (RBAC)](/docs/cloud/manage-access/about-user-access#role-based-access-control-) that manages granular permissions for users and user groups. You can control which users can see or edit all aspects of a dbt Cloud project. A user’s access to dbt Cloud projects also determines whether they can “explore” that project in detail. Roles, users, and groups are defined within the dbt Cloud application via the UI or by integrating with an identity provider. [Model access](/docs/collaborate/govern/model-access) defines where models can be referenced. It also informs the discoverability of those projects within dbt Explorer. Model `access` is defined in code, just like any other model configuration (`materialized`, `tags`, etc). diff --git a/website/docs/best-practices/how-we-structure/4-marts.md b/website/docs/best-practices/how-we-structure/4-marts.md index 21de31a9e0d..995dea7e96f 100644 --- a/website/docs/best-practices/how-we-structure/4-marts.md +++ b/website/docs/best-practices/how-we-structure/4-marts.md @@ -26,7 +26,8 @@ models/marts ✅ **Group by department or area of concern.** If you have fewer than 10 or so marts you may not have much need for subfolders, so as with the intermediate layer, don’t over-optimize too early. If you do find yourself needing to insert more structure and grouping though, use useful business concepts here. In our marts layer, we’re no longer worried about source-conformed data, so grouping by departments (marketing, finance, etc.) is the most common structure at this stage. -✅ **Name by entity.** Use plain English to name the file based on the concept that forms the grain of the mart `customers`, `orders`. Note that for pure marts, there should not be a time dimension (`orders_per_day`) here, that is typically best captured via metrics. +✅ **Name by entity.** Use plain English to name the file based on the concept that forms the grain of the mart’s `customers`, `orders`. Marts that don't include any time-based rollups (pure marts) should not have a time dimension (`orders_per_day`) here, typically best captured via metrics. + ❌ **Build the same concept differently for different teams.** `finance_orders` and `marketing_orders` is typically considered an anti-pattern. There are, as always, exceptions — a common pattern we see is that, finance may have specific needs, for example reporting revenue to the government in a way that diverges from how the company as a whole measures revenue day-to-day. Just make sure that these are clearly designed and understandable as _separate_ concepts, not departmental views on the same concept: `tax_revenue` and `revenue` not `finance_revenue` and `marketing_revenue`. diff --git a/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md b/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md index c7522bf12eb..9358b507acc 100644 --- a/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md +++ b/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md @@ -102,12 +102,14 @@ We’ve focused heavily thus far on the primary area of action in our dbt projec ### Project splitting -One important, growing consideration in the analytics engineering ecosystem is how and when to split a codebase into multiple dbt projects. Our present stance on this for most projects, particularly for teams starting out, is straightforward: you should avoid it unless you have no other option or it saves you from an even more complex workaround. If you do have the need to split up your project, it’s completely possible through the use of private packages, but the added complexity and separation is, for most organizations, a hindrance, not a help, at present. That said, this is very likely subject to change! [We want to create a world where it’s easy to bring lots of dbt projects together into a cohesive lineage](https://github.com/dbt-labs/dbt-core/discussions/5244). In a world where it’s simple to break up monolithic dbt projects into multiple connected projects, perhaps inside of a modern mono repo, the calculus will be different, and the below situations we recommend against may become totally viable. So watch this space! +One important, growing consideration in the analytics engineering ecosystem is how and when to split a codebase into multiple dbt projects. Currently, our advice for most teams, especially those just starting, is fairly simple: in most cases, we recommend doing so with [dbt Mesh](/best-practices/how-we-mesh/mesh-1-intro)! dbt Mesh allows organizations to handle complexity by connecting several dbt projects rather than relying on one big, monolithic project. This approach is designed to speed up development while maintaining governance. -- ❌ **Business groups or departments.** Conceptual separations within the project are not a good reason to split up your project. Splitting up, for instance, marketing and finance modeling into separate projects will not only add unnecessary complexity but destroy the unifying effect of collaborating across your organization on cohesive definitions and business logic. -- ❌ **ML vs Reporting use cases.** Similarly to the point above, splitting a project up based on different use cases, particularly more standard BI versus ML features, is a common idea. We tend to discourage it for the time being. As with the previous point, a foundational goal of implementing dbt is to create a single source of truth in your organization. The features you’re providing to your data science teams should be coming from the same marts and metrics that serve reports on executive dashboards. +As breaking up monolithic dbt projects into smaller, connected projects, potentially within a modern mono repo becomes easier, the scenarios we currently advise against may soon become feasible. So watch this space! + +- ✅ **Business groups or departments.** Conceptual separations within the project are the primary reason to split up your project. This allows your business domains to own their own data products and still collaborate using dbt Mesh. For more information about dbt Mesh, please refer to our [dbt Mesh FAQs](/best-practices/how-we-mesh/mesh-5-faqs). - ✅ **Data governance.** Structural, organizational needs — such as data governance and security — are one of the few worthwhile reasons to split up a project. If, for instance, you work at a healthcare company with only a small team cleared to access raw data with PII in it, you may need to split out your staging models into their own projects to preserve those policies. In that case, you would import your staging project into the project that builds on those staging models as a [private package](https://docs.getdbt.com/docs/build/packages/#private-packages). - ✅ **Project size.** At a certain point, your project may grow to have simply too many models to present a viable development experience. If you have 1000s of models, it absolutely makes sense to find a way to split up your project. +- ❌ **ML vs Reporting use cases.** Similarly to the point above, splitting a project up based on different use cases, particularly more standard BI versus ML features, is a common idea. We tend to discourage it for the time being. As with the previous point, a foundational goal of implementing dbt is to create a single source of truth in your organization. The features you’re providing to your data science teams should be coming from the same marts and metrics that serve reports on executive dashboards. ## Final considerations diff --git a/website/docs/community/spotlight/bruno-de-lima.md b/website/docs/community/spotlight/bruno-de-lima.md index f5ffaa6a970..3c373db06e8 100644 --- a/website/docs/community/spotlight/bruno-de-lima.md +++ b/website/docs/community/spotlight/bruno-de-lima.md @@ -2,42 +2,39 @@ id: bruno-de-lima title: Bruno de Lima description: | - Hi all! I'm a Data Engineer, deeply fascinated by the awesomeness dbt. I love talking about dbt, creating content from daily tips to blogposts and engaging with this vibrant community! - - Started my career at the beginning of 2022 at Indicium as an Analytics Engineer, working with dbt from day 1. By 2023, my path took a global trajectory as I joined phData as a Data Engineer, expanding my experiences and forging connections beyond Brazil. While dbt is at the heart of my expertise, I've also delved into data warehouses such as Snowflake, Databricks, and BigQuery; visualization tools like Power BI and Tableau; and several minor modern data stack tools. - - I actively participate in the dbt community, having attended two dbt Meetups in Brazil organized by Indicium; writing about dbt-related topics in my Medium and LinkedIn profiles; contributing to the code; and frequently checking dbt Slack and Discourse, helping (and being helped by) other dbt practitioners. If you are a community member, you may have seen me around! -image: /img/community/spotlight/bruno-de-lima.jpg + Hey all! I was born and raised in Florianopolis, Brazil, and I'm a Senior Data Engineer at phData. I live with my fiancée and I enjoy music, photography, and powerlifting. + + I started my career in early 2022 at Indicium as an Analytics Engineer, working with dbt from day 1. By 2023, my path took a global trajectory as I joined phData as a Data Engineer, expanding my experiences and creating connections beyond Brazil. While dbt is my main expertise, because of my work in consultancy I have experience with a large range of tools, specially the ones related to Snowflake, Databricks, AWS and GCP; but I have already tried several other modern data stack tools too. + + I actively participate in the dbt community, having organized dbt Meetups in Brazil (in Floripa and São Paulo); writing about dbt-related topics in my Medium and LinkedIn profiles; contributing to the dbt Core code and to the docs; and frequently checking dbt Slack and Discourse, helping (and being helped by) other dbt practitioners. If you are a community member, you may have seen me around! +image: /img/community/spotlight/bruno-souza-de-lima-newimage.jpg pronouns: he/him location: Florianópolis, Brazil -jobTitle: Data Engineer +jobTitle: Senior Data Engineer companyName: phData -organization: "" socialLinks: - name: LinkedIn link: https://www.linkedin.com/in/brunoszdl/ - name: Medium link: https://medium.com/@bruno.szdl -dateCreated: 2023-11-05 +dateCreated: 2024-11-03 hide_table_of_contents: true communityAward: true -communityAwardYear: 2023 +communityAwardYear: 2024 --- ## When did you join the dbt community and in what way has it impacted your career? -I was not truly happy with my academic life. My career took a new turn when I enrolled in the Analytics Engineer course by Indicium. That was my first contact with dbt, and I didn't realize how much it would transform my career. After that, I was hired at the company as an Analytics Engineer and worked extensively with dbt from day one. +I was not truly happy with my academic life. My career took a new turn when I enrolled in the Analytics Engineer course by Indicium. That was my first contact with dbt, and I didn't realize how much it would transform my career. After that, I was hired at the company as an Analytics Engineer and worked extensively with dbt from day one. It took me some time to become an active member of the dbt community. I started working with dbt at the beginning of 2022 and became more involved towards the end of that year, encouraged by Daniel Avancini. I regret not doing this earlier, because being an active community member has been a game-changer for me, as my knowledge of dbt has grown exponentially just by participating in daily discussions on Slack. I have found #advice-dbt-help and #advice-dbt-for-power-users channels particularly useful, as well as the various database-specific channels. Additionally, the #i-made-this and #i-read-this channels have allowed me to learn about the innovative things that community members are doing. Inspired by other members, especially Josh Devlin and Owen Prough, I began answering questions on Slack and Discourse. For questions I couldn't answer, I would try engaging in discussions about possible solutions or provide useful links. I also started posting dbt tips on LinkedIn to help practitioners learn about new features or to refresh their memories about existing ones. -By being more involved in the community, I felt more connected and supported. I received help from other members, and now, I could help others, too. I was happy with this arrangement, but more unexpected surprises came my way. My active participation in Slack, Discourse, and LinkedIn opened doors to new connections and career opportunities. I had the pleasure of meeting a lot of incredible people and receiving exciting job offers, including the one for working at phData. +By being more involved in the community, I felt more connected and supported. I received help from other members, and now, I could help others, too. I was happy with this arrangement, but more unexpected surprises came my way. My active participation in Slack, Discourse, and LinkedIn opened doors to new connections and career opportunities. I had the pleasure of meeting a lot of incredible people and receiving exciting job offers, including the ones for working at phData and teaching at Zach Wilson's data engineering bootcamp. Thanks to the dbt community, I went from feeling uncertain about my career prospects to having a solid career and being surrounded by incredible people. -I would like to thank the Indicium folks for opening the first door for me for this career in data, and not just for me but for lots of people in Brazil trying to migrate from different fields who would not have this opportunity otherwise. - ## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? I identify with Gwen Windflower and Joel Labes, or at least they are the kind of leader I admire. Their strong presence and continuous interaction with all types of dbt enthusiasts make everyone feel welcomed in the community. They uplift those who contribute to the community, whether it's through a LinkedIn post or answering a question, and provide constructive feedback to help them improve. And of course they show a very strong knowledge about dbt and data in general, which is reflected in their contributions. diff --git a/website/docs/community/spotlight/christophe-oudar.md b/website/docs/community/spotlight/christophe-oudar.md new file mode 100644 index 00000000000..2381d88a381 --- /dev/null +++ b/website/docs/community/spotlight/christophe-oudar.md @@ -0,0 +1,35 @@ +--- +id: christophe-oudar +title: Christophe Oudar +description: | + I joined the dbt Community in November 2021 after exchanging some issues in Github. I currently work as a staff engineer at a scaleup in the ad tech industry called Teads, which I joined 11 years ago as a new grad. I've been using dbt Core on BigQuery since then. I write about data engineering both on Medium and Substack. I contribute on dbt-bigquery. I wrote an article that was then featured on the Developer Blog called BigQuery ingestion-time partitioning and partition copy with dbt. +image: /img/community/spotlight/christophe-oudar.jpg +pronouns: he/him +location: Montpellier, France +jobTitle: Staff Engineer +companyName: Teads +socialLinks: + - name: X + link: https://x.com/Kayrnt + - name: LinkedIn + link: https://www.linkedin.com/in/christopheoudar/ + - name: Substack + link: https://smallbigdata.substack.com/ +dateCreated: 2024-11-08 +hide_table_of_contents: true +communityAward: true +communityAwardYear: 2024 +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the community in November 2021 as a way to explore how to move our in-house data modeling layer to dbt. The transition took over a year while we ensured we could cover all our bases and add missing features to dbt-bigquery. That project was one of stepping stones that helped me to move from senior to staff level at my current job. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I identify with leaders that have strong convictions about how data engineering should move forward but remain open to innovation and ideas from everyone to bring the best to the field and make it as inclusive as possible to all cultures and profiles. I think that could mean people like Jordan Tigani or Mark Raasveldt. In the dbt community, my leadership has looked like helping people struggling and offering better ways to simplify one's day to day work when possible. + +## What have you learned from community members? What do you hope others can learn from you? + +I read a lot of articles about dbt, especially when I got started with it. It helped me a lot to build a proper Slim CI that could fit my company's ways of working. I also got to see how data pipelines were done in other companies and the pros and cons of my approaches. I hope I can share more of that knowledge for people to pick what's best for their needs. +​ diff --git a/website/docs/community/spotlight/fabiyi-opeyemi.md b/website/docs/community/spotlight/fabiyi-opeyemi.md index 18a311fa437..b5b4bf8c9e0 100644 --- a/website/docs/community/spotlight/fabiyi-opeyemi.md +++ b/website/docs/community/spotlight/fabiyi-opeyemi.md @@ -2,13 +2,11 @@ id: fabiyi-opeyemi title: Opeyemi Fabiyi description: | - I'm an Analytics Engineer with Data Culture, a Data Consulting firm where I use dbt regularly to help clients build quality-tested data assets. I've also got a background in financial services and supply chain. I'm passionate about helping organizations to become data-driven and I majorly use dbt for data modeling, while the other aspect of the stack is largely dependent on the client infrastructure I'm working for, so I often say I'm tool-agnostic. 😀 - - I'm the founder of Nigeria's Young Data Professional Community. I'm also the organizer of the Lagos dbt Meetup which I started, and one of the organizers of the DataFest Africa Conference. I became an active member of the dbt Community in 2021 & spoke at Coalesce 2022. + I’m an Analytics Engineer with Data Culture, a Data Consulting firm where I use dbt regularly to help clients build quality-tested data assets. Before Data Culture, I worked at Cowrywise, one of the leading Fintech companies in Nigeria, where I was a solo data team member, and that was my first introduction to dbt and Analytics Engineering. Before that, I was doing Data Science and Analytics at Deloitte Nigeria. It’s been an exciting journey since I started using dbt and joining the community.Outside of work, I’m very passionate about Community building and Data Advocacy. I founded one of Nigeria’s most vibrant Data communities, “The Young Data Professional Community.” I’m also the Founder of the Lagos dbt Meetup and one of the organizers of the Largest Data Conference in Africa, DataFest Africa Conference. I became an active member of the dbt community in 2021 & spoke at Coalesce 2022. So when I’m not actively working I’m involved in one community activity or the other. image: /img/community/spotlight/fabiyi-opeyemi.jpg pronouns: he/him location: Lagos, Nigeria -jobTitle: Senior Analytics Engineer +jobTitle: Analytics Manager companyName: Data Culture organization: Young Data Professionals (YDP) socialLinks: @@ -16,10 +14,10 @@ socialLinks: link: https://twitter.com/Opiano_1 - name: LinkedIn link: https://www.linkedin.com/in/opeyemifabiyi/ -dateCreated: 2023-11-06 +dateCreated: 2024-11-02 hide_table_of_contents: true communityAward: true -communityAwardYear: 2023 +communityAwardYear: 2024 --- ## When did you join the dbt community and in what way has it impacted your career? @@ -40,4 +38,4 @@ I've learned how to show empathy as a data professional and be a great engineer ## Anything else interesting you want to tell us? -Maybe, I will consider DevRel as a career sometime because of my innate passion and love for community and people. Several folks tell me I'm a strong DevRel talent and a valuable asset for any product-led company. If you need someone to bounce ideas off of or discuss😃 your community engagement efforts, please feel free to reach out. +Maybe I will consider DevRel as a career sometime because of my innate passion and love for community and people. Several folks tell me I’m a strong DevRel talent and a valuable asset for any product-led company. If you need someone to bounce ideas off of or discuss your community engagement efforts, please feel free to reach out. On a side note, it was really exciting for me to attend Coalesce 2024 in Vegas in person, which allowed me not only to learn but, most importantly, to meet amazing persons I’ve only interacted with online, like Bruno, Kuberjain, Dakota and many more; shout-out to Zenlytic and Lightdash for making that possible and, most importantly, a huge shout-out to the dbt Lab community team: Amada, Natasha and everyone on the community team for their constant supports to helping out with making the dbt Lagos (Nigeria) meetup a success. diff --git a/website/docs/community/spotlight/jenna-jordan.md b/website/docs/community/spotlight/jenna-jordan.md new file mode 100644 index 00000000000..86f19f125f8 --- /dev/null +++ b/website/docs/community/spotlight/jenna-jordan.md @@ -0,0 +1,36 @@ +--- +id: jenna-jordan +title: Jenna Jordan +description: | + I am a Senior Data Management Consultant with Analytics8, where I advise clients on dbt best practices (especially regarding dbt Mesh and the various shifts in governance and strategy that come with it). My experiences working within a dbt Mesh architecture and all of the difficulties organizations could run into with such a major paradigm shift inspired my peer exchange (role-playing/simulation game) at Coalesce 2024: "Governance co-lab: We the people, in order to govern data, do establish processes." I also experimented with bringing role-playing scenarios to data problems at the September 2024 Chicago dbt Meetup, hosted by Analytics8. I occasionally write long blog posts on my website, if you're up for the read. +image: /img/community/spotlight/jenna-jordan.jpg +pronouns: she/her +location: Asheville, USA +jobTitle: Senior Data Management Consultant +companyName: Analytics8 +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/jennajordan1/ + - name: Personal website + link: https://jennajordan.me/ +dateCreated: 2024-11-01 +hide_table_of_contents: true +communityAward: true +communityAwardYear: 2024 +--- + +## When did you join the dbt community and in what way has it impacted your career? + +My dbt learning journey kicked off with the CoRise (now Uplimit) course Analytics Engineering with dbt, with Emily Hawkins and Jake Hannan, in February 2022 – less than a month after starting as a data engineer with the City of Boston Analytics Team. About a year later, I spearheaded the adoption of dbt at the City and got to build the project and associated architecture from scratch – which is probably the best learning experience you could ask for! I saw the value dbt could bring to improving data management processes at the City, and I knew there were other cities and local governments that could benefit from dbt as well, which motivated me to find my fellow co-speakers Ian Rose and Laurie Merrell to give a talk at Coalesce 2023 called "From Coast to Coast: Implementing dbt in the public sector." As a part of our goal to identify and cultivate a community of dbt practitioners in the public (and adjacent) sectors, we also started the dbt Community Slack channel #industry-public-sector. That experience allowed me to continue to grow my career and find my current role - as well as connect with so many amazing data folks! + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +There are many leaders in the dbt community that I admire and identify with – I won’t list them all out because I will invariably miss someone (but… you probably know who you are). Technical prowess is always enviable, but I most admire those who bring the human element to data work: those who aren’t afraid to be their authentic selves, cultivate a practice of empathy and compassion, and are driven by curiosity and a desire to help others. I’ve never set out to be a leader, and I still don’t really consider myself to be a leader – I’m much more comfortable in the role of a librarian. I just want to help people by connecting them to the information and resources that they may need. + +## What have you learned from community members? What do you hope others can learn from you? + +Pretty much everything I’ve learned about dbt and working in a mature analytics ecosystem I’ve learned from dbt community members. The dbt Community Slack is full of useful information and advice, and has also helped me identify experts about certain topics that I can chat with to learn even more. When I find someone sharing useful information, I usually try to find and follow them on social media so I can see more of their content. If there is one piece of advice I want to share, it is this: don’t be afraid to engage. Ask for help when you need it, but also offer help freely. Engage with the community with the same respect and grace you would offer your friends and coworkers. + +## Anything else interesting you want to tell us? + +Library Science is so much more than the Dewey Decimal System (seriously, ask a librarian about Dewey for a juicy rant). RDF triples (for knowledge graphs) are queried using SPARQL (pronounced “sparkle”). An antelope can be a document. The correct way to write a date/time is ISO-8601. The oldest known table (of the spreadsheet variety) is from 5,000 years ago – record-keeping predates literature by a significant margin. Zip codes aren’t polygons – they don’t contain an area or have boundaries. Computers don’t always return 0.3 when asked to add 0.1 + 0.2. SQL was the sequel to SQUARE. Before computers, people programmed looms (weaving is binary). What? You asked!! On a more serious note – data teams: start hiring librarians. No, seriously. No degree could have prepared me better for what I do in the data field than my M.S. in Library & Information Science. I promise, you want the skillset & mindset that a librarian will bring to your team. diff --git a/website/docs/community/spotlight/meagan-palmer.md b/website/docs/community/spotlight/meagan-palmer.md index ff45a3d6b7d..fffc2a6e0d6 100644 --- a/website/docs/community/spotlight/meagan-palmer.md +++ b/website/docs/community/spotlight/meagan-palmer.md @@ -3,8 +3,11 @@ id: meagan-palmer title: Meagan Palmer description: | I first started using dbt in 2016 or 2017 (I can't remember exactly). Since then, I have moved into data and analytics consulting and have dipped in and out of the dbt Community. + Late last year, I started leading dbt Cloud training courses and spending more time in the dbt Slack. + In consulting, I get to use a range of stacks. I've used dbt with Redshift, Snowflake, and Databricks in production settings with a range of loaders & reporting tools, and I've been enjoying using DuckDB for some home experimentation. + To share some of the experiences, I regularly post to LinkedIn and have recently started Analytics Engineering Today, a twice monthly newsletter about dbt in practice. image: /img/community/spotlight/Meagan-Palmer.png pronouns: she/her @@ -14,9 +17,10 @@ companyName: Altis Consulting socialLinks: - name: LinkedIn link: https://www.linkedin.com/in/meaganpalmer/ -dateCreated: 2024-07-29 +dateCreated: 2024-11-04 hide_table_of_contents: true -communityAward: false +communityAward: true +communityAwardYear: 2024 --- ## When did you join the dbt community and in what way has it impacted your career? @@ -27,9 +31,9 @@ I was fortunate that Jon Bradley at Nearmap had the vision to engage the then Fi Being in Australia, I often see replies from Jeremy Yeo to people in the dbt Slack. His clarity of communication is impressive. -For growth, I'm hoping that others can benefit from the wide range of experience I have. My newsletter, Analytics Engineering Today on LinkedIn aims to upskill the dbt Community and shed some light on some useful features that might not be well known. +For growth, I'm hoping that others can benefit from the wide range of experience I have. My LinkedIn Newsletter, Analytics Engineering Today aims to upskill the dbt Community and shed some light on some useful features that might not be well known. -I'll be at Coalesce and am doing some webinars/events later in the year. Come say hi, I love talking dbt and analytics engineering with people. +I was at Coalesce Onlineand am doing some webinars/events later in the year. Come say hi, I love talking dbt and analytics engineering with people. ## What have you learned from community members? What do you hope others can learn from you? diff --git a/website/docs/community/spotlight/mike-stanley.md b/website/docs/community/spotlight/mike-stanley.md new file mode 100644 index 00000000000..853b0e2f704 --- /dev/null +++ b/website/docs/community/spotlight/mike-stanley.md @@ -0,0 +1,30 @@ +--- +id: mike-stanley +title: Mike Stanley +description: | + I've split my time between financial services and the video games industry. Back when I wrote code every day, I worked in marketing analytics and marketing technology. I've been in the dbt community for about two years. I haven't authored any extensions to dbt's adapters yet but I've given feedback on proposed changes! +image: /img/community/spotlight/mike-stanley.jpg +pronouns: he/him +location: London, United Kingdom +jobTitle: Manager, Data +companyName: Freetrade +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/mike-stanley-31616994/ +dateCreated: 2024-11-05 +hide_table_of_contents: true +communityAward: true +communityAwardYear: 2024 +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I've led data teams for almost ten years now and it can be a challenge to stay current on new technology when you're spending a lot of time on leadership and management. I joined the dbt Community to learn how to get more from it, how to solve problems and use more advanced features, and to learn best practices. I find that answering questions is the way I learn best, so I started helping people! + +## Which dbt Community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I hope that we can all continue to level up our dbt skills and leave the data environments that we work in better than we found them. + +## What have you learned from community members? What do you hope others can learn from you? + +Everything! People share so much about their best practices and when and how to deviate from them, interesting extensions to dbt that they've worked on, common bugs and problems, and how to think in a "dbtish" way. I couldn't have learned any of that without the community! diff --git a/website/docs/community/spotlight/original-dbt-athena-maintainers.md b/website/docs/community/spotlight/original-dbt-athena-maintainers.md new file mode 100644 index 00000000000..b3728a71d63 --- /dev/null +++ b/website/docs/community/spotlight/original-dbt-athena-maintainers.md @@ -0,0 +1,44 @@ +--- +id: original-dbt-athena-maintainers +title: The Original dbt-athena Maintainers +description: | + The original dbt-athena Maintainers is a group of 5 people—Jérémy Guiselin, Mattia, Jesse Dobbelaere, Serhii Dimchenko, and Nicola Corda—who met via dbt Slack in the #db-athena channel, with the aim to make make dbt-athena a production-ready adapter. + + In the first periods, Winter 2022 and Spring 2023, we focused on contributing directly to the adapter, adding relevant features like Iceberg and Lake Formation support, and stabilizing some internal behaviour. + + On a second iteration our role was triaging, providing community support and bug fixing. We encouraged community members to make their first contributions, and helped them to merge their PRs. +image: /img/community/spotlight/dbt-athena-groupheadshot.jpg +location: Europe +jobTitle: A group of data-engineers +companyName: Mix of companies +organization: dbt-athena (since November 2022) +socialLinks: + - name: Jérémy's LinkedIn + link: https://www.linkedin.com/in/jrmyy/ + - name: Mattia's LinkedIn + link: https://www.linkedin.com/in/mattia-sappa/ + - name: Jesse's LinkedIn + link: https://www.linkedin.com/in/dobbelaerejesse/ + - name: Serhii's LinkedIn + link: https://www.linkedin.com/in/serhii-dimchenko-075b3061/ + - name: Nicola's LinkedIn + link: https://www.linkedin.com/in/nicolacorda/ +dateCreated: 2024-11-06 +hide_table_of_contents: true +communityAward: true +communityAwardYear: 2024 +--- + +## When did you join the dbt community and in what way has it impacted your career? + +The dbt community allowed the dbt-athena maintainers to meet each other, and share the common goal of making the dbt-athena adapter production-ready. + +## Which dbt Community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +As we grow, we are looking to embody democratic leadership. + +## What have you learned from community members? What do you hope others can learn from you? + +We learned that the power of the community was endless. People started to share best practises, and some of the best practises were incorporated directly in dbt-athena, allowing people to run the adapter smoothly in their production environment. +We reached a point where people started to ask advice for their AWS architecture, which we found pretty awesome. + diff --git a/website/docs/community/spotlight/ruth-onyekwe.md b/website/docs/community/spotlight/ruth-onyekwe.md new file mode 100644 index 00000000000..cf07e98a4f7 --- /dev/null +++ b/website/docs/community/spotlight/ruth-onyekwe.md @@ -0,0 +1,31 @@ +--- +id: ruth-onyekwe +title: Ruth Onyekwe +description: | + I've been working in the world of Data Analytics for over 5 years and have been part of the dbt community for the last 4. With a background in International Business and Digital Marketing, I experienced first hand the need for reliable data to fuel business decisions. This inspired a career move into the technology space to be able to work with the tools and the people that were facilitating this process. Today I am leading teams to deliver data modernization projects, as well as helping grow the analytics arm of my company on a day to day basis. I also have the privilege of organising the dbt Meetups in Barcelona, Spain - and am excited to continue to grow the community across Europe. +image: /img/community/spotlight/ruth-onyekwe.jpeg +pronouns: she/her +location: Madrid, Spain +jobTitle: Data Analytics Manager +companyName: Spaulding Ridge +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/ruth-onyekwe/ +dateCreated: 2024-11-07 +hide_table_of_contents: true +communityAward: true +communityAwardYear: 2024 +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community in 2021, after meeting dbt Labs reps at a conference. Through partnering with dbt Labs and learning the technology, we (Spaulding Ridge) were able to open a whole new offering in our service catalogue, and meet the growing needs of our customers. + +## Which dbt Community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I identify with the transparent leaders - those willing to share their learnings, knowledge, and experiences. I want to encourage other dbt enthusiasts to stretch themselves professionally and actively participate in the analytics community. + +## What have you learned from community members? What do you hope others can learn from you? + +I've learnt that most of us working in data have experienced the same struggles, be it searching for the best testing frameworks, or deciding how to build optimised and scalable models, or searching for the answers to non-technical questions like how to best organise teams or how to communicate with business stakeholders and translate their needs - we're all faced with the same dilemmas. And the great thing I've learned being in the dbt community, is that if you're brave enough to share your stories, you'll connect with someone who has already gone through those experiences, and can help you reach a solution a lot faster than if you tried to start from scratch. + diff --git a/website/docs/docs/build/cumulative-metrics.md b/website/docs/docs/build/cumulative-metrics.md index 056ff79c6eb..b44918d2fbd 100644 --- a/website/docs/docs/build/cumulative-metrics.md +++ b/website/docs/docs/build/cumulative-metrics.md @@ -18,21 +18,21 @@ Note that we use the double colon (::) to indicate whether a parameter is nested -| Parameter |
Description
| Type | -| --------- | ----------- | ---- | -| `name` | The name of the metric. | Required | -| `description` | The description of the metric. | Optional | -| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | -| `label` | Required string that defines the display value in downstream tools. Accepts plain text, spaces, and quotes (such as `orders_total` or `"orders_total"`). | Required | -| `type_params` | The type parameters of the metric. Supports nested parameters indicated by the double colon, such as `type_params::measure`. | Required | -| `type_params::cumulative_type_params` | Allows you to add a `window`, `period_agg`, and `grain_to_date` configuration. Nested under `type_params`. | Optional | -| `cumulative_type_params::window` | The accumulation window, such as 1 month, 7 days, 1 year. This can't be used with `grain_to_date`. | Optional | -| `cumulative_type_params::grain_to_date` | Sets the accumulation grain, such as `month`, which will accumulate data for one month and then restart at the beginning of the next. This can't be used with `window`. | Optional | -| `cumulative_type_params::period_agg` | Specifies how to aggregate the cumulative metric when summarizing data to a different granularity. Can be used with grain_to_date. Options are
- `first` (Takes the first value within the period)
- `last` (Takes the last value within the period
- `average` (Calculates the average value within the period).

Defaults to `first` if no `window` is specified. | Optional | -| `type_params::measure` | A dictionary describing the measure you will use. | Required | -| `measure::name` | The measure you are referencing. | Optional | -| `measure::fill_nulls_with` | Set the value in your metric definition instead of null (such as zero). | Optional | -| `measure::join_to_timespine` | Boolean that indicates if the aggregated measure should be joined to the time spine table to fill in missing dates. Default `false`. | Optional | +| Parameter |
Description
| Type | +|-------------|---------------------------------------------------|-----------| +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | Required string that defines the display value in downstream tools. Accepts plain text, spaces, and quotes (such as `orders_total` or `"orders_total"`). | Required | +| `type_params` | The type parameters of the metric. Supports nested parameters indicated by the double colon, such as `type_params::measure`. | Required | +| `type_params::measure` | The measure associated with the metric. Supports both shorthand (string) and object syntax. The shorthand is used if only the name is needed, while the object syntax allows specifying additional attributes. | Required | +| `measure::name` | The name of the measure being referenced. Required if using object syntax for `type_params::measure`. | Optional | +| `measure::fill_nulls_with` | Sets a value (for example, 0) to replace nulls in the metric definition. | Optional | +| `measure::join_to_timespine` | Boolean indicating if the aggregated measure should be joined to the time spine table to fill in missing dates. Default is `false`. | Optional | +| `type_params::cumulative_type_params` | Configures the attributes like `window`, `period_agg`, and `grain_to_date` for cumulative metrics. | Optional | +| `cumulative_type_params::window` | Specifies the accumulation window, such as `1 month`, `7 days`, or `1 year`. Cannot be used with `grain_to_date`. | Optional | +| `cumulative_type_params::grain_to_date` | Sets the accumulation grain, such as `month`, restarting accumulation at the beginning of each specified grain period. Cannot be used with `window`. | Optional | +| `cumulative_type_params::period_agg` | Defines how to aggregate the cumulative metric when summarizing data to a different granularity: `first`, `last`, or `average`. Defaults to `first` if `window` is not specified. | Optional |
@@ -45,15 +45,34 @@ Note that we use the double colon (::) to indicate whether a parameter is nested | `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | | `label` | Required string that defines the display value in downstream tools. Accepts plain text, spaces, and quotes (such as `orders_total` or `"orders_total"`). | Required | | `type_params` | The type parameters of the metric. Supports nested parameters indicated by the double colon, such as `type_params::measure`. | Required | -| `window` | The accumulation window, such as 1 month, 7 days, 1 year. This can't be used with `grain_to_date`. | Optional | +| `window` | The accumulation window, such as `1 month`, `7 days`, or `1 year`. This can't be used with `grain_to_date`. | Optional | | `grain_to_date` | Sets the accumulation grain, such as `month`, which will accumulate data for one month and then restart at the beginning of the next. This can't be used with `window`. | Optional | | `type_params::measure` | A list of measure inputs | Required | -| `measure:name` | The measure you are referencing. | Optional | +| `measure:name` | The name of the measure being referenced. Required if using object syntax for `type_params::measure`. | Optional | | `measure:fill_nulls_with` | Set the value in your metric definition instead of null (such as zero).| Optional | | `measure:join_to_timespine` | Boolean that indicates if the aggregated measure should be joined to the time spine table to fill in missing dates. Default `false`. | Optional |
+ + +The`type_params::measure` configuration can be written in different ways: +- Shorthand syntax — To only specify the name of the measure, use a simple string value. This is a shorthand approach when no other attributes are required. + ```yaml + type_params: + measure: revenue + ``` +- Object syntax — To add more details or attributes to the measure (such as adding a filter, handling `null` values, or specifying whether to join to a time spine), you need to use the object syntax. This allows for additional configuration beyond just the measure's name. + + ```yaml + type_params: + measure: + name: order_total + fill_nulls_with: 0 + join_to_timespine: true + ``` + + ### Complete specification The following displays the complete specification for cumulative metrics, along with an example: diff --git a/website/docs/docs/build/data-tests.md b/website/docs/docs/build/data-tests.md index 59d716b4ca9..afe4719768c 100644 --- a/website/docs/docs/build/data-tests.md +++ b/website/docs/docs/build/data-tests.md @@ -66,11 +66,25 @@ having total_amount < 0 -The name of this test is the name of the file: `assert_total_payment_amount_is_positive`. Simple enough. +The name of this test is the name of the file: `assert_total_payment_amount_is_positive`. -Singular data tests are easy to write—so easy that you may find yourself writing the same basic structure over and over, only changing the name of a column or model. By that point, the test isn't so singular! In that case, we recommend... +To add a description to a singular test in your project, add a `.yml` file to your `tests` directory, for example, `tests/schema.yml` with the following content: + +```yaml +version: 2 +data_tests: + - name: assert_total_payment_amount_is_positive + description: > + Refunds have a negative amount, so the total amount should always be >= 0. + Therefore return records where total amount < 0 to make the test fail. + +``` + + + +Singular data tests are so easy that you may find yourself writing the same basic structure repeatedly, only changing the name of a column or model. By that point, the test isn't so singular! In that case, we recommend generic data tests. ## Generic data tests Certain data tests are generic: they can be reused over and over again. A generic data test is defined in a `test` block, which contains a parametrized query and accepts arguments. It might look like: @@ -304,7 +318,6 @@ data_tests: -To suppress warnings about the rename, add `TestsConfigDeprecation` to the `silence` block of the `warn_error_options` flag in `dbt_project.yml`, [as described in the Warnings documentation](https://docs.getdbt.com/reference/global-configs/warnings).
diff --git a/website/docs/docs/build/dimensions.md b/website/docs/docs/build/dimensions.md index affd74f81aa..5026f4c45cd 100644 --- a/website/docs/docs/build/dimensions.md +++ b/website/docs/docs/build/dimensions.md @@ -12,7 +12,7 @@ Dimensions represent the non-aggregatable columns in your data set, which are th Groups are defined within semantic models, alongside entities and measures, and correspond to non-aggregatable columns in your dbt model that provides categorical or time-based context. In SQL, dimensions is typically included in the GROUP BY clause.--> -All dimensions require a `name`, `type`, and can optionally include an `expr` parameter. The `name` for your Dimension must be unique wihtin the same semantic model. +All dimensions require a `name`, `type`, and can optionally include an `expr` parameter. The `name` for your Dimension must be unique within the same semantic model. | Parameter | Description | Type | | --------- | ----------- | ---- | @@ -41,7 +41,7 @@ Refer to the following example to see how dimensions are used in a semantic mode semantic_models: - name: transactions description: A record for every transaction that takes place. Carts are considered multiple transactions for each SKU. - model: {{ ref("fact_transactions") }} + model: {{ ref('fact_transactions') }} defaults: agg_time_dimension: order_date # --- entities --- @@ -67,7 +67,7 @@ semantic_models: type: categorical ``` -Dimensions are bound to the primary entity of the semantic model they are defined in. For example the dimensoin `type` is defined in a model that has `transaction` as a primary entity. `type` is scoped to the `transaction` entity, and to reference this dimension you would use the fully qualified dimension name i.e `transaction__type`. +Dimensions are bound to the primary entity of the semantic model they are defined in. For example the dimension `type` is defined in a model that has `transaction` as a primary entity. `type` is scoped to the `transaction` entity, and to reference this dimension you would use the fully qualified dimension name i.e `transaction__type`. MetricFlow requires that all semantic models have a primary entity. This is to guarantee unique dimension names. If your data source doesn't have a primary entity, you need to assign the entity a name using the `primary_entity` key. It doesn't necessarily have to map to a column in that table and assigning the name doesn't affect query generation. We recommend making these "virtual primary entities" unique across your semantic model. An example of defining a primary entity for a data source that doesn't have a primary entity column is below: @@ -122,7 +122,7 @@ dbt sl query --metrics users_created,users_deleted --group-by metric_time__year mf query --metrics users_created,users_deleted --group-by metric_time__year --order-by metric_time__year ``` -You can set `is_partition` for time to define specific time spans. Additionally, use the `type_params` section to set `time_granularity` to adjust aggregation details (hourly, daily, weekly, and so on). +You can set `is_partition` for time to define specific time spans. Additionally, use the `type_params` section to set `time_granularity` to adjust aggregation details (daily, weekly, and so on). @@ -161,6 +161,8 @@ measures: + + `time_granularity` specifies the grain of a time dimension. MetricFlow will transform the underlying column to the specified granularity. For example, if you add hourly granularity to a time dimension column, MetricFlow will run a `date_trunc` function to convert the timestamp to hourly. You can easily change the time grain at query time and aggregate it to a coarser grain, for example, from hourly to monthly. However, you can't go from a coarser grain to a finer grain (monthly to hourly). Our supported granularities are: @@ -172,6 +174,7 @@ Our supported granularities are: * hour * day * week +* month * quarter * year @@ -204,6 +207,50 @@ measures: agg: sum ``` + + + + +`time_granularity` specifies the grain of a time dimension. MetricFlow will transform the underlying column to the specified granularity. For example, if you add daily granularity to a time dimension column, MetricFlow will run a `date_trunc` function to convert the timestamp to daily. You can easily change the time grain at query time and aggregate it to a coarser grain, for example, from daily to monthly. However, you can't go from a coarser grain to a finer grain (monthly to daily). + +Our supported granularities are: +* day +* week +* month +* quarter +* year + +Aggregation between metrics with different granularities is possible, with the Semantic Layer returning results at the coarsest granularity by default. For example, when querying two metrics with daily and monthly granularity, the resulting aggregation will be at the monthly level. + +```yaml +dimensions: + - name: created_at + type: time + label: "Date of creation" + expr: ts_created # ts_created is the underlying column name from the table + is_partition: True + type_params: + time_granularity: day + - name: deleted_at + type: time + label: "Date of deletion" + expr: ts_deleted # ts_deleted is the underlying column name from the table + is_partition: True + type_params: + time_granularity: day + +measures: + - name: users_deleted + expr: 1 + agg: sum + agg_time_dimension: deleted_at + - name: users_created + expr: 1 + agg: sum +``` + + + @@ -313,7 +360,7 @@ Additionally, the entity is tagged as `natural` to differentiate it from a `prim semantic_models: - name: sales_person_tiers description: SCD Type II table of tiers for salespeople - model: {{ref(sales_person_tiers)}} + model: {{ ref('sales_person_tiers') }} defaults: agg_time_dimension: tier_start @@ -355,7 +402,7 @@ semantic_models: There is a transaction, product, sales_person, and customer id for every transaction. There is only one transaction id per transaction. The `metric_time` or date is reflected in UTC. - model: {{ ref(fact_transactions) }} + model: {{ ref('fact_transactions') }} defaults: agg_time_dimension: metric_time diff --git a/website/docs/docs/build/documentation.md b/website/docs/docs/build/documentation.md index d040d3c5bef..6f7c6c27f31 100644 --- a/website/docs/docs/build/documentation.md +++ b/website/docs/docs/build/documentation.md @@ -101,7 +101,18 @@ The events in this table are recorded by [Snowplow](http://github.com/snowplow/s In the above example, a docs block named `table_events` is defined with some descriptive markdown contents. There is nothing significant about the name `table_events` — docs blocks can be named however you like, as long as the name only contains alphanumeric and underscore characters and does not start with a numeric character. ### Placement -Docs blocks should be placed in files with a `.md` file extension. By default, dbt will search in all resource paths for docs blocks (i.e. the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [macro-paths](/reference/project-configs/macro-paths) and [snapshot-paths](/reference/project-configs/snapshot-paths)) — you can adjust this behavior using the [docs-paths](/reference/project-configs/docs-paths) config. + + + +Docs blocks should be placed in files with a `.md` file extension. By default, dbt will search in all resource paths for docs blocks (for example, the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [test-paths](/reference/project-configs/test-paths), [macro-paths](/reference/project-configs/macro-paths), and [snapshot-paths](/reference/project-configs/snapshot-paths)) — you can adjust this behavior using the [docs-paths](/reference/project-configs/docs-paths) config. + + + + + +Docs blocks should be placed in files with a `.md` file extension. By default, dbt will search in all resource paths for docs blocks (for example, the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [macro-paths](/reference/project-configs/macro-paths), and [snapshot-paths](/reference/project-configs/snapshot-paths)) — you can adjust this behavior using the [docs-paths](/reference/project-configs/docs-paths) config. + + ### Usage diff --git a/website/docs/docs/build/environment-variables.md b/website/docs/docs/build/environment-variables.md index 01601ce7eb8..b87786ac596 100644 --- a/website/docs/docs/build/environment-variables.md +++ b/website/docs/docs/build/environment-variables.md @@ -97,49 +97,56 @@ While all environment variables are encrypted at rest in dbt Cloud, dbt Cloud ha dbt Cloud has a number of pre-defined variables built in. Variables are set automatically and cannot be changed. -**dbt Cloud IDE details** +#### dbt Cloud IDE details The following environment variable is set automatically for the dbt Cloud IDE: -- `DBT_CLOUD_GIT_BRANCH`: Provides the development Git branch name in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). - - Available in dbt v 1.6 and later. +- `DBT_CLOUD_GIT_BRANCH` — Provides the development Git branch name in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). + - Available in dbt v1.6 and later. - The variable changes when the branch is changed. - Doesn't require restarting the IDE after a branch change. - Currently not available in the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation). Use case — This is useful in cases where you want to dynamically use the Git branch name as a prefix for a [development schema](/docs/build/custom-schemas) ( `{{ env_var ('DBT_CLOUD_GIT_BRANCH') }}` ). -**dbt Cloud context** +#### dbt Cloud context -The following environment variables are set automatically for deployment runs: +The following environment variables are set automatically: -- `DBT_ENV`: This key is reserved for the dbt Cloud application and will always resolve to 'prod' +- `DBT_ENV` — This key is reserved for the dbt Cloud application and will always resolve to 'prod'. For deployment runs only. +- `DBT_CLOUD_ENVIRONMENT_NAME` — The name of the dbt Cloud environment in which `dbt` is running. +- `DBT_CLOUD_ENVIRONMENT_TYPE` — The type of dbt Cloud environment in which `dbt` is running. The valid values are `development` or `deployment`. -**Run details** -- `DBT_CLOUD_PROJECT_ID`: The ID of the dbt Cloud Project for this run -- `DBT_CLOUD_JOB_ID`: The ID of the dbt Cloud Job for this run -- `DBT_CLOUD_RUN_ID`: The ID of this particular run -- `DBT_CLOUD_RUN_REASON_CATEGORY`: The "category" of the trigger for this run (one of: `scheduled`, `github_pull_request`, `gitlab_merge_request`, `azure_pull_request`, `other`) -- `DBT_CLOUD_RUN_REASON`: The specific trigger for this run (eg. `Scheduled`, `Kicked off by `, or custom via `API`) -- `DBT_CLOUD_ENVIRONMENT_ID`: The ID of the environment for this run -- `DBT_CLOUD_ACCOUNT_ID`: The ID of the dbt Cloud account for this run +#### Run details -**Git details** +- `DBT_CLOUD_PROJECT_ID` — The ID of the dbt Cloud Project for this run +- `DBT_CLOUD_JOB_ID` — The ID of the dbt Cloud Job for this run +- `DBT_CLOUD_RUN_ID` — The ID of this particular run +- `DBT_CLOUD_RUN_REASON_CATEGORY` — The "category" of the trigger for this run (one of: `scheduled`, `github_pull_request`, `gitlab_merge_request`, `azure_pull_request`, `other`) +- `DBT_CLOUD_RUN_REASON` — The specific trigger for this run (eg. `Scheduled`, `Kicked off by `, or custom via `API`) +- `DBT_CLOUD_ENVIRONMENT_ID` — The ID of the environment for this run +- `DBT_CLOUD_ACCOUNT_ID` — The ID of the dbt Cloud account for this run + +#### Git details _The following variables are currently only available for GitHub, GitLab, and Azure DevOps PR builds triggered via a webhook_ -- `DBT_CLOUD_PR_ID`: The Pull Request ID in the connected version control system -- `DBT_CLOUD_GIT_SHA`: The git commit SHA which is being run for this Pull Request build +- `DBT_CLOUD_PR_ID` — The Pull Request ID in the connected version control system +- `DBT_CLOUD_GIT_SHA` — The git commit SHA which is being run for this Pull Request build ### Example usage Environment variables can be used in many ways, and they give you the power and flexibility to do what you want to do more easily in dbt Cloud. -#### Clone private packages + + Now that you can set secrets as environment variables, you can pass git tokens into your package HTTPS URLs to allow for on-the-fly cloning of private repositories. Read more about enabling [private package cloning](/docs/build/packages#private-packages). -#### Dynamically set your warehouse in your Snowflake connection + + + + Environment variables make it possible to dynamically change the Snowflake virtual warehouse size depending on the job. Instead of calling the warehouse name directly in your project connection, you can reference an environment variable which will get set to a specific virtual warehouse at runtime. For example, suppose you'd like to run a full-refresh job in an XL warehouse, but your incremental job only needs to run in a medium-sized warehouse. Both jobs are configured in the same dbt Cloud environment. In your connection configuration, you can use an environment variable to set the warehouse name to `{{env_var('DBT_WAREHOUSE')}}`. Then in the job settings, you can set a different value for the `DBT_WAREHOUSE` environment variable depending on the job's workload. @@ -160,7 +167,10 @@ However, there are some limitations when using env vars with Snowflake OAuth Con Something to note, if you supply an environment variable in the account/host field, Snowflake OAuth Connection will **fail** to connect. This happens because the field doesn't pass through Jinja rendering, so dbt Cloud simply passes the literal `env_var` code into a URL string like `{{ env_var("DBT_ACCOUNT_HOST_NAME") }}.snowflakecomputing.com`, which is an invalid hostname. Use [extended attributes](/docs/deploy/deploy-environments#deployment-credentials) instead. ::: -#### Audit your run metadata + + + + Here's another motivating example that uses the dbt Cloud run ID, which is set automatically at each run. This additional data field can be used for auditing and debugging: ```sql @@ -186,3 +196,13 @@ select *, from users_aggregated ``` + + + + + +import SLEnvVars from '/snippets/_sl-env-vars.md'; + + + + diff --git a/website/docs/docs/build/incremental-microbatch.md b/website/docs/docs/build/incremental-microbatch.md new file mode 100644 index 00000000000..e1c39e6ae47 --- /dev/null +++ b/website/docs/docs/build/incremental-microbatch.md @@ -0,0 +1,309 @@ +--- +title: "About microbatch incremental models" +description: "Learn about the 'microbatch' strategy for incremental models." +id: "incremental-microbatch" +--- + +# About microbatch incremental models + +:::info Microbatch + +The `microbatch` strategy is available in beta for [dbt Cloud Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) and dbt Core v1.9. We have been developing it behind a flag to prevent unintended interactions with existing custom incremental strategies. To enable this feature, [set the environment variable](/docs/build/environment-variables#setting-and-overriding-environment-variables) `DBT_EXPERIMENTAL_MICROBATCH` to `True` in your dbt Cloud environments or wherever you're running dbt Core. + +Read and participate in the discussion: [dbt-core#10672](https://github.com/dbt-labs/dbt-core/discussions/10672) + +Refer to [Supported incremental strategies by adapter](/docs/build/incremental-strategy#supported-incremental-strategies-by-adapter) for a list of supported adapters. + +::: + +## What is "microbatch" in dbt? + +Incremental models in dbt are a [materialization](/docs/build/materializations) designed to efficiently update your data warehouse tables by only transforming and loading _new or changed data_ since the last run. Instead of reprocessing an entire dataset every time, incremental models process a smaller number of rows, and then append, update, or replace those rows in the existing table. This can significantly reduce the time and resources required for your data transformations. + +Microbatch incremental models make it possible to process transformations on very large time-series datasets with efficiency and resiliency. When dbt runs a microbatch model — whether for the first time, during incremental runs, or in specified backfills — it will split the processing into multiple queries (or "batches"), based on the `event_time` and `batch_size` you configure. + +Each "batch" corresponds to a single bounded time period (by default, a single day of data). Where other incremental strategies operate only on "old" and "new" data, microbatch models treat every batch as an atomic unit that can be built or replaced on its own. Each batch is independent and . This is a powerful abstraction that makes it possible for dbt to run batches separately — in the future, concurrently — and to retry them independently. + +### Example + +A `sessions` model aggregates and enriches data that comes from two other models. +- `page_views` is a large, time-series table. It contains many rows, new records almost always arrive after existing ones, and existing records rarely update. +- `customers` is a relatively small dimensional table. Customer attributes update often, and not in a time-based manner — that is, older customers are just as likely to change column values as newer customers. + +The `page_view_start` column in `page_views` is configured as that model's `event_time`. The `customers` model does not configure an `event_time`. Therefore, each batch of `sessions` will filter `page_views` to the equivalent time-bounded batch, and it will not filter `customers` (a full scan for every batch). + + + +```yaml +models: + - name: page_views + config: + event_time: page_view_start +``` + + +We run the `sessions` model on October 1, 2024, and then again on October 2. It produces the following queries: + + + + + +The `event_time` for the `sessions` model is set to `session_start`, which marks the beginning of a user’s session on the website. This setting allows dbt to combine multiple page views (each tracked by their own `page_view_start` timestamps) into a single session. This way, `session_start` differentiates the timing of individual page views from the broader timeframe of the entire user session. + + + +```sql +{{ config( + materialized='incremental', + incremental_strategy='microbatch', + event_time='session_start', + begin='2020-01-01', + batch_size='day' +) }} + +with page_views as ( + + -- this ref will be auto-filtered + select * from {{ ref('page_views') }} + +), + +customers as ( + + -- this ref won't + select * from {{ ref('customers') }} + +), + +select + page_views.id as session_id, + page_views.page_view_start as session_start, + customers.* + from page_views + left join customers + on page_views.customer_id = customer.id +``` + + + + + + + + + +```sql + +with page_views as ( + + select * from ( + -- filtered on configured event_time + select * from "analytics"."page_views" + where page_view_start >= '2024-10-01 00:00:00' -- Oct 1 + and page_view_start < '2024-10-02 00:00:00' + ) + +), + +customers as ( + + select * from "analytics"."customers" + +), + +... +``` + + + + + + + + + +```sql + +with page_views as ( + + select * from ( + -- filtered on configured event_time + select * from "analytics"."page_views" + where page_view_start >= '2024-10-02 00:00:00' -- Oct 2 + and page_view_start < '2024-10-03 00:00:00' + ) + +), + +customers as ( + + select * from "analytics"."customers" + +), + +... +``` + + + + + + + +dbt will instruct the data platform to take the result of each batch query and insert, update, or replace the contents of the `analytics.sessions` table for the same day of data. To perform this operation, dbt will use the most efficient atomic mechanism for "full batch" replacement that is available on each data platform. + +It does not matter whether the table already contains data for that day. Given the same input data, the resulting table is the same no matter how many times a batch is reprocessed. + + + +### Relevant configs + +Several configurations are relevant to microbatch models, and some are required: + +| Config | Type | Description | Default | +|----------|------|---------------|---------| +| `event_time` | Column (required) | The column indicating "at what time did the row occur." Required for your microbatch model and any direct parents that should be filtered. | N/A | +| `begin` | Date (required) | The "beginning of time" for the microbatch model. This is the starting point for any initial or full-refresh builds. For example, a daily-grain microbatch model run on `2024-10-01` with `begin = '2023-10-01` will process 366 batches (it's a leap year!) plus the batch for "today." | N/A | +| `batch_size` | String (required) | The granularity of your batches. Supported values are `hour`, `day`, `month`, and `year` | N/A | +| `lookback` | Integer (optional) | Process X batches prior to the latest bookmark to capture late-arriving records. | `1` | + + + +As a best practice, we recommend configuring `full_refresh: False` on microbatch models so that they ignore invocations with the `--full-refresh` flag. If you need to reprocess historical data, do so with a targeted backfill that specifies explicit start and end dates. + +### Usage + +**You must write your model query to process (read and return) exactly one "batch" of data**. This is a simplifying assumption and a powerful one: +- You don’t need to think about `is_incremental` filtering +- You don't need to pick among DML strategies (upserting/merging/replacing) +- You can preview your model, and see the exact records for a given batch that will appear when that batch is processed and written to the table + +When you run a microbatch model, dbt will evaluate which batches need to be loaded, break them up into a SQL query per batch, and load each one independently. + +dbt will automatically filter upstream inputs (`source` or `ref`) that define `event_time`, based on the `lookback` and `batch_size` configs for this model. + +During standard incremental runs, dbt will process batches according to the current timestamp and the configured `lookback`, with one query per batch. + + + +**Note:** If there’s an upstream model that configures `event_time`, but you *don’t* want the reference to it to be filtered, you can specify `ref('upstream_model').render()` to opt-out of auto-filtering. This isn't generally recommended — most models that configure `event_time` are fairly large, and if the reference is not filtered, each batch will perform a full scan of this input table. + +### Backfills + +Whether to fix erroneous source data or retroactively apply a change in business logic, you may need to reprocess a large amount of historical data. + +Backfilling a microbatch model is as simple as selecting it to run or build, and specifying a "start" and "end" for `event_time`. Note that `--event-time-start` and `--event-time-end` are mutually necessary, meaning that if you specify one, you must specify the other. + +As always, dbt will process the batches between the start and end as independent queries. + +```bash +dbt run --event-time-start "2024-09-01" --event-time-end "2024-09-04" +``` + + + + +### Retry + +If one or more of your batches fail, you can use `dbt retry` to reprocess _only_ the failed batches. + +![Partial retry](https://github.com/user-attachments/assets/f94c4797-dcc7-4875-9623-639f70c97b8f) + +### Timezones + +For now, dbt assumes that all values supplied are in UTC: + +- `event_time` +- `begin` +- `--event-time-start` +- `--event-time-end` + +While we may consider adding support for custom time zones in the future, we also believe that defining these values in UTC makes everyone's lives easier. + +## How `microbatch` compares to other incremental strategies? + +Most incremental models rely on the end user (you) to explicitly tell dbt what "new" means, in the context of each model, by writing a filter in an `{% if is_incremental() %}` conditional block. You are responsible for crafting this SQL in a way that queries [`{{ this }}`](/reference/dbt-jinja-functions/this) to check when the most recent record was last loaded, with an optional look-back window for late-arriving records. + +Other incremental strategies will control _how_ the data is being added into the table — whether append-only `insert`, `delete` + `insert`, `merge`, `insert overwrite`, etc — but they all have this in common. + +As an example: + +```sql +{{ + config( + materialized='incremental', + incremental_strategy='delete+insert', + unique_key='date_day' + ) +}} + +select * from {{ ref('stg_events') }} + + {% if is_incremental() %} + -- this filter will only be applied on an incremental run + -- add a lookback window of 3 days to account for late-arriving records + where date_day >= (select {{ dbt.dateadd("day", -3, "max(date_day)") }} from {{ this }}) + {% endif %} + +``` + +For this incremental model: + +- "New" records are those with a `date_day` greater than the maximum `date_day` that has previously been loaded +- The lookback window is 3 days +- When there are new records for a given `date_day`, the existing data for `date_day` is deleted and the new data is inserted + +Let’s take our same example from before, and instead use the new `microbatch` incremental strategy: + + + +```sql +{{ + config( + materialized='incremental', + incremental_strategy='microbatch', + event_time='event_occured_at', + batch_size='day', + lookback=3, + begin='2020-01-01', + full_refresh=false + ) +}} + +select * from {{ ref('stg_events') }} -- this ref will be auto-filtered +``` + + + +Where you’ve also set an `event_time` for the model’s direct parents - in this case, `stg_events`: + + + +```yaml +models: + - name: stg_events + config: + event_time: my_time_field +``` + + + +And that’s it! + +When you run the model, each batch templates a separate query. For example, if you were running the model on October 1, dbt would template separate queries for each day between September 28 and October 1, inclusive — four batches in total. + +The query for `2024-10-01` would look like: + + + +```sql +select * from ( + select * from "analytics"."stg_events" + where my_time_field >= '2024-10-01 00:00:00' + and my_time_field < '2024-10-02 00:00:00' +) +``` + + + +Based on your data platform, dbt will choose the most efficient atomic mechanism to insert, update, or replace these four batches (`2024-09-28`, `2024-09-29`, `2024-09-30`, and `2024-10-01`) in the existing table. diff --git a/website/docs/docs/build/incremental-models-overview.md b/website/docs/docs/build/incremental-models-overview.md index 16c950eb331..bddc6b0a55d 100644 --- a/website/docs/docs/build/incremental-models-overview.md +++ b/website/docs/docs/build/incremental-models-overview.md @@ -42,4 +42,5 @@ Transaction management, a process used in certain data platforms, ensures that a ## Related docs - [Incremental models](/docs/build/incremental-models) to learn how to configure incremental models in dbt. - [Incremental strategies](/docs/build/incremental-strategy) to understand how dbt implements incremental models on different databases. +- [Microbatch](/docs/build/incremental-strategy) to understand a new incremental strategy intended for efficient and resilient processing of very large time-series datasets. - [Materializations best practices](/best-practices/materializations/1-guide-overview) to learn about the best practices for using materializations in dbt. diff --git a/website/docs/docs/build/incremental-models.md b/website/docs/docs/build/incremental-models.md index 2f8bbc46c3a..2968496290a 100644 --- a/website/docs/docs/build/incremental-models.md +++ b/website/docs/docs/build/incremental-models.md @@ -94,7 +94,7 @@ Not specifying a `unique_key` will result in append-only behavior, which means d The optional `unique_key` parameter specifies a field (or combination of fields) that defines the grain of your model. That is, the field(s) identify a single unique row. You can define `unique_key` in a configuration block at the top of your model, and it can be a single column name or a list of column names. -The `unique_key` should be supplied in your model definition as a string representing a single column or a list of single-quoted column names that can be used together, for example, `['col1', 'col2', …])`. Columns used in this way should not contain any nulls, or the incremental model run may fail. Either ensure that each column has no nulls (for example with `coalesce(COLUMN_NAME, 'VALUE_IF_NULL')`), or define a single-column [surrogate key](/terms/surrogate-key) (for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source)). +The `unique_key` should be supplied in your model definition as a string representing a single column or a list of single-quoted column names that can be used together, for example, `['col1', 'col2', …])`. Columns used in this way should not contain any nulls, or the incremental model may fail to match rows and generate duplicate rows. Either ensure that each column has no nulls (for example with `coalesce(COLUMN_NAME, 'VALUE_IF_NULL')`) or define a single-column [surrogate key](https://www.getdbt.com/blog/guide-to-surrogate-key) (for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source)). :::tip In cases where you need multiple columns in combination to uniquely identify each row, we recommend you pass these columns as a list (`unique_key = ['user_id', 'session_number']`), rather than a string expression (`unique_key = 'concat(user_id, session_number)'`). @@ -103,7 +103,7 @@ By using the first syntax, which is more universal, dbt can ensure that the colu When you pass a list in this way, please ensure that each column does not contain any nulls, or the incremental model run may fail. -Alternatively, you can define a single-column [surrogate key](/terms/surrogate-key), for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source). +Alternatively, you can define a single-column [surrogate key](https://www.getdbt.com/blog/guide-to-surrogate-key), for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source). ::: When you define a `unique_key`, you'll see this behavior for each row of "new" data returned by your dbt model: @@ -111,7 +111,7 @@ When you define a `unique_key`, you'll see this behavior for each row of "new" d * If the same `unique_key` is present in the "new" and "old" model data, dbt will update/replace the old row with the new row of data. The exact mechanics of how that update/replace takes place will vary depending on your database, [incremental strategy](/docs/build/incremental-strategy), and [strategy specific configs](/docs/build/incremental-strategy#strategy-specific-configs). * If the `unique_key` is _not_ present in the "old" data, dbt will insert the entire row into the table. -Please note that if there's a unique_key with more than one row in either the existing target table or the new incremental rows, the incremental model may fail depending on your database and [incremental strategy](/docs/build/incremental-strategy). If you're having issues running an incremental model, it's a good idea to double check that the unique key is truly unique in both your existing database table and your new incremental rows. You can [learn more about surrogate keys here](/terms/surrogate-key). +Please note that if there's a unique_key with more than one row in either the existing target table or the new incremental rows, the incremental model may fail depending on your database and [incremental strategy](/docs/build/incremental-strategy). If you're having issues running an incremental model, it's a good idea to double check that the unique key is truly unique in both your existing database table and your new incremental rows. You can [learn more about surrogate keys here](https://www.getdbt.com/blog/guide-to-surrogate-key). :::info While common incremental strategies, such as`delete+insert` + `merge`, might use `unique_key`, others don't. For example, the `insert_overwrite` strategy does not use `unique_key`, because it operates on partitions of data rather than individual rows. For more information, see [About incremental_strategy](/docs/build/incremental-strategy). diff --git a/website/docs/docs/build/incremental-strategy.md b/website/docs/docs/build/incremental-strategy.md index 8e86da0eba8..30de135b09b 100644 --- a/website/docs/docs/build/incremental-strategy.md +++ b/website/docs/docs/build/incremental-strategy.md @@ -10,32 +10,31 @@ There are various strategies to implement the concept of incremental materializa * The reliability of your `unique_key`. * The support of certain features in your data platform. -An optional `incremental_strategy` config is provided in some adapters that controls the code that dbt uses -to build incremental models. +An optional `incremental_strategy` config is provided in some adapters that controls the code that dbt uses to build incremental models. -### Supported incremental strategies by adapter - -Click the name of the adapter in the below table for more information about supported incremental strategies. +:::info Microbatch -The `merge` strategy is available in dbt-postgres and dbt-redshift beginning in dbt v1.6. +The [`microbatch` incremental strategy](/docs/build/incremental-microbatch) is intended for large time-series datasets. dbt will process the incremental model in multiple queries (or "batches") based on a configured `event_time` column. Depending on the volume and nature of your data, this can be more efficient and resilient than using a single query for adding new data. -| data platform adapter | `append` | `merge` | `delete+insert` | `insert_overwrite` | -|-----------------------------------------------------------------------------------------------------|:--------:|:-------:|:---------------:|:------------------:| -| [dbt-postgres](/reference/resource-configs/postgres-configs#incremental-materialization-strategies) | ✅ | ✅ | ✅ | | -| [dbt-redshift](/reference/resource-configs/redshift-configs#incremental-materialization-strategies) | ✅ | ✅ | ✅ | | -| [dbt-bigquery](/reference/resource-configs/bigquery-configs#merge-behavior-incremental-models) | | ✅ | | ✅ | -| [dbt-spark](/reference/resource-configs/spark-configs#incremental-models) | ✅ | ✅ | | ✅ | -| [dbt-databricks](/reference/resource-configs/databricks-configs#incremental-models) | ✅ | ✅ | | ✅ | -| [dbt-snowflake](/reference/resource-configs/snowflake-configs#merge-behavior-incremental-models) | ✅ | ✅ | ✅ | | -| [dbt-trino](/reference/resource-configs/trino-configs#incremental) | ✅ | ✅ | ✅ | | -| [dbt-fabric](/reference/resource-configs/fabric-configs#incremental) | ✅ | | ✅ | | +::: +### Supported incremental strategies by adapter -:::note Snowflake Configurations +This table represents the availability of each incremental strategy, based on the latest version of dbt Core and each adapter. -dbt has changed the default materialization for incremental table merges from `temporary table` to `view`. For more information about this change and instructions for setting the configuration to a temp table, please read about [Snowflake temporary tables](/reference/resource-configs/snowflake-configs#temporary-tables). +Click the name of the adapter in the below table for more information about supported incremental strategies. -::: +| Data platform adapter | `append` | `merge` | `delete+insert` | `insert_overwrite` | `microbatch` | +|-----------------------|:--------:|:-------:|:---------------:|:------------------:|:-------------------:| +| [dbt-postgres](/reference/resource-configs/postgres-configs#incremental-materialization-strategies) | ✅ | ✅ | ✅ | | ✅ | +| [dbt-redshift](/reference/resource-configs/redshift-configs#incremental-materialization-strategies) | ✅ | ✅ | ✅ | | | +| [dbt-bigquery](/reference/resource-configs/bigquery-configs#merge-behavior-incremental-models) | | ✅ | | ✅ | ✅ | +| [dbt-spark](/reference/resource-configs/spark-configs#incremental-models) | ✅ | ✅ | | ✅ | ✅ | +| [dbt-databricks](/reference/resource-configs/databricks-configs#incremental-models) | ✅ | ✅ | | ✅ | | +| [dbt-snowflake](/reference/resource-configs/snowflake-configs#merge-behavior-incremental-models) | ✅ | ✅ | ✅ | | ✅ | +| [dbt-trino](/reference/resource-configs/trino-configs#incremental) | ✅ | ✅ | ✅ | | | +| [dbt-fabric](/reference/resource-configs/fabric-configs#incremental) | ✅ | ✅ | ✅ | | | +| [dbt-athena](/reference/resource-configs/athena-configs#incremental-models) | ✅ | ✅ | | ✅ | | ### Configuring incremental strategy @@ -200,6 +199,7 @@ Before diving into [custom strategies](#custom-strategies), it's important to un | `delete+insert` | `get_incremental_delete_insert_sql` | | `merge` | `get_incremental_merge_sql` | | `insert_overwrite` | `get_incremental_insert_overwrite_sql` | +| `microbatch` | `get_incremental_microbatch_sql` | For example, a built-in strategy for the `append` can be defined and used with the following files: diff --git a/website/docs/docs/build/measures.md b/website/docs/docs/build/measures.md index efd732f4428..d60aa3f7e21 100644 --- a/website/docs/docs/build/measures.md +++ b/website/docs/docs/build/measures.md @@ -102,7 +102,7 @@ semantic_models: description: A record of every transaction that takes place. Carts are considered multiple transactions for each SKU. model: ref('schema.transactions') defaults: - agg_time_dimensions: metric_time + agg_time_dimension: transaction_date # --- entities --- entities: @@ -167,7 +167,7 @@ semantic_models: # --- dimensions --- dimensions: - - name: metric_time + - name: transaction_date type: time expr: date_trunc('day', ts) # expr refers to underlying column ts type_params: @@ -200,19 +200,19 @@ Parameters under the `non_additive_dimension` will specify dimensions that the m ```yaml semantic_models: - - name: subscription_id + - name: subscriptions description: A subscription table with one row per date for each active user and their subscription plans. model: ref('your_schema.subscription_table') defaults: - agg_time_dimension: metric_time + agg_time_dimension: subscription_date entities: - name: user_id type: foreign - primary_entity: subscription_table + primary_entity: subscription dimensions: - - name: metric_time + - name: subscription_date type: time expr: date_transaction type_params: @@ -224,21 +224,21 @@ semantic_models: expr: user_id agg: count_distinct non_additive_dimension: - name: metric_time + name: subscription_date window_choice: max - name: mrr description: Aggregate by summing all users' active subscription plans expr: subscription_value agg: sum non_additive_dimension: - name: metric_time + name: subscription_date window_choice: max - name: user_mrr description: Group by user_id to achieve each user's MRR expr: subscription_value agg: sum non_additive_dimension: - name: metric_time + name: subscription_date window_choice: max window_groupings: - user_id @@ -255,15 +255,15 @@ We can query the semi-additive metrics using the following syntax: For dbt Cloud: ```bash -dbt sl query --metrics mrr_by_end_of_month --group-by metric_time__month --order metric_time__month -dbt sl query --metrics mrr_by_end_of_month --group-by metric_time__week --order metric_time__week +dbt sl query --metrics mrr_by_end_of_month --group-by subscription__subscription_date__month --order subscription__subscription_date__month +dbt sl query --metrics mrr_by_end_of_month --group-by subscription__subscription_date__week --order subscription__subscription_date__week ``` For dbt Core: ```bash -mf query --metrics mrr_by_end_of_month --group-by metric_time__month --order metric_time__month -mf query --metrics mrr_by_end_of_month --group-by metric_time__week --order metric_time__week +mf query --metrics mrr_by_end_of_month --group-by subscription__subscription_date__month --order subscription__subscription_date__month +mf query --metrics mrr_by_end_of_month --group-by subscription__subscription_date__week --order subscription__subscription_date__week ``` import SetUpPages from '/snippets/_metrics-dependencies.md'; diff --git a/website/docs/docs/build/metricflow-commands.md b/website/docs/docs/build/metricflow-commands.md index 1f50e501261..2da5618b86f 100644 --- a/website/docs/docs/build/metricflow-commands.md +++ b/website/docs/docs/build/metricflow-commands.md @@ -8,7 +8,7 @@ tags: [Metrics, Semantic Layer] Once you define metrics in your dbt project, you can query metrics, dimensions, and dimension values, and validate your configs using the MetricFlow commands. -MetricFlow allows you to define and query metrics in your dbt project in the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation), [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud), or [dbt Core](/docs/core/installation-overview). To experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and dynamically query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. +MetricFlow allows you to define and query metrics in your dbt project in the [dbt Cloud](/docs/cloud/about-develop-dbt) or [dbt Core](/docs/core/installation-overview). To experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and dynamically query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. MetricFlow is compatible with Python versions 3.8, 3.9, 3.10, and 3.11. @@ -18,67 +18,70 @@ MetricFlow is a dbt package that allows you to define and query metrics in your Using MetricFlow with dbt Cloud means you won't need to manage versioning — your dbt Cloud account will automatically manage the versioning. -**dbt Cloud jobs** — MetricFlow commands aren't supported in dbt Cloud jobs yet. However, you can add MetricFlow validations with your git provider (such as GitHub Actions) by installing MetricFlow (`python -m pip install metricflow`). This allows you to run MetricFlow commands as part of your continuous integration checks on PRs. +dbt Cloud jobs support the `dbt sl validate` command to [automatically test your semantic nodes](/docs/deploy/ci-jobs#semantic-validations-in-ci). You can also add MetricFlow validations with your git provider (such as GitHub Actions) by installing MetricFlow (`python -m pip install metricflow`). This allows you to run MetricFlow commands as part of your continuous integration checks on PRs. - + -- MetricFlow [commands](#metricflow-commands) are embedded in the dbt Cloud CLI. This means you can immediately run them once you install the dbt Cloud CLI and don't need to install MetricFlow separately. -- You don't need to manage versioning — your dbt Cloud account will automatically manage the versioning for you. - - +In dbt Cloud, run MetricFlow commands directly in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) or in the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation). - - -:::info -You can create metrics using MetricFlow in the dbt Cloud IDE and run the [dbt sl validate](/docs/build/validation#validations-command) command. Support for running more MetricFlow commands in the IDE will be available soon. -::: +For dbt Cloud CLI users, MetricFlow commands are embedded in the dbt Cloud CLI, which means you can immediately run them once you install the dbt Cloud CLI and don't need to install MetricFlow separately. You don't need to manage versioning because your dbt Cloud account will automatically manage the versioning for you. - - -:::tip Use dbt Cloud CLI for semantic layer development - -You can use the dbt Cloud CLI for the experience in defining and querying metrics in your dbt project. - -A benefit to using the dbt Cloud is that you won't need to manage versioning — your dbt Cloud account will automatically manage the versioning. -::: + You can install [MetricFlow](https://github.com/dbt-labs/metricflow#getting-started) from [PyPI](https://pypi.org/project/dbt-metricflow/). You need to use `pip` to install MetricFlow on Windows or Linux operating systems: + + 1. Create or activate your virtual environment `python -m venv venv` 2. Run `pip install dbt-metricflow` * You can install MetricFlow using PyPI as an extension of your dbt adapter in the command line. To install the adapter, run `python -m pip install "dbt-metricflow[your_adapter_name]"` and add the adapter name at the end of the command. For example, for a Snowflake adapter run `python -m pip install "dbt-metricflow[snowflake]"` -**Note**, you'll need to manage versioning between dbt Core, your adapter, and MetricFlow. + - + + +1. Create or activate your virtual environment `python -m venv venv` +2. Run `pip install dbt-metricflow` + * You can install MetricFlow using PyPI as an extension of your dbt adapter in the command line. To install the adapter, run `python -m pip install "dbt-metricflow[adapter_package_name]"` and add the adapter name at the end of the command. For example, for a Snowflake adapter run `python -m pip install "dbt-metricflow[dbt-snowflake]"` - + + +**Note**, you'll need to manage versioning between dbt Core, your adapter, and MetricFlow. Something to note, MetricFlow `mf` commands return an error if you have a Metafont latex package installed. To run `mf` commands, uninstall the package. + + + ## MetricFlow commands MetricFlow provides the following commands to retrieve metadata and query metrics. - + -You can use the `dbt sl` prefix before the command name to execute them in the dbt Cloud CLI. For example, to list all metrics, run `dbt sl list metrics`. For a complete list of the MetricFlow commands and flags, run the `dbt sl --help` command in your terminal. +You can use the `dbt sl` prefix before the command name to execute them in the dbt Cloud IDE or dbt Cloud CLI. For example, to list all metrics, run `dbt sl list metrics`. + +dbt Cloud CLI users can run `dbt sl --help` in the terminal for a complete list of the MetricFlow commands and flags. + +The following table lists the commands compatible with the dbt Cloud IDE and dbt Cloud CLI: + +|
Command
|
Description
| dbt Cloud IDE | dbt Cloud CLI | +|---------|-------------|---------------|---------------| +| [`list metrics`](#list-metrics) | Lists metrics with dimensions. | ✅ | ✅ | +| [`list dimensions`](#list) | Lists unique dimensions for metrics. | ✅ | ✅ | +| [`list dimension-values`](#list-dimension-values) | List dimensions with metrics. | ✅ | ✅ | +| [`list entities`](#list-entities) | Lists all unique entities. | ✅ | ✅ | +| [`list saved-queries`](#list-saved-queries) | Lists available saved queries. Use the `--show-exports` flag to display each export listed under a saved query or `--show-parameters` to show the full query parameters each saved query uses. | ✅ | ✅ | +| [`query`](#query) | Query metrics, saved queries, and dimensions you want to see in the command line interface. Refer to [query examples](#query-examples) to help you get started. | ✅ | ✅ | +| [`validate`](#validate) | Validates semantic model configurations. | ✅ | ✅ | +| [`export`](#export) | Runs exports for a singular saved query for testing and generating exports in your development environment. You can also use the `--select` flag to specify particular exports from a saved query. | ❌ | ✅ | +| [`export-all`](#export-all) | Runs exports for multiple saved queries at once, saving time and effort. | ❌ | ✅ | -- [`list`](#list) — Retrieves metadata values. -- [`list metrics`](#list-metrics) — Lists metrics with dimensions. -- [`list dimensions`](#list) — Lists unique dimensions for metrics. -- [`list dimension-values`](#list-dimension-values) — List dimensions with metrics. -- [`list entities`](#list-entities) — Lists all unique entities. -- [`list saved-queries`](#list-saved-queries) — Lists available saved queries. Use the `--show-exports` flag to display each export listed under a saved query. -- [`query`](#query) — Query metrics, saved queries, and dimensions you want to see in the command line interface. Refer to [query examples](#query-examples) to help you get started. -- [`export`](#export) — Runs exports for a singular saved query for testing and generating exports in your development environment. You can also use the `--select` flag to specify particular exports from a saved query. -- [`export-all`](#export-all) — Runs exports for multiple saved queries at once, saving time and effort. -- [`validate`](#validate) — Validates semantic model configurations. **Query** ```bash # In dbt Core -mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order-by -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' +mf query --metrics order_total --group-by order_id__is_food_order --limit 10 --order-by -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' ``` **Result** @@ -502,8 +496,6 @@ The following tabs present additional query examples, like exporting to a CSV. S - - Add `--compile` (or `--explain` for dbt Core users) to your query to view the SQL generated by MetricFlow. @@ -522,24 +514,24 @@ mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 - ```bash ✔ Success 🦄 - query completed after 0.28 seconds 🔎 SQL (remove --compile to see data or add --show-dataflow-plan to see the generated dataflow plan): -SELECT +select metric_time , is_food_order - , SUM(order_cost) AS order_total -FROM ( - SELECT - cast(ordered_at as date) AS metric_time + , sum(order_cost) as order_total +from ( + select + cast(ordered_at as date) as metric_time , is_food_order , order_cost - FROM ANALYTICS.js_dbt_sl_demo.orders orders_src_1 - WHERE cast(ordered_at as date) BETWEEN CAST('2017-08-22' AS TIMESTAMP) AND CAST('2017-08-27' AS TIMESTAMP) + from analytics.js_dbt_sl_demo.orders orders_src_1 + where cast(ordered_at as date) between cast('2017-08-22' as timestamp) and cast('2017-08-27' as timestamp) ) subq_3 -WHERE is_food_order = True -GROUP BY +where is_food_order = True +group by metric_time , is_food_order -ORDER BY metric_time DESC -LIMIT 10 +order by metric_time desc +limit 10 ``` diff --git a/website/docs/docs/build/metricflow-time-spine.md b/website/docs/docs/build/metricflow-time-spine.md index 18acf451a12..5f16af38023 100644 --- a/website/docs/docs/build/metricflow-time-spine.md +++ b/website/docs/docs/build/metricflow-time-spine.md @@ -1,55 +1,128 @@ --- title: MetricFlow time spine id: metricflow-time-spine -description: "MetricFlow expects a default timespine table called metricflow_time_spine" +description: "MetricFlow expects a default time spine table called metricflow_time_spine" sidebar_label: "MetricFlow time spine" tags: [Metrics, Semantic Layer] --- + -It's common in analytics engineering to have a date dimension or "time spine" table as a base table for different types of time-based joins and aggregations. The structure of this table is typically a base column of daily or hourly dates, with additional columns for other time grains, like fiscal quarter, defined based on the base column. You can join other tables to the time spine on the base column to calculate metrics like revenue at a point in time, or to aggregate to a specific time grain. + -MetricFlow requires you to define a time spine table as a project level configuration, which then is used for various time-based joins and aggregations, like cumulative metrics. At a minimum, you need to define a time spine table for a daily grain. You can optionally define a time spine table for a different granularity, like hourly. +It's common in analytics engineering to have a date dimension or "time spine" table as a base table for different types of time-based joins and aggregations. The structure of this table is typically a base column of daily or hourly dates, with additional columns for other time grains, like fiscal quarters, defined based on the base column. You can join other tables to the time spine on the base column to calculate metrics like revenue at a point in time, or to aggregate to a specific time grain. -If you already have a date dimension or time spine table in your dbt project, you can point MetricFlow to this table by updating the `model` configuration to use this table in the Semantic Layer. For example, given the following directory structure, you can create two time spine configurations, `time_spine_hourly` and `time_spine_daily`. +MetricFlow requires you to define at least one dbt model which provides a time-spine, and then specify (in YAML) the columns to be used for time-based joins. MetricFlow will join against the time-spine model for the following types of metrics and dimensions: -:::tip -Previously, you were required to create a model called `metricflow_time_spine` in your dbt project. This is no longer required. However, you can build your time spine model from this table if you don't have another date dimension table you want to use in your project. +- [Cumulative metrics](/docs/build/cumulative) +- [Metric offsets](/docs/build/derived#derived-metric-offset) +- [Conversion metrics](/docs/build/conversion) +- [Slowly Changing Dimensions](/docs/build/dimensions#scd-type-ii) +- [Metrics](/docs/build/metrics-overview) with the `join_to_timespine` configuration set to true +To see the generated SQL for the metric and dimension types that use time spine joins, refer to the respective documentation or add the `compile=True` flag when querying the Semantic Layer to return the compiled SQL. + +## Configuring time spine in YAML + + Time spine models are normal dbt models with extra configurations that tell dbt and MetricFlow how to use specific columns by defining their properties. Add the [`models` key](/reference/model-properties) for the time spine in your `models/` directory. If your project already includes a calendar table or date dimension, you can configure that table as a time spine. Otherwise, review the [example time-spine tables](#example-time-spine-tables) to create one. + + Some things to note when configuring time spine models: + +- Add the configurations under the `time_spine` key for that [model's properties](/reference/model-properties), just as you would add a description or tests. +- You only need to configure time-spine models that the Semantic Layer should recognize. +- At a minimum, define a time-spine table for a daily grain. +- You can optionally define additional time-spine tables for different granularities, like hourly. Review the [granularity considerations](#granularity-considerations) when deciding which tables to create. +- If you're looking to specify the grain of a time dimension so that MetricFlow can transform the underlying column to the required granularity, refer to the [Time granularity documentation](/docs/build/dimensions?dimension=time_gran) + +:::tip +If you previously used a model called `metricflow_time_spine`, you no longer need to create this specific model. You can now configure MetricFlow to use any date dimension or time spine table already in your project by updating the `model` setting in the Semantic Layer. + +If you don’t have a date dimension table, you can still create one by using the code snippet in the [next section](#creating-a-time-spine-table) to build your time spine model. ::: - +### Creating a time spine table + +MetricFlow supports granularities ranging from milliseconds to years. Refer to the [Dimensions page](/docs/build/dimensions?dimension=time_gran#time) (time_granularity tab) to find the full list of supported granularities. + +To create a time spine table from scratch, you can do so by adding the following code to your dbt project. +This example creates a time spine at an hourly grain and a daily grain: `time_spine_hourly` and `time_spine_daily`. + + + + +```yaml +[models:](/reference/model-properties) +# Hourly time spine + - name: time_spine_hourly + description: my favorite time spine + time_spine: + standard_granularity_column: date_hour # column for the standard grain of your table, must be date time type. + custom_granularities: + - name: fiscal_year + column_name: fiscal_year_column + columns: + - name: date_hour + granularity: hour # set granularity at column-level for standard_granularity_column + +# Daily time spine + - name: time_spine_daily + time_spine: + standard_granularity_column: date_day # column for the standard grain of your table + columns: + - name: date_day + granularity: day # set granularity at column-level for standard_granularity_column +``` + + + + + -Now, break down the configuration above. It's pointing to a model called `time_spine_daily`. It sets the time spine configurations under the `time_spine` key. The `standard_granularity_column` is the lowest grain of the table, in this case, it's hourly. It needs to reference a column defined under the columns key, in this case, `date_hour`. Use the `standard_granularity_column` as the join key for the time spine table when joining tables in MetricFlow. Here, the granularity of the `standard_granularity_column` is set at the column level, in this case, `hour`. +- This example configuration shows a time spine model called `time_spine_hourly` and `time_spine_daily`. It sets the time spine configurations under the `time_spine` key. +- The `standard_granularity_column` is the column that maps to one of our [standard granularities](/docs/build/dimensions?dimension=time_gran). This column must be set under the `columns` key and should have a grain that is finer or equal to any custom granularity columns defined in the same model. + - It needs to reference a column defined under the `columns` key, in this case, `date_hour` and `date_day`, respectively. + - It sets the granularity at the column-level using the `granularity` key, in this case, `hour` and `day`, respectively. +- MetricFlow will use the `standard_granularity_column` as the join key when joining the time spine table to another source table. +- [The `custom_granularities` field](#custom-calendar), (available in Versionless and dbt v1.9 and higher) lets you specify non-standard time periods like `fiscal_year` or `retail_month` that your organization may use. +For an example project, refer to our [Jaffle shop](https://github.com/dbt-labs/jaffle-sl-template/blob/main/models/marts/_models.yml) example. -If you need to create a time spine table from scratch, you can do so by adding the following code to your dbt project. -The example creates a time spine at a daily grain and an hourly grain. A few things to note when creating time spine models: -* MetricFlow will use the time spine with the largest compatible granularity for a given query to ensure the most efficient query possible. For example, if you have a time spine at a monthly grain, and query a dimension at a monthly grain, MetricFlow will use the monthly time spine. If you only have a daily time spine, MetricFlow will use the daily time spine and date_trunc to month. -* You can add a time spine for each granularity you intend to use if query efficiency is more important to you than configuration time, or storage constraints. For most engines, the query performance difference should be minimal and transforming your time spine to a coarser grain at query time shouldn't add significant overhead to your queries. -* We recommend having a time spine at the finest grain used in any of your dimensions to avoid unexpected errors. i.e., if you have dimensions at an hourly grain, you should have a time spine at an hourly grain. +### Considerations when choosing which granularities to create{#granularity-considerations} - +- MetricFlow will use the time spine with the largest compatible granularity for a given query to ensure the most efficient query possible. For example, if you have a time spine at a monthly grain, and query a dimension at a monthly grain, MetricFlow will use the monthly time spine. If you only have a daily time spine, MetricFlow will use the daily time spine and date_trunc to month. +- You can add a time spine for each granularity you intend to use if query efficiency is more important to you than configuration time, or storage constraints. For most engines, the query performance difference should be minimal and transforming your time spine to a coarser grain at query time shouldn't add significant overhead to your queries. +- We recommend having a time spine at the finest grain used in any of your dimensions to avoid unexpected errors. For example, if you have dimensions at an hourly grain, you should have a time spine at an hourly grain. - +## Example time spine tables + +### Daily + + ```sql {{ @@ -61,7 +134,7 @@ The example creates a time spine at a daily grain and an hourly grain. A few thi with days as ( {{ - dbt_utils.date_spine( + dbt.date_spine( 'day', "to_date('01/01/2000','mm/dd/yyyy')", "to_date('01/01/2025','mm/dd/yyyy')" @@ -76,14 +149,47 @@ final as ( ) select * from final +where date_day > dateadd(year, -4, current_timestamp()) +and date_day < dateadd(day, 30, current_timestamp()) +``` + +### Daily (BigQuery) + +Use this model if you're using BigQuery. BigQuery supports `DATE()` instead of `TO_DATE()`: + + + +```sql + +{{config(materialized='table')}} +with days as ( + {{dbt.date_spine( + 'day', + "DATE(2000,01,01)", + "DATE(2025,01,01)" + ) + }} +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * +from final -- filter the time spine to a specific range where date_day > dateadd(year, -4, current_timestamp()) -and date_hour < dateadd(day, 30, current_timestamp()) +and date_day < dateadd(day, 30, current_timestamp()) ``` - + + + - +### Hourly + + ```sql {{ @@ -92,11 +198,11 @@ and date_hour < dateadd(day, 30, current_timestamp()) ) }} -with days as ( +with hours as ( {{ dbt.date_spine( - 'day', + 'hour', "to_date('01/01/2000','mm/dd/yyyy')", "to_date('01/01/2025','mm/dd/yyyy')" ) @@ -105,31 +211,51 @@ with days as ( ), final as ( - select cast(date_day as date) as date_day - from days + select cast(date_hour as timestamp) as date_hour + from hours ) select * from final +-- filter the time spine to a specific range where date_day > dateadd(year, -4, current_timestamp()) and date_hour < dateadd(day, 30, current_timestamp()) ``` + + + + + + + + +MetricFlow uses a time spine table to construct cumulative metrics. By default, MetricFlow expects the time spine table to be named `metricflow_time_spine` and doesn't support using a different name. For supported granularities, refer to the [dimensions](/docs/build/dimensions?dimension=time_gran#time) page. + +To create this table, you need to create a model in your dbt project called `metricflow_time_spine` and add the following code: + +### Daily + + -Use this model if you're using BigQuery. BigQuery supports `DATE()` instead of `TO_DATE()`: - - - ```sql -{{config(materialized='table')}} -with days as ( - {{dbt_utils.date_spine( - 'day', - "DATE(2000,01,01)", - "DATE(2025,01,01)" +{{ + config( + materialized = 'table', ) +}} + +with days as ( + + {{ + dbt.date_spine( + 'day', + "to_date('01/01/2000','mm/dd/yyyy')", + "to_date('01/01/2025','mm/dd/yyyy')" + ) }} + ), final as ( @@ -137,21 +263,20 @@ final as ( from days ) -select * -from final --- filter the time spine to a specific range +select * from final where date_day > dateadd(year, -4, current_timestamp()) -and date_hour < dateadd(day, 30, current_timestamp()) +and date_day < dateadd(day, 30, current_timestamp()) ``` + - - +### Daily (BigQuery) + +Use this model if you're using BigQuery. BigQuery supports `DATE()` instead of `TO_DATE()`: ```sql - {{config(materialized='table')}} with days as ( {{dbt.date_spine( @@ -171,43 +296,69 @@ select * from final -- filter the time spine to a specific range where date_day > dateadd(year, -4, current_timestamp()) -and date_hour < dateadd(day, 30, current_timestamp()) +and date_day < dateadd(day, 30, current_timestamp()) ``` + + +You only need to include the `date_day` column in the table. MetricFlow can handle broader levels of detail, but finer grains are only supported in versions 1.9 and higher. + - -## Hourly time spine - +## Custom calendar -```sql -{{ - config( - materialized = 'table', - ) -}} + -with hours as ( +The ability to configure custom calendars, such as a fiscal calendar, is available in [dbt Cloud Versionless](/docs/dbt-versions/versionless-cloud) or dbt Core [v1.9 and higher](/docs/dbt-versions/core). - {{ - dbt.date_spine( - 'hour', - "to_date('01/01/2000','mm/dd/yyyy')", - "to_date('01/01/2025','mm/dd/yyyy')" - ) - }} +To access this feature, [upgrade to Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) or your dbt Core version to v1.9 or higher. -), + -final as ( - select cast(date_hour as timestamp) as date_hour - from hours -) + -select * from final --- filter the time spine to a specific range -where date_day > dateadd(year, -4, current_timestamp()) -and date_hour < dateadd(day, 30, current_timestamp()) +Custom date transformations can be complex, and organizations often have unique needs that can’t be easily generalized. Creating a custom calendar model allows you to define these transformations in SQL, offering more flexibility than native transformations in MetricFlow. This approach lets you map custom columns back to MetricFlow granularities, ensuring consistency while giving you control over the transformations. + +For example, if you use a custom calendar in your organization, such as a fiscal calendar, you can configure it in MetricFlow using its date and time operations. + +- This is useful for calculating metrics based on a custom calendar, such as fiscal quarters or weeks. +- Use the `custom_granularities` key to define a non-standard time period for querying data, such as a `retail_month` or `fiscal_week`, instead of standard options like `day`, `month`, or `year`. +- This feature provides more control over how time-based metrics are calculated. + + + +When working with custom calendars in MetricFlow, it's important to ensure: + +- Consistent data types — Both your dimension column and the time spine column should use the same data type to allow accurate comparisons. Functions like `DATE_TRUNC` don't change the data type of the input in some databases (like Snowflake). Using different data types can lead to mismatches and inaccurate results. + + We recommend using `DATETIME` or `TIMESTAMP` data types for your time dimensions and time spine, as they support all granularities. The `DATE` data type may not support smaller granularities like hours or minutes. + +- Time zones — MetricFlow currently doesn't perform any timezone manipulation. When working with timezone-aware data, inconsistent time zones may lead to unexpected results during aggregations and comparisons. + +For example, if your time spine column is `TIMESTAMP` type and your dimension column is `DATE` type, comparisons between these columns might not work as intended. To fix this, convert your `DATE` column to `TIMESTAMP`, or make sure both columns are the same data type. + + + +### Add custom granularities + +To add custom granularities, the Semantic Layer supports custom calendar configurations that allow users to query data using non-standard time periods like `fiscal_year` or `retail_month`. You can define these custom granularities (all lowercased) by modifying your model's YAML configuration like this: + + + +```yaml +models: + - name: my_time_spine + description: my favorite time spine + time_spine: + standard_granularity_column: date_day + custom_granularities: + - name: fiscal_year + column_name: fiscal_year_column ``` + +#### Coming soon +Note that features like calculating offsets and period-over-period will be supported soon! + + diff --git a/website/docs/docs/build/metrics-overview.md b/website/docs/docs/build/metrics-overview.md index 38b9b22bdb2..7021a6d7330 100644 --- a/website/docs/docs/build/metrics-overview.md +++ b/website/docs/docs/build/metrics-overview.md @@ -92,7 +92,18 @@ import SLCourses from '/snippets/_sl-course.md'; ## Default granularity for metrics -It's possible to define a default time granularity for metrics if it's different from the granularity of the default aggregation time dimensions (`metric_time`). This is useful if your time dimension has a very fine grain, like second or hour, but you typically query metrics rolled up at a coarser grain. The granularity can be set using the `time_granularity` parameter on the metric, and defaults to `day`. If day is not available because the dimension is defined at a coarser granularity, it will default to the defined granularity for the dimension. + +Default time granularity for metrics is useful if your time dimension has a very fine grain, like second or hour, but you typically query metrics rolled up at a coarser grain. + +To set the default time granularity for metrics, you need to be on dbt Cloud Versionless or dbt v1.9 and higher. + + + + + +It's possible to define a default time granularity for metrics if it's different from the granularity of the default aggregation time dimensions (`metric_time`). This is useful if your time dimension has a very fine grain, like second or hour, but you typically query metrics rolled up at a coarser grain. + +The granularity can be set using the `time_granularity` parameter on the metric, and defaults to `day`. If day is not available because the dimension is defined at a coarser granularity, it will default to the defined granularity for the dimension. ### Example You have a semantic model called `orders` with a time dimension called `order_time`. You want the `orders` metric to roll up to `monthly` by default; however, you want the option to look at these metrics hourly. You can set the `time_granularity` parameter on the `order_time` dimension to `hour`, and then set the `time_granularity` parameter in the metric to `month`. @@ -117,6 +128,7 @@ semantic_models: name: orders time_granularity: month -- Optional, defaults to day ``` + ## Conversion metrics @@ -270,6 +282,8 @@ A filter is configured using Jinja templating. Use the following syntax to refer Refer to [Metrics as dimensions](/docs/build/ref-metrics-in-filters) for details on how to use metrics as dimensions with metric filters: + + ```yaml @@ -283,10 +297,30 @@ filter: | {{ TimeDimension('time_dimension', 'granularity') }} filter: | - {{ Metric('metric_name', group_by=['entity_name']) }} # Available in v1.8 or with [versionless (/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) dbt Cloud. + {{ Metric('metric_name', group_by=['entity_name']) }} + ``` + + + + + + + +```yaml +filter: | + {{ Entity('entity_name') }} + +filter: | + {{ Dimension('primary_entity__dimension_name') }} + +filter: | + {{ TimeDimension('time_dimension', 'granularity') }} + +``` + For example, if you want to filter for the order date dimension grouped by month, use the following syntax: diff --git a/website/docs/docs/build/packages.md b/website/docs/docs/build/packages.md index 0b69d10cee6..49cd7e00b1c 100644 --- a/website/docs/docs/build/packages.md +++ b/website/docs/docs/build/packages.md @@ -20,9 +20,10 @@ In dbt, libraries like these are called _packages_. dbt's packages are so powerf * Models to understand [Redshift](https://hub.getdbt.com/dbt-labs/redshift/latest/) privileges. * Macros to work with data loaded by [Stitch](https://hub.getdbt.com/dbt-labs/stitch_utils/latest/). -dbt _packages_ are in fact standalone dbt projects, with models and macros that tackle a specific problem area. As a dbt user, by adding a package to your project, the package's models and macros will become part of your own project. This means: +dbt _packages_ are in fact standalone dbt projects, with models, macros, and other resources that tackle a specific problem area. As a dbt user, by adding a package to your project, all of the package's resources will become part of your own project. This means: * Models in the package will be materialized when you `dbt run`. * You can use `ref` in your own models to refer to models from the package. +* You can use `source` to refer to sources in the package. * You can use macros in the package in your own project. * It's important to note that defining and installing dbt packages is different from [defining and installing Python packages](/docs/build/python-models#using-pypi-packages) @@ -82,11 +83,7 @@ packages: version: [">=0.7.0", "<0.8.0"] ``` - - -Beginning in v1.7, `dbt deps` "pins" each package by default. See ["Pinning packages"](#pinning-packages) for details. - - +`dbt deps` "pins" each package by default. See ["Pinning packages"](#pinning-packages) for details. Where possible, we recommend installing packages via dbt Hub, since this allows dbt to handle duplicate dependencies. This is helpful in situations such as: * Your project uses both the dbt-utils and Snowplow packages, and the Snowplow package _also_ uses the dbt-utils package. @@ -145,18 +142,8 @@ packages: revision: 4e28d6da126e2940d17f697de783a717f2503188 ``` - - -We **strongly recommend** ["pinning" your packages](#pinning-packages) to a specific release by specifying a release name. - - - - - By default, `dbt deps` "pins" each package. See ["Pinning packages"](#pinning-packages) for details. - - ### Internally hosted tarball URL Some organizations have security requirements to pull resources only from internal services. To address the need to install packages from hosted environments such as Artifactory or cloud storage buckets, dbt Core enables you to install packages from internally-hosted tarball URLs. @@ -318,18 +305,6 @@ When you remove a package from your `packages.yml` file, it isn't automatically ### Pinning packages - - -We **strongly recommend** "pinning" your package to a specific release by specifying a tagged release name or a specific commit hash. - -If you do not provide a revision, or if you use the main branch, then any updates to the package will be incorporated into your project the next time you run `dbt deps`. While we generally try to avoid making breaking changes to these packages, they are sometimes unavoidable. Pinning a package revision helps prevent your code from changing without your explicit approval. - -To find the latest release for a package, navigate to the `Releases` tab in the relevant GitHub repository. For example, you can find all of the releases for the dbt-utils package [here](https://github.com/dbt-labs/dbt-utils/releases). - - - - - Beginning with v1.7, running [`dbt deps`](/reference/commands/deps) "pins" each package by creating or updating the `package-lock.yml` file in the _project_root_ where `packages.yml` is recorded. - The `package-lock.yml` file contains a record of all packages installed. @@ -337,8 +312,6 @@ Beginning with v1.7, running [`dbt deps`](/reference/commands/deps) "pins" each For example, if you use a branch name, the `package-lock.yml` file pins to the head commit. If you use a version range, it pins to the latest release. In either case, subsequent commits or versions will **not** be installed. To get new commits or versions, run `dbt deps --upgrade` or add `package-lock.yml` to your .gitignore file. - - As of v0.14.0, dbt will warn you if you install a package using the `git` syntax without specifying a revision (see below). ### Configuring packages diff --git a/website/docs/docs/build/ratio-metrics.md b/website/docs/docs/build/ratio-metrics.md index cc1d13b7835..fdaeb878450 100644 --- a/website/docs/docs/build/ratio-metrics.md +++ b/website/docs/docs/build/ratio-metrics.md @@ -24,6 +24,8 @@ Ratio allows you to create a ratio between two metrics. You simply specify a num The following displays the complete specification for ratio metrics, along with an example. + + ```yaml metrics: - name: The metric name # Required @@ -40,11 +42,19 @@ metrics: filter: Filter for the denominator # Optional alias: Alias for the denominator # Optional ``` + For advanced data modeling, you can use `fill_nulls_with` and `join_to_timespine` to [set null metric values to zero](/docs/build/fill-nulls-advanced), ensuring numeric values for every data row. ## Ratio metrics example +These examples demonstrate how to create ratio metrics in your model. They cover basic and advanced use cases, including applying filters to the numerator and denominator metrics. + +#### Example 1 +This example is a basic ratio metric that calculates the ratio of food orders to total orders: + + + ```yaml metrics: - name: food_order_pct @@ -55,6 +65,30 @@ metrics: numerator: food_orders denominator: orders ``` + + +#### Example 2 +This example is a ratio metric that calculates the ratio of food orders to total orders, with a filter and alias applied to the numerator. Note that in order to add these attributes, you'll need to use an explicit key for the name attribute too. + + + +```yaml +metrics: + - name: food_order_pct + description: "The food order count as a ratio of the total order count, filtered by location" + label: Food order ratio by location + type: ratio + type_params: + numerator: + name: food_orders + filter: location = 'New York' + alias: ny_food_orders + denominator: + name: orders + filter: location = 'New York' + alias: ny_orders +``` + ## Ratio metrics using different semantic models @@ -109,6 +143,8 @@ on Users can define constraints on input metrics for a ratio metric by applying a filter directly to the input metric, like so: + + ```yaml metrics: - name: frequent_purchaser_ratio @@ -123,6 +159,7 @@ metrics: denominator: name: distinct_purchasers ``` + Note the `filter` and `alias` parameters for the metric referenced in the numerator. - Use the `filter` parameter to apply a filter to the metric it's attached to. diff --git a/website/docs/docs/build/saved-queries.md b/website/docs/docs/build/saved-queries.md index e9beffca15f..ed56d13dcc9 100644 --- a/website/docs/docs/build/saved-queries.md +++ b/website/docs/docs/build/saved-queries.md @@ -102,24 +102,7 @@ saved_queries: ``` -Note, you can set `export_as` to both the saved query and the exports [config](/reference/resource-properties/config), with the exports config value taking precedence. If a key isn't set in the exports config, it will inherit the saved query config value. - -#### Project-level saved queries - -To enable saved queries at the project level, you can set the `saved-queries` configuration in the [`dbt_project.yml` file](/reference/dbt_project.yml). This saves you time in configuring saved queries in each file: - - - -```yaml -saved-queries: - my_saved_query: - config: - +cache: - enabled: true -``` - - -For more information on `dbt_project.yml` and config naming conventions, see the [dbt_project.yml reference page](/reference/dbt_project.yml#naming-convention). +Note, that you can set `export_as` to both the saved query and the exports [config](/reference/resource-properties/config), with the exports config value taking precedence. If a key isn't set in the exports config, it will inherit the saved query config value. #### Where clause @@ -171,6 +154,22 @@ saved_queries: +#### Project-level saved queries + +To enable saved queries at the project level, you can set the `saved-queries` configuration in the [`dbt_project.yml` file](/reference/dbt_project.yml). This saves you time in configuring saved queries in each file: + + + +```yaml +saved-queries: + my_saved_query: + +cache: + enabled: true +``` + + +For more information on `dbt_project.yml` and config naming conventions, see the [dbt_project.yml reference page](/reference/dbt_project.yml#naming-convention). + To build `saved_queries`, use the [`--resource-type` flag](/reference/global-configs/resource-type) and run the command `dbt build --resource-type saved_query`. ## Configure exports diff --git a/website/docs/docs/build/semantic-models.md b/website/docs/docs/build/semantic-models.md index d683d7cd020..609d7f1ff8d 100644 --- a/website/docs/docs/build/semantic-models.md +++ b/website/docs/docs/build/semantic-models.md @@ -119,8 +119,6 @@ semantic_models: type: categorical ``` - - Semantic models support [`meta`](/reference/resource-configs/meta), [`group`](/reference/resource-configs/group), and [`enabled`](/reference/resource-configs/enabled) [`config`](/reference/resource-properties/config) property in either the schema file or at the project level: - Semantic model config in `models/semantic.yml`: @@ -148,8 +146,6 @@ Semantic models support [`meta`](/reference/resource-configs/meta), [`group`](/r For more information on `dbt_project.yml` and config naming conventions, see the [dbt_project.yml reference page](/reference/dbt_project.yml#naming-convention). - - ### Name Define the name of the semantic model. You must define a unique name for the semantic model. The semantic graph will use this name to identify the model, and you can update it at any time. Avoid using double underscores (\_\_) in the name as they're not supported. diff --git a/website/docs/docs/build/snapshots.md b/website/docs/docs/build/snapshots.md index 82b5104fcef..f5321aa626a 100644 --- a/website/docs/docs/build/snapshots.md +++ b/website/docs/docs/build/snapshots.md @@ -18,215 +18,205 @@ Snapshots implement [type-2 Slowly Changing Dimensions](https://en.wikipedia.org | id | status | updated_at | | -- | ------ | ---------- | -| 1 | pending | 2019-01-01 | +| 1 | pending | 2024-01-01 | Now, imagine that the order goes from "pending" to "shipped". That same record will now look like: | id | status | updated_at | | -- | ------ | ---------- | -| 1 | shipped | 2019-01-02 | +| 1 | shipped | 2024-01-02 | This order is now in the "shipped" state, but we've lost the information about when the order was last in the "pending" state. This makes it difficult (or impossible) to analyze how long it took for an order to ship. dbt can "snapshot" these changes to help you understand how values in a row change over time. Here's an example of a snapshot table for the previous example: | id | status | updated_at | dbt_valid_from | dbt_valid_to | | -- | ------ | ---------- | -------------- | ------------ | -| 1 | pending | 2019-01-01 | 2019-01-01 | 2019-01-02 | -| 1 | shipped | 2019-01-02 | 2019-01-02 | `null` | +| 1 | pending | 2024-01-01 | 2024-01-01 | 2024-01-02 | +| 1 | shipped | 2024-01-02 | 2024-01-02 | `null` | -In dbt, snapshots are `select` statements, defined within a snapshot block in a `.sql` file (typically in your `snapshots` directory). You'll also need to configure your snapshot to tell dbt how to detect record changes. - +## Configuring snapshots - +:::info Previewing or compiling snapshots in IDE not supported -```sql -{% snapshot orders_snapshot %} - -{{ - config( - target_database='analytics', - target_schema='snapshots', - unique_key='id', +It is not possible to "preview data" or "compile sql" for snapshots in dbt Cloud. Instead, [run the `dbt snapshot` command](#how-snapshots-work) in the IDE. - strategy='timestamp', - updated_at='updated_at', - ) -}} +::: -select * from {{ source('jaffle_shop', 'orders') }} + -{% endsnapshot %} -``` +- To configure snapshots in versions 1.8 and earlier, refer to [Configure snapshots in versions 1.8 and earlier](#configure-snapshots-in-versions-18-and-earlier). These versions use an older syntax where snapshots are defined within a snapshot block in a `.sql` file, typically located in your `snapshots` directory. +- Note that defining multiple resources in a single file can significantly slow down parsing and compilation. For faster and more efficient management, consider the updated snapshot YAML syntax, [available in Versionless](/docs/dbt-versions/versionless-cloud) or [dbt Core v1.9 and later](/docs/dbt-versions/core). - - +Configure your snapshots in YAML files to tell dbt how to detect record changes. Define snapshots configurations in YAML files, alongside your models, for a cleaner, faster, and more consistent set up. + + + +```yaml +snapshots: + - name: string + relation: relation # source('my_source', 'my_table') or ref('my_model') + config: + [database](/reference/resource-configs/database): string + [schema](/reference/resource-configs/schema): string + [alias](/reference/resource-configs/alias): string + [strategy](/reference/resource-configs/strategy): timestamp | check + [unique_key](/reference/resource-configs/unique_key): column_name_or_expression + [check_cols](/reference/resource-configs/check_cols): [column_name] | all + [updated_at](/reference/resource-configs/updated_at): column_name + [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes): true | false + [snapshot_meta_column_names](/reference/resource-configs/snapshot_meta_column_names): dictionary -```sql -{% snapshot orders_snapshot %} - -{{ - config( - unique_key='id', - schema='snapshots', - strategy='timestamp', - updated_at='updated_at', - ) -}} - -select * from {{ source('jaffle_shop', 'orders') }} - -{% endsnapshot %} ``` - - -:::info Preview or Compile Snapshots in IDE - -It is not possible to "preview data" or "compile sql" for snapshots in dbt Cloud. Instead, run the `dbt snapshot` command in the IDE by completing the following steps. - -::: - -When you run the [`dbt snapshot` command](/reference/commands/snapshot): -* **On the first run:** dbt will create the initial snapshot table — this will be the result set of your `select` statement, with additional columns including `dbt_valid_from` and `dbt_valid_to`. All records will have a `dbt_valid_to = null`. -* **On subsequent runs:** dbt will check which records have changed or if any new records have been created: - - The `dbt_valid_to` column will be updated for any existing records that have changed - - The updated record and any new records will be inserted into the snapshot table. These records will now have `dbt_valid_to = null` +The following table outlines the configurations available for snapshots: -Snapshots can be referenced in downstream models the same way as referencing models — by using the [ref](/reference/dbt-jinja-functions/ref) function. +| Config | Description | Required? | Example | +| ------ | ----------- | --------- | ------- | +| [database](/reference/resource-configs/database) | Specify a custom database for the snapshot | No | analytics | +| [schema](/reference/resource-configs/schema) | Specify a custom schema for the snapshot | No | snapshots | +| [alias](/reference/resource-configs/alias) | Specify an alias for the snapshot | No | your_custom_snapshot | +| [strategy](/reference/resource-configs/strategy) | The snapshot strategy to use. Valid values: `timestamp` or `check` | Yes | timestamp | +| [unique_key](/reference/resource-configs/unique_key) | A column or expression for the record | Yes | id | +| [check_cols](/reference/resource-configs/check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | +| [updated_at](/reference/resource-configs/updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | +| [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) | Find hard deleted records in source and set `dbt_valid_to` to current time if the record no longer exists | No | True | +| [snapshot_meta_column_names](/reference/resource-configs/snapshot_meta_column_names) | Customize the names of the snapshot meta fields | No | dictionary | -## Example +- In versions prior to v1.9, the `target_schema` (required) and `target_database` (optional) configurations defined a single schema or database to build a snapshot across users and environment. This created problems when testing or developing a snapshot, as there was no clear separation between development and production environments. In v1.9, `target_schema` became optional, allowing snapshots to be environment-aware. By default, without `target_schema` or `target_database` defined, snapshots now use the `generate_schema_name` or `generate_database_name` macros to determine where to build. Developers can still set a custom location with [`schema`](/reference/resource-configs/schema) and [`database`](/reference/resource-configs/database) configs, consistent with other resource types. +- A number of other configurations are also supported (for example, `tags` and `post-hook`). For the complete list, refer to [Snapshot configurations](/reference/snapshot-configs). +- You can configure snapshots from both the `dbt_project.yml` file and a `config` block. For more information, refer to the [configuration docs](/reference/snapshot-configs). -To add a snapshot to your project: -1. Create a file in your `snapshots` directory with a `.sql` file extension, e.g. `snapshots/orders.sql` -2. Use a `snapshot` block to define the start and end of a snapshot: +### Add a snapshot to your project - +To add a snapshot to your project follow these steps. For users on versions 1.8 and earlier, refer to [Configure snapshots in versions 1.8 and earlier](#configure-snapshots-in-versions-18-and-earlier). -```sql -{% snapshot orders_snapshot %} +1. Create a YAML file in your `snapshots` directory: `snapshots/orders_snapshot.yml` and add your configuration details. You can also configure your snapshot from your `dbt_project.yml` file ([docs](/reference/snapshot-configs)). -{% endsnapshot %} -``` + - + ```yaml + snapshots: + - name: orders_snapshot + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + database: analytics + unique_key: id + strategy: timestamp + updated_at: updated_at -3. Write a `select` statement within the snapshot block (tips for writing a good snapshot query are below). This select statement defines the results that you want to snapshot over time. You can use `sources` and `refs` here. + ``` + - +2. Since snapshots focus on configuration, the transformation logic is minimal. Typically, you'd select all data from the source. If you need to apply transformations (like filters, deduplication), it's best practice to define an ephemeral model and reference it in your snapshot configuration. -```sql -{% snapshot orders_snapshot %} + -select * from {{ source('jaffle_shop', 'orders') }} + ```yaml + {{ config(materialized='ephemeral') }} -{% endsnapshot %} -``` + select * from {{ source('jaffle_shop', 'orders') }} + ``` + - +3. Check whether the result set of your query includes a reliable timestamp column that indicates when a record was last updated. For our example, the `updated_at` column reliably indicates record changes, so we can use the `timestamp` strategy. If your query result set does not have a reliable timestamp, you'll need to instead use the `check` strategy — more details on this below. -4. Check whether the result set of your query includes a reliable timestamp column that indicates when a record was last updated. For our example, the `updated_at` column reliably indicates record changes, so we can use the `timestamp` strategy. If your query result set does not have a reliable timestamp, you'll need to instead use the `check` strategy — more details on this below. +4. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example, a new table will be created at `analytics.snapshots.orders_snapshot`. The [`schema`](/reference/resource-configs/schema) config will utilize the `generate_schema_name` macro. -5. Add configurations to your snapshot using a `config` block (more details below). You can also configure your snapshot from your `dbt_project.yml` file ([docs](/reference/snapshot-configs)). + ``` + $ dbt snapshot + Running with dbt=1.9.0 - + 15:07:36 | Concurrency: 8 threads (target='dev') + 15:07:36 | + 15:07:36 | 1 of 1 START snapshot snapshots.orders_snapshot...... [RUN] + 15:07:36 | 1 of 1 OK snapshot snapshots.orders_snapshot..........[SELECT 3 in 1.82s] + 15:07:36 | + 15:07:36 | Finished running 1 snapshots in 0.68s. - + Completed successfully -```sql -{% snapshot orders_snapshot %} + Done. PASS=2 ERROR=0 SKIP=0 TOTAL=1 + ``` -{{ - config( - target_database='analytics', - target_schema='snapshots', - unique_key='id', +5. Inspect the results by selecting from the table dbt created (`analytics.snapshots.orders_snapshot`). After the first run, you should see the results of your query, plus the [snapshot meta fields](#snapshot-meta-fields) as described later on. - strategy='timestamp', - updated_at='updated_at', - ) -}} +6. Run the `dbt snapshot` command again and inspect the results. If any records have been updated, the snapshot should reflect this. -select * from {{ source('jaffle_shop', 'orders') }} +7. Select from the `snapshot` in downstream models using the `ref` function. -{% endsnapshot %} -``` + - + ```sql + select * from {{ ref('orders_snapshot') }} + ``` + -6. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example a new table will be created at `analytics.snapshots.orders_snapshot`. You can change the `target_database` configuration, the `target_schema` configuration and the name of the snapshot (as defined in `{% snapshot .. %}`) will change how dbt names this table. +8. Snapshots are only useful if you run them frequently — schedule the `dbt snapshot` command to run regularly. - +### Configuration best practices - + -```sql -{% snapshot orders_snapshot %} +This strategy handles column additions and deletions better than the `check` strategy. -{{ - config( - schema='snapshots', - unique_key='id', - strategy='timestamp', - updated_at='updated_at', - ) -}} + -select * from {{ source('jaffle_shop', 'orders') }} + -{% endsnapshot %} -``` +The unique key is used by dbt to match rows up, so it's extremely important to make sure this key is actually unique! If you're snapshotting a source, I'd recommend adding a uniqueness test to your source ([example](https://github.com/dbt-labs/jaffle_shop/blob/8e7c853c858018180bef1756ec93e193d9958c5b/models/staging/schema.yml#L26)). + - + -6. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example, a new table will be created at `analytics.snapshots.orders_snapshot`. The [`schema`](/reference/resource-configs/schema) config will utilize the `generate_schema_name` macro. + +Snapshots cannot be rebuilt. As such, it's a good idea to put snapshots in a separate schema so end users know they are special. From there, you may want to set different privileges on your snapshots compared to your models, and even run them as a different user (or role, depending on your warehouse) to make it very difficult to drop a snapshot unless you really want to. + + -``` -$ dbt snapshot -Running with dbt=1.8.0 + -15:07:36 | Concurrency: 8 threads (target='dev') -15:07:36 | -15:07:36 | 1 of 1 START snapshot snapshots.orders_snapshot...... [RUN] -15:07:36 | 1 of 1 OK snapshot snapshots.orders_snapshot..........[SELECT 3 in 1.82s] -15:07:36 | -15:07:36 | Finished running 1 snapshots in 0.68s. + -Completed successfully +Snapshots can't be rebuilt. Because of this, it's a good idea to put snapshots in a separate schema so end users know they're special. From there, you may want to set different privileges on your snapshots compared to your models, and even run them as a different user (or role, depending on your warehouse) to make it very difficult to drop a snapshot unless you really want to. -Done. PASS=2 ERROR=0 SKIP=0 TOTAL=1 -``` + -7. Inspect the results by selecting from the table dbt created. After the first run, you should see the results of your query, plus the [snapshot meta fields](#snapshot-meta-fields) as described below. + -8. Run the `snapshot` command again, and inspect the results. If any records have been updated, the snapshot should reflect this. + If you need to clean or transform your data before snapshotting, create an ephemeral model or a staging model that applies the necessary transformations. Then, reference this model in your snapshot configuration. This approach keeps your snapshot definitions clean and allows you to test and run transformations separately. -9. Select from the `snapshot` in downstream models using the `ref` function. + + - +### How snapshots work -```sql -select * from {{ ref('orders_snapshot') }} -``` - - +When you run the [`dbt snapshot` command](/reference/commands/snapshot): +* **On the first run:** dbt will create the initial snapshot table — this will be the result set of your `select` statement, with additional columns including `dbt_valid_from` and `dbt_valid_to`. All records will have a `dbt_valid_to = null`. +* **On subsequent runs:** dbt will check which records have changed or if any new records have been created: + - The `dbt_valid_to` column will be updated for any existing records that have changed + - The updated record and any new records will be inserted into the snapshot table. These records will now have `dbt_valid_to = null` -10. Schedule the `snapshot` command to run regularly — snapshots are only useful if you run them frequently. +Note, these column names can be customized to your team or organizational conventions using the [snapshot_meta_column_names](#snapshot-meta-fields) config. +Snapshots can be referenced in downstream models the same way as referencing models — by using the [ref](/reference/dbt-jinja-functions/ref) function. ## Detecting row changes -Snapshot "strategies" define how dbt knows if a row has changed. There are two strategies built-in to dbt — `timestamp` and `check`. +Snapshot "strategies" define how dbt knows if a row has changed. There are two strategies built-in to dbt: +- [Timestamp](#timestamp-strategy-recommended) — Uses an `updated_at` column to determine if a row has changed. +- [Check](#check-strategy) — Compares a list of columns between their current and historical values to determine if a row has changed. ### Timestamp strategy (recommended) The `timestamp` strategy uses an `updated_at` field to determine if a row has changed. If the configured `updated_at` column for a row is more recent than the last time the snapshot ran, then dbt will invalidate the old record and record the new one. If the timestamps are unchanged, then dbt will not take any action. @@ -266,27 +256,19 @@ The `timestamp` strategy requires the following configurations: - - -```sql -{% snapshot orders_snapshot_timestamp %} - - {{ - config( - schema='snapshots', - strategy='timestamp', - unique_key='id', - updated_at='updated_at', - ) - }} - - select * from {{ source('jaffle_shop', 'orders') }} - -{% endsnapshot %} + + +```yaml +snapshots: + - name: orders_snapshot_timestamp + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: timestamp + updated_at: updated_at ``` - - ### Check strategy @@ -298,15 +280,12 @@ The `check` strategy requires the following configurations: | ------ | ----------- | ------- | | check_cols | A list of columns to check for changes, or `all` to check all columns | `["name", "email"]` | - - :::caution check_cols = 'all' The `check` snapshot strategy can be configured to track changes to _all_ columns by supplying `check_cols = 'all'`. It is better to explicitly enumerate the columns that you want to check. Consider using a to condense many columns into a single column. ::: - **Example Usage** @@ -336,23 +315,19 @@ The `check` snapshot strategy can be configured to track changes to _all_ column - - -```sql -{% snapshot orders_snapshot_check %} - - {{ - config( - schema='snapshots', - strategy='check', - unique_key='id', - check_cols=['status', 'is_cancelled'], - ) - }} - - select * from {{ source('jaffle_shop', 'orders') }} - -{% endsnapshot %} + + +```yaml +snapshots: + - name: orders_snapshot_check + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: check + check_cols: + - status + - is_cancelled ``` @@ -397,112 +372,42 @@ For this configuration to work with the `timestamp` strategy, the configured `up - - -```sql -{% snapshot orders_snapshot_hard_delete %} - - {{ - config( - schema='snapshots', - strategy='timestamp', - unique_key='id', - updated_at='updated_at', - invalidate_hard_deletes=True, - ) - }} - - select * from {{ source('jaffle_shop', 'orders') }} - -{% endsnapshot %} + + +```yaml +snapshots: + - name: orders_snapshot_hard_delete + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: timestamp + updated_at: updated_at + invalidate_hard_deletes: true ``` -## Configuring snapshots -### Snapshot configurations -There are a number of snapshot-specific configurations: - - - -| Config | Description | Required? | Example | -| ------ | ----------- | --------- | ------- | -| [target_database](/reference/resource-configs/target_database) | The database that dbt should render the snapshot table into | No | analytics | -| [target_schema](/reference/resource-configs/target_schema) | The schema that dbt should render the snapshot table into | Yes | snapshots | -| [strategy](/reference/resource-configs/strategy) | The snapshot strategy to use. One of `timestamp` or `check` | Yes | timestamp | -| [unique_key](/reference/resource-configs/unique_key) | A column or expression for the record | Yes | id | -| [check_cols](/reference/resource-configs/check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | -| [updated_at](/reference/resource-configs/updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | -| [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) | Find hard deleted records in source, and set `dbt_valid_to` current time if no longer exists | No | True | - -A number of other configurations are also supported (e.g. `tags` and `post-hook`), check out the full list [here](/reference/snapshot-configs). - -Snapshots can be configured from both your `dbt_project.yml` file and a `config` block, check out the [configuration docs](/reference/snapshot-configs) for more information. - -Note: BigQuery users can use `target_project` and `target_dataset` as aliases for `target_database` and `target_schema`, respectively. - - - - - -| Config | Description | Required? | Example | -| ------ | ----------- | --------- | ------- | -| [database](/reference/resource-configs/database) | Specify a custom database for the snapshot | No | analytics | -| [schema](/reference/resource-configs/schema) | Specify a custom schema for the snapshot | No | snapshots | -| [alias](/reference/resource-configs/alias) | Specify an alias for the snapshot | No | your_custom_snapshot | -| [strategy](/reference/resource-configs/strategy) | The snapshot strategy to use. Valid values: `timestamp` or `check` | Yes | timestamp | -| [unique_key](/reference/resource-configs/unique_key) | A column or expression for the record | Yes | id | -| [check_cols](/reference/resource-configs/check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | -| [updated_at](/reference/resource-configs/updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | -| [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) | Find hard deleted records in source and set `dbt_valid_to` to current time if the record no longer exists | No | True | - -In versions prior to v1.9, the `target_schema` (required) and `target_database` (optional) configurations defined a single schema or database to build a snapshot into across users and environments. This created problems when testing or developing a snapshot, as there was no clear separation between development and production environments. In v1.9, support was added for environment-aware snapshots by making `target_schema` optional. Snapshots, by default with no `target_schema` or `target_database` config defined, now resolve the schema or database to build the snapshot into using the `generate_schema_name` or `generate_database_name` macros. Developers can optionally define a custom location for snapshots to build to with the [`schema`](/reference/resource-configs/schema) and [`database`](/reference/resource-configs/database) configs, as is consistent with other resource types. - -A number of other configurations are also supported (for example, `tags` and `post-hook`). For the complete list, refer to [Snapshot configurations](/reference/snapshot-configs). - -You can configure snapshots from both the `dbt_project.yml` file and a `config` block. For more information, refer to the [configuration docs](/reference/snapshot-configs). - - - -### Configuration best practices -#### Use the `timestamp` strategy where possible -This strategy handles column additions and deletions better than the `check` strategy. - -#### Ensure your unique key is really unique -The unique key is used by dbt to match rows up, so it's extremely important to make sure this key is actually unique! If you're snapshotting a source, I'd recommend adding a uniqueness test to your source ([example](https://github.com/dbt-labs/jaffle_shop/blob/8e7c853c858018180bef1756ec93e193d9958c5b/models/staging/schema.yml#L26)). - - - -#### Use a `target_schema` that is separate to your analytics schema -Snapshots cannot be rebuilt. As such, it's a good idea to put snapshots in a separate schema so end users know they are special. From there, you may want to set different privileges on your snapshots compared to your models, and even run them as a different user (or role, depending on your warehouse) to make it very difficult to drop a snapshot unless you really want to. - - - - - -#### Use a schema that is separate to your models' schema -Snapshots can't be rebuilt. Because of this, it's a good idea to put snapshots in a separate schema so end users know they're special. From there, you may want to set different privileges on your snapshots compared to your models, and even run them as a different user (or role, depending on your warehouse) to make it very difficult to drop a snapshot unless you really want to. - - - ## Snapshot query best practices -#### Snapshot source data. -Your models should then select from these snapshots, treating them like regular data sources. As much as possible, snapshot your source data in its raw form and use downstream models to clean up the data +This section outlines some best practices for writing snapshot queries: + +- #### Snapshot source data + Your models should then select from these snapshots, treating them like regular data sources. As much as possible, snapshot your source data in its raw form and use downstream models to clean up the data -#### Use the `source` function in your query. -This helps when understanding data lineage in your project. +- #### Use the `source` function in your query + This helps when understanding data lineage in your project. -#### Include as many columns as possible. -In fact, go for `select *` if performance permits! Even if a column doesn't feel useful at the moment, it might be better to snapshot it in case it becomes useful – after all, you won't be able to recreate the column later. +- #### Include as many columns as possible + In fact, go for `select *` if performance permits! Even if a column doesn't feel useful at the moment, it might be better to snapshot it in case it becomes useful – after all, you won't be able to recreate the column later. -#### Avoid joins in your snapshot query. -Joins can make it difficult to build a reliable `updated_at` timestamp. Instead, snapshot the two tables separately, and join them in downstream models. +- #### Avoid joins in your snapshot query + Joins can make it difficult to build a reliable `updated_at` timestamp. Instead, snapshot the two tables separately, and join them in downstream models. -#### Limit the amount of transformation in your query. -If you apply business logic in a snapshot query, and this logic changes in the future, it can be impossible (or, at least, very difficult) to apply the change in logic to your snapshots. +- #### Limit the amount of transformation in your query + If you apply business logic in a snapshot query, and this logic changes in the future, it can be impossible (or, at least, very difficult) to apply the change in logic to your snapshots. Basically – keep your query as simple as possible! Some reasonable exceptions to these recommendations include: * Selecting specific columns if the table is wide. @@ -512,6 +417,8 @@ Basically – keep your query as simple as possible! Some reasonable exceptions Snapshot tables will be created as a clone of your source dataset, plus some additional meta-fields*. +Starting in 1.9 or with [dbt Cloud Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless), these column names can be customized to your team or organizational conventions via the [`snapshot_meta_column_names`](/reference/resource-configs/snapshot_meta_column_names) config. + | Field | Meaning | Usage | | -------------- | ------- | ----- | | dbt_valid_from | The timestamp when this snapshot row was first inserted | This column can be used to order the different "versions" of a record. | @@ -526,30 +433,30 @@ For the `timestamp` strategy, the configured `updated_at` column is used to popu
Details for the timestamp strategy -Snapshot query results at `2019-01-01 11:00` +Snapshot query results at `2024-01-01 11:00` | id | status | updated_at | | -- | ------- | ---------------- | -| 1 | pending | 2019-01-01 10:47 | +| 1 | pending | 2024-01-01 10:47 | Snapshot results (note that `11:00` is not used anywhere): | id | status | updated_at | dbt_valid_from | dbt_valid_to | dbt_updated_at | | -- | ------- | ---------------- | ---------------- | ---------------- | ---------------- | -| 1 | pending | 2019-01-01 10:47 | 2019-01-01 10:47 | | 2019-01-01 10:47 | +| 1 | pending | 2024-01-01 10:47 | 2024-01-01 10:47 | | 2024-01-01 10:47 | -Query results at `2019-01-01 11:30`: +Query results at `2024-01-01 11:30`: | id | status | updated_at | | -- | ------- | ---------------- | -| 1 | shipped | 2019-01-01 11:05 | +| 1 | shipped | 2024-01-01 11:05 | Snapshot results (note that `11:30` is not used anywhere): | id | status | updated_at | dbt_valid_from | dbt_valid_to | dbt_updated_at | | -- | ------- | ---------------- | ---------------- | ---------------- | ---------------- | -| 1 | pending | 2019-01-01 10:47 | 2019-01-01 10:47 | 2019-01-01 11:05 | 2019-01-01 10:47 | -| 1 | shipped | 2019-01-01 11:05 | 2019-01-01 11:05 | | 2019-01-01 11:05 | +| 1 | pending | 2024-01-01 10:47 | 2024-01-01 10:47 | 2024-01-01 11:05 | 2024-01-01 10:47 | +| 1 | shipped | 2024-01-01 11:05 | 2024-01-01 11:05 | | 2024-01-01 11:05 |
@@ -560,7 +467,7 @@ For the `check` strategy, the current timestamp is used to populate each column.
Details for the check strategy -Snapshot query results at `2019-01-01 11:00` +Snapshot query results at `2024-01-01 11:00` | id | status | | -- | ------- | @@ -570,9 +477,9 @@ Snapshot results: | id | status | dbt_valid_from | dbt_valid_to | dbt_updated_at | | -- | ------- | ---------------- | ---------------- | ---------------- | -| 1 | pending | 2019-01-01 11:00 | | 2019-01-01 11:00 | +| 1 | pending | 2024-01-01 11:00 | | 2024-01-01 11:00 | -Query results at `2019-01-01 11:30`: +Query results at `2024-01-01 11:30`: | id | status | | -- | ------- | @@ -582,11 +489,191 @@ Snapshot results: | id | status | dbt_valid_from | dbt_valid_to | dbt_updated_at | | --- | ------- | ---------------- | ---------------- | ---------------- | -| 1 | pending | 2019-01-01 11:00 | 2019-01-01 11:30 | 2019-01-01 11:00 | -| 1 | shipped | 2019-01-01 11:30 | | 2019-01-01 11:30 | +| 1 | pending | 2024-01-01 11:00 | 2024-01-01 11:30 | 2024-01-01 11:00 | +| 1 | shipped | 2024-01-01 11:30 | | 2024-01-01 11:30 |
+## Configure snapshots in versions 1.8 and earlier + + + +This section is for users on dbt versions 1.8 and earlier. To configure snapshots in versions 1.9 and later, refer to [Configuring snapshots](#configuring-snapshots). The latest versions use an updated snapshot configuration syntax that optimizes performance. + + + + + +- In dbt versions 1.8 and earlier, snapshots are `select` statements, defined within a snapshot block in a `.sql` file (typically in your `snapshots` directory). You'll also need to configure your snapshot to tell dbt how to detect record changes. +- The earlier dbt versions use an older syntax that allows for defining multiple resources in a single file. This syntax can significantly slow down parsing and compilation. +- For faster and more efficient management, consider[ upgrading to Versionless](/docs/dbt-versions/versionless-cloud) or the [latest version of dbt Core](/docs/dbt-versions/core), which introduces an updated snapshot configuration syntax that optimizes performance. + +The following example shows how to configure a snapshot: + + + +```sql +{% snapshot orders_snapshot %} + +{{ + config( + target_database='analytics', + target_schema='snapshots', + unique_key='id', + + strategy='timestamp', + updated_at='updated_at', + ) +}} + +select * from {{ source('jaffle_shop', 'orders') }} + +{% endsnapshot %} +``` + + + +The following table outlines the configurations available for snapshots in versions 1.8 and earlier: + +| Config | Description | Required? | Example | +| ------ | ----------- | --------- | ------- | +| [target_database](/reference/resource-configs/target_database) | The database that dbt should render the snapshot table into | No | analytics | +| [target_schema](/reference/resource-configs/target_schema) | The schema that dbt should render the snapshot table into | Yes | snapshots | +| [strategy](/reference/resource-configs/strategy) | The snapshot strategy to use. One of `timestamp` or `check` | Yes | timestamp | +| [unique_key](/reference/resource-configs/unique_key) | A column or expression for the record | Yes | id | +| [check_cols](/reference/resource-configs/check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | +| [updated_at](/reference/resource-configs/updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | +| [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) | Find hard deleted records in source, and set `dbt_valid_to` current time if no longer exists | No | True | + +- A number of other configurations are also supported (e.g. `tags` and `post-hook`), check out the full list [here](/reference/snapshot-configs). +- Snapshots can be configured from both your `dbt_project.yml` file and a `config` block, check out the [configuration docs](/reference/snapshot-configs) for more information. +- Note: BigQuery users can use `target_project` and `target_dataset` as aliases for `target_database` and `target_schema`, respectively. + +### Configuration example + +To add a snapshot to your project: + +1. Create a file in your `snapshots` directory with a `.sql` file extension, e.g. `snapshots/orders.sql` +2. Use a `snapshot` block to define the start and end of a snapshot: + + + +```sql +{% snapshot orders_snapshot %} + +{% endsnapshot %} +``` + + + +3. Write a `select` statement within the snapshot block (tips for writing a good snapshot query are below). This select statement defines the results that you want to snapshot over time. You can use `sources` and `refs` here. + + + +```sql +{% snapshot orders_snapshot %} + +select * from {{ source('jaffle_shop', 'orders') }} + +{% endsnapshot %} +``` + + + +4. Check whether the result set of your query includes a reliable timestamp column that indicates when a record was last updated. For our example, the `updated_at` column reliably indicates record changes, so we can use the `timestamp` strategy. If your query result set does not have a reliable timestamp, you'll need to instead use the `check` strategy — more details on this below. + +5. Add configurations to your snapshot using a `config` block (more details below). You can also configure your snapshot from your `dbt_project.yml` file ([docs](/reference/snapshot-configs)). + + + + + +```sql +{% snapshot orders_snapshot %} + +{{ + config( + target_database='analytics', + target_schema='snapshots', + unique_key='id', + + strategy='timestamp', + updated_at='updated_at', + ) +}} + +select * from {{ source('jaffle_shop', 'orders') }} + +{% endsnapshot %} +``` + + + +6. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example a new table will be created at `analytics.snapshots.orders_snapshot`. You can change the `target_database` configuration, the `target_schema` configuration and the name of the snapshot (as defined in `{% snapshot .. %}`) will change how dbt names this table. + + + + + + + +```sql +{% snapshot orders_snapshot %} + +{{ + config( + schema='snapshots', + unique_key='id', + strategy='timestamp', + updated_at='updated_at', + ) +}} + +select * from {{ source('jaffle_shop', 'orders') }} + +{% endsnapshot %} +``` + + + +6. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example, a new table will be created at `analytics.snapshots.orders_snapshot`. The [`schema`](/reference/resource-configs/schema) config will utilize the `generate_schema_name` macro. + + + +``` +$ dbt snapshot +Running with dbt=1.8.0 + +15:07:36 | Concurrency: 8 threads (target='dev') +15:07:36 | +15:07:36 | 1 of 1 START snapshot snapshots.orders_snapshot...... [RUN] +15:07:36 | 1 of 1 OK snapshot snapshots.orders_snapshot..........[SELECT 3 in 1.82s] +15:07:36 | +15:07:36 | Finished running 1 snapshots in 0.68s. + +Completed successfully + +Done. PASS=2 ERROR=0 SKIP=0 TOTAL=1 +``` + +7. Inspect the results by selecting from the table dbt created. After the first run, you should see the results of your query, plus the [snapshot meta fields](#snapshot-meta-fields) as described earlier. + +8. Run the `dbt snapshot` command again, and inspect the results. If any records have been updated, the snapshot should reflect this. + +9. Select from the `snapshot` in downstream models using the `ref` function. + + + +```sql +select * from {{ ref('orders_snapshot') }} +``` + + + +10. Snapshots are only useful if you run them frequently — schedule the `snapshot` command to run regularly. + + + ## FAQs diff --git a/website/docs/docs/cloud-integrations/avail-sl-integrations.md b/website/docs/docs/cloud-integrations/avail-sl-integrations.md index 04d9d55acb4..acc36623ab5 100644 --- a/website/docs/docs/cloud-integrations/avail-sl-integrations.md +++ b/website/docs/docs/cloud-integrations/avail-sl-integrations.md @@ -29,7 +29,7 @@ import AvailIntegrations from '/snippets/_sl-partner-links.md'; - {frontMatter.meta.api_name} to learn how to integrate and query your metrics in downstream tools. - [dbt Semantic Layer API query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) -- [Hex dbt Semantic Layer cells](https://learn.hex.tech/docs/logic-cell-types/transform-cells/dbt-metrics-cells) to set up SQL cells in Hex. +- [Hex dbt Semantic Layer cells](https://learn.hex.tech/docs/explore-data/cells/data-cells/dbt-metrics-cells) to set up SQL cells in Hex. - [Resolve 'Failed APN'](/faqs/Troubleshooting/sl-alpn-error) error when connecting to the dbt Semantic Layer. - [dbt Semantic Layer on-demand course](https://learn.getdbt.com/courses/semantic-layer) - [dbt Semantic Layer FAQs](/docs/use-dbt-semantic-layer/sl-faqs) diff --git a/website/docs/docs/cloud-integrations/configure-auto-exposures.md b/website/docs/docs/cloud-integrations/configure-auto-exposures.md index 41448dd5f9e..42e36e572b3 100644 --- a/website/docs/docs/cloud-integrations/configure-auto-exposures.md +++ b/website/docs/docs/cloud-integrations/configure-auto-exposures.md @@ -6,12 +6,10 @@ description: "Import and auto-generate exposures from dashboards and understand image: /img/docs/cloud-integrations/auto-exposures/explorer-lineage2.jpg --- -# Configure auto-exposures +# Configure auto-exposures As a data team, it’s critical that you have context into the downstream use cases and users of your data products. [Auto-exposures](/docs/collaborate/auto-exposures) integrates natively with Tableau and [auto-generates downstream lineage](/docs/collaborate/auto-exposures#view-auto-exposures-in-dbt-explorer) in dbt Explorer for a richer experience. -:::info Available in beta -Auto-exposures are currently available in beta to a limited group of users and are gradually being rolled out. If you're interested in gaining access or learning more, stay tuned for updates! -::: + Auto-exposures help data teams optimize their efficiency and ensure data quality by: - Helping users understand how their models are used in downstream analytics tools to inform investments and reduce incidents — ultimately building trust and confidence in data products. @@ -27,18 +25,17 @@ To access the features, you should meet the following: 3. You have set up a [production](/docs/deploy/deploy-environments#set-as-production-environment) deployment environment for each project you want to explore, with at least one successful job run. 4. You have [admin permissions](/docs/cloud/manage-access/enterprise-permissions) in dbt Cloud to edit project settings or production environment settings. 5. Use Tableau as your BI tool and enable metadata permissions or work with an admin to do so. Compatible with Tableau Cloud or Tableau Server with the Metadata API enabled. -6. Run a production job _after_ saving the Tableau collections. ## Set up in Tableau This section of the document explains the steps you need to set up the auto-exposures integration with Tableau. Once you've set this up in Tableau and dbt Cloud, you can view the [auto-exposures](/docs/collaborate/auto-exposures#view-auto-exposures-in-dbt-explorer) in dbt Explorer. -To set up [personal access tokens (PATs)](/docs/dbt-cloud-apis/user-tokens#using-the-new-personal-access-tokens) needed for auto exposures, ask a site admin to configure it for the account. +To set up [personal access tokens (PATs)](https://help.tableau.com/current/server/en-us/security_personal_access_tokens.htm) needed for auto exposures, ask a site admin to configure it for the account. 1. Ensure you or a site admin enables PATs for the account in Tableau. -2. Create a PAT that you can add to dbt Cloud to pull in Tableau metadata for auto exposures. +2. Create a PAT that you can add to dbt Cloud to pull in Tableau metadata for auto exposures. Ensure the user creating the PAT has access to collections/folders, as the PAT only grants access matching the creator's existing privileges. 3. Copy the **Secret** and the **Token name** and enter them in dbt Cloud. The secret is only displayed once, so store it in a safe location (like a password manager). @@ -65,7 +62,6 @@ To set up [personal access tokens (PATs)](/docs/dbt-cloud-apis/user-tokens#using dbt Cloud automatically imports and syncs any workbook within the selected collections. New additions to the collections will be added to the lineage in dbt Cloud during the next automatic sync (usually once per day). 5. Click **Save**. -6. Run a production job _after_ saving the Tableau collections. dbt Cloud imports everything in the collection(s) and you can continue to view them in Explorer. For more information on how to view and use auto-exposures, refer to [View auto-exposures from dbt Explorer](/docs/collaborate/auto-exposures) page. @@ -74,5 +70,5 @@ dbt Cloud imports everything in the collection(s) and you can continue to view t ## Refresh auto-exposures in jobs :::info Coming soon -Soon, you’ll also be able to use auto-exposures trigger refresh of the data used in your Tableau dashboards from within dbt Cloud. Stay tuned for more on this soon! +Soon, you’ll also be able to use auto-exposures to trigger the refresh of the data used in your Tableau dashboards from within dbt Cloud. Stay tuned for more on this soon! ::: diff --git a/website/docs/docs/cloud-integrations/overview.md b/website/docs/docs/cloud-integrations/overview.md index d925e3e52a7..8334632a7f8 100644 --- a/website/docs/docs/cloud-integrations/overview.md +++ b/website/docs/docs/cloud-integrations/overview.md @@ -13,7 +13,7 @@ Many data applications integrate with dbt Cloud, enabling you to leverage the po
diff --git a/website/docs/docs/cloud-integrations/semantic-layer/excel.md b/website/docs/docs/cloud-integrations/semantic-layer/excel.md index e666bda0e58..c80040dce01 100644 --- a/website/docs/docs/cloud-integrations/semantic-layer/excel.md +++ b/website/docs/docs/cloud-integrations/semantic-layer/excel.md @@ -6,8 +6,6 @@ tags: [Semantic Layer] sidebar_label: "Microsoft Excel" --- -# Microsoft Excel - The dbt Semantic Layer offers a seamless integration with Excel Online and Desktop through a custom menu. This add-on allows you to build dbt Semantic Layer queries and return data on your metrics directly within Excel. ## Prerequisites @@ -18,14 +16,15 @@ The dbt Semantic Layer offers a seamless integration with Excel Online and Deskt - You must have a dbt Cloud Team or Enterprise [account](https://www.getdbt.com/pricing). Suitable for both Multi-tenant and Single-tenant deployment. - Single-tenant accounts should contact their account representative for necessary setup and enablement. -import SLCourses from '/snippets/_sl-course.md'; +:::tip - +📹 For on-demand video learning, explore the [Querying the Semantic Layer with Excel](https://learn.getdbt.com/courses/querying-the-semantic-layer-with-excel) course to learn how to query metrics with Excel. +::: ## Installing the add-on -The dbt Semantic Layer Microsoft Excel integration is available to download directly on [Microsoft AppSource](https://appsource.microsoft.com/en-us/marketplace/apps?product=office). You can choose to download this add in for both [Excel Desktop](https://pages.store.office.com/addinsinstallpage.aspx?assetid=WA200007100&rs=en-US&correlationId=4132ecd1-425d-982d-efb4-de94ebc83f26) and [Excel Online](https://pages.store.office.com/addinsinstallpage.aspx?assetid=WA200007100&rs=en-US&correlationid=4132ecd1-425d-982d-efb4-de94ebc83f26&isWac=True) +The dbt Semantic Layer Microsoft Excel integration is available to download directly on [Microsoft AppSource](https://appsource.microsoft.com/en-us/product/office/WA200007100?tab=Overview). You can choose to download this add-on in for both [Excel Desktop](https://pages.store.office.com/addinsinstallpage.aspx?assetid=WA200007100&rs=en-US&correlationId=4132ecd1-425d-982d-efb4-de94ebc83f26) and [Excel Online](https://pages.store.office.com/addinsinstallpage.aspx?assetid=WA200007100&rs=en-US&correlationid=4132ecd1-425d-982d-efb4-de94ebc83f26&isWac=True) 1. In Excel, authenticate with your host, dbt Cloud environment ID, and service token. - Access your Environment ID, Host, and URLs in your dbt Cloud Semantic Layer settings. Generate a service token in the Semantic Layer settings or API tokens settings diff --git a/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md b/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md index cffd034ac33..49e6f90e41f 100644 --- a/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md +++ b/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md @@ -144,7 +144,7 @@ Check that the SL user has been granted access to the `dbt_sl_llm` schema and ma -If there's been an update to the dbt Cloud account ID, access URL, or API service token, you need to update the configuration for the dbt Snowflake Native App. In Snowflake, navigate to the app's configuration page and delete the existing configurations. Add the new configuration and then run `CALL app_public.restart_ap ();` in the application database in Snowsight. +If there's been an update to the dbt Cloud account ID, access URL, or API service token, you need to update the configuration for the dbt Snowflake Native App. In Snowflake, navigate to the app's configuration page and delete the existing configurations. Add the new configuration and then run `CALL app_public.restart_app();` in the application database in Snowsight. diff --git a/website/docs/docs/cloud/about-cloud-develop-defer.md b/website/docs/docs/cloud/about-cloud-develop-defer.md index 4e2f70b7b82..fc55edf8a38 100644 --- a/website/docs/docs/cloud/about-cloud-develop-defer.md +++ b/website/docs/docs/cloud/about-cloud-develop-defer.md @@ -50,8 +50,11 @@ The dbt Cloud CLI offers additional flexibility by letting you choose the source - ```yml -defer-env-id: '123456' +```yml +context: + active-host: ... + active-project: ... + defer-env-id: '123456' ``` @@ -60,7 +63,7 @@ defer-env-id: '123456' ```yml -dbt_cloud: +dbt-cloud: defer-env-id: '123456' ``` diff --git a/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md index 02f950111ea..d7afd424fc4 100644 --- a/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md +++ b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md @@ -24,7 +24,7 @@ dbt Cloud's [flexible plans](https://www.getdbt.com/pricing/) and features make diff --git a/website/docs/docs/cloud/about-cloud/architecture.md b/website/docs/docs/cloud/about-cloud/architecture.md index 52614f0cbcd..ecf67b83a96 100644 --- a/website/docs/docs/cloud/about-cloud/architecture.md +++ b/website/docs/docs/cloud/about-cloud/architecture.md @@ -48,7 +48,7 @@ The git repo information is stored on dbt Cloud servers to make it accessible du ### Authentication services -The default settings of dbt Cloud enable local users with credentials stored in dbt Cloud. Still, integrations with various authentication services are offered as an alternative, including [single sign-on services](/docs/cloud/manage-access/sso-overview). Access to features can be granted/restricted by role using [RBAC](/docs/cloud/manage-access/enterprise-permissions). +The default settings of dbt Cloud enable local users with credentials stored in dbt Cloud. Still, integrations with various authentication services are offered as an alternative, including [single sign-on services](/docs/cloud/manage-access/sso-overview). Access to features can be granted/restricted by role using [RBAC](/docs/cloud/manage-access/about-user-access#role-based-access-control-). SSO features are essential because they reduce the number of credentials a user must maintain. Users sign in once and the authentication token is shared among integrated services (such as dbt Cloud). The token expires and must be refreshed at predetermined intervals, requiring the user to go through the authentication process again. If the user is disabled in the SSO provider service, their access to dbt Cloud is disabled, and they cannot override this with local auth credentials. diff --git a/website/docs/docs/cloud/account-settings.md b/website/docs/docs/cloud/account-settings.md index 6d35da3b5f3..aaad9b28e5c 100644 --- a/website/docs/docs/cloud/account-settings.md +++ b/website/docs/docs/cloud/account-settings.md @@ -39,12 +39,12 @@ To use, select the **Enable partial parsing between deployment runs** option fro -## Account access to Advanced CI features +## Account access to Advanced CI features [Advanced CI](/docs/deploy/advanced-ci) features, such as [compare changes](/docs/deploy/advanced-ci#compare-changes), allow dbt Cloud account members to view details about the changes between what's in the production environment and the pull request. To use Advanced CI features, your dbt Cloud account must have access to them. Ask your dbt Cloud administrator to enable Advanced CI features on your account, which they can do by choosing the **Enable account access to Advanced CI** option from the account settings. -Once enabled, the **Run compare changes** option becomes available in the CI job settings for you to select. +Once enabled, the **dbt compare** option becomes available in the CI job settings for you to select. - \ No newline at end of file + diff --git a/website/docs/docs/cloud/configure-cloud-cli.md b/website/docs/docs/cloud/configure-cloud-cli.md index 854950f5d8c..2e0fc174517 100644 --- a/website/docs/docs/cloud/configure-cloud-cli.md +++ b/website/docs/docs/cloud/configure-cloud-cli.md @@ -52,21 +52,29 @@ Once you install the dbt Cloud CLI, you need to configure it to connect to a dbt The config file looks like this: - ```yaml - version: "1" - context: - active-project: "" - active-host: "" - defer-env-id: "" - projects: - - project-id: "" - account-host: "" - api-key: "" - - - project-id: "" - account-host: "" - api-key: "" - ``` + ```yaml + version: "1" + context: + active-project: "" + active-host: "" + defer-env-id: "" + projects: + - project-name: "" + project-id: "" + account-name: "" + account-id: "" + account-host: "" # for example, "cloud.getdbt.com" + token-name: "" + token-value: "" + + - project-name: "" + project-id: "" + account-name: "" + account-id: "" + account-host: "" # for example, "cloud.getdbt.com" + token-name: "" + token-value: "" + ``` 3. After downloading the config file and creating your directory, navigate to a dbt project in your terminal: @@ -100,7 +108,6 @@ To set environment variables in the dbt Cloud CLI for your dbt project: 2. Then select **Profile Settings**, then **Credentials**. 3. Click on your project and scroll to the **Environment Variables** section. 4. Click **Edit** on the lower right and then set the user-level environment variables. - - Note, when setting up the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), using [environment variables](/docs/build/environment-variables) like `{{env_var('DBT_WAREHOUSE')}}` is not supported. You should use the actual credentials instead. ## Use the dbt Cloud CLI @@ -195,4 +202,4 @@ This command moves the `dbt_cloud.yml` from the `Downloads` folder to the `.dbt` By default, [all artifacts](/reference/artifacts/dbt-artifacts) are downloaded when you execute dbt commands from the dbt Cloud CLI. To skip these files from being downloaded, add `--download-artifacts=false` to the command you want to run. This can help improve run-time performance but might break workflows that depend on assets like the [manifest](/reference/artifacts/manifest-json). - \ No newline at end of file + diff --git a/website/docs/docs/cloud/connect-data-platform/about-connections.md b/website/docs/docs/cloud/connect-data-platform/about-connections.md index 83ba503657d..89dd13808ec 100644 --- a/website/docs/docs/cloud/connect-data-platform/about-connections.md +++ b/website/docs/docs/cloud/connect-data-platform/about-connections.md @@ -18,10 +18,14 @@ dbt Cloud can connect with a variety of data platform providers including: - [PostgreSQL](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) - [Snowflake](/docs/cloud/connect-data-platform/connect-snowflake) - [Starburst or Trino](/docs/cloud/connect-data-platform/connect-starburst-trino) +- [Teradata](/docs/cloud/connect-data-platform/connect-teradata) -You can connect to your database in dbt Cloud by clicking the gear in the top right and selecting **Account Settings**. From the Account Settings page, click **+ New Project**. +To connect to your database in dbt Cloud: - +1. Click your account name at the bottom of the left-side menu and click **Account settings** +2. Select **Projects** from the top left, and from there click **New Project** + + These connection instructions provide the basic fields required for configuring a data platform connection in dbt Cloud. For more detailed guides, which include demo project data, read our [Quickstart guides](https://docs.getdbt.com/guides) @@ -33,13 +37,14 @@ Starting July 2024, connection management has moved from the project level to th -The following connection management section describes these changes. +Connections created with APIs before this change cannot be accessed with the [latest APIs](https://docs.getdbt.com/dbt-cloud/api-v3#/operations/List%20Account%20Connections). dbt Labs recommends [recreating the connections](https://docs.getdbt.com/dbt-cloud/api-v3#/operations/Create%20Account%20Connection) with the latest APIs. + ::: Warehouse connections are an account-level resource. As such you can find them under **Accounts Settings** > **Connections**: - + Warehouse connections can be re-used across projects. If multiple projects all connect to the same warehouse, you should re-use the same connection to streamline your management operations. Connections are assigned to a project via an [environment](/docs/dbt-cloud-environments). @@ -95,4 +100,4 @@ dbt Cloud will always connect to your data platform from the IP addresses specif Be sure to allow traffic from these IPs in your firewall, and include them in any database grants. -Allowing these IP addresses only enables the connection to your . However, you might want to send API requests from your restricted network to the dbt Cloud API. For example, you could use the API to send a POST request that [triggers a job to run](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#operation/triggerRun). Using the dbt Cloud API requires that you allow the `cloud.getdbt.com` subdomain. For more on the dbt Cloud architecture, see [Deployment architecture](/docs/cloud/about-cloud/architecture). +Allowing these IP addresses only enables the connection to your . However, you might want to send API requests from your restricted network to the dbt Cloud API. Using the dbt Cloud API requires allowing the `cloud.getdbt.com` subdomain. For more on the dbt Cloud architecture, see [Deployment architecture](/docs/cloud/about-cloud/architecture). diff --git a/website/docs/docs/cloud/connect-data-platform/connect-amazon-athena.md b/website/docs/docs/cloud/connect-data-platform/connect-amazon-athena.md index 0b2f844ccac..f1009f61274 100644 --- a/website/docs/docs/cloud/connect-data-platform/connect-amazon-athena.md +++ b/website/docs/docs/cloud/connect-data-platform/connect-amazon-athena.md @@ -5,7 +5,7 @@ description: "Configure the Amazon Athena data platform connection in dbt Cloud. sidebar_label: "Connect Amazon Athena" --- -# Connect Amazon Athena +# Connect Amazon Athena Your environment(s) must be on ["Versionless"](/docs/dbt-versions/versionless-cloud) to use the Amazon Athena connection. diff --git a/website/docs/docs/cloud/connect-data-platform/connect-teradata.md b/website/docs/docs/cloud/connect-data-platform/connect-teradata.md new file mode 100644 index 00000000000..cf41814078b --- /dev/null +++ b/website/docs/docs/cloud/connect-data-platform/connect-teradata.md @@ -0,0 +1,29 @@ +--- +title: "Connect Teradata" +id: connect-teradata +description: "Configure the Teradata platform connection in dbt Cloud." +sidebar_label: "Connect Teradata" +--- + +# Connect Teradata + +Your environment(s) must be on ["Versionless"](/docs/dbt-versions/versionless-cloud) to use the Teradata connection. + +| Field | Description | Type | Required? | Example | +| ----------------------------- | --------------------------------------------------------------------------------------------- | -------------- | --------- | ------- | +| Host | Host name of your Teradata environment. | String | Required | host-name.env.clearscape.teradata.com | +| Port | The database port number. Equivalent to the Teradata JDBC Driver DBS_PORT connection parameter.| Quoted integer | Optional | 1025 | +| Retries | Number of times to retry to connect to database upon error. | Integer | optional | 10 | +| Request timeout | The waiting period between connections attempts in seconds. Default is "1" second. | Quoted integer | Optional | 3 | + + + +### Development and deployment credentials + +| Field | Description | Type | Required? | Example | +| ------------------------------|-----------------------------------------------------------------------------------------------|----------------|-----------|--------------------| +| Username | The database username. Equivalent to the Teradata JDBC Driver USER connection parameter. | String | Required | database_username | +| Password | The database password. Equivalent to the Teradata JDBC Driver PASSWORD connection parameter. | String | Required | DatabasePassword123 | +| Schema | Specifies the initial database to use after login, rather than the user's default database. | String | Required | dbtlabsdocstest | + + diff --git a/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md b/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md index 0243bc619b1..1ce9712ab91 100644 --- a/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md +++ b/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md @@ -52,6 +52,123 @@ As an end user, if your organization has set up BigQuery OAuth, you can link a p To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [BigQuery-specific configuration](/reference/resource-configs/bigquery-configs). +### Optional configurations + +In BigQuery, optional configurations let you tailor settings for tasks such as query priority, dataset location, job timeout, and more. These options give you greater control over how BigQuery functions behind the scenes to meet your requirements. + +To customize your optional configurations in dbt Cloud: + +1. Click your name at the bottom left-hand side bar menu in dbt Cloud +2. Select **Your profile** from the menu +3. From there, click **Projects** and select your BigQuery project +5. Go to **Development Connection** and select BigQuery +6. Click **Edit** and then scroll down to **Optional settings** + + + +The following are the optional configurations you can set in dbt Cloud: + +| Configuration |
Information
| Type |
Example
| +|---------------------------|-----------------------------------------|---------|--------------------| +| [Priority](#priority) | Sets the priority for BigQuery jobs (either `interactive` or queued for `batch` processing) | String | `batch` or `interactive` | +| [Retries](#retries) | Specifies the number of retries for failed jobs due to temporary issues | Integer | `3` | +| [Location](#location) | Location for creating new datasets | String | `US`, `EU`, `us-west2` | +| [Maximum bytes billed](#maximum-bytes-billed) | Limits the maximum number of bytes that can be billed for a query | Integer | `1000000000` | +| [Execution project](#execution-project) | Specifies the project ID to bill for query execution | String | `my-project-id` | +| [Impersonate service account](#impersonate-service-account) | Allows users authenticated locally to access BigQuery resources under a specified service account | String | `service-account@project.iam.gserviceaccount.com` | +| [Job retry deadline seconds](#job-retry-deadline-seconds) | Sets the total number of seconds BigQuery will attempt to retry a job if it fails | Integer | `600` | +| [Job creation timeout seconds](#job-creation-timeout-seconds) | Specifies the maximum timeout for the job creation step | Integer | `120` | +| [Google cloud storage-bucket](#google-cloud-storage-bucket) | Location for storing objects in Google Cloud Storage | String | `my-bucket` | +| [Dataproc region](#dataproc-region) | Specifies the cloud region for running data processing jobs | String | `US`, `EU`, `asia-northeast1` | +| [Dataproc cluster name](#dataproc-cluster-name) | Assigns a unique identifier to a group of virtual machines in Dataproc | String | `my-cluster` | + + + + +The `priority` for the BigQuery jobs that dbt executes can be configured with the `priority` configuration in your BigQuery profile. The priority field can be set to one of `batch` or `interactive`. For more information on query priority, consult the [BigQuery documentation](https://cloud.google.com/bigquery/docs/running-queries). + + + + + +Retries in BigQuery help to ensure that jobs complete successfully by trying again after temporary failures, making your operations more robust and reliable. + + + + + +The `location` of BigQuery datasets can be set using the `location` setting in a BigQuery profile. As per the [BigQuery documentation](https://cloud.google.com/bigquery/docs/locations), `location` may be either a multi-regional location (for example, `EU`, `US`), or a regional location (like `us-west2`). + + + + + +When a `maximum_bytes_billed` value is configured for a BigQuery profile, that allows you to limit how much data your query can process. It’s a safeguard to prevent your query from accidentally processing more data than you expect, which could lead to higher costs. Queries executed by dbt will fail if they exceed the configured maximum bytes threshhold. This configuration should be supplied as an integer number of bytes. + +If your `maximum_bytes_billed` is 1000000000, you would enter that value in the `maximum_bytes_billed` field in dbt cloud. + + + + + + +By default, dbt will use the specified `project`/`database` as both: + +1. The location to materialize resources (models, seeds, snapshots, and so on), unless they specify a custom project/database config +2. The GCP project that receives the bill for query costs or slot usage + +Optionally, you may specify an execution project to bill for query execution, instead of the project/database where you materialize most resources. + + + + + +This feature allows users authenticating using local OAuth to access BigQuery resources based on the permissions of a service account. + +For a general overview of this process, see the official docs for [Creating Short-lived Service Account Credentials](https://cloud.google.com/iam/docs/create-short-lived-credentials-direct). + + + + + +Job retry deadline seconds is the maximum amount of time BigQuery will spend retrying a job before it gives up. + + + + + +Job creation timeout seconds is the maximum time BigQuery will wait to start the job. If the job doesn’t start within that time, it times out. + + + +#### Run dbt python models on Google Cloud Platform + +import BigQueryDataproc from '/snippets/_bigquery-dataproc.md'; + + + + + +Everything you store in Cloud Storage must be placed inside a [bucket](https://cloud.google.com/storage/docs/buckets). Buckets help you organize your data and manage access to it. + + + + + +A designated location in the cloud where you can run your data processing jobs efficiently. This region must match the location of your BigQuery dataset if you want to use Dataproc with BigQuery to ensure data doesn't move across regions, which can be inefficient and costly. + +For more information on [Dataproc regions](https://cloud.google.com/bigquery/docs/locations), refer to the BigQuery documentation. + + + + + +A unique label you give to your group of virtual machines to help you identify and manage your data processing tasks in the cloud. When you integrate Dataproc with BigQuery, you need to provide the cluster name so BigQuery knows which specific set of resources (the cluster) to use for running the data jobs. + +Have a look at [Dataproc's document on Create a cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) for an overview on how clusters work. + + + ### Account level connections and credential management You can re-use connections across multiple projects with [global connections](/docs/cloud/connect-data-platform/about-connections#migration-from-project-level-connections-to-account-level-connections). Connections are attached at the environment level (formerly project level), so you can utilize multiple connections inside of a single project (to handle dev, staging, production, etc.). @@ -147,3 +264,7 @@ For a project, you will first create an environment variable to store the secret "extended_attributes_id": FFFFF }' ``` + + + + diff --git a/website/docs/docs/cloud/dbt-assist-data.md b/website/docs/docs/cloud/dbt-assist-data.md deleted file mode 100644 index ad32c304ca8..00000000000 --- a/website/docs/docs/cloud/dbt-assist-data.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: "dbt Assist privacy and data" -sidebar_label: "dbt Assist privacy" -description: "dbt Assist’s powerful AI feature helps you deliver data that works." ---- - -# dbt Assist privacy and data - -dbt Labs is committed to protecting your privacy and data. This page provides information about how dbt Labs handles your data when you use dbt Assist. - -#### Is my data used by dbt Labs to train AI models? - -No, dbt Assist does not use client warehouse data to train any AI models. It uses API calls to an AI provider. - -#### Does dbt Labs share my personal data with third parties - -dbt Labs only shares client personal information as needed to perform the services, under client instructions, or for legal, tax, or compliance reasons. - -#### Does dbt Assist store or use personal data? - -The user clicks the AI assist button, and the user does not otherwise enter data. - -#### Does dbt Assist access my warehouse data? - -dbt Assist utilizes metadata, including column names, model SQL, the model's name, and model documentation. The row-level data from the warehouse is never used or sent to a third-party provider. Such output must be double-checked by the user for completeness and accuracy. - -#### Can dbt Assist data be deleted upon client written request? - -dbt Assist data, aside from usage data, does not persist on dbt Labs systems. Usage data is retained by dbt Labs. dbt Labs does not have possession of any personal or sensitive data. To the extent client identifies personal or sensitive information uploaded by or on behalf of client to dbt Labs systems, such data can be deleted within 30 days of written request. diff --git a/website/docs/docs/cloud/dbt-assist.md b/website/docs/docs/cloud/dbt-assist.md deleted file mode 100644 index eafe7d05821..00000000000 --- a/website/docs/docs/cloud/dbt-assist.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "About dbt Assist" -sidebar_label: "About dbt Assist" -description: "dbt Assist’s powerful AI co-pilot feature helps you deliver data that works." -pagination_next: "docs/cloud/enable-dbt-assist" -pagination_prev: null ---- - -# About dbt Assist - -dbt Assist is a powerful artificial intelligence (AI) co-pilot feature that helps automate development in dbt Cloud, allowing you to focus on delivering data that works. dbt Assist’s AI co-pilot generates [documentation](/docs/build/documentation) and [tests](/docs/build/data-tests) for your dbt SQL models directly in the dbt Cloud IDE, with a click of a button, and helps you accomplish more in less time. - -:::tip Beta feature -dbt Assist is an AI tool meant to _help_ developers generate documentation and tests in dbt Cloud. It's available in beta, in the dbt Cloud IDE only. - -To use dbt Assist, you must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing) and agree to use dbt Labs' OpenAI key. [Register your interest](https://docs.google.com/forms/d/e/1FAIpQLScPjRGyrtgfmdY919Pf3kgqI5E95xxPXz-8JoVruw-L9jVtxg/viewform) to join the private beta or reach out to your account team to begin this process. -::: - - - -## Feedback - -Please note: Always review AI-generated code and content as it may produce incorrect results. dbt Assist features and/or functionality may be added or eliminated as part of the beta trial. - -To give feedback, please reach out to your dbt Labs account team. We appreciate your feedback and suggestions as we improve dbt Assist. diff --git a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md index af303d0d9a0..c9d2cbbad30 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md @@ -13,7 +13,7 @@ The dbt Cloud integrated development environment (IDE) is a single web-based int The dbt Cloud IDE offers several [keyboard shortcuts](/docs/cloud/dbt-cloud-ide/keyboard-shortcuts) and [editing features](/docs/cloud/dbt-cloud-ide/ide-user-interface#editing-features) for faster and efficient development and governance: - Syntax highlighting for SQL — Makes it easy to distinguish different parts of your code, reducing syntax errors and enhancing readability. -- AI co-pilot — Use [dbt Assist](/docs/cloud/dbt-assist), a powerful AI co-pilot feature, to generate documentation and tests for your dbt SQL models. +- AI copilot — Use [dbt Copilot](/docs/cloud/dbt-copilot), a powerful AI engine that can generate documentation, tests, and semantic models for your dbt SQL models. - Auto-completion — Suggests table names, arguments, and column names as you type, saving time and reducing typos. - Code [formatting and linting](/docs/cloud/dbt-cloud-ide/lint-format) — Helps standardize and fix your SQL code effortlessly. - Navigation tools — Easily move around your code, jump to specific lines, find and replace text, and navigate between project files. @@ -53,9 +53,9 @@ To understand how to navigate the IDE and its user interface elements, refer to | Feature | Description | |---|---| | [**Keyboard shortcuts**](/docs/cloud/dbt-cloud-ide/keyboard-shortcuts) | You can access a variety of [commands and actions](/docs/cloud/dbt-cloud-ide/keyboard-shortcuts) in the IDE by choosing the appropriate keyboard shortcut. Use the shortcuts for common tasks like building modified models or resuming builds from the last failure. | -| **IDE version control** | The IDE version control section and git button allow you to apply the concept of [version control](/docs/collaborate/git/version-control-basics) to your project directly into the IDE.

- Create or change branches, execute git commands using the git button.
- Commit or revert individual files by right-clicking the edited file
- [Resolve merge conflicts](/docs/collaborate/git/merge-conflicts)
- Link to the repo directly by clicking the branch name
- Edit, format, or lint files and execute dbt commands in your primary protected branch, and commit to a new branch.
- Use Git diff view to view what has been changed in a file before you make a pull request.
- From dbt version 1.6 and higher, use the **Prune branches** [button](/docs/cloud/dbt-cloud-ide/ide-user-interface#prune-branches-modal) to delete local branches that have been deleted from the remote repository, keeping your branch management tidy. | +| **IDE version control** | The IDE version control section and git button allow you to apply the concept of [version control](/docs/collaborate/git/version-control-basics) to your project directly into the IDE.

- Create or change branches, execute git commands using the git button.
- Commit or revert individual files by right-clicking the edited file
- [Resolve merge conflicts](/docs/collaborate/git/merge-conflicts)
- Link to the repo directly by clicking the branch name
- Edit, format, or lint files and execute dbt commands in your primary protected branch, and commit to a new branch.
- Use Git diff view to view what has been changed in a file before you make a pull request.
- Use the **Prune branches** [button](/docs/cloud/dbt-cloud-ide/ide-user-interface#prune-branches-modal) (dbt v1.6 and higher) to delete local branches that have been deleted from the remote repository, keeping your branch management tidy.
- Sign your [git commits](/docs/cloud/dbt-cloud-ide/git-commit-signing) to mark them as 'Verified'. | | **Preview and Compile button** | You can [compile or preview](/docs/cloud/dbt-cloud-ide/ide-user-interface#console-section) code, a snippet of dbt code, or one of your dbt models after editing and saving. | -| [**dbt Assist**](/docs/cloud/dbt-assist) | A powerful AI co-pilot feature that generates documentation and tests for your dbt SQL models. Available for dbt Cloud Enterprise plans. | +| [**dbt Copilot**](/docs/cloud/dbt-copilot) | A powerful AI engine that can generate documentation, tests, and semantic models for your dbt SQL models. Available for dbt Cloud Enterprise plans. | | **Build, test, and run button** | Build, test, and run your project with a button click or by using the Cloud IDE command bar. | **Command bar** | You can enter and run commands from the command bar at the bottom of the IDE. Use the [rich model selection syntax](/reference/node-selection/syntax) to execute [dbt commands](/reference/dbt-commands) directly within dbt Cloud. You can also view the history, status, and logs of previous runs by clicking History on the left of the bar. | **Drag and drop** | Drag and drop files located in the file explorer, and use the file breadcrumb on the top of the IDE for quick, linear navigation. Access adjacent files in the same file by right-clicking on the breadcrumb file. @@ -101,6 +101,7 @@ Nice job, you're ready to start developing and building models 🎉! ### Considerations - To improve your experience using dbt Cloud, we suggest that you turn off ad blockers. This is because some project file names, such as `google_adwords.sql`, might resemble ad traffic and trigger ad blockers. - To preserve performance, there's a file size limitation for repositories over 6 GB. If you have a repo over 6 GB, please contact [dbt Support](mailto:support@getdbt.com) before running dbt Cloud. +- The IDE's idle session timeout is one hour. - ### Start-up process @@ -127,8 +128,9 @@ Nice job, you're ready to start developing and building models 🎉! - If a model or test fails, dbt Cloud makes it easy for you to view and download the run logs for your dbt invocations to fix the issue. - Use dbt's [rich model selection syntax](/reference/node-selection/syntax) to [run dbt commands](/reference/dbt-commands) directly within dbt Cloud. - Starting from dbt v1.6, leverage [environments variables](/docs/build/environment-variables#special-environment-variables) to dynamically use the Git branch name. For example, using the branch name as a prefix for a development schema. + - Run [MetricFlow commands](/docs/build/metricflow-commands) to create and manage metrics in your project with the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl). -- **Generate your YAML configurations with dbt Assist** — [dbt Assist](/docs/cloud/dbt-assist) is a powerful artificial intelligence (AI) co-pilot feature that helps automate development in dbt Cloud. It generates documentation and tests for your dbt SQL models directly in the dbt Cloud IDE, with a click of a button, and helps you accomplish more in less time. Available for dbt Cloud Enterprise plans. +- **Generate your YAML configurations with dbt Copilot** — [dbt Copilot](/docs/cloud/dbt-copilot) is a powerful artificial intelligence (AI) feature that helps automate development in dbt Cloud. It can generate documentation, tests, and semantic models for your dbt SQL models directly in the dbt Cloud IDE, with a click of a button, and helps you accomplish more in less time. Available for dbt Cloud Enterprise plans. - **Build and view your project's docs** — The dbt Cloud IDE makes it possible to [build and view](/docs/collaborate/build-and-view-your-docs) documentation for your dbt project while your code is still in development. With this workflow, you can inspect and verify what your project's generated documentation will look like before your changes are released to production. diff --git a/website/docs/docs/cloud/dbt-cloud-ide/git-commit-signing.md b/website/docs/docs/cloud/dbt-cloud-ide/git-commit-signing.md new file mode 100644 index 00000000000..afaa0751669 --- /dev/null +++ b/website/docs/docs/cloud/dbt-cloud-ide/git-commit-signing.md @@ -0,0 +1,80 @@ +--- +title: "Git commit signing" +description: "Learn how to sign your Git commits when using the IDE for development." +sidebar_label: Git commit signing +--- + +# Git commit signing + +To prevent impersonation and enhance security, you can sign your Git commits before pushing them to your repository. Using your signature, a Git provider can cryptographically verify a commit and mark it as "verified", providing increased confidence about its origin. + +You can configure dbt Cloud to sign your Git commits when using the IDE for development. To set up, enable the feature in dbt Cloud, follow the flow to generate a keypair, and upload the public key to your Git provider to use for signature verification. + + +## Prerequisites + +- GitHub or GitLab is your Git provider. Currently, Azure DevOps is not supported. +- You have a dbt Cloud account on the [Enterprise plan](https://www.getdbt.com/pricing/). + +## Generate GPG keypair in dbt Cloud + +To generate a GPG keypair in dbt Cloud, follow these steps: +1. Go to your **Personal profile** page in dbt Cloud. +2. Navigate to **Signed Commits** section. +3. Enable the **Sign commits originating from this user** toggle. +4. This will generate a GPG keypair. The private key will be used to sign all future Git commits. The public key will be displayed, allowing you to upload it to your Git provider. + + + +## Upload public key to Git provider + +To upload the public key to your Git provider, follow the detailed documentation provided by the supported Git provider: + +- [GitHub instructions](https://docs.github.com/en/authentication/managing-commit-signature-verification/adding-a-gpg-key-to-your-github-account) +- [GitLab instructions](https://docs.gitlab.com/ee/user/project/repository/signed_commits/gpg.html) + +Once you have uploaded the public key to your Git provider, your Git commits will be marked as "Verified" after you push the changes to the repository. + + + +## Considerations + +- The GPG keypair is tied to the user, not a specific account. There is a 1:1 relationship between the user and keypair. The same key will be used for signing commits on any accounts the user is a member of. +- The GPG keypair generated in dbt Cloud is linked to the email address associated with your account at the time of keypair creation. This email identifies the author of signed commits. +- For your Git commits to be marked as "verified", your dbt Cloud email address must be a verified email address with your Git provider. The Git provider (such as, GitHub, GitLab) checks that the commit's signed email matches a verified email in your Git provider account. If they don’t match, the commit won't be marked as "verified." +- Keep your dbt Cloud email and Git provider's verified email in sync to avoid verification issues. If you change your dbt Cloud email address: + - Generate a new GPG keypair with the updated email, following the [steps mentioned earlier](/docs/cloud/dbt-cloud-ide/git-commit-signing#generate-gpg-keypair-in-dbt-cloud). + - Add and verify the new email in your Git provider. + + + +## FAQs + + + + + +If you delete your GPG keypair in dbt Cloud, your Git commits will no longer be signed. You can generate a new GPG keypair by following the [steps mentioned earlier](/docs/cloud/dbt-cloud-ide/git-commit-signing#generate-gpg-keypair-in-dbt-cloud). + + + + +GitHub and GitLab support commit signing, while Azure DevOps does not. Commit signing is a [git feature](https://git-scm.com/book/ms/v2/Git-Tools-Signing-Your-Work), and is independent of any specific provider. However, not all providers support the upload of public keys, or the display of verification badges on commits. + + + + + +If your Git Provider does not explicitly support the uploading of public GPG keys, then +commits will still be signed using the private key, but no verification information will +be displayed by the provider. + + + + + +If your Git provider is configured to enforce commit verification, then unsigned commits +will be rejected. To avoid this, ensure that you have followed all previous steps to generate +a keypair, and uploaded the public key to the provider. + + diff --git a/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md b/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md index 8d80483485c..36c6cc898dc 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md @@ -35,7 +35,7 @@ The IDE streamlines your workflow, and features a popular user interface layout * Added (A) — The IDE detects added files * Deleted (D) — The IDE detects deleted files. - + 5. **Command bar —** The Command bar, located in the lower left of the IDE, is used to invoke [dbt commands](/reference/dbt-commands). When a command is invoked, the associated logs are shown in the Invocation History Drawer. @@ -107,15 +107,19 @@ Starting from dbt v1.6 or higher, when you save changes to a model, you can comp 3. **Build button —** The build button allows users to quickly access dbt commands related to the active model in the File Editor. The available commands include dbt build, dbt test, and dbt run, with options to include only the current resource, the resource and its upstream dependencies, the resource, and its downstream dependencies, or the resource with all dependencies. This menu is available for all executable nodes. -4. **Format button —** The editor has a **Format** button that can reformat the contents of your files. For SQL files, it uses either `sqlfmt` or `sqlfluff`, and for Python files, it uses `black`. +4. **Lint button** — The **Lint** button runs the [linter](/docs/cloud/dbt-cloud-ide/lint-format) on the active file in the File Editor. The linter checks for syntax errors and style issues in your code and displays the results in the **Code quality** tab. -5. **Results tab —** The Results console tab displays the most recent Preview results in tabular format. +5. **dbt Copilot** — [dbt Copilot](/docs/cloud/dbt-copilot) is a powerful artificial intelligence engine that can generate documentation, tests, and semantic models for you. dbt Copilot is available in the IDE for Enterprise plans. + +6. **Results tab —** The Results console tab displays the most recent Preview results in tabular format. -6. **Compiled Code tab —** The Compile button triggers a compile invocation that generates compiled code, which is displayed in the Compiled Code tab. +7. **Code quality tab** — The Code Quality tab displays the results of the linter on the active file in the File Editor. It allows you to view code errors, provides code quality visibility and management, and displays the SQLFluff version used. + +8. **Compiled Code tab —** The Compile generates the compiled code when the Compile button is executed. The Compiled Code tab displays the compiled SQL code for the active file in the File Editor. -7. **Lineage tab —** The Lineage tab in the File Editor displays the active model's lineage or . By default, it shows two degrees of lineage in both directions (`2+model_name+2`), however, you can change it to +model+ (full DAG). +9. **Lineage tab —** The Lineage tab in the File Editor displays the active model's lineage or . By default, it shows two degrees of lineage in both directions (`2+model_name+2`), however, you can change it to +model+ (full DAG). To use the lineage: - Double-click a node in the DAG to open that file in a new tab - Expand or shrink the DAG using node selection syntax. - Note, the `--exclude` flag isn't supported. @@ -158,11 +162,11 @@ Use menus and modals to interact with IDE and access useful options to help your - #### File Search You can easily search for and navigate between files using the File Navigation menu, which can be accessed by pressing Command-O or Control-O or clicking on the 🔍 icon in the File Explorer. - + - #### Global Command Palette The Global Command Palette provides helpful shortcuts to interact with the IDE, such as git actions, specialized dbt commands, and compile, and preview actions, among others. To open the menu, use Command-P or Control-P. - + - #### IDE Status modal The IDE Status modal shows the current error message and debug logs for the server. This also contains an option to restart the IDE. Open this by clicking on the IDE Status button. @@ -193,7 +197,7 @@ Use menus and modals to interact with IDE and access useful options to help your * Toggling between dark or light mode for a better viewing experience * Restarting the IDE - * Fully recloning your repository to refresh your git state and view status details + * Rollback your repo to remote, to refresh your git state and view status details * Viewing status details, including the IDE Status modal. - + diff --git a/website/docs/docs/cloud/dbt-copilot-data.md b/website/docs/docs/cloud/dbt-copilot-data.md new file mode 100644 index 00000000000..b55681542e3 --- /dev/null +++ b/website/docs/docs/cloud/dbt-copilot-data.md @@ -0,0 +1,29 @@ +--- +title: "dbt Copilot privacy and data" +sidebar_label: "dbt Copilot privacy" +description: "dbt Copilot is a powerful AI engine to help you deliver data that works." +--- + +# dbt Copilot privacy and data + +dbt Labs is committed to protecting your privacy and data. This page provides information about how the dbt Copilot AI engine handles your data. + +#### Is my data used by dbt Labs to train AI models? + +No, dbt Copilot does not use client warehouse data to train any AI models. It uses API calls to an AI provider. + +#### Does dbt Labs share my personal data with third parties + +dbt Labs only shares client personal information as needed to perform the services, under client instructions, or for legal, tax, or compliance reasons. + +#### Does dbt Copilot store or use personal data? + +The user clicks the dbt Copilot button, and the user does not otherwise enter data. + +#### Does dbt Copilot access my warehouse data? + +dbt Copilot utilizes metadata, including column names, model SQL, the model's name, and model documentation. The row-level data from the warehouse is never used or sent to a third-party provider. Such output must be double-checked by the user for completeness and accuracy. + +#### Can dbt Copilot data be deleted upon client written request? + +The data from using dbt Copilot, aside from usage data, _doesn't_ persist on dbt Labs systems. Usage data is retained by dbt Labs. dbt Labs doesn't have possession of any personal or sensitive data. To the extent client identifies personal or sensitive information uploaded by or on behalf of client to dbt Labs systems, such data can be deleted within 30 days of written request. diff --git a/website/docs/docs/cloud/dbt-copilot.md b/website/docs/docs/cloud/dbt-copilot.md new file mode 100644 index 00000000000..403df86a089 --- /dev/null +++ b/website/docs/docs/cloud/dbt-copilot.md @@ -0,0 +1,25 @@ +--- +title: "About dbt Copilot" +sidebar_label: "About dbt Copilot" +description: "dbt Copilot is a powerful AI engine designed to accelerate your analytics workflows throughout your entire ADLC." +pagination_next: "docs/cloud/enable-dbt-copilot" +pagination_prev: null +--- + +# About dbt Copilot + +dbt Copilot is a powerful artificial intelligence (AI) engine that's fully integrated into your dbt Cloud experience and designed to accelerate your analytics workflows. dbt Copilot embeds AI-driven assistance across every stage of the analytics development life cycle (ADLC), empowering data practitioners to deliver data products faster, improve data quality, and enhance data accessibility. With automatic code generation, you can let the AI engine generate the [documentation](/docs/build/documentation), [tests](/docs/build/data-tests), and [semantic models](/docs/build/semantic-models) for you. + +:::tip Beta feature +dbt Copilot is designed to _help_ developers generate documentation, tests, and semantic models in dbt Cloud. It's available in beta, in the dbt Cloud IDE only. + +To use dbt Copilot, you must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing) and either agree to use dbt Labs' OpenAI key or provide your own Open AI API key. [Register here](https://docs.google.com/forms/d/e/1FAIpQLScPjRGyrtgfmdY919Pf3kgqI5E95xxPXz-8JoVruw-L9jVtxg/viewform) or reach out to the Account Team if you're interested in joining the private beta. +::: + + + +## Feedback + +Please note: Always review AI-generated code and content as it may produce incorrect results. The features and/or functionality of dbt Copilot may be added or eliminated as part of the beta trial. + +To give feedback, please contact your dbt Labs account team. We appreciate your feedback and suggestions as we improve dbt Copilot. diff --git a/website/docs/docs/cloud/enable-dbt-assist.md b/website/docs/docs/cloud/enable-dbt-assist.md deleted file mode 100644 index 9432f858001..00000000000 --- a/website/docs/docs/cloud/enable-dbt-assist.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -title: "Enable dbt Assist" -sidebar_label: "Enable dbt Assist" -description: "Enable dbt Assist in dbt Cloud and leverage AI to speed up your development." ---- - -# Enable dbt Assist - -This page explains how to enable dbt Assist in dbt Cloud to leverage AI to speed up your development and allow you to focus on delivering quality data. - -## Prerequisites - -- Available in the dbt Cloud IDE only. -- Must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing). -- Development environment be ["Versionless"](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless). -- Current dbt Assist deployments use a central OpenAI API key managed by dbt Labs. In the future, you may provide your own key for Azure OpenAI or OpenAI. -- Accept and sign legal agreements. Reach out to your account team to begin this process. - -## Enable dbt Assist - -dbt Assist will only be available at an account level after your organization has signed the legal requirements. It will be disabled by default. Your dbt Cloud Admin(s) will enable it by following these steps: - -1. Navigate to **Account Settings** in the navigation menu. - -2. Under **Settings**, confirm the account you're enabling. - -3. Click **Edit** in the top right corner. - -4. To turn on dbt Assist, toggle the **Enable account access to AI-powered features** switch to the right. The toggle will slide to the right side, activating dbt Assist. - -5. Click **Save** and you should now have dbt Assist AI enabled to use. - -Note: To disable (only after enabled), repeat steps 1 to 3, toggle off in step 4, and repeat step 5. - - diff --git a/website/docs/docs/cloud/enable-dbt-copilot.md b/website/docs/docs/cloud/enable-dbt-copilot.md new file mode 100644 index 00000000000..07a9f6294da --- /dev/null +++ b/website/docs/docs/cloud/enable-dbt-copilot.md @@ -0,0 +1,51 @@ +--- +title: "Enable dbt Copilot" +sidebar_label: "Enable dbt Copilot" +description: "Enable the dbt Copilot AI engine in dbt Cloud to speed up your development." +--- + +# Enable dbt Copilot + +This page explains how to enable the dbt Copilot engine in dbt Cloud, leveraging AI to speed up your development and allowing you to focus on delivering quality data. + +## Prerequisites + +- Available in the dbt Cloud IDE only. +- Must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing). +- Development environment has been upgraded to ["Versionless"](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless). +- By default, dbt Copilot deployments use a central OpenAI API key managed by dbt Labs. Alternatively, you can [provide your own OpenAI API key](#bringing-your-own-openai-api-key-byok). +- Accept and sign legal agreements. Reach out to your Account team to begin this process. + +## Enable dbt Copilot + +dbt Copilot is only available to your account after your organization has signed the required legal documents. It's disabled by default. A dbt Cloud admin can enable it by following these steps: + +1. Navigate to **Account settings** in the navigation menu. + +2. Under **Settings**, confirm the account you're enabling. + +3. Click **Edit** in the top right corner. + +4. Enable the **Enable account access to AI-powered features** option. + +5. Click **Save**. You should now have the dbt Copilot AI engine enabled for use. + +Note: To disable (only after enabled), repeat steps 1 to 3, toggle off in step 4, and repeat step 5. + + + +### Bringing your own OpenAI API key (BYOK) + +Once AI features have been enabled, you can provide your organization's OpenAI API key. dbt Cloud will then leverage your OpenAI account and terms to power dbt CoPilot. This will incur billing charges to your organization from OpenAI for requests made by dbt CoPilot. + +Note that Azure OpenAI is not currently supported, but will be in the future. + +A dbt Cloud admin can provide their API key by following these steps: + +1. Navigate to **Account settings** in the side menu. + +2. Find the **Settings** section and click on **Integrations**. + +3. Scroll to **AI** and select the toggle for **OpenAI** + +4. Enter your API key and click **Save**. \ No newline at end of file diff --git a/website/docs/docs/cloud/git/authenticate-azure.md b/website/docs/docs/cloud/git/authenticate-azure.md index 42028bf993b..5278c134f72 100644 --- a/website/docs/docs/cloud/git/authenticate-azure.md +++ b/website/docs/docs/cloud/git/authenticate-azure.md @@ -13,9 +13,9 @@ If you use the dbt Cloud IDE or dbt Cloud CLI to collaborate on your team's Azur Connect your dbt Cloud profile to Azure DevOps using OAuth: -1. Click the gear icon at the top right and select **Profile settings**. -2. Click **Linked Accounts**. -3. Next to Azure DevOps, click **Link**. +1. Click your account name at the bottom of the left-side menu and click **Account settings** +2. Scroll down to **Your profile** and select **Personal profile**. +3. Go to the **Linked accounts** section in the middle of the page. 4. Once you're redirected to Azure DevOps, sign into your account. diff --git a/website/docs/docs/cloud/git/connect-github.md b/website/docs/docs/cloud/git/connect-github.md index f230f70e1f6..e2bf459275e 100644 --- a/website/docs/docs/cloud/git/connect-github.md +++ b/website/docs/docs/cloud/git/connect-github.md @@ -17,7 +17,7 @@ Connecting your GitHub account to dbt Cloud provides convenience and another lay * **Note** — [Single tenant](/docs/cloud/about-cloud/tenancy#single-tenant) accounts offer enhanced connection options for integrating with an On-Premises GitHub deployment setup using the native integration. This integration allows you to use all the features of the integration, such as triggering CI builds. The dbt Labs infrastructure team will coordinate with you to ensure any additional networking configuration requirements are met and completed. To discuss details, contact dbt Labs support or your dbt Cloud account team. - You _must_ be a **GitHub organization owner** in order to [install the dbt Cloud application](/docs/cloud/git/connect-github#installing-dbt-cloud-in-your-github-account) in your GitHub organization. To learn about GitHub organization roles, see the [GitHub documentation](https://docs.github.com/en/organizations/managing-peoples-access-to-your-organization-with-roles/roles-in-an-organization). - The GitHub organization owner requires [_Owner_](/docs/cloud/manage-access/self-service-permissions) or [_Account Admin_](/docs/cloud/manage-access/enterprise-permissions) permissions when they log into dbt Cloud to integrate with a GitHub environment using organizations. -- You may need to temporarily provide an extra dbt Cloud user account with _Owner_ or _Account Admin_ [permissions](/docs/cloud/manage-access/self-service-permissions) for your GitHub organization owner until they complete the installation. +- You may need to temporarily provide an extra dbt Cloud user account with _Owner_ or _Account Admin_ [permissions](/docs/cloud/manage-access/enterprise-permissions) for your GitHub organization owner until they complete the installation. ## Installing dbt Cloud in your GitHub account diff --git a/website/docs/docs/cloud/git/setup-azure.md b/website/docs/docs/cloud/git/setup-azure.md index 6fdb2517f1a..273660ba3dd 100644 --- a/website/docs/docs/cloud/git/setup-azure.md +++ b/website/docs/docs/cloud/git/setup-azure.md @@ -17,7 +17,7 @@ To use our native integration with Azure DevOps in dbt Cloud, an account admin n 4. [Connect Azure DevOps to your new app](#connect-azure-devops-to-your-new-app). 5. [Add your Entra ID app to dbt Cloud](#add-your-azure-ad-app-to-dbt-cloud). -Once the Microsoft Entra ID app is added to dbt Cloud, an account admin must also [connect a service user](#connecting-a-service-user) via OAuth, which will be used to power headless actions in dbt Cloud such as deployment runs and CI. +Once the Microsoft Entra ID app is added to dbt Cloud, an account admin must also [connect a service user](/docs/cloud/git/setup-azure#connect-a-service-user) via OAuth, which will be used to power headless actions in dbt Cloud such as deployment runs and CI. Once the Microsoft Entra ID app is added to dbt Cloud and the service user is connected, then dbt Cloud developers can personally authenticate in dbt Cloud from Azure DevOps. For more on this, see [Authenticate with Azure DevOps](/docs/cloud/git/authenticate-azure). @@ -89,7 +89,7 @@ An Azure admin will need one of the following permissions in both the Microsoft - Azure Service Administrator - Azure Co-administrator -If your Azure DevOps account is connected to Entra ID, then you can proceed to [Connecting a service user](#connecting-a-service-user). However, if you're just getting set up, connect Azure DevOps to the Microsoft Entra ID app you just created: +If your Azure DevOps account is connected to Entra ID, then you can proceed to [Connect a service user](#connect-a-service-user). However, if you're just getting set up, connect Azure DevOps to the Microsoft Entra ID app you just created: 1. From your Azure DevOps account, select **Organization settings** in the bottom left. 2. Navigate to Microsoft Entra ID. @@ -373,6 +373,16 @@ A dbt Cloud account admin with access to the service user's Azure DevOps account Once connected, dbt Cloud displays the email address of the service user so you know which user's permissions are enabling headless actions in deployment environments. To change which account is connected, disconnect the profile in dbt Cloud, sign into the alternative Azure DevOps service account, and re-link the account in dbt Cloud. +### Using Azure AD for SSO with dbt Cloud and Microsoft tools + +If you're using Azure AD for SSO with dbt Cloud and Microsoft tools, the SSO flow may sometimes direct your account admin to their personal user account instead of the service user. If this happens, follow these steps to resolve it: + +1. Sign in to the service user's Azure DevOps account (ensure they are also connected to dbt Cloud through SSO). +2. When connected to dbt Cloud, sign out of Azure AD through the [Azure portal](https://portal.azure.com/). +3. Disconnect the service user in dbt Cloud, and follow the steps to set it up again. +4. You should then be prompted to enter service user credentials. + + :::info Personal Access Tokens (PATs) dbt Cloud leverages the service user to generate temporary access tokens called [PATs](https://learn.microsoft.com/en-us/azure/devops/organizations/accounts/use-personal-access-tokens-to-authenticate?toc=%2Fazure%2Fdevops%2Fmarketplace-extensibility%2Ftoc.json&view=azure-devops&tabs=Windows). diff --git a/website/docs/docs/cloud/manage-access/about-access.md b/website/docs/docs/cloud/manage-access/about-access.md index 64826531245..b9d23b28add 100644 --- a/website/docs/docs/cloud/manage-access/about-access.md +++ b/website/docs/docs/cloud/manage-access/about-access.md @@ -8,142 +8,211 @@ pagination_prev: null :::info "User access" is not "Model access" -**User groups and access** and **model groups and access** mean two different things. "Model groups and access" is a specific term used in the language of dbt-core. Refer to [Model access](/docs/collaborate/govern/model-access) for more info on what it means in dbt-core. +This page is specific to user groups and access, which includes: +- User licenses, permissions, and group memberships +- Role-based access controls for projects and environments +- Single sign-on and secure authentication -::: +"Model groups and access" is a feature specific to models and their availability across projects. Refer to [Model access](/docs/collaborate/govern/model-access) for more info on what it means for your dbt projects. -dbt Cloud administrators can use dbt Cloud's permissioning model to control -user-level access in a dbt Cloud account. This access control comes in two flavors: -License-based and Role-based. +::: -- **License-based Access Controls:** User are configured with account-wide - license types. These licenses control the specific parts of the dbt Cloud application - that a given user can access. -- **Role-based Access Control (RBAC):** Users are assigned to _groups_ that have - specific permissions on specific projects or the entire account. A user may be - a member of multiple groups, and those groups may have permissions on multiple - projects. +# About user access +You can regulate access to dbt Cloud by various measures, including licenses, groups, permissions, and role-based access control (RBAC). To understand the possible approaches to user access to dbt Cloud features and functionality, you should first know how we approach users and groups. -## License-based access control +### Users -Each user on an account is assigned a license type when the user is first -invited to a given account. This license type may change over time, but a -user can only have one type of license at any given time. +Individual users in dbt Cloud can be people you [manually invite](/docs/cloud/manage-access/invite-users) or grant access via an external identity provider (IdP), such as Microsoft Entra ID, Okta, or Google Workspace. -A user's license type controls the features in dbt Cloud that the user is able -to access. dbt Cloud's three license types are: +In either scenario, when you add a user to dbt Cloud, they are assigned a [license](#licenses). You assign licenses at the individual user or group levels. When you manually invite a user, you will assign the license in the invitation window. - - **Developer** — User may be granted _any_ permissions. - - **Read-Only** — User has read-only permissions applied to all dbt Cloud resources regardless of the role-based permissions that the user is assigned. - - **IT** — User has [Security Admin](/docs/cloud/manage-access/enterprise-permissions#security-admin) and [Billing Admin](/docs/cloud/manage-access/enterprise-permissions#billing-admin) permissions applied regardless of the role-based permissions that the user is assigned. + -For more information on these license types, see [Seats & Users](/docs/cloud/manage-access/seats-and-users). +You can edit an existing user's license by navigating to the **Users** section of the **Account settings**, clicking on a user, and clicking **Edit** on the user pane. Delete users from this same window to free up licenses for new users. -## Role-based access control + -:::info dbt Cloud Enterprise -Role-based access control is a feature of the dbt Cloud Enterprise plan +### Groups -::: +Groups in dbt Cloud serve much of the same purpose as they do in traditional directory tools — to gather individual users together to make bulk assignment of permissions easier. Admins use groups in dbt Cloud to assign [licenses](#licenses) and [permissions](#permissions). The permissions are more granular than licenses, and you only assign them at the group level; _you can’t assign permissions at the user level._ Every user in dbt Cloud must be assigned to at least one group. -Role-based access control allows for fine-grained permissioning in the dbt Cloud -application. With role-based access control, users can be assigned varying -permissions to different projects within a dbt Cloud account. For teams on the -Enterprise tier, role-based permissions can be generated dynamically from -configurations in an [Identity Provider](sso-overview). +There are three default groups available as soon as you create your dbt Cloud account (the person who created the account is added to all three automatically): -Role-based permissions are applied to _groups_ and pertain to _projects_. The -assignable permissions themselves are granted via _permission sets_. +- **Owner:** This group is for individuals responsible for the entire account and will give them elevated account admin privileges. You cannot change the permissions. +- **Member:** This group is for the general members of your organization, who will also have full access to the account. You cannot change the permissions. By default, dbt Cloud adds new users to this group. +- **Everyone:** A general group for all members of your organization. Customize the permissions to fit your organizational needs. By default, dbt Cloud adds new users to this group. +We recommend deleting the default `Owner`, `Member`, and `Everyone` groups before deploying and replacing them with your organizational groups. This prevents users from receiving more elevated privileges than they should and helps admins ensure they are properly placed. -### Groups +Create new groups from the **Groups & Licenses** section of the **Account settings**. If you use an external IdP for SSO, you can sync those SSO groups to dbt Cloud from the **Group details** pane when creating or editing existing groups. -A group is a collection of users. Users may belong to multiple groups. Members -of a group inherit any permissions applied to the group itself. + -Users can be added to a dbt Cloud group based on their group memberships in the -configured [Identity Provider](sso-overview) for the account. In this way, dbt -Cloud administrators can manage access to dbt Cloud resources via identity -management software like Microsoft Entra ID (formerly Azure AD), Okta, or GSuite. See _SSO Mappings_ below for -more information. +:::important -You can view the groups in your account or create new groups from the **Groups & Licenses** -page in your Account Settings.
+If a user is assigned licenses and permissions from multiple groups, the group that grants the most access will take precedence. You must assign a permission set to any groups created beyond the three defaults, or users assigned will not have access to features beyond their user profile. - +::: -### SSO mappings +#### SSO mappings -SSO Mappings connect Identity Provider (IdP) group membership to dbt Cloud group -membership. When a user logs into dbt Cloud via a supported identity provider, -their IdP group memberships are synced with dbt Cloud. Upon logging in -successfully, the user's group memberships (and therefore, permissions) are -adjusted accordingly within dbt Cloud automatically. +SSO Mappings connect an identity provider (IdP) group membership to a dbt Cloud group. When users log into dbt Cloud via a supported identity provider, their IdP group memberships sync with dbt Cloud. Upon logging in successfully, the user's group memberships (and permissions) will automatically adjust within dbt Cloud. :::tip Creating SSO Mappings -While dbt Cloud supports mapping multiple IdP groups to a single dbt Cloud -group, we recommend using a 1:1 mapping to make administration as simple as -possible. Consider using the same name for your dbt Cloud groups and your IdP -groups. +While dbt Cloud supports mapping multiple IdP groups to a single dbt Cloud group, we recommend using a 1:1 mapping to make administration as simple as possible. Use the same names for your dbt Cloud groups and your IdP groups. ::: +Create an SSO mapping in the group view: + +1. Open an existing group to edit or create a new group. +2. In the **SSO** portion of the group screen, enter the name of the SSO group exactly as it appears in the IdP. If the name is not the same, the users will not be properly placed into the group. +3. In the **Users** section, ensure the **Add all users by default** option is disabled. +4. Save the group configuration. New SSO users will be added to the group upon login, and existing users will be added to the group upon their next login. + + + +Refer to [role-based access control](#role-based-access-control) for more information about mapping SSO groups for user assignment to dbt Cloud groups. + +## Grant access + +dbt Cloud users have both a license (assigned to an individual user or by group membership) and permissions (by group membership only) that determine what actions they can take. Licenses are account-wide, and permissions provide more granular access or restrictions to specific features. + +### Licenses + +Every user in dbt Cloud will have a license assigned. Licenses consume "seats" which impact how your account is [billed](/docs/cloud/billing), depending on your [service plan](https://www.getdbt.com/pricing). + +There are three license types in dbt Cloud: + +- **Developer** — User can be granted _any_ permissions. +- **Read-Only** — User has read-only permissions applied to all dbt Cloud resources regardless of the role-based permissions that the user is assigned. +- **IT** — User has Security Admin and Billing Admin [permissions](/docs/cloud/manage-access/enterprise-permissions) applied, regardless of the group permissions assigned. + +Developer licenses will make up a majority of the users in your environment and have the highest impact on billing, so it's important to monitor how many you have at any given time. + +For more information on these license types, see [Seats & Users](/docs/cloud/manage-access/seats-and-users) + +### Permissions + +Permissions determine what a developer-licensed user can do in your dbt Cloud account. By default, members of the `Owner` and `Member` groups have full access to all areas and features. When you want to restrict access to features, assign users to groups with stricter permission sets. Keep in mind that if a user belongs to multiple groups, the most permissive group will take precedence. + +The permissions available depends on whether you're on an [Enterprise](/docs/cloud/manage-access/enterprise-permissions) or [self-service Team](/docs/cloud/manage-access/self-service-permissions) plan. Developer accounts only have a single user, so permissions aren't applicable. + + + +Some permissions (those that don't grant full access, like admins) allow groups to be "assigned" to specific projects and environments only. Read about [environment-level permissions](/docs/cloud/manage-access/environment-permissions-setup) for more information on restricting environment access. + + + +## Role-based access control + +Role-based access control (RBAC) allows you to grant users access to features and functionality based on their group membership. With this method, you can grant users varying access levels to different projects and environments. You can take access and security to the next level by integrating dbt Cloud with a third-party identity provider (IdP) to grant users access when they authenticate with your SSO or OAuth service. + +There are a few things you need to know before you configure RBAC for SSO users: +- New SSO users join any groups with the **Add all new users by default** option enabled. By default, the `Everyone` and `Member` groups have this option enabled. Disable this option across all groups for the best RBAC experience. +- You must have the appropriate SSO groups configured in the group details SSO section. If the SSO group name does not match _exactly_, users will not be placed in the group correctly. + +- dbt Labs recommends that your dbt Cloud group names match the IdP group names. + +Let's say you have a new employee being onboarded into your organization using [Okta](/docs/cloud/manage-access/set-up-sso-okta) as the IdP and dbt Cloud groups with SSO mappings. In this scenario, users are working on `The Big Project` and a new analyst named `Euclid Ean` is joining the group. + +Check out the following example configurations for an idea of how you can implement RBAC for your organization (these examples assume you have already configured [SSO](/docs/cloud/manage-access/sso-overview)): + + -### Permission sets +You and your IdP team add `Euclid Ean` to your Okta environment and assign them to the `dbt Cloud` SSO app via a group called `The Big Project`. -Permission sets are predefined collections of granular permissions. Permission -sets combine low-level permission grants into high-level roles that can be -assigned to groups. Some examples of existing permission sets are: - - Account Admin - - Git Admin - - Job Admin - - Job Viewer - - ...and more + -For a full list of enterprise permission sets, see [Enterprise Permissions](/docs/cloud/manage-access/enterprise-permissions). -These permission sets are available for assignment to groups and control the ability -for users in these groups to take specific actions in the dbt Cloud application. +Configure the group attribute statements the `dbt Cloud` application in Okta. The group statements in the following example are set to the group name exactly (`The Big Project`), but yours will likely be a much broader configuration. Companies often use the same prefix across all dbt groups in their IdP. For example `DBT_GROUP_` -In the following example, the _dbt Cloud Owners_ group is configured with the -**Account Admin** permission set on _All Projects_ and the **Job Admin** permission -set on the _Internal Analytics_ project. + - + + -### Manual assignment +You and your dbt Cloud admin team configure the groups in your account's settings: +1. Navigate to the **Account settings** and click **Groups & Licenses** on the left-side menu. +2. Click **Create group** or select an existing group and click **Edit**. +3. Enter the group name in the **SSO** field. +4. Configure the **Access and permissions** fields to your needs. Select a [permission set](/docs/cloud/manage-access/enterprise-permissions), the project they can access, and [environment-level access](/docs/cloud/manage-access/environment-permissions). -dbt Cloud administrators can manually assign users to groups independently of -IdP attributes. If a dbt Cloud group is configured _without_ any -SSO Mappings, then the group will be _unmanaged_ and dbt Cloud will not adjust -group membership automatically when users log into dbt Cloud via an identity -provider. This behavior may be desirable for teams that have connected an identity -provider, but have not yet configured SSO Mappings between dbt Cloud and the -IdP. + -If an SSO Mapping is added to an _unmanaged_ group, then it will become -_managed_, and dbt Cloud may add or remove users to the group automatically at -sign-in time based on the user's IdP-provided group membership information. +Euclid is limited to the `Analyst` role, the `Jaffle Shop` project, and the `Development`, `Staging`, and `General` environments of that project. Euclid has no access to the `Production` environment in their role. + + + + +Euclid takes the following steps to log in: + +1. Access the SSO URL or the dbt Cloud app in their Okta account. The URL can be found on the **Single sign-on** configuration page in the **Account settings**. + + + +2. Login with their Okta credentials. + + + +3. Since it's their first time logging in with SSO, Euclid Ean is presented with a message and no option to move forward until they check the email address associated with their Okta account. + + + +4. They now open their email and click the link to join dbt Labs, which completes the process. + + + +Euclid is now logged in to their account. They only have access to the `Jaffle Shop` pr, and the project selection option is removed from their UI entirely. + + + +They can now configure development credentials. The `Production` environment is visible, but it is `read-only`, and they have full access in the `Staging` environment. + + + + + + + +With RBAC configured, you now have granular control over user access to features across dbt Cloud. ## FAQs -- **When are IdP group memberships updated for SSO Mapped groups?**
- Group memberships are updated whenever a user logs into dbt Cloud via a supported SSO provider. If you've changed group memberships in your identity provider or dbt Cloud, ask your users to log back into dbt Cloud to synchronize these group memberships. -- **Can I set up SSO without RBAC?**
+ + +Group memberships are updated whenever a user logs into dbt Cloud via a supported SSO provider. If you've changed group memberships in your identity provider or dbt Cloud, ask your users to log back into dbt Cloud to synchronize these group memberships. + + + + + Yes, see the documentation on [Manual Assignment](#manual-assignment) above for more information on using SSO without RBAC. -- **Can I configure a user's License Type based on IdP Attributes?**
- Yes, see the docs on [managing license types](/docs/cloud/manage-access/seats-and-users#managing-license-types) for more information. -- **Why can't I edit a user's group membership?**
-Make sure you're not trying to edit your own user as this isn't allowed for security reasons. To edit the group membership of your own user, you'll need a different user to make those changes. +
+ + -- **How do I add or remove users**?
-Each dbt Cloud plan comes with a base number of Developer and Read-Only licenses. You can add or remove licenses by modifying the number of users in your account settings. - - If you're on an Enterprise plans and have the correct [permissions](/docs/cloud/manage-access/enterprise-permissions), you can add or remove developers by adjusting your developer user seat count in **Account settings** -> **Users**. +Yes, see the docs on [managing license types](/docs/cloud/manage-access/seats-and-users#managing-license-types) for more information. + +
+ + + +Don't try to edit your own user, as this isn't allowed for security reasons. You'll need a different user to make changes to your own user's group membership. + + + + + +Each dbt Cloud plan has a base number of Developer and Read-Only licenses. You can add or remove licenses by modifying the number of users in your account settings. + - If you're on an Enterprise plan and have the correct [permissions](/docs/cloud/manage-access/enterprise-permissions), you can add or remove developers by adjusting your developer user seat count in **Account settings** -> **Users**. - If you're on a Team plan and have the correct [permissions](/docs/cloud/manage-access/self-service-permissions), you can add or remove developers by making two changes: adjust your developer user seat count AND your developer billing seat count in **Account settings** -> **Users** and then in **Account settings** -> **Billing**. - Refer to [Users and licenses](/docs/cloud/manage-access/seats-and-users#licenses) for detailed steps. +For detailed steps, refer to [Users and licenses](/docs/cloud/manage-access/seats-and-users#licenses). + + \ No newline at end of file diff --git a/website/docs/docs/cloud/manage-access/audit-log.md b/website/docs/docs/cloud/manage-access/audit-log.md index 70ef4d66f8e..4d07afe2cde 100644 --- a/website/docs/docs/cloud/manage-access/audit-log.md +++ b/website/docs/docs/cloud/manage-access/audit-log.md @@ -32,7 +32,7 @@ On the audit log page, you will see a list of various events and their associate ### Event details -Click the event card to see the details about the activity that triggered the event. This view provides important details, including when it happened and what type of event was triggered. For example, if someone changes the settings for a job, you can use the event details to see which job was changed (type of event: `job_definition.Changed`), by whom (person who triggered the event: `actor`), and when (time it was triggered: `created_at_utc`). For types of events and their descriptions, see [Events in audit log](#events-in-audit-log). +Click the event card to see the details about the activity that triggered the event. This view provides important details, including when it happened and what type of event was triggered. For example, if someone changes the settings for a job, you can use the event details to see which job was changed (type of event: `job_definition.Changed`), by whom (person who triggered the event: `actor`), and when (time it was triggered: `created_at_utc`). For types of events and their descriptions, see [Events in audit log](#audit-log-events). The event details provide the key factors of an event: @@ -60,7 +60,6 @@ The audit log supports various events for different objects in dbt Cloud. You wi | Event Name | Event Type | Description | | -------------------------- | ---------------------------------------- | ------------------------------------------------------ | | Auth Provider Changed | auth_provider.Changed | Authentication provider settings changed | -| Credential Login Failed | auth.CredentialsLoginFailed | User login via username and password failed | | Credential Login Succeeded | auth.CredentialsLoginSucceeded | User successfully logged in with username and password | | SSO Login Failed | auth.SsoLoginFailed | User login via SSO failed | | SSO Login Succeeded | auth.SsoLoginSucceeded | User successfully logged in via SSO diff --git a/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md b/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md index f636be796d3..f814d58777a 100644 --- a/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md +++ b/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md @@ -3,7 +3,7 @@ title: "Users and licenses" description: "Learn how dbt Cloud administrators can use licenses and seats to control access in a dbt Cloud account." id: "seats-and-users" sidebar: "Users and licenses" -pagination_next: "docs/cloud/manage-access/self-service-permissions" +pagination_next: "docs/cloud/manage-access/enterprise-permissions" pagination_prev: null --- @@ -55,7 +55,7 @@ If you're on an Enterprise plan and have the correct [permissions](/docs/cloud/m - To add a user, go to **Account Settings** and select **Users**. - Click the [**Invite Users**](/docs/cloud/manage-access/invite-users) button. - - For fine-grained permission configuration, refer to [Role based access control](/docs/cloud/manage-access/enterprise-permissions). + - For fine-grained permission configuration, refer to [Role based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control-). @@ -124,9 +124,7 @@ Great work! After completing these steps, your dbt Cloud user count and billing ## Managing license types -Licenses can be assigned manually, or automatically based on IdP configuration -(enterprise only). By default, new users in an account will be assigned a -Developer license. +Licenses can be assigned to users individually or through group membership. To assign a license via group membership, you can manually add a user to a group during the invitation process or assign them to a group after they’ve enrolled in dbt Cloud. Alternatively, with [SSO configuration](/docs/cloud/manage-access/sso-overview) and [role-based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control-) (Enterprise only), users can be automatically assigned to groups. By default, new users in an account are assigned a Developer license. ### Manual configuration @@ -142,16 +140,9 @@ change. -### Mapped configuration +### Mapped configuration -**Note:** This feature is only available on the Enterprise plan. - -If your account is connected to an Identity Provider (IdP) for [Single Sign -On](/docs/cloud/manage-access/sso-overview), you can automatically map IdP user -groups to specific license types in dbt Cloud. To configure license mappings, -navigate to the Account Settings > Team > License Mappings page. From -here, you can create or edit SSO mappings for both Read-Only and Developer -license types. +If your account is connected to an Identity Provider (IdP) for [Single Sign On](/docs/cloud/manage-access/sso-overview), you can automatically map IdP user groups to specific groups in dbt Cloud and assign license types to those groups. To configure license mappings, navigate to the **Account Settings** > **Groups & Licenses** > **License Mappings** page. From here, you can create or edit SSO mappings for both Read-Only and Developer license types. By default, all new members of a dbt Cloud account will be assigned a Developer license. To assign Read-Only licenses to certain groups of users, create a new diff --git a/website/docs/docs/cloud/manage-access/enterprise-permissions.md b/website/docs/docs/cloud/manage-access/enterprise-permissions.md index a1f6d795c23..5a56900d529 100644 --- a/website/docs/docs/cloud/manage-access/enterprise-permissions.md +++ b/website/docs/docs/cloud/manage-access/enterprise-permissions.md @@ -22,22 +22,14 @@ The following roles and permission sets are available for assignment in dbt Clou :::tip Licenses or Permission sets -The user's [license](/docs/cloud/manage-access/seats-and-users) type always overrides their assigned permission set. This means that even if a user belongs to a dbt Cloud group with 'Account Admin' permissions, having a 'Read-Only' license would still prevent them from performing administrative actions on the account. +The user's [license](/docs/cloud/manage-access/about-user-access) type always overrides their assigned permission set. This means that even if a user belongs to a dbt Cloud group with 'Account Admin' permissions, having a 'Read-Only' license would still prevent them from performing administrative actions on the account. ::: -## How to set up RBAC Groups in dbt Cloud +## Additional resources -Role-Based Access Control (RBAC) is helpful for automatically assigning permissions to dbt admins based on their SSO provider group associations. RBAC does not apply to [model groups](/docs/collaborate/govern/model-access#groups). +- [Grant users access](/docs/cloud/manage-access/about-user-access#grant-access) +- [Role-based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control-) +- [Environment-level permissions](/docs/cloud/manage-access/environment-permissions) -1. Click the gear icon to the top right and select **Account Settings**. Click **Groups & Licenses** - - - -2. Select an existing group or create a new group to add RBAC. Name the group (this can be any name you like, but it's recommended to keep it consistent with the SSO groups). If you have configured SSO with SAML 2.0, you may have to use the GroupID instead of the name of the group. -3. Configure the SSO provider groups you want to add RBAC by clicking **Add** in the **SSO** section. These fields are case-sensitive and must match the source group formatting. -4. Configure the permissions for users within those groups by clicking **Add** in the **Access** section of the window. - - -5. When you've completed your configurations, click **Save**. Users will begin to populate the group automatically once they have signed in to dbt Cloud with their SSO credentials. diff --git a/website/docs/docs/cloud/manage-access/environment-permissions.md b/website/docs/docs/cloud/manage-access/environment-permissions.md index 44cf2dc9a64..b99da64609c 100644 --- a/website/docs/docs/cloud/manage-access/environment-permissions.md +++ b/website/docs/docs/cloud/manage-access/environment-permissions.md @@ -77,4 +77,4 @@ If the user has the same roles across projects, you can apply environment access ## Related docs --[Environment-level permissions setup](/docs/cloud/manage-access/environment-permissions-setup) +- [Environment-level permissions setup](/docs/cloud/manage-access/environment-permissions-setup) diff --git a/website/docs/docs/cloud/manage-access/external-oauth.md b/website/docs/docs/cloud/manage-access/external-oauth.md index 7ed9e4ef446..deb23f36f09 100644 --- a/website/docs/docs/cloud/manage-access/external-oauth.md +++ b/website/docs/docs/cloud/manage-access/external-oauth.md @@ -1,20 +1,17 @@ --- -title: "Set up external Oauth" +title: "Set up external OAuth" id: external-oauth -description: "Configuration instructions for dbt Cloud and external Oauth connections" -sidebar_label: "Set up external Oauth" -unlisted: true +description: "Configuration instructions for dbt Cloud and external OAuth connections" +sidebar_label: "Set up external OAuth" pagination_next: null pagination_prev: null --- -# Set up external Oauth +# Set up external OAuth -:::note Beta feature +:::note -External OAuth for authentication is available in a limited beta. If you are interested in joining the beta, please contact your account manager. - -This feature is currently only available for the Okta and Entra ID identity providers and Snowflake connections. Only available to Enterprise accounts. +This feature is currently only available for the Okta and Entra ID identity providers and [Snowflake connections](/docs/cloud/connect-data-platform/connect-snowflake). ::: @@ -23,7 +20,7 @@ dbt Cloud Enterprise supports [external OAuth authentication](https://docs.snow ## Getting started -The process of setting up external Oauth will require a little bit of back-and-forth between your dbt Cloud, IdP, and Snowflake accounts, and having them open in multiple browser tabs will help speed up the configuration process: +The process of setting up external OAuth will require a little bit of back-and-forth between your dbt Cloud, IdP, and Snowflake accounts, and having them open in multiple browser tabs will help speed up the configuration process: - **dbt Cloud:** You’ll primarily be working in the **Account Settings** —> **Integrations** page. You will need [proper permission](/docs/cloud/manage-access/enterprise-permissions) to set up the integration and create the connections. - **Snowflake:** Open a worksheet in an account that has permissions to [create a security integration](https://docs.snowflake.com/en/sql-reference/sql/create-security-integration). @@ -34,7 +31,7 @@ If the admins that handle these products are all different people, it’s better ### Snowflake commands -The following is a template for creating the Oauth configurations in the Snowflake environment: +The following is a template for creating the OAuth configurations in the Snowflake environment: ```sql @@ -53,41 +50,45 @@ external_oauth_any_role_mode = 'ENABLE' The `external_oauth_token_user_mapping_claim` and `external_oauth_snowflake_user_mapping_attribute` can be modified based on the your organizations needs. These values point to the claim in the users’ token. In the example, Snowflake will look up the Snowflake user whose `email` matches the value in the `sub` claim. -**Note:** The Snowflake default roles ACCOUNTADMIN, ORGADMIN, or SECURITYADMIN, are blocked from external Oauth by default and they will likely fail to authenticate. See the [Snowflake documentation](https://docs.snowflake.com/en/sql-reference/sql/create-security-integration-oauth-external) for more information. +**Note:** The Snowflake default roles ACCOUNTADMIN, ORGADMIN, or SECURITYADMIN, are blocked from external OAuth by default and they will likely fail to authenticate. See the [Snowflake documentation](https://docs.snowflake.com/en/sql-reference/sql/create-security-integration-oauth-external) for more information. + +## Identity provider configuration -## Set up with Okta +Select a supported identity provider (IdP) for instructions on configuring external OAuth in their environment and completing the integration in dbt Cloud. + + ### 1. Initialize the dbt Cloud settings -1. In your dbt Cloud account, navigate to **Account settings** —> **Integrations**. +1. In your dbt Cloud account, navigate to **Account settings** —> **Integrations**. 2. Scroll down to **Custom integrations** and click **Add integrations** -3. Leave this window open. You can set the **Integration type** to Okta and make a note of the **Redirect URI** at the bottom of the page. Copy this to your clipboard for use in the next steps. +3. Leave this window open. You can set the **Integration type** to Okta and note the **Redirect URI** at the bottom of the page. Copy this to your clipboard for use in the next steps. ### 2. Create the Okta app -1. From the Okta dashboard, expand the **Applications** section and click **Applications.** Click the **Create app integration** button. +1. Expand the **Applications** section from the Okta dashboard and click **Applications.** Click the **Create app integration** button. 2. Select **OIDC** as the sign-in method and **Web applications** as the application type. Click **Next**. -3. Give the application an appropriate name, something like “External Oauth app for dbt Cloud” that will make it easily identifiable. +3. Give the application an appropriate name, something like “External OAuth app for dbt Cloud,” that will make it easily identifiable. 4. In the **Grant type** section, enable the **Refresh token** option. -5. Scroll down to the **Sign-in redirect URIs** option. Here, you’ll need to paste the redirect URI you gathered from dbt Cloud in step 1.3. +5. Scroll down to the **Sign-in redirect URIs** option. You’ll need to paste the redirect URI you gathered from dbt Cloud in step 1.3. - + -6. Save the app configuration. You’ll come back to it, but for now move on to the next steps. +6. Save the app configuration. You’ll come back to it, but move on to the next steps for now. ### 3. Create the Okta API -1. From the Okta sidebar menu, expand the **Security** section and clicl **API**. -2. On the API screen, click **Add authorization server**. Give the authorizations server a name (a nickname for your Snowflake account would be appropriate). For the **Audience** field, copy and paste your Snowflake login URL (for example, https://abdc-ef1234.snowflakecomputing.com). Give the server an appropriate description and click **Save**. +1. Expand the **Security** section and click **API** from the Okta sidebar menu. +2. On the API screen, click **Add authorization server**. Give the authorization server a name (a nickname for your Snowflake account would be appropriate). For the **Audience** field, copy and paste your Snowflake login URL (for example, https://abdc-ef1234.snowflakecomputing.com). Give the server an appropriate description and click **Save**. -3. On the authorization server config screen, open the **Metadata URI** in a new tab. You’ll need information from this screen in later steps. +3. On the authorization server config screen, open the **Metadata URI** in a new tab. You’ll need information from this screen in later steps. @@ -97,7 +98,7 @@ The `external_oauth_token_user_mapping_claim` and `external_oauth_snowflake_u -5. Open the **Access policies** tab and click **Add policy**. Give the policy a **Name** and **Description** and set **Assign to** as **The following clients**. Start typing the name of the app you created in step 2.3 and you’ll see it autofill. Select the app and click **Create Policy**. +5. Open the **Access policies** tab and click **Add policy**. Give the policy a **Name** and **Description** and set **Assign to** as **The following clients**. Start typing the name of the app you created in step 2.3, and you’ll see it autofill. Select the app and click **Create Policy**. @@ -105,13 +106,13 @@ The `external_oauth_token_user_mapping_claim` and `external_oauth_snowflake_u -7. Give the rule a descriptive name and scroll down to **token lifetimes**. Configure the **Access token lifetime is**, **Refresh token lifetime is**, and **but will expire if not used every** settings according to your organizational policies. We recommend the defaults of 1 hour and 90 days. Stricter rules increases the odds of your users having to re-authenticate. +7. Give the rule a descriptive name and scroll down to **token lifetimes**. Configure the **Access token lifetime is**, **Refresh token lifetime is**, and **but will expire if not used every** settings according to your organizational policies. We recommend the defaults of 1 hour and 90 days. Stricter rules increase the odds of your users having to re-authenticate. 8. Navigate back to the **Settings** tab and leave it open in your browser. You’ll need some of the information in later steps. -### 4. Create the Oauth settings in Snowflake +### 4. Create the OAuth settings in Snowflake 1. Open up a Snowflake worksheet and copy/paste the following: @@ -130,9 +131,9 @@ external_oauth_any_role_mode = 'ENABLE' ``` -2. Change `your_integration_name` to something appropriately descriptive. For example, `dev_OktaAccountNumber_okta`. Copy the `external_oauth_issuer` and `external_oauth_jws_keys_url` from the metadate URI in step 3.3. Use the same Snowflake URL that you entered in step 3.2 as the `external_oauth_audience_list`. +2. Change `your_integration_name` to something appropriately descriptive. For example, `dev_OktaAccountNumber_okta`. Copy the `external_oauth_issuer` and `external_oauth_jws_keys_url` from the metadata URI in step 3.3. Use the same Snowflake URL you entered in step 3.2 as the `external_oauth_audience_list`. -Adjust the other settings as needed to meet your organizations configurations in Okta and Snowflake. +Adjust the other settings as needed to meet your organization's configurations in Okta and Snowflake. @@ -140,39 +141,47 @@ Adjust the other settings as needed to meet your organizations configurations in ### 5. Configuring the integration in dbt Cloud -1. Navigate back to the dbt Cloud **Account settings** —> **Integrations** page you were on at the beginning. It’s time to start filling out all of the fields. - 1. `Integration name`: Give the integration a descriptive name that includes identifying information about the Okta environment so future users won’t have to guess where it belongs. - 2. `Client ID` and `Client secrets`: Retrieve these from the Okta application page. - - 3. Authorize URL and Token URL: Found in the metadata URI. - +1. Navigate back to the dbt Cloud **Account settings** —> **Integrations** page you were on at the beginning. It’s time to start filling out all of the fields. + 1. `Integration name`: Give the integration a descriptive name that includes identifying information about the Okta environment so future users won’t have to guess where it belongs. + 2. `Client ID` and `Client secrets`: Retrieve these from the Okta application page. + + 3. Authorize URL and Token URL: Found in the metadata URI. + 2. **Save** the configuration + ### 6. Create a new connection in dbt Cloud -1. Navigate the **Account settings** and click **Connections** from the menu. Click **Add connection**. -2. Configure the `Account`, `Database`, and `Warehouse` as you normally would and for the `Oauth method` select the external Oauth you just created. - +1. Navigate the **Account settings** and click **Connections** from the menu. Click **Add connection**. +2. Configure the `Account`, `Database`, and `Warehouse` as you normally would, and for the `OAuth method`, select the external OAuth you just created. + + + + -3. Scroll down to the **External Oauth** configurations box and select the config from the list. +3. Scroll down to the **External OAuth** configurations box and select the config from the list. - -4. **Save** the connection and you have now configured External Oauth with Okta and Snowflake! + -## Set up with Entra ID + +4. **Save** the connection, and you have now configured External OAuth with Okta and Snowflake! + + + + ### 1. Initialize the dbt Cloud settings -1. In your dbt Cloud account, navigate to **Account settings** —> **Integrations**. +1. In your dbt Cloud account, navigate to **Account settings** —> **Integrations**. 2. Scroll down to **Custom integrations** and click **Add integrations**. -3. Leave this window open. You can set the **Integration type** to Entra ID and make a note of the **Redirect URI** at the bottom of the page. Copy this to your clipboard for use in the next steps. +3. Leave this window open. You can set the **Integration type** to Entra ID and note the **Redirect URI** at the bottom of the page. Copy this to your clipboard for use in the next steps. ### Entra ID -You’ll create two different `apps` in the Azure portal — A resource server and a client app. +You’ll create two apps in the Azure portal: A resource server and a client app. :::important @@ -187,68 +196,74 @@ In your Azure portal, open the **Entra ID** and click **App registrations** from ### 1. Create a resource server 1. From the app registrations screen, click **New registration**. - 1. Give the app a name. - 2. Ensure **Supported account types** are set to “Accounts in this organizational directory only (`Org name` - Single Tenant).” - 3. Click **Register** and you will be taken to the apps overview. + 1. Give the app a name. + 2. Ensure **Supported account types** are set to “Accounts in this organizational directory only (`Org name` - Single Tenant).” + 3. Click **Register**to see the application’s overview. 2. From the app overview page, click **Expose an API** from the left menu. -3. Click **Add** next to **Application ID URI**. The field will automatically populate. Click **Save**. -4. Record the `value` field as it will be used in a future step. *This is only displayed once. Be sure to record it immediately. It will be hidden when you leave the page and come back.* +3. Click **Add** next to **Application ID URI**. The field will automatically populate. Click **Save**. +4. Record the `value` field for use in a future step. _This is only displayed once. Be sure to record it immediately. Microsoft hides the field when you leave the page and come back._ 5. From the same screen, click **Add scope**. - 1. Give the scope a name. - 2. Set “Who can consent?” to **Admins and users**. - 3. Set **Admin consent display name** session:role-any and give it a description. - 4. Ensure **State** is set to **Enabled**. - 5. Click **Add scope**. + 1. Give the scope a name. + 2. Set “Who can consent?” to **Admins and users**. + 3. Set **Admin consent display name** session:role-any and give it a description. + 4. Ensure **State** is set to **Enabled**. + 5. Click **Add scope**. ### 2. Create a client app 1. From the **App registration page**, click **New registration**. - 1. Give the app a name that uniquely identifies it as the client app. - 2. Ensure **Supported account types** are set to “Accounts in this organizational directory only (`Org name` - Single Tenant).” - 3. Set the **Redirect URI** to **Web** and copy/paste the **Redirect URI** from dbt Cloud into the field. - 4. Click **Register**. + 1. Give the app a name that uniquely identifies it as the client app. + 2. Ensure **Supported account types** are set to “Accounts in this organizational directory only (`Org name` - Single Tenant).” + 3. Set the **Redirect URI** to **Web** and copy/paste the **Redirect URI** from dbt Cloud into the field. + 4. Click **Register**. 2. From the app overview page, click **API permissions** from the left menu, and click **Add permission**. 3. From the pop-out screen, click **APIs my organization uses**, search for the resource server name from the previous steps, and click it. 4. Ensure the box for the **Permissions** `session:role-any` is enabled and click **Add permissions**. 5. Click **Grant admin consent** and from the popup modal click **Yes**. -6. From the left menu, click **Certificates and secrets** and cllick **New client secret**. Name the secret, set an expiration, and click **Add**. -**Note**: Microsoft does not allow “forever” as an expiration. The maximum time is two years. It’s essential to document the expiration date so that the secret can be refreshed before the expiration or user authorization will fail. -7. Record the `value` for use in a future step and record it immediately. -**Note**: This value will not be displayed again once you navigate away from this screen. +6. From the left menu, click **Certificates and secrets** and click **New client secret**. Name the secret, set an expiration, and click **Add**. +**Note**: Microsoft does not allow “forever” as an expiration date. The maximum time is two years. Documenting the expiration date so you can refresh the secret before the expiration or user authorization fails is essential. +7. Record the `value` for use in a future step and record it immediately. +**Note**: Entra ID will not display this value again once you navigate away from this screen. ### 3. Snowflake configuration -You'll be switching between the Entra ID site and Snowflake. Keep your Entra ID account open for this process. +You'll be switching between the Entra ID site and Snowflake. Keep your Entra ID account open for this process. Copy and paste the following as a template in a Snowflake worksheet: ```sql + create or replace security integration - type = external_oauth - enabled = true - external_oauth_type = azure - external_oauth_issuer = '' - external_oauth_jws_keys_url = '' - external_oauth_audience_list = ('') - external_oauth_token_user_mapping_claim = 'upn' - external_oauth_any_role_mode = 'ENABLE' - external_oauth_snowflake_user_mapping_attribute = 'login_name'; + type = external_oauth + enabled = true + external_oauth_type = azure + external_oauth_issuer = '' + external_oauth_jws_keys_url = '' + external_oauth_audience_list = ('') + external_oauth_token_user_mapping_claim = 'upn' + external_oauth_any_role_mode = 'ENABLE' + external_oauth_snowflake_user_mapping_attribute = 'login_name'; + ``` + On the Entra ID site: -1. From the Client ID app in Entra ID, click **Endpoints** and open the **Federation metadata document** in a new tab. - - The **entity ID** on this page maps to the `external_oauth_issuer` field in the Snowflake config. +1. From the Client ID +app in Entra ID, click **Endpoints** and open the **Federation metadata document** in a new tab. + - The **entity ID** on this page maps to the `external_oauth_issuer` field in the Snowflake config. 2. Back on the list of endpoints, open the **OpenID Connect metadata document** in a new tab. - - The **jwks_uri** field maps to the `external_oauth_jws_keys_url` field in Snowflake. + - The **jwks_uri** field maps to the `external_oauth_jws_keys_url` field in Snowflake. 3. Navigate to the resource server in previous steps. - - The **Application ID URI** maps to teh `external_oauth_audience_list` field in Snowflake. -4. Run the configurations. Be sure the admin who created the Microsoft apps is also a user in Snowflake, or the configuration will fail. + - The **Application ID URI** maps to the `external_oauth_audience_list` field in Snowflake. +4. Run the configurations. Be sure the admin who created the Microsoft apps is also a user in Snowflake, or the configuration will fail. ### 4. Configuring the integration in dbt Cloud -1. Navigate back to the dbt Cloud **Account settings** —> **Integrations** page you were on at the beginning. It’s time to start filling out all of the fields. There will be some back-and-forth between the Entra ID account and dbt Cloud. -2. `Integration name`: Give the integration a descriptive name that includes identifying information about the Entra ID environment so future users won’t have to guess where it belongs. -3. `Client secrets`: These are found in the Client ID from the **Certificates and secrets** page. `Value` is the `Client secret` . Note that it only appears when created; if you return later, it will be hidden, and you must recreate the secret. +1. Navigate back to the dbt Cloud **Account settings** —> **Integrations** page you were on at the beginning. It’s time to start filling out all of the fields. There will be some back-and-forth between the Entra ID account and dbt Cloud. +2. `Integration name`: Give the integration a descriptive name that includes identifying information about the Entra ID environment so future users won’t have to guess where it belongs. +3. `Client secrets`: Found in the Client ID from the **Certificates and secrets** page. `Value` is the `Client secret`. Note that it only appears when created; _Microsoft hides the secret if you return later, and you must recreate it._ 4. `Client ID`: Copy the’ Application (client) ID’ on the overview page for the client ID app. -5. `Authorization URL` and `Token URL`: From the client ID app, open the `Endpoints` tab. The `Oauth 2.0 authorization endpoint (v2)` and `Oauth 2.0 token endpoint (v2)` fields map to these. *You must use v2 of the `Oauth 2.0 authorization endpoint`. Do not use V1.* You can use either version of the `Oauth 2.0 token endpoint`. +5. `Authorization URL` and `Token URL`: From the client ID app, open the `Endpoints` tab. These URLs map to the `OAuth 2.0 authorization endpoint (v2)` and `OAuth 2.0 token endpoint (v2)` fields. *You must use v2 of the `OAuth 2.0 authorization endpoint`. Do not use V1.* You can use either version of the `OAuth 2.0 token endpoint`. 6. `Application ID URI`: Copy the `Application ID URI` field from the resource server’s Overview screen. + + diff --git a/website/docs/docs/cloud/manage-access/invite-users.md b/website/docs/docs/cloud/manage-access/invite-users.md index c82e15fd48f..0922b4dc991 100644 --- a/website/docs/docs/cloud/manage-access/invite-users.md +++ b/website/docs/docs/cloud/manage-access/invite-users.md @@ -17,19 +17,16 @@ You must have proper permissions to invite new users: ## Invite new users -1. In your dbt Cloud account, select the gear menu in the upper right corner and then select **Account Settings**. -2. From the left sidebar, select **Users**. - - - -3. Click on **Invite Users**. +1. In your dbt Cloud account, select your account name in the bottom left corner. Then select **Account settings**. +2. Under **Settings**, select **Users**. +3. Click on **Invite users**. -4. In the **Email Addresses** field, enter the email addresses of the users you would like to invite separated by comma, semicolon, or a new line. +4. In the **Email Addresses** field, enter the email addresses of the users you want to invite separated by a comma, semicolon, or a new line. 5. Select the license type for the batch of users from the **License** dropdown. -6. Select the group(s) you would like the invitees to belong to. -7. Click **Send Invitations**. +6. Select the group(s) you want the invitees to belong to. +7. Click **Send invitations**. - If the list of invitees exceeds the number of licenses your account has available, you will receive a warning when you click **Send Invitations** and the invitations will not be sent. diff --git a/website/docs/docs/cloud/manage-access/licenses-and-groups.md b/website/docs/docs/cloud/manage-access/licenses-and-groups.md deleted file mode 100644 index b91af80f9b3..00000000000 --- a/website/docs/docs/cloud/manage-access/licenses-and-groups.md +++ /dev/null @@ -1,145 +0,0 @@ ---- -title: "Licenses and groups" -id: "licenses-and-groups" ---- - -## Overview - -dbt Cloud administrators can use dbt Cloud's permissioning model to control -user-level access in a dbt Cloud account. This access control comes in two flavors: -License-based and Role-based. - -- **License-based Access Controls:** User are configured with account-wide - license types. These licenses control the specific parts of the dbt Cloud application - that a given user can access. -- **Role-based Access Control (RBAC):** Users are assigned to _groups_ that have - specific permissions on specific projects or the entire account. A user may be - a member of multiple groups, and those groups may have permissions on multiple - projects. - -## License-based access control - -Each user on an account is assigned a license type when the user is first -invited to a given account. This license type may change over time, but a -user can only have one type of license at any given time. - -A user's license type controls the features in dbt Cloud that the user is able -to access. dbt Cloud's three license types are: - - **Read-Only** - - **Developer** - - **IT** - -For more information on these license types, see [Seats & Users](/docs/cloud/manage-access/seats-and-users). -At a high-level, Developers may be granted _any_ permissions, whereas Read-Only -users will have read-only permissions applied to all dbt Cloud resources -regardless of the role-based permissions that the user is assigned. IT users will have Security Admin and Billing Admin permissions applied regardless of the role-based permissions that the user is assigned. - -## Role-based access control - -:::info dbt Cloud Enterprise - -Role-based access control is a feature of the dbt Cloud Enterprise plan - -::: - -Role-based access control allows for fine-grained permissioning in the dbt Cloud -application. With role-based access control, users can be assigned varying -permissions to different projects within a dbt Cloud account. For teams on the -Enterprise tier, role-based permissions can be generated dynamically from -configurations in an [Identity Provider](sso-overview). - -Role-based permissions are applied to _groups_ and pertain to _projects_. The -assignable permissions themselves are granted via _permission sets_. - - -### Groups - -A group is a collection of users. Users may belong to multiple groups. Members -of a group inherit any permissions applied to the group itself. - -Users can be added to a dbt Cloud group based on their group memberships in the -configured [Identity Provider](sso-overview) for the account. In this way, dbt -Cloud administrators can manage access to dbt Cloud resources via identity -management software like Microsoft Entra ID (formerly Azure AD), Okta, or GSuite. See _SSO Mappings_ below for -more information. - -You can view the groups in your account or create new groups from the **Team > Groups** -page in your Account Settings. - - - - -### SSO Mappings - -SSO Mappings connect Identity Provider (IdP) group membership to dbt Cloud group -membership. When a user logs into dbt Cloud via a supported identity provider, -their IdP group memberships are synced with dbt Cloud. Upon logging in -successfully, the user's group memberships (and therefore, permissions) are -adjusted accordingly within dbt Cloud automatically. - -:::tip Creating SSO Mappings - -While dbt Cloud supports mapping multiple IdP groups to a single dbt Cloud -group, we recommend using a 1:1 mapping to make administration as simple as -possible. Consider using the same name for your dbt Cloud groups and your IdP -groups. - -::: - - -### Permission Sets - -Permission sets are predefined collections of granular permissions. Permission -sets combine low-level permission grants into high-level roles that can be -assigned to groups. Some examples of existing permission sets are: - - Account Admin - - Git Admin - - Job Admin - - Job Viewer - - ...and more - -For a full list of enterprise permission sets, see [Enterprise Permissions](/docs/cloud/manage-access/enterprise-permissions). -These permission sets are available for assignment to groups and control the ability -for users in these groups to take specific actions in the dbt Cloud application. - -In the following example, the _dbt Cloud Owners_ group is configured with the -**Account Admin** permission set on _All Projects_ and the **Job Admin** permission -set on the _Internal Analytics_ project. - - - - -### Manual assignment - -dbt Cloud administrators can manually assign users to groups independently of -IdP attributes. If a dbt Cloud group is configured _without_ any -SSO Mappings, then the group will be _unmanaged_ and dbt Cloud will not adjust -group membership automatically when users log into dbt Cloud via an identity -provider. This behavior may be desirable for teams that have connected an identity -provider, but have not yet configured SSO Mappings between dbt Cloud and the -IdP. - -If an SSO Mapping is added to an _unmanaged_ group, then it will become -_managed_, and dbt Cloud may add or remove users to the group automatically at -sign-in time based on the user's IdP-provided group membership information. - - -## FAQs -- **When are IdP group memberships updated for SSO Mapped groups?** Group memberships - are updated every time a user logs into dbt Cloud via a supported SSO provider. If - you've changed group memberships in your identity provider or dbt Cloud, ask your - users to log back into dbt Cloud for these group memberships to be synchronized. - -- **Can I set up SSO without RBAC?** Yes, see the documentation on - [Manual Assignment](#manual-assignment) above for more information on using - SSO without RBAC. - -- **Can I configure a user's License Type based on IdP Attributes?** Yes, see - the docs on [managing license types](/docs/cloud/manage-access/seats-and-users#managing-license-types) - for more information. diff --git a/website/docs/docs/cloud/manage-access/mfa.md b/website/docs/docs/cloud/manage-access/mfa.md index a06251e6468..bcddc04f072 100644 --- a/website/docs/docs/cloud/manage-access/mfa.md +++ b/website/docs/docs/cloud/manage-access/mfa.md @@ -7,6 +7,13 @@ sidebar: null # Multi-factor authentication +:::important + + +dbt Cloud enforces multi-factor authentication (MFA) for all users with username and password credentials. If MFA is not set up, you will see a notification bar prompting you to configure one of the supported methods when you log in. If you do not, you will have to configure MFA upon subsequent logins, or you will be unable to access dbt Cloud. + +::: + dbt Cloud provides multiple options for multi-factor authentication (MFA). MFA provides an additional layer of security to username and password logins for Developer and Team plan accounts. The available MFA methods are: - SMS verification code (US-based phone numbers only) diff --git a/website/docs/docs/cloud/manage-access/self-service-permissions.md b/website/docs/docs/cloud/manage-access/self-service-permissions.md index 24e1283b126..a5bdba825c2 100644 --- a/website/docs/docs/cloud/manage-access/self-service-permissions.md +++ b/website/docs/docs/cloud/manage-access/self-service-permissions.md @@ -1,42 +1,84 @@ --- -title: "Self-service permissions" -description: "Learn how dbt Cloud administrators can use self-service permissions to control access in a dbt Cloud account." +title: "Self-service Team account permissions" +description: "Learn how dbt Cloud administrators can use self-service permissions to control access in a dbt Cloud Team account." +sidebar_label: "Team permissions" id: "self-service-permissions" --- -import Permissions from '/snippets/_self-service-permissions-table.md'; +Self-service Team accounts are a quick and easy way to get dbt Cloud up and running for a small team. For teams looking to scale and access advanced features like SSO, group management, and support for larger user bases, upgrading to an [Enterprise](/docs/cloud/manage-access/enterprise-permissions) account unlocks these capabilities. +If you're interested in upgrading, contact [dbt Labs today](https://www.getdbt.com/contact) - +## Groups and permissions -## Read-Only vs. Developer License Types +Groups determine a user's permission and there are three groups are available for Team plan dbt Cloud accounts: Owner, Member, and Everyone. The first Owner user is the person who created the dbt Cloud account. -Users configured with Read-Only license types will experience a restricted set of permissions in dbt Cloud. If a user is associated with a _Member_ permission set and a Read-Only seat license, then they will only have access to what a Read-Only seat allows. See [Seats and Users](/docs/cloud/manage-access/seats-and-users) for more information on the impact of licenses on these permissions. +New users are added to the Member and Everyone groups when they onboard but this can be changed when the invitation is created. These groups only affect users with a [Developer license](#licenses) assigned. -## Owner and Member Groups in dbt Cloud Enterprise +The group access permissions are as follows: -By default, new users are added to the Member and Owner groups when they onboard to a new dbt Cloud account. Member and Owner groups are included with every new dbt Cloud account because they provide access for administrators to add users and groups, and to apply permission sets. +- **Owner** — Full access to account features. +- **Member** — Robust access to the account with restrictions on features that can alter billing or security. +- **Everyone** — A catch-all group for all users in the account. This group does not have any permission assignments beyond the user's profile. Users must be assigned to either the Member or Owner group to work in dbt Cloud. -You will need owner and member groups to help with account onboarding, but these groups can create confusion when initially setting up SSO and RBAC for dbt Cloud Enterprise accounts as described in the [Enterprise Permissions](enterprise-permissions) guide. Owner and Member groups are **account level** groups, so their permissions override any project-level permissions you wish to apply. +## Licenses -After onboarding administrative users and configuring RBAC/SSO groups, we recommend the following steps for onboarding users to a dbt Cloud Enterprise account. +You assign licenses to every user onboarded into dbt Cloud. You only assign Developer-licensed users to the Owner and Member groups. The groups have no impact on Read-only or IT licensed users. +There are three license types: -### Prerequisites +- **Developer** — The default license. Developer licenses don't restrict access to any features, so users with this license should be assigned to either the Owner or Member group. You're allotted up to 8 developer licenses per account. +- **Read-Only** — Read-only access to your project, including environments dbt Explorer. Doesn't have access to account settings at all. Functions the same regardless of group assignments. You're allotted up to 5 read-only licenses per account. +- **IT** — Partial access to the account settings including users, integrations, billing, and API settings. Cannot create or edit connects or access the project at all. Functions the same regardless of group assignments. You're allocated 1 seat per account. -You need to create an Account Admins group before removing any other groups. +See [Seats and Users](/docs/cloud/manage-access/seats-and-users) for more information on the impact of licenses on these permissions. -1. Create an Account Admins group. -2. Assign at least one user to the Account Admins group. The assigned user can manage future group, SSO mapping, and user or group assignment. +## Table of groups, licenses, and permissions -### Remove the Owner and Member groups +Key: -Follow these steps for both Owner and Member groups: +* (W)rite — Create new or modify existing. Includes `send`, `create`, `delete`, `allocate`, `modify`, and `read`. +* (R)ead — Can view but can not create or change any fields. +* No value — No access to the feature. + +Permissions: + +* [Account-level permissions](#account-permissions-for-account-roles) — Permissions related to management of the dbt Cloud account. For example, billing and account settings. +* [Project-level permissions](#project-permissions-for-account-roles) — Permissions related to the projects in dbt Cloud. For example, Explorer and the IDE. + +The following tables outline the access that users have if they are assigned a Developer license and the Owner or Member group, Read-only license, or IT license. + +#### Account permissions for account roles + +| Account-level permission| Owner | Member | Read-only license| IT license | +|:------------------------|:-----:|:------:|:----------------:|:------------:| +| Account settings | W | W | | W | +| Billing | W | | | W | +| Invitations | W | W | | W | +| Licenses | W | R | | W | +| Users | W | R | | W | +| Project (create) | W | W | | W | +| Connections | W | W | | W | +| Service tokens | W | | | W | +| Webhooks | W | W | | | + +#### Project permissions for account roles + +|Project-level permission | Owner | Member | Read-only | IT license | +|:------------------------|:-----:|:-------:|:---------:|:----------:| +| Adapters | W | W | R | | +| Connections | W | W | R | | +| Credentials | W | W | R | | +| Custom env. variables | W | W | R | | +| Develop (IDE or dbt Cloud CLI)| W | W | | | +| Environments | W | W | R | | +| Jobs | W | W | R | | +| dbt Explorer | W | W | R | | +| Permissions | W | R | | | +| Profile | W | W | R | | +| Projects | W | W | R | | +| Repositories | W | W | R | | +| Runs | W | W | R | | +| Semantic Layer Config | W | W | R | | -1. Log into dbt Cloud. -2. Click the gear icon at the top right and select **Account settings**. -3. Select **Groups** then select **OWNER** or **MEMBER**** group. -4. Click **Edit**. -5. At the bottom of the Group page, click **Delete**. -The Account Admin can add additional SSO mapping groups, permission sets, and users as needed. diff --git a/website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md b/website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md index e9c4236438e..27c09cbca09 100644 --- a/website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md +++ b/website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md @@ -10,7 +10,7 @@ This guide describes a feature of the dbt Cloud Enterprise plan. If you’re int ::: -dbt Cloud Enterprise supports [OAuth authentication](https://docs.snowflake.net/manuals/user-guide/oauth-intro.html) with Snowflake. When Snowflake OAuth is enabled, users can authorize their Development credentials using Single Sign On (SSO) via Snowflake rather than submitting a username and password to dbt Cloud. If Snowflake is setup with SSO through a third-party identity provider, developers can use this method to log into Snowflake and authorize the dbt Development credentials without any additional setup. +dbt Cloud Enterprise supports [OAuth authentication](https://docs.snowflake.net/manuals/user-guide/oauth-intro.html) with Snowflake. When Snowflake OAuth is enabled, users can authorize their Development credentials using Single Sign On (SSO) via Snowflake rather than submitting a username and password to dbt Cloud. If Snowflake is set up with SSO through a third-party identity provider, developers can use this method to log into Snowflake and authorize the dbt Development credentials without any additional setup. To set up Snowflake OAuth in dbt Cloud, admins from both are required for the following steps: 1. [Locate the redirect URI value](#locate-the-redirect-uri-value) in dbt Cloud. @@ -22,10 +22,10 @@ To use Snowflake in the dbt Cloud IDE, all developers must [authenticate with Sn ### Locate the redirect URI value To get started, copy the connection's redirect URI from dbt Cloud: -1. Navigate to **Account settings** -1. Select **Projects** and choose a project from the list -1. Select the connection to view its details abd set the **OAuth method** to "Snowflake SSO" -1. Copy the **Redirect URI** for use in later steps +1. Navigate to **Account settings**. +1. Select **Projects** and choose a project from the list. +1. Select the connection to view its details and set the **OAuth method** to "Snowflake SSO". +1. Copy the **Redirect URI** to use in the later steps. ` value with the Redirect URI (also referred to as the [access URL](/docs/cloud/about-cloud/access-regions-ip-addresses)) copied in dbt Cloud. To locate the Redirect URI, refer to the previous [locate the redirect URI value](#locate-the-redirect-uri-value) section. ``` CREATE OR REPLACE SECURITY INTEGRATION DBT_CLOUD @@ -43,7 +45,7 @@ CREATE OR REPLACE SECURITY INTEGRATION DBT_CLOUD ENABLED = TRUE OAUTH_CLIENT = CUSTOM OAUTH_CLIENT_TYPE = 'CONFIDENTIAL' - OAUTH_REDIRECT_URI = 'LOCATED_REDIRECT_URI' + OAUTH_REDIRECT_URI = '' OAUTH_ISSUE_REFRESH_TOKENS = TRUE OAUTH_REFRESH_TOKEN_VALIDITY = 7776000; ``` @@ -88,11 +90,11 @@ Enter the Client ID and Client Secret into dbt Cloud to complete the creation of -### Authorize Developer Credentials +### Authorize developer credentials Once Snowflake SSO is enabled, users on the project will be able to configure their credentials in their Profiles. By clicking the "Connect to Snowflake Account" button, users will be redirected to Snowflake to authorize with the configured SSO provider, then back to dbt Cloud to complete the setup process. At this point, users should now be able to use the dbt IDE with their development credentials. -### SSO OAuth Flow Diagram +### SSO OAuth flow diagram diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md b/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md index e4ff998015c..2b2575efc57 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md @@ -117,7 +117,7 @@ If the verification information looks appropriate, then you have completed the c ## Setting up RBAC Now you have completed setting up SSO with GSuite, the next steps will be to set up -[RBAC groups](/docs/cloud/manage-access/enterprise-permissions) to complete your access control configuration. +[RBAC groups](/docs/cloud/manage-access/about-user-access#role-based-access-control-) to complete your access control configuration. ## Troubleshooting diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-okta.md b/website/docs/docs/cloud/manage-access/set-up-sso-okta.md index 53986513ce2..fda32f118ef 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-okta.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-okta.md @@ -190,4 +190,4 @@ configured in the steps above. ## Setting up RBAC Now you have completed setting up SSO with Okta, the next steps will be to set up -[RBAC groups](/docs/cloud/manage-access/enterprise-permissions) to complete your access control configuration. +[RBAC groups](/docs/cloud/manage-access/about-user-access#role-based-access-control-) to complete your access control configuration. diff --git a/website/docs/docs/cloud/manage-access/sso-overview.md b/website/docs/docs/cloud/manage-access/sso-overview.md index 560be72e31d..6b6527df753 100644 --- a/website/docs/docs/cloud/manage-access/sso-overview.md +++ b/website/docs/docs/cloud/manage-access/sso-overview.md @@ -43,7 +43,7 @@ Then, assign all of these (and only these) to the user license. This step will a ## SSO enforcement -* **SSO Enforcement:** If you have SSO turned on in your organization, dbt Cloud will enforce SSO-only logins for all non-admin users. If an Account Admin already has a password, they can continue logging in with a password. +* **SSO Enforcement:** If SSO is turned on in your organization, dbt Cloud will enforce SSO-only logins for all non-admin users. By default, if an Account Admin or Security Admin already has a password, they can continue logging in with a password. To restrict admins from using passwords, turn off **Allow password logins for account administrators** in the **Single sign-on** section of your organization's **Account settings**. * **SSO Re-Authentication:** dbt Cloud will prompt you to re-authenticate using your SSO provider every 24 hours to ensure high security. ### How should non-admin users log in? diff --git a/website/docs/docs/cloud/migration.md b/website/docs/docs/cloud/migration.md index 3aec1956297..2665b8f6a97 100644 --- a/website/docs/docs/cloud/migration.md +++ b/website/docs/docs/cloud/migration.md @@ -11,15 +11,22 @@ dbt Labs is in the process of rolling out a new cell-based architecture for dbt We're scheduling migrations by account. When we're ready to migrate your account, you will receive a banner or email communication with your migration date. If you have not received this communication, then you don't need to take action at this time. dbt Labs will share information about your migration with you, with appropriate advance notice, when applicable to your account. -Your account will be automatically migrated on its scheduled date. However, if you use certain features, you must take action before that date to avoid service disruptions. +Your account will be automatically migrated on or after its scheduled date. However, if you use certain features, you must take action before that date to avoid service disruptions. ## Recommended actions +:::info Rescheduling your migration + +If you're on the dbt Cloud Enterprise tier, you can postpone your account migration by up to 45 days. To reschedule your migration, navigate to **Account Settings** → **Migration guide**. + +For help, contact the dbt Support Team at [support@getdbt.com](mailto:support@getdbt.com). +::: + We highly recommended you take these actions: -- Ensure pending user invitations are accepted or note outstanding invitations. Pending user invitations will be voided during the migration and must be resent after it is complete. -- Commit unsaved changes in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). Unsaved changes will be lost during migration. -- Export and download [audit logs](/docs/cloud/manage-access/audit-log) older than 90 days, as they will be lost during migration. If you lose critical logs older than 90 days during the migration, you will have to work with the dbt Labs Customer Support team to recover. +- Ensure pending user invitations are accepted or note outstanding invitations. Pending user invitations might be voided during the migration. You can resend user invitations after the migration is complete. +- Commit unsaved changes in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). Unsaved changes might be lost during migration. +- Export and download [audit logs](/docs/cloud/manage-access/audit-log) older than 90 days, as they will be unavailable from dbt Cloud after the migration is complete. Logs older than 90 days while within the data retention period are not deleted, but you will have to work with the dbt Labs Customer Support team to recover. ## Required actions diff --git a/website/docs/docs/cloud/secure/about-privatelink.md b/website/docs/docs/cloud/secure/about-privatelink.md index 731cef3f019..f19790fd708 100644 --- a/website/docs/docs/cloud/secure/about-privatelink.md +++ b/website/docs/docs/cloud/secure/about-privatelink.md @@ -7,10 +7,13 @@ sidebar_label: "About PrivateLink" import SetUpPages from '/snippets/_available-tiers-privatelink.md'; import PrivateLinkHostnameWarning from '/snippets/_privatelink-hostname-restriction.md'; +import CloudProviders from '/snippets/_privatelink-across-providers.md'; -PrivateLink enables a private connection from any dbt Cloud Multi-Tenant environment to your data platform hosted on AWS using [AWS PrivateLink](https://aws.amazon.com/privatelink/) technology. PrivateLink allows dbt Cloud customers to meet security and compliance controls as it allows connectivity between dbt Cloud and your data platform without traversing the public internet. This feature is supported in most regions across NA, Europe, and Asia, but [contact us](https://www.getdbt.com/contact/) if you have questions about availability. +PrivateLink enables a private connection from any dbt Cloud Multi-Tenant environment to your data platform hosted on a cloud provider, such as [AWS](https://aws.amazon.com/privatelink/) or [Azure](https://azure.microsoft.com/en-us/products/private-link), using that provider’s PrivateLink technology. PrivateLink allows dbt Cloud customers to meet security and compliance controls as it allows connectivity between dbt Cloud and your data platform without traversing the public internet. This feature is supported in most regions across NA, Europe, and Asia, but [contact us](https://www.getdbt.com/contact/) if you have questions about availability. + + ### Cross-region PrivateLink diff --git a/website/docs/docs/cloud/secure/databricks-privatelink.md b/website/docs/docs/cloud/secure/databricks-privatelink.md index a02683e1269..d754f2b76c4 100644 --- a/website/docs/docs/cloud/secure/databricks-privatelink.md +++ b/website/docs/docs/cloud/secure/databricks-privatelink.md @@ -8,11 +8,14 @@ pagination_next: null import SetUpPages from '/snippets/_available-tiers-privatelink.md'; import PrivateLinkSLA from '/snippets/_PrivateLink-SLA.md'; +import CloudProviders from '/snippets/_privatelink-across-providers.md'; The following steps will walk you through the setup of a Databricks AWS PrivateLink or Azure Private Link endpoint in the dbt Cloud multi-tenant environment. + + ## Configure AWS PrivateLink 1. Locate your [Databricks instance name](https://docs.databricks.com/en/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) diff --git a/website/docs/docs/cloud/secure/postgres-privatelink.md b/website/docs/docs/cloud/secure/postgres-privatelink.md index 864cfe4acba..4d670354686 100644 --- a/website/docs/docs/cloud/secure/postgres-privatelink.md +++ b/website/docs/docs/cloud/secure/postgres-privatelink.md @@ -6,11 +6,15 @@ sidebar_label: "PrivateLink for Postgres" --- import SetUpPages from '/snippets/_available-tiers-privatelink.md'; import PrivateLinkTroubleshooting from '/snippets/_privatelink-troubleshooting.md'; +import PrivateLinkCrossZone from '/snippets/_privatelink-cross-zone-load-balancing.md'; +import CloudProviders from '/snippets/_privatelink-across-providers.md'; A Postgres database, hosted either in AWS or in a properly connected on-prem data center, can be accessed through a private network connection using AWS Interface-type PrivateLink. The type of Target Group connected to the Network Load Balancer (NLB) may vary based on the location and type of Postgres instance being connected, as explained in the following steps. + + ## Configuring Postgres interface-type PrivateLink ### 1. Provision AWS resources @@ -41,9 +45,16 @@ Creating an Interface VPC PrivateLink connection requires creating multiple AWS - Target Group protocol: **TCP** - **Network Load Balancer (NLB)** — Requires creating a Listener that attaches to the newly created Target Group for port `5432` + - **Scheme:** Internal + - **IP address type:** IPv4 + - **Network mapping:** Choose the VPC that the VPC Endpoint Service and NLB are being deployed in, and choose subnets from at least two Availability Zones. + - **Security Groups:** The Network Load Balancer (NLB) associated with the VPC endpoint service must either not have an associated security group, or the security group must have a rule that allows requests from the appropriate dbt Cloud **private CIDR(s)**. Note that _this is different_ than the static public IPs listed on the dbt Cloud [Access, Regions, & IP addresses](https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses) page. dbt Support can provide the correct private CIDR(s) upon request. If necessary, until you can refine the rule to the smaller CIDR provided by dbt, allow connectivity by temporarily adding an allow rule of `10.0.0.0/8`. + - **Listeners:** Create one listener per target group that maps the appropriate incoming port to the corresponding target group ([details](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/load-balancer-listeners.html)). - **VPC Endpoint Service** — Attach to the newly created NLB. - Acceptance required (optional) — Requires you to [accept our connection request](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#accept-reject-connection-requests) after dbt creates the endpoint. + + ### 2. Grant dbt AWS account access to the VPC Endpoint Service On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the root user in the appropriate production AWS account and save your changes. @@ -88,4 +99,4 @@ Once dbt Cloud support completes the configuration, you can start creating new c 4. Configure the remaining data platform details. 5. Test your connection and save it. - \ No newline at end of file + diff --git a/website/docs/docs/cloud/secure/redshift-privatelink.md b/website/docs/docs/cloud/secure/redshift-privatelink.md index a9d4332918b..75924cf76a9 100644 --- a/website/docs/docs/cloud/secure/redshift-privatelink.md +++ b/website/docs/docs/cloud/secure/redshift-privatelink.md @@ -7,6 +7,8 @@ sidebar_label: "PrivateLink for Redshift" import SetUpPages from '/snippets/_available-tiers-privatelink.md'; import PrivateLinkTroubleshooting from '/snippets/_privatelink-troubleshooting.md'; +import PrivateLinkCrossZone from '/snippets/_privatelink-cross-zone-load-balancing.md'; +import CloudProviders from '/snippets/_privatelink-across-providers.md'; @@ -16,6 +18,8 @@ AWS provides two different ways to create a PrivateLink VPC endpoint for a Redsh dbt Cloud supports both types of endpoints, but there are a number of [considerations](https://docs.aws.amazon.com/redshift/latest/mgmt/managing-cluster-cross-vpc.html#managing-cluster-cross-vpc-considerations) to take into account when deciding which endpoint type to use. Redshift-managed provides a far simpler setup with no additional cost, which might make it the preferred option for many, but may not be an option in all environments. Based on these criteria, you will need to determine which is the right type for your system. Follow the instructions from the section below that corresponds to your chosen endpoint type. + + :::note Redshift Serverless While Redshift Serverless does support Redshift-managed type VPC endpoints, this functionality is not currently available across AWS accounts. Due to this limitation, an Interface-type VPC endpoint service must be used for Redshift Serverless cluster PrivateLink connectivity from dbt Cloud. ::: @@ -79,9 +83,16 @@ Creating an Interface VPC PrivateLink connection requires creating multiple AWS - Target Group protocol: **TCP** - **Network Load Balancer (NLB)** — Requires creating a Listener that attaches to the newly created Target Group for port `5439` + - **Scheme:** Internal + - **IP address type:** IPv4 + - **Network mapping:** Choose the VPC that the VPC Endpoint Service and NLB are being deployed in, and choose subnets from at least two Availability Zones. + - **Security Groups:** The Network Load Balancer (NLB) associated with the VPC endpoint service must either not have an associated security group, or the security group must have a rule that allows requests from the appropriate dbt Cloud **private CIDR(s)**. Note that _this is different_ than the static public IPs listed on the dbt Cloud [Access, Regions, & IP addresses](https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses) page. dbt Support can provide the correct private CIDR(s) upon request. If necessary, until you can refine the rule to the smaller CIDR provided by dbt, allow connectivity by temporarily adding an allow rule of `10.0.0.0/8`. + - **Listeners:** Create one listener per target group that maps the appropriate incoming port to the corresponding target group ([details](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/load-balancer-listeners.html)). - **VPC Endpoint Service** — Attach to the newly created NLB. - Acceptance required (optional) — Requires you to [accept our connection request](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#accept-reject-connection-requests) after dbt creates the endpoint. + + ### 2. Grant dbt AWS Account access to the VPC Endpoint Service On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the root user in the appropriate production AWS account and save your changes. @@ -117,4 +128,4 @@ Once dbt Cloud support completes the configuration, you can start creating new c 4. Configure the remaining data platform details. 5. Test your connection and save it. - \ No newline at end of file + diff --git a/website/docs/docs/cloud/secure/snowflake-privatelink.md b/website/docs/docs/cloud/secure/snowflake-privatelink.md index c6775be2444..b943791292f 100644 --- a/website/docs/docs/cloud/secure/snowflake-privatelink.md +++ b/website/docs/docs/cloud/secure/snowflake-privatelink.md @@ -6,11 +6,14 @@ sidebar_label: "PrivateLink for Snowflake" --- import SetUpPages from '/snippets/_available-tiers-privatelink.md'; +import CloudProviders from '/snippets/_privatelink-across-providers.md'; The following steps walk you through the setup of a Snowflake AWS PrivateLink or Azure Private Link endpoint in a dbt Cloud multi-tenant environment. + + :::note Snowflake SSO with PrivateLink Users connecting to Snowflake using SSO over a PrivateLink connection from dbt Cloud will also require access to a PrivateLink endpoint from their local workstation. diff --git a/website/docs/docs/cloud/secure/vcs-privatelink.md b/website/docs/docs/cloud/secure/vcs-privatelink.md index 6041b1cb4ed..28b4df8f706 100644 --- a/website/docs/docs/cloud/secure/vcs-privatelink.md +++ b/website/docs/docs/cloud/secure/vcs-privatelink.md @@ -7,6 +7,7 @@ sidebar_label: "PrivateLink for VCS" import SetUpPages from '/snippets/_available-tiers-privatelink.md'; import PrivateLinkTroubleshooting from '/snippets/_privatelink-troubleshooting.md'; +import PrivateLinkCrossZone from '/snippets/_privatelink-cross-zone-load-balancing.md'; @@ -44,12 +45,15 @@ Creating an Interface VPC PrivateLink connection requires creating multiple AWS - **Scheme:** Internal - **IP address type:** IPv4 - **Network mapping:** Choose the VPC that the VPC Endpoint Service and NLB are being deployed in, and choose subnets from at least two Availability Zones. + - **Security Groups:** The Network Load Balancer (NLB) associated with the VPC Endpoint Service must either not have an associated Security Group, or the Security Group must have a rule that allows requests from the appropriate dbt Cloud **private CIDR(s)**. Note that **this is different** than the static public IPs listed on the dbt Cloud [Access, Regions, & IP addresses](https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses) page. The correct private CIDR(s) can be provided by dbt Support upon request. If necessary, temporarily adding an allow rule of `10.0.0.0/8` should allow connectivity until the rule can be refined to the smaller dbt provided CIDR. - **Listeners:** Create one Listener per Target Group that maps the appropriate incoming port to the corresponding Target Group ([details](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/load-balancer-listeners.html)). - **Endpoint Service** - The VPC Endpoint Service is what allows for the VPC to VPC connection, routing incoming requests to the configured load balancer. - **Load balancer type:** Network. - **Load balancer:** Attach the NLB created in the previous step. - **Acceptance required (recommended)**: When enabled, requires a new connection request to the VPC Endpoint Service to be accepted by the customer before connectivity is allowed ([details](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#accept-reject-connection-requests)). + + ### 2. Grant dbt AWS account access to the VPC Endpoint Service Once these resources have been provisioned, access needs to be granted for the dbt Labs AWS account to create a VPC Endpoint in our VPC. On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the following IAM role in the appropriate production AWS account and save your changes ([details](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#add-remove-permissions)). diff --git a/website/docs/docs/cloud/use-dbt-assist.md b/website/docs/docs/cloud/use-dbt-assist.md deleted file mode 100644 index 4eef6c87329..00000000000 --- a/website/docs/docs/cloud/use-dbt-assist.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: "Use dbt Assist" -sidebar_label: "Use dbt Assist" -description: "Use dbt Assist to generate documentation and tests from scratch, giving you the flexibility to modify or fix generated code." ---- - -# Use dbt Assist - -Use dbt Assist to generate documentation and tests from scratch, giving you the flexibility to modify or fix generated code. To access and use dbt Assist: - -1. Navigate to the dbt Cloud IDE and select a SQL model file under the **File Explorer**. - -2. In the **Console** section (under the **File Editor**), select the **dbt Assist** to view the available AI options. - -3. Select the available options: **Documentation** or **Tests** to generate the YAML config. - - To generate both for the same model, click each option separately. dbt Assist intelligently saves the YAML config in the same file. - -4. Verify the AI-generated code. You can update or fix the code as needed. - -5. Click **Save** to save the code. You should see the file changes under the **Version control** section. - - diff --git a/website/docs/docs/cloud/use-dbt-copilot.md b/website/docs/docs/cloud/use-dbt-copilot.md new file mode 100644 index 00000000000..30def967f96 --- /dev/null +++ b/website/docs/docs/cloud/use-dbt-copilot.md @@ -0,0 +1,22 @@ +--- +title: "Use dbt Copilot" +sidebar_label: "Use dbt Copilot" +description: "Use the dbt Copilot AI engine to generate documentation, tests, and semantic models from scratch, giving you the flexibility to modify or fix generated code." +--- + +# Use dbt Copilot + +Use dbt Copilot to generate documentation, tests, and semantic models from scratch, giving you the flexibility to modify or fix generated code. To access and use this AI engine: + +1. Navigate to the dbt Cloud IDE and select a SQL model file under the **File Explorer**. + +2. In the **Console** section (under the **File Editor**), click **dbt Copilot** to view the available AI options. + +3. Select the available options to generate the YAML config: **Generate Documentation**, **Generate Tests**, or **Generate Semantic Model**. + - To generate multiple YAML configs for the same model, click each option separately. dbt Copilot intelligently saves the YAML config in the same file. + +4. Verify the AI-generated code. You can update or fix the code as needed. + +5. Click **Save As**. You should see the file changes under the **Version control** section. + + diff --git a/website/docs/docs/collaborate/auto-exposures.md b/website/docs/docs/collaborate/auto-exposures.md index 371f6e80248..9b25a2fb305 100644 --- a/website/docs/docs/collaborate/auto-exposures.md +++ b/website/docs/docs/collaborate/auto-exposures.md @@ -7,12 +7,9 @@ pagination_next: "docs/collaborate/data-tile" image: /img/docs/cloud-integrations/auto-exposures/explorer-lineage.jpg --- -# Auto-exposures +# Auto-exposures As a data team, it’s critical that you have context into the downstream use cases and users of your data products. Auto-exposures integrates natively with Tableau (Power BI coming soon) and auto-generates downstream lineage in dbt Explorer for a richer experience. -:::info Available in beta -Auto-exposures are currently available in beta to a limited group of users and are gradually being rolled out. If you're interested in gaining access or learning more, stay tuned for updates! -::: Auto-exposures helps users understand how their models are used in downstream analytics tools to inform investments and reduce incidents — ultimately building trust and confidence in data products. It imports and auto-generates exposures based on Tableau dashboards, with user-defined curation. diff --git a/website/docs/docs/collaborate/data-tile.md b/website/docs/docs/collaborate/data-tile.md index efd6a0d59aa..1d5b26e26b7 100644 --- a/website/docs/docs/collaborate/data-tile.md +++ b/website/docs/docs/collaborate/data-tile.md @@ -6,14 +6,8 @@ description: "Embed data health tiles in your dashboards to distill trust signal image: /img/docs/collaborate/dbt-explorer/data-tile-pass.jpg --- -# Embed data health tile in dashboards - With data health tiles, stakeholders will get an at-a-glance confirmation on whether the data they’re looking at is stale or degraded. This trust signal allows teams to immediately go back into Explorer to see more details and investigate issues. -:::info Available in beta -Data health tile is currently available in open beta. -::: - The data health tile: - Distills trust signals for data consumers. @@ -98,7 +92,7 @@ Follow these steps to embed the data health tile in PowerBI: ```html Website = - "" + "" ``` @@ -122,19 +116,38 @@ Follow these steps to embed the data health tile in Tableau: 1. Create a dashboard in Tableau and connect to your database to pull in the data. -2. Ensure you've copied the iFrame snippet available in dbt Explorer's **Data health** section, under the **Embed data health into your dashboard** toggle. -3. Embed the snippet in your dashboard. +2. Ensure you've copied the URL or iFrame snippet available in dbt Explorer's **Data health** section, under the **Embed data health into your dashboard** toggle. +3. Insert a **Web Page** object. +4. Insert the URL and click **Ok**. - `