diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 193e989b875..d2bb72552bd 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,23 +1,23 @@ ## What are you changing in this pull request and why? - ## Checklist +- [ ] I have reviewed the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) so my content adheres to these guidelines. +- [ ] The topic I'm writing about is for specific dbt version(s) and I have versioned it according to the [version a whole page](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) and/or [version a block of content](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#versioning-blocks-of-content) guidelines. +- [ ] I have added checklist item(s) to this list for anything anything that needs to happen before this PR is merged, such as "needs technical review" or "change base branch." +- [ ] The content in this PR requires a dbt release note, so I added one to the [release notes page](https://docs.getdbt.com/docs/dbt-versions/dbt-cloud-release-notes). -- [ ] Review the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) so my content adheres to these guidelines. -- [ ] For [docs versioning](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#about-versioning), review how to [version a whole page](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) and [version a block of content](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#versioning-blocks-of-content). -- [ ] Add a checklist item for anything that needs to happen before this PR is merged, such as "needs technical review" or "change base branch." - -Adding or removing pages (delete if not applicable): + diff --git a/.github/workflows/preview-link.yml b/.github/workflows/preview-link.yml new file mode 100644 index 00000000000..f128f44b8cd --- /dev/null +++ b/.github/workflows/preview-link.yml @@ -0,0 +1,169 @@ +name: Vercel deployment preview link generator + +on: + pull_request: + types: [opened, synchronize] + paths: + - 'website/docs/docs/**' + - 'website/docs/best-practices/**' + - 'website/docs/guides/**' + - 'website/docs/faqs/**' + - 'website/docs/reference/**' + +permissions: + contents: write + pull-requests: write + +jobs: + update-pr-description: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install necessary tools + run: | + sudo apt-get update + sudo apt-get install -y jq curl + + - name: Generate Vercel deployment URL + id: vercel_url + run: | + # Get the branch name + BRANCH_NAME="${{ github.head_ref }}" + + # Convert to lowercase + BRANCH_NAME_LOWER=$(echo "$BRANCH_NAME" | tr '[:upper:]' '[:lower:]') + + # Replace non-alphanumeric characters with hyphens + BRANCH_NAME_SANITIZED=$(echo "$BRANCH_NAME_LOWER" | sed 's/[^a-z0-9]/-/g') + + # Construct the deployment URL + DEPLOYMENT_URL="https://docs-getdbt-com-git-${BRANCH_NAME_SANITIZED}-dbt-labs.vercel.app" + + echo "deployment_url=$DEPLOYMENT_URL" >> $GITHUB_OUTPUT + + - name: Wait for deployment to be accessible + id: wait_for_deployment + run: | + DEPLOYMENT_URL="${{ steps.vercel_url.outputs.deployment_url }}" + echo "Waiting for deployment at $DEPLOYMENT_URL to become accessible..." + + MAX_ATTEMPTS=60 # Wait up to 10 minutes + SLEEP_TIME=10 # Check every 10 seconds + ATTEMPTS=0 + + while [ $ATTEMPTS -lt $MAX_ATTEMPTS ]; do + STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$DEPLOYMENT_URL") + if [ "$STATUS_CODE" -eq 200 ]; then + echo "Deployment is accessible." + break + else + echo "Deployment not yet accessible (status code: $STATUS_CODE). Waiting..." + sleep $SLEEP_TIME + ATTEMPTS=$((ATTEMPTS + 1)) + fi + done + + if [ $ATTEMPTS -eq $MAX_ATTEMPTS ]; then + echo "Deployment did not become accessible within the expected time." + exit 1 + fi + + - name: Get changed files + id: files + run: | + CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.sha }} | grep -E '^website/docs/(docs|best-practices|guides|faqs|reference)/.*\.md$' || true) + if [ -z "$CHANGED_FILES" ]; then + echo "No documentation files were changed." + echo "changed_files=" >> $GITHUB_OUTPUT + else + CHANGED_FILES=$(echo "$CHANGED_FILES" | tr '\n' ' ') + echo "changed_files=$CHANGED_FILES" >> $GITHUB_OUTPUT + fi + + - name: Generate file preview links + id: links + run: | + DEPLOYMENT_URL="${{ steps.vercel_url.outputs.deployment_url }}" + CHANGED_FILES="${{ steps.files.outputs.changed_files }}" + + if [ -z "$CHANGED_FILES" ]; then + echo "No changed files found in the specified directories." + LINKS="No documentation files were changed." + else + LINKS="" + # Convert CHANGED_FILES back to newline-separated for processing + CHANGED_FILES=$(echo "$CHANGED_FILES" | tr ' ' '\n') + for FILE in $CHANGED_FILES; do + # Remove 'website/docs/' prefix + FILE_PATH="${FILE#website/docs/}" + # Remove the .md extension + FILE_PATH="${FILE_PATH%.md}" + + # Construct the full URL + FULL_URL="$DEPLOYMENT_URL/$FILE_PATH" + LINKS="$LINKS\n- $FULL_URL" + done + fi + + # Properly set the multi-line output + echo "links<> $GITHUB_OUTPUT + echo -e "$LINKS" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + + - name: Update PR description with deployment links + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const prNumber = context.issue.number; + + // Fetch the current PR description + const { data: pullRequest } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: prNumber, + }); + + let body = pullRequest.body || ''; + + // Define the markers + const startMarker = ''; + const endMarker = ''; + + // Get the deployment URL and links from environment variables + const deploymentUrl = process.env.DEPLOYMENT_URL; + const links = process.env.LINKS; + + // Build the deployment content without leading whitespace + const deploymentContent = [ + `${startMarker}`, + '---', + '🚀 Deployment available! Here are the direct links to the updated files:', + '', + `${links}`, + '', + `${endMarker}` + ].join('\n'); + + // Remove existing deployment content between markers + const regex = new RegExp(`${startMarker}[\\s\\S]*?${endMarker}`, 'g'); + body = body.replace(regex, '').trim(); + + // Append the new deployment content + body = `${body}\n\n${deploymentContent}`; + + // Update the PR description + await github.rest.pulls.update({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: prNumber, + body: body, + }); + env: + DEPLOYMENT_URL: ${{ steps.vercel_url.outputs.deployment_url }} + LINKS: ${{ steps.links.outputs.links }} diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml new file mode 100644 index 00000000000..5feaaa12a20 --- /dev/null +++ b/.github/workflows/vale.yml @@ -0,0 +1,80 @@ +name: Vale linting + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - 'website/docs/**/*' + - 'website/blog/**/*' + - 'website/**/*' + +jobs: + vale: + name: Vale linting + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: List repository contents + run: | + pwd + ls -R + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install Vale + run: pip install vale==2.27.0 # Install a stable version of Vale + + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v34 + with: + files: | + website/**/*.md + separator: ' ' + + - name: Debugging - Print changed files + if: ${{ steps.changed-files.outputs.any_changed == 'true' }} + run: | + echo "Changed files:" + echo "${{ steps.changed-files.outputs.all_changed_and_modified_files }}" + + - name: Confirm files exist + if: ${{ steps.changed-files.outputs.any_changed == 'true' }} + run: | + echo "Checking if files exist..." + for file in ${{ steps.changed-files.outputs.all_changed_and_modified_files }}; do + if [ -f "$file" ]; then + echo "Found: $file" + else + echo "File not found: $file" + exit 1 + fi + done + + - name: Run vale + if: ${{ steps.changed-files.outputs.any_changed == 'true' }} + uses: errata-ai/vale-action@reviewdog + with: + token: ${{ secrets.GITHUB_TOKEN }} + reporter: github-check + files: ${{ steps.changed-files.outputs.all_changed_and_modified_files }} + separator: ' ' + version: '2.27.0' + +# - name: Post summary comment +# if: ${{ steps.changed-files.outputs.any_changed == 'true' }} +# run: | +# COMMENT="❗️Oh no, some Vale linting found issues! Please check the **Files change** tab for detailed results and make the necessary updates." +# COMMENT+=$'\n' +# COMMENT+=$'\n\n' +# COMMENT+="➡️ Link to detailed report: [Files changed](${{ github.event.pull_request.html_url }}/files)" +# gh pr comment ${{ github.event.pull_request.number }} --body "$COMMENT" +# env: +# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.hyperlint/config.yaml b/.hyperlint/config.yaml new file mode 100644 index 00000000000..03082114ae1 --- /dev/null +++ b/.hyperlint/config.yaml @@ -0,0 +1,10 @@ +content_dir: /docs +authorized_users: + - mirnawong1 + - matthewshaver + - nghi-ly + - runleonarun + - nataliefiann + +vale: + enabled: true diff --git a/.vale.ini b/.vale.ini new file mode 100644 index 00000000000..58aff923afe --- /dev/null +++ b/.vale.ini @@ -0,0 +1,7 @@ +StylesPath = styles +MinAlertLevel = warning + +Vocab = EN + +[*.md] +BasedOnStyles = custom diff --git a/contributing/adding-page-components.md b/contributing/adding-page-components.md index 68294e7d149..7a92d627995 100644 --- a/contributing/adding-page-components.md +++ b/contributing/adding-page-components.md @@ -4,7 +4,7 @@ You can use the following components to provide code snippets for each supported Identify code by labeling with the warehouse names: -```code +```sql
@@ -32,7 +32,7 @@ You can use the following components to provide code snippets in a tabbed view. Identify code and code files by labeling with the component they are describing: -```code +```sql ` tag. This allows you to share a link to a page with a pre-selected tab so that clicking on a tab creates a unique hyperlink for that tab. However, this feature doesn't provide an anchor link, which means the browser won't scroll to the tab. Additionally, you can define the search parameter name to use. If the tabs content is under a header, you can alternatively link to the header itself, instaed of the `queryString` prop. +You can use the [queryString](https://docusaurus.io/docs/next/markdown-features/tabs?current-os=ios#query-string) prop in the `` tag. This allows you to share a link to a page with a pre-selected tab so that clicking on a tab creates a unique hyperlink for that tab. However, this feature doesn't provide an anchor link, which means the browser won't scroll to the tab. Additionally, you can define the search parameter name to use. If the tabs content is under a header, you can alternatively link to the header itself, instead of the `queryString` prop. In the following example, clicking a tab adds a search parameter to the end of the URL: `?current-os=android or ?current-os=ios`. -``` +```sql Android @@ -105,3 +105,48 @@ In the following example, clicking a tab adds a search parameter to the end of t ``` + +## Markdown Links + +Refer to the Links section of the Content Style Guide to read about how you can use links in the dbt product documentation. + +## Collapsible header + + +
+

Shows and hides children elements

+
+
+ +```markdown + +
+

Shows and hides children elements

+
+
+
+``` + +## File component + +```yml + + +```yaml +password: hunter2 +``` + +``` + +## LoomVideo component + +
{``}
+ + + +## YoutubeVideo component + +
{``}
+ + + diff --git a/contributing/content-style-guide.md b/contributing/content-style-guide.md index 022efa127a5..a8520bc0e0d 100644 --- a/contributing/content-style-guide.md +++ b/contributing/content-style-guide.md @@ -624,6 +624,12 @@ When describing icons that appear on-screen, use the [_Google Material Icons_](h :white_check_mark:Click on the menu icon +#### Upload icons +If you're using icons to document things like [third-party vendors](https://docs.getdbt.com/docs/cloud-integrations/avail-sl-integrations), etc. — you need to add the icon file in the following locations to ensure the icons render correctly in light and dark mode: + +- website/static/img/icons +- website/static/img/icons/white + ### Image names Two words that are either adjectives or nouns describing the name of a file separated by an underscore `_` (known as `snake_case`). The two words can also be separated by a hyphen (`kebab-case`). diff --git a/contributing/lightbox.md b/contributing/lightbox.md index 5f35b4d9639..95feccbe779 100644 --- a/contributing/lightbox.md +++ b/contributing/lightbox.md @@ -25,4 +25,9 @@ You can use the Lightbox component to add an image or screenshot to your page. I /> ``` +Note that if you're using icons to document things like third party vendors, etc, — you need to add the icon file in the following locations to ensure the icons render correctly in light and dark mode: + +- `website/static/img/icons` +- `website/static/img/icons/white` + diff --git a/contributing/single-sourcing-content.md b/contributing/single-sourcing-content.md index 2b8a82dfb81..6dc14d760b1 100644 --- a/contributing/single-sourcing-content.md +++ b/contributing/single-sourcing-content.md @@ -90,7 +90,7 @@ This component can be added directly to a markdown file in a similar way as othe Both properties can be used together to set a range where the content should show. In the example below, this content will only show if the selected version is between **0.21** and **1.0**: ```markdown - + Versioned content here @@ -132,94 +132,6 @@ $ dbt test --models [...] --defer --state path/to/artifacts ``` -## Using global variables - ---- - -Global variables can be configured for use throughout the docs. - -Using a global variable requires two steps: - -1. Set the variable in the `website/dbt-global-variables.js` file. -2. Use the **Var** component to add the global variable to a page. - -```jsx -// The dbtCore property is the identifier for the variable, -// while the name property is the value shown on the page. - -exports.dbtVariables = { - dbtCore: { - name: "dbt Core" - } -} -``` - -```markdown -// is converted to dbt Core - -You can install on the command line by using one of these recommended methods: -``` - -### Versioning global variables - -It is possible to version global variables as well. This creates the ability to show different variations of a string based off the current version a visitor has selected. - -To extend our `dbt-global-variables.js` file above, we can add a new variable: *note - these versions are not accurate and only shown for this example.* - -```jsx -// A new variable called dbtCloud is added below -// This variable includes a versions array -// "Sinter" will replace "dbt Cloud" for versions 0.21 or lower - -exports.dbtVariables = { - dbtCore: { - name: "dbt Core" - }, - dbtCloud: { - name: "dbt Cloud", - versions: [ - { - "name": "Sinter", - "version": "0.21" - } - ] - } -} -``` - -```markdown -You can get started with by [Signing up](https://www.getdbt.com/signup/). -``` - -In the above example, the **dbtCloud** property has a default name of “dbt Cloud”. The naming for variables cascade down, meaning “dbt Cloud” will show for all versions, until version **0.21** or lower is selected. At that point “Sinter” will replace “dbt Cloud”. - -### Global variables properties - -**name** (required): Expects the identifier for a global variable. - -### Global variables example - -The global `` component can be used inline, for example: - -```markdown -This piece of markdown content explains why is awesome. -``` - -However, a Var component cannot start a new line of content. Fortunately, a workaround exists to use the Var component at the beginning of a line of content. - -To use the component at the beginning of a sentence, add a non-breaking space character before the component: - -```markdown -// When starting a new line with a global variable, -// a non-breaking space is required - -// Works -  is awesome! - -// Does not work - is awesome! -``` - ## Reusing content To reuse content on different pages, you can use some techniques like partial files or snippets. Partial files, a built-in Docusaurus feature, is the recommended method over snippets. diff --git a/styles/Vocab/EN/accept.txt b/styles/Vocab/EN/accept.txt new file mode 100644 index 00000000000..e673e2ef83d --- /dev/null +++ b/styles/Vocab/EN/accept.txt @@ -0,0 +1,67 @@ +dbt Cloud +dbt Core +dbt Semantic Layer +dbt Explorer +dbt +dbt-tonic +dbtonic +IDE +CLI +Config +info +docs +yaml +YAML +SQL +bash +shell +MetricFlow +jinja +jinja2 +sqlmesh +Snowflake +Databricks +Fabric +Redshift +Azure +DevOps +Athena +Amazon +UI +CSV +S3 +SCD +repo +dbt_project.yml +boolean +defaultValue= +DWH +DWUs +shoutout +ADF +BQ +gcloud +MSFT +DDL +APIs +API +SSIS +PBI +PowerBI +datetime +PySpark +:::caution +:::note +:::info +:::tip +:::warning +\<[^>]+\> +\b[A-Z]{2,}(?:/[A-Z]{2,})?\b +\w+-\w+ +\w+/\w+ +n/a +N/A +\ diff --git a/styles/custom/LatinAbbreviations.yml b/styles/custom/LatinAbbreviations.yml new file mode 100644 index 00000000000..44a3c9d6e8c --- /dev/null +++ b/styles/custom/LatinAbbreviations.yml @@ -0,0 +1,15 @@ +# LatinAbbreviations.yml +extends: substitution +message: "Avoid Latin abbreviations: '%s'. Consider using '%s' instead." +level: warning + +swap: + 'e.g.': 'for example' + 'e.g': 'for example' + 'eg': 'for example' + 'i.e.': 'that is' + 'i.e': 'that is' + 'etc.': 'and so on' + 'etc': 'and so on' + 'N.B.': 'Note' + 'NB': 'Note' diff --git a/styles/custom/Repitition.yml b/styles/custom/Repitition.yml new file mode 100644 index 00000000000..4cd620146cf --- /dev/null +++ b/styles/custom/Repitition.yml @@ -0,0 +1,6 @@ +extends: repetition +message: "'%s' is repeated!" +level: warning +alpha: true +tokens: + - '[^\s]+' diff --git a/styles/custom/SentenceCaseHeaders.yml b/styles/custom/SentenceCaseHeaders.yml new file mode 100644 index 00000000000..d1d6cd97c67 --- /dev/null +++ b/styles/custom/SentenceCaseHeaders.yml @@ -0,0 +1,34 @@ +extends: capitalization +message: "'%s' should use sentence-style capitalization. Try '%s' instead." +level: warning +scope: heading +match: $sentence # Enforces sentence-style capitalization +indicators: + - ":" +exceptions: + - '\bdbt\b' + - '\bdbt\s+Cloud\b' + - '\bdbt\s+Core\b' + - '\bdbt\s+Cloud\s+CLI\b' + - Snowflake + - Databricks + - Azure + - GCP + - AWS + - SQL + - CLI + - API + - YAML + - JSON + - HTML + - Redshift + - Google + - BigQuery + - SnowSQL + - Snowsight + - Snowpark + - Fabric + - Microsoft + - Postgres + - Explorer + - IDE diff --git a/styles/custom/Typos.yml b/styles/custom/Typos.yml new file mode 100644 index 00000000000..456517950a9 --- /dev/null +++ b/styles/custom/Typos.yml @@ -0,0 +1,39 @@ +extends: spelling + +message: "Oops there's a typo -- did you really mean '%s'? " +level: warning + +action: + name: suggest + params: + - spellings + +custom: true +filters: + - '\bdbt\b' + - '\bdbt\s+Cloud\b' + - '\bdbt\s+Core\b' + - '\bdbt\s+Cloud\s+CLI\b' + - '\bdbt\s+.*?\b' + - '<[^>]+>' # Ignore all HTML-like components starting with < and ending with > + - '<[^>]+>.*<\/[^>]+>' + +--- + +extends: existence + +message: "Ignore specific patterns" +level: skip +tokens: + - '\bdbt\b' + - '\bdbt\s+Cloud\b' + - '\bdbt\s+Core\b' + - '\bdbt\s+Cloud\s+CLI\b' + - '\bdbt\s+.*?\b' + - '<[^>]+>' # Ignore all HTML-like components starting with < and ending with > + - '<[^>]+>.*<\/[^>]+>' + - '\w+-\w+' + - '\w+/\w+' + - '\w+/\w+|\w+-\w+|n/a' + - 'n/a' + - 'N/A' diff --git a/styles/custom/UIElements.yml b/styles/custom/UIElements.yml new file mode 100644 index 00000000000..ff9c5f86187 --- /dev/null +++ b/styles/custom/UIElements.yml @@ -0,0 +1,19 @@ +extends: existence +message: "UI elements like '%s' should be bold." +level: warning +tokens: + # Match UI elements that are not bolded (i.e., not within **) + - '(? + +dbt Mesh is powerful because it allows teams to operate _independently_ and _collaboratively_, each team free to build on their own but contributing to a larger, shared set of data outputs. + +The flexibility of dbt Mesh means that it can support [a wide variety of patterns and designs](/best-practices/how-we-mesh/mesh-3-structures). Today, let’s dive into one pattern that is showing promise as a way to enable teams working on very different dbt deployments to work together. + +## How Hybrid Mesh enables collaboration between dbt Core and dbt Cloud teams + +**_Scenario_** — A company with a central data team uses dbt Core. The setup is working well for that team. They want to scale their impact to enable faster decision-making, organization-wide. The current dbt Core setup isn't well suited for onboarding a larger number of less-technical, nontechnical, or less-frequent contributors. + +**_The goal_** — Enable three domain teams of less-technical users to leverage and extend the central data models, with full ownership over their domain-specific dbt models. + + - **Central data team:** Data engineers comfortable using dbt Core and the command line interface (CLI), building and maintaining foundational data models for the entire organization. + + - **Domain teams:** Data analysts comfortable working in SQL but not using the CLI and prefer to start working right away without managing local dbt Core installations or updates. The team needs to build transformations specific to their business context. Some of these users may have tried dbt in the past, but they were not able to successfully onboard to the central team's setup. + +**_Solution: Hybrid Mesh_** — Data teams can use dbt Mesh to connect projects *across* dbt Core and dbt Cloud, creating a workflow where everyone gets to work in their preferred environment while creating a shared lineage that allows for visibility, validation, and ownership across the data pipeline. + +Each team will fully own its dbt code, from development through deployment, using the product that is appropriate to their needs and capabilities _while sharing data products across teams using both dbt Core and dbt Cloud._ + + + +Creating a Hybrid Mesh is mostly the same as creating any other [dbt Mesh](/guides/mesh-qs?step=1) workflow — there are a few considerations but mostly _it just works_. We anticipate it will continue to see adoption as more central data teams look to onboard their downstream domain teams. + +A Hybrid Mesh can be adopted as a stable long-term pattern, or as an intermediary while you perform a [migration from dbt Core to dbt Cloud](/guides/core-cloud-2?step=1). + +## How to build a Hybrid Mesh +Enabling a Hybrid Mesh is as simple as a few additional steps to import the metadata from your Core project into dbt Cloud. Once you’ve done this, you should be able to operate your dbt Mesh like normal and all of our [standard recommendations](/best-practices/how-we-mesh/mesh-1-intro) still apply. + +### Step 1: Prepare your Core project for access through dbt Mesh + +Configure public models to serve as stable interfaces for downstream dbt Projects. + +- Decide which models from your Core project will be accessible in your Mesh. For more information on how to configure public access for those models, refer to the [model access page.](/docs/collaborate/govern/model-access) +- Optionally set up a [model contract](/docs/collaborate/govern/model-contracts) for all public models for better governance. +- Keep dbt Core and dbt Cloud projects in separate repositories to allow for a clear separation between upstream models managed by the dbt Core team and the downstream models handled by the dbt Cloud team. + +### Step 2: Mirror each "producer" Core project in dbt Cloud +This allows dbt Cloud to know about the contents and metadata of your project, which in turn allows for other projects to access its models. + +- [Create a dbt Cloud account](https://www.getdbt.com/signup/) and a dbt project for each upstream Core project. + - Note: If you have [environment variables](/docs/build/environment-variables) in your project, dbt Cloud environment variables must be prefixed with `DBT_ `(including `DBT_ENV_CUSTOM_ENV_` or `DBT_ENV_SECRET`). Follow the instructions in [this guide](https://docs.getdbt.com/guides/core-to-cloud-1?step=8#environment-variables) to convert them for dbt Cloud. +- Each upstream Core project has to have a production [environment](/docs/dbt-cloud-environments) in dbt Cloud. You need to configure credentials and environment variables in dbt Cloud just so that it will resolve relation names to the same places where your dbt Core workflows are deploying those models. +- Set up a [merge job](/docs/deploy/merge-jobs) in a production environment to run `dbt parse`. This will enable connecting downstream projects in dbt Mesh by producing the necessary [artifacts](/reference/artifacts/dbt-artifacts) for cross-project referencing. + - Note: Set up a regular job to run `dbt build` instead of using a merge job for `dbt parse`, and centralize your dbt orchestration by moving production runs to dbt Cloud. Check out [this guide](/guides/core-to-cloud-1?step=9) for more details on converting your production runs to dbt Cloud. +- Optional: Set up a regular job (for example, daily) to run `source freshness` and `docs generate`. This will hydrate dbt Cloud with additional metadata and enable features in [dbt Explorer](/docs/collaborate/explore-projects) that will benefit both teams, including [Column-level lineage](/docs/collaborate/column-level-lineage). + +### Step 3: Create and connect your downstream projects to your Core project using dbt Mesh +Now that dbt Cloud has the necessary information about your Core project, you can begin setting up your downstream projects, building on top of the public models from the project you brought into Cloud in [Step 2](#step-2-mirror-each-producer-core-project-in-dbt-cloud). To do this: +- Initialize each new downstream dbt Cloud project and create a [`dependencies.yml` file](/docs/collaborate/govern/project-dependencies#use-cases). +- In that `dependencies.yml` file, add the dbt project name from the `dbt_project.yml` of the upstream project(s). This sets up cross-project references between different dbt projects: + + ```yaml + # dependencies.yml file in dbt Cloud downstream project + projects: + - name: upstream_project_name + ``` +- Use [cross-project references](/reference/dbt-jinja-functions/ref#ref-project-specific-models) for public models in upstream project. Add [version](/reference/dbt-jinja-functions/ref#versioned-ref) to references of versioned models: + ```yaml + select * from {{ ref('upstream_project_name', 'monthly_revenue') }} + ``` + +And that’s all it takes! From here, the domain teams that own each dbt Project can build out their models to fit their own use cases. You can now build out your Hybrid Mesh however you want, accessing the full suite of dbt Cloud features. +- Orchestrate your Mesh to ensure timely delivery of data products and make them available to downstream consumers. +- Use [dbt Explorer](/docs/collaborate/explore-projects) to trace the lineage of your data back to its source. +- Onboard more teams and connect them to your Mesh. +- Build [semantic models](/docs/build/semantic-models) and [metrics](/docs/build/metrics-overview) into your projects to query them with the [dbt Semantic Layer](https://www.getdbt.com/product/semantic-layer). + + +## Conclusion + +In a world where organizations have complex and ever-changing data needs, there is no one-size fits all solution. Instead, data practitioners need flexible tooling that meets them where they are. The Hybrid Mesh presents a model for this approach, where teams that are comfortable and getting value out of dbt Core can collaborate frictionlessly with domain teams on dbt Cloud. diff --git a/website/blog/2024-10-04-iceberg-is-an-implementation-detail.md b/website/blog/2024-10-04-iceberg-is-an-implementation-detail.md new file mode 100644 index 00000000000..dc9b78bba8d --- /dev/null +++ b/website/blog/2024-10-04-iceberg-is-an-implementation-detail.md @@ -0,0 +1,84 @@ +--- +title: "Iceberg Is An Implementation Detail" +description: "This blog will talk about iceberg table support and why it both matters and doesn't" +slug: icebeg-is-an-implementation-detail + +authors: [amy_chen] + +tags: [table formats, iceberg] +hide_table_of_contents: false + +date: 2024-10-04 +is_featured: false +--- + +If you haven’t paid attention to the data industry news cycle, you might have missed the recent excitement centered around an open table format called Apache Iceberg™. It’s one of many open table formats like Delta Lake, Hudi, and Hive. These formats are changing the way data is stored and metadata accessed. They are groundbreaking in many ways. + +But I have to be honest: **I don’t care**. But not for the reasons you think. + + + +## What is Iceberg? + +To have this conversation, we need to start with the same foundational understanding of Iceberg. Apache Iceberg is a high-performance open table format developed for modern data lakes. It was designed for large-scale datasets, and within the project, there are many ways to interact with it. When people talk about Iceberg, it often means multiple components including but not limited to: + +1. Iceberg Table Format - an open-source table format with large-scale data. Tables materialized in iceberg table format are stored on a user’s infrastructure, such as S3 Bucket. +2. Iceberg Data Catalog - an open-source metadata management system that tracks the schema, partition, and versions of Iceberg tables. +3. Iceberg REST Protocol (also called Iceberg REST API) is how engines can support and speak to other Iceberg-compatible catalogs. + +If you have been in the industry, you also know that everything I just wrote above about Iceberg could easily be replaced by `Hive,` `Hudi,` or `Delta.` This is because they were all designed to solve essentially the same problem. Ryan Blue (creator of Iceberg) and Michael Armbrust (creator of Delta Lake) recently sat down for this [fantastic chat](https://vimeo.com/1012543474) and said two points that resonated with me: + +- “We never intended for people to pay attention to this area. It’s something we wanted to fix, but people should be able to not pay attention and just work with their data. Storage systems should just work.” +- “We solve the same challenges with different approaches.” + +At the same time, the industry is converging on Apache Iceberg. [Iceberg has the highest availability of read and write support](https://medium.com/sundeck/2024-lakehouse-format-rundown-7edd75015428). + + + + +Snowflake launched Iceberg support in 2022. Databricks launched Iceberg support via Uniform last year. Microsoft announced Fabric support for Iceberg in September 2024 at Fabric Con. **Customers are demanding interoperability, and vendors are listening**. + +Why does this matter? Standardization of the industry benefits customers. When the industry standardizes - customers have the gift of flexibility. Everyone has a preferred way of working, and with standardization — they can always bring their preferred tools to their organization’s data. + +## Just another implementation detail + +I’m not saying open table formats aren't important. The metadata management and performance make them very meaningful and should be paid attention to. Our users are already excited to use it to create data lakes to save on storage costs, create more abstraction from their computing, etc. + +But when building data models or focusing on delivering business value through analytics, my primary concern is not *how* the data is stored—it's *how* I can leverage it to generate insights and drive decisions. The analytics development lifecycle is hard enough without having to take into every detail. dbt abstracts the underlying platform and lets me focus on writing SQL and orchestrating my transformations. It’s a feature that I don’t need to think about how tables are stored or optimized—I just need to know that when I reference dim_customers or fct_sales, the correct data is there and ready to use. **It should just work.** + +## Sometimes the details do matter + +While table formats are an implementation detail for data transformation — Iceberg can impact dbt developers when the implementation details aren’t seamless. Currently, using Iceberg requires a significant amount of upfront configuration and integration work beyond just creating tables to get started. + +One of the biggest hurdles is managing Iceberg’s metadata layer. This metadata often needs to be synced with external catalogs, which requires careful setup and ongoing maintenance to prevent inconsistencies. Permissions and access controls add another layer of complexity—because multiple engines can access Iceberg tables, you have to ensure that all systems have the correct access to both the data files and the metadata catalog. Currently, setting up integrations between these engines is also far from seamless; while some engines natively support Iceberg, others require brittle workarounds to ensure the metadata is synced correctly. This fragmented landscape means you could land with a web of interconnected components. + +## Fixing it + +**Today, we announced official support for the Iceberg table format in dbt.** By supporting the Iceberg table format, it’s one less thing you have to worry about on your journey to adopting Iceberg. + +With support for Iceberg Table Format, it is now easier to convert your dbt models using proprietary table formats to Iceberg by updating your configuration. After you have set up your external storage for Iceberg and connected it to your platforms, you will be able to jump into your dbt model and update the configuration to look something like this: + + + +It is available on these adapters: + +- Athena +- Databricks +- Snowflake +- Spark +- Starburst/Trino +- Dremio + +As with the beauty of any open-source project, Iceberg support grew organically, so the implementations vary. However, this will change in the coming months as we converge onto one dbt standard. This way, no matter which adapter you jump into, the configuration will always be the same. + +## dbt the Abstraction Layer + +dbt is more than about abstracting away the DDL to create and manage objects. It’s also about ensuring an opinionated approach to managing and optimizing your data. That remains true for our strategy around Iceberg Support. + +In our dbt-snowflake implementation, we have already started to [enforce best practices centered around how to manage the base location](https://docs.getdbt.com/reference/resource-configs/snowflake-configs#base-location) to ensure you don’t create technical debt accidentally, ensuring your Iceberg implementation scales over time. And we aren’t done yet. + +That said, while we can create the models, there is a *lot* of initial work to get to that stage. dbt developers must still consider the implementation, like how their external volume has been set up or where dbt can access the metadata. We have to make this better. + +Given the friction of getting launched on Iceberg, over the coming months, we will enable more capabilities to empower users to adopt Iceberg. It should be easier to read from foreign Iceberg catalogs. It should be easier to mount your volume. It should be easier to manage refreshes. And you should also trust that permissions and governance are consistently enforced. + +And this work doesn’t stop at Iceberg. The framework we are building is also compatible with other table formats, ensuring that whatever table format works for you is supported on dbt. This way — dbt users can also stop caring about table formats. **It’s just another implementation detail.** diff --git a/website/blog/2024-10-05-snowflake-feature-store.md b/website/blog/2024-10-05-snowflake-feature-store.md new file mode 100644 index 00000000000..cf5c55be1b5 --- /dev/null +++ b/website/blog/2024-10-05-snowflake-feature-store.md @@ -0,0 +1,273 @@ +--- +title: "Snowflake feature store and dbt: A bridge between data pipelines and ML" +description: A deep-dive into the workflow steps you can take to build and deploy ML models within a single platform. +slug: snowflake-feature-store +authors: [randy_pettus, luis_leon] +tags: [snowflake ML] +hide_table_of_contents: false +date: 2024-10-08 +is_featured: true +--- + +Flying home into Detroit this past week working on this blog post on a plane and saw for the first time, the newly connected deck of the Gordie Howe International [bridge](https://www.freep.com/story/news/local/michigan/detroit/2024/07/24/gordie-howe-bridge-deck-complete-work-moves-to-next-phase/74528258007/) spanning the Detroit River and connecting the U.S. and Canada. The image stuck out because, in one sense, a feature store is a bridge between the clean, consistent datasets and the machine learning models that rely upon this data. But, more interesting than the bridge itself is the massive process of coordination needed to build it. This construction effort — I think — can teach us more about processes and the need for feature stores in machine learning (ML). + +Think of the manufacturing materials needed as our data and the building of the bridge as the building of our ML models. There are thousands of engineers and construction workers taking materials from all over the world, pulling only the specific pieces needed for each part of the project. However, to make this project truly work at this scale, we need the warehousing and logistics to ensure that each load of concrete rebar and steel meets the standards for quality and safety needed and is available to the right people at the right time — as even a single fault can have catastrophic consequences or cause serious delays in project success. This warehouse and the associated logistics play the role of the feature store, ensuring that data is delivered consistently where and when it is needed to train and run ML models. + + + +## What is a feature? + +A feature is a transformed or enriched data that serves as an input into a machine learning model to make predictions. In machine learning, a data scientist derives features from various data sources to build a model that makes predictions based on historical data. To capture the value from this model, the enterprise must operationalize the data pipeline, ensuring that the features being used in production at inference time match those being used in training and development. + +## What role does dbt play in getting data ready for ML models? + +dbt is the standard for data transformation in the enterprise. Organizations leverage dbt at scale to deliver clean and well-governed datasets wherever and whenever they are needed. Using dbt to manage the data transformation processes to cleanse and prepare datasets used in feature development will ensure consistent datasets of guaranteed data quality — meaning that feature development will be consistent and reliable. + + +## Who is going to use this and what benefits will they see? + +Snowflake and dbt are already a well-established and trusted combination for delivering data excellence across the enterprise. The ability to register dbt pipelines in the Snowflake Feature Store further extends this combination for ML and AI workloads, while fitting naturally into the data engineering and feature pipelines already present in dbt. + + +Some of the key benefits are: + +- **Feature collaboration** — Data scientists, data analysts, data engineers, and machine learning engineers collaborate on features used in machine learning models in both Python and SQL, enabling teams to share and reuse features. As a result, teams can improve the time to value of models while improving the understanding of their components. This is all backed by Snowflake’s role-based access control (RBAC) and governance. +- **Feature consistency** — Teams are assured that features generated for training sets and those served for model inference are consistent. This can especially be a concern for large organizations where multiple versions of the truth might persist. Much like how dbt and Snowflake help enterprises have a single source of data truth, now they can have a single source of truth for features. +- **Feature visibility and use** — The Snowflake Feature Store provides an intuitive SDK to work with ML features and their associated metadata. In addition, users can browse and search for features in the Snowflake UI, providing an easy way to identify features +- **Point-in-time correctness** — Snowflake retrieves point-in-time correct features using ASOF Joins, removing the significant complexity in generating the right feature value for a given time period whether for training or batch prediction retrieval. +- **Integration with data pipelines** — Teams that have already built data pipelines in dbt can continue to use these with the Snowflake Feature Store. No additional migration or feature re-creation is necessary as teams plug into the same pipelines. + +## Why did we integrate/build this with Snowflake? + +How does dbt help with ML workloads today? dbt plays a pivotal role in preparing data for ML models by transforming raw data into a format suitable for feature engineering. It helps orchestrate and automate these transformations, ensuring that data is clean, consistent, and ready for ML applications. The combination of Snowflake’s powerful AI Data Cloud and dbt’s transformation prowess makes it an unbeatable pair for organizations aiming to scale their ML operations efficiently. + +## Making it easier for ML/Data Engineers to both build & deploy ML data & models + +dbt is a perfect tool to promote collaboration between data engineers, ML engineers, and data scientists. dbt is designed to support collaboration and quality of data pipelines through features including version control, environments and development life cycles, as well as built-in data and pipeline testing. Leveraging dbt means that data engineers and data scientists can collaborate and develop new models and features while maintaining the rigorous governance and high quality that's needed. + +Additionally, dbt Mesh makes maintaining domain ownership extremely easy by breaking up portions of our data projects and pipelines into connected projects where critical models can be published for consumption by others with strict data contracts enforcing quality and governance. This paradigm supports rapid development as each project can be kept to a maintainable size for its contributors and developers. Contracting on published models used between these projects ensures the consistency of the integration points between them. + +Finally, dbt Cloud also provides [dbt Explorer](/docs/collaborate/explore-projects) — a perfect tool to catalog and share knowledge about organizational data across disparate teams. dbt Explorer provides a central place for information on data pipelines, including lineage information, data freshness, and quality. Best of all, dbt Explorer updates every time dbt jobs run, ensuring this information is always up-to-date and relevant. + +## What tech is at play? + +Here’s what you need from dbt. dbt should be used to manage data transformation pipelines and generate the datasets needed by ML engineers and data scientists maintaining the Snowflake Feature Store. dbt Cloud Enterprise users should leverage dbt Mesh to create different projects with clear owners for these different domains of data pipelines. This Mesh design will promote easier collaboration by keeping each dbt project smaller and more manageable for the people building and maintaining it. dbt also supports both SQL and Python-based transformations making it an ideal fit for AI/ML workflows, which commonly leverage both languages. + +Using dbt for the data transformation pipelines will also ensure the quality and consistency of data products, which is critical for ensuring successful AI/ML efforts. + +## Snowflake ML overview + +The Feature Store is one component of [Snowflake ML’s](https://www.snowflake.com/en/data-cloud/snowflake-ml/) integrated suite of machine learning features that powers end-to-end machine learning within a single platform. Data scientists and ML engineers leverage ready-to-use ML functions or build custom ML workflows all without any data movement or without sacrificing governance. Snowflake ML includes scalable feature engineering and model training capabilities. Meanwhile, the Feature Store and Model Registry allow teams to store and use features and models in production, providing an end-to-end suite for operating ML workloads at scale. + + +## What do you need to do to make it all work? + +dbt Cloud offers the fastest and easiest way to run dbt. It offers a Cloud-based IDE, Cloud-attached CLI, and even a low-code visual editor option (currently in beta), meaning it’s perfect for connecting users across different teams with different workflows and tooling preferences, which is very common in AI/ML workflows. This is the tool you will use to prepare and manage data for AI/ML, promote collaboration across the different teams needed for a successful AI/ML workflow, and ensure the quality and consistency of the underlying data that will be used to create features and train models. + +Organizations interested in AI/ML workflows through Snowflake should also look at the new dbt Snowflake Native App — a Snowflake Native Application that extends the functionality of dbt Cloud into Snowflake. Of particular interest is Ask dbt — a chatbot that integrates directly with Snowflake Cortex and the dbt Semantic Layer to allow natural language questions of Snowflake data. + + +## How to power ML pipelines with dbt and Snowflake’s Feature Store + +Let’s provide a brief example of what this workflow looks like in dbt and Snowflake to build and use the powerful capabilities of a Feature Store. For this example, consider that we have a data pipeline in dbt to process customer transaction data. Various data science teams in the organization need to derive features from these transactions to use in various models, including to predict fraud and perform customer segmentation and personalization. These different use cases all benefit from having related features, such as the count of transactions or purchased amounts over different periods of time (for example, the last day, 7 days, or 30 days) for a given customer. + +Instead of the data scientists building out their own workflows to derive these features, let’s look at the flow of using dbt to manage the feature pipeline and Snowflake’s Feature Store to solve this problem. The following subsections describe the workflow step by step. + +### Create feature tables as dbt models + +The first step consists of building out a feature table as a dbt model. Data scientists and data engineers plug in to existing dbt pipelines and derive a table that includes the underlying entity (for example, customer id, timestamp and feature values). The feature table aggregates the needed features at the appropriate timestamp for a given entity. Note that Snowflake provides various common feature and query patterns available [here](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/examples). So, in our example, we would see a given customer, timestamp, and features representing transaction counts and sums over various periods. Data scientists can use SQL or Python directly in dbt to build this table, which will push down the logic into Snowflake, allowing data scientists to use their existing skill set. + +Window aggregations play an important role in the creation of features. Because the logic for these aggregations is often complex, let’s see how Snowflake and dbt make this process easier by leveraging Don’t Repeat Yourself (DRY) principles. We’ll create a macro that will allow us to use Snowflake’s `range between` syntax in a repeatable way: + +```sql +{% macro rolling_agg(column, partition_by, order_by, interval='30 days', agg_function='sum') %} + {{ agg_function }}({{ column }}) over ( + partition by {{ partition_by }} + order by {{ order_by }} + range between interval '{{ interval }}' preceding and current row + ) +{% endmacro %} + +``` + +Now, we use this macro in our feature table to build out various aggregations of customer transactions over the last day, 7 days, and 30 days. Snowflake has just taken significant complexity away in generating appropriate feature values and dbt has just made the code even more readable and repeatable. While the following example is built in SQL, teams can also build these pipelines using Python directly. + +```sql + +select + tx_datetime, + customer_id, + tx_amount, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "1 days", "sum") }} + as tx_amount_1d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "7 days", "sum") }} + as tx_amount_7d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "30 days", "sum") }} + as tx_amount_30d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "1 days", "avg") }} + as tx_amount_avg_1d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "7 days", "avg") }} + as tx_amount_avg_7d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "30 days", "avg") }} + as tx_amount_avg_30d, + {{ rolling_agg("*", "CUSTOMER_ID", "TX_DATETIME", "1 days", "count") }} + as tx_cnt_1d, + {{ rolling_agg("*", "CUSTOMER_ID", "TX_DATETIME", "7 days", "count") }} + as tx_cnt_7d, + {{ rolling_agg("*", "CUSTOMER_ID", "TX_DATETIME", "30 days", "count") }} + as tx_cnt_30d +from {{ ref("stg_transactions") }} + +``` + +### Create or connect to a Snowflake Feature Store + +Once a feature table is built in dbt, data scientists use Snowflake’s [snowflake-ml-python](https://docs.snowflake.com/en/developer-guide/snowflake-ml/snowpark-ml) package to create or connect to an existing Feature Store in Snowflake. Data scientists can do this all in Python, including in Jupyter Notebooks or directly in Snowflake using [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks). + +Let’s go ahead and create the Feature Store in Snowflake: + + +```sql +from snowflake.ml.feature_store import ( + FeatureStore, + FeatureView, + Entity, + CreationMode +) + +fs = FeatureStore( + session=session, + database=fs_db, + name=fs_schema, + default_warehouse='WH_DBT', + creation_mode=CreationMode.CREATE_IF_NOT_EXIST, +) + +``` + +### Create and register feature entities + +The next step consists of creating and registering [entities](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/entities). These represent the underlying objects that features are associated with, forming the join keys used for feature lookups. In our example, the data scientist can register various entities, including for the customer, a transaction id, or other necessary attributes. + +Let’s create some example entities. + +```python +customer = Entity(name="CUSTOMER", join_keys=["CUSTOMER_ID"]) +transaction = Entity(name="TRANSACTION", join_keys=["TRANSACTION_ID"]) +fs.register_entity(customer) +fs.register_entity(transaction) + +``` + +### Register feature tables as feature views + +After registering entities, the next step is to register a [feature view](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/feature-views). This represents a group of related features that stem from the features tables created in the dbt model. In this case, note that the feature logic, refresh, and consistency is managed by the dbt pipeline. The feature view in Snowflake enables versioning of the features while providing discoverability among teams. + +```python +# Create a dataframe from our feature table produced in dbt +customers_transactions_df = session.sql(f""" + SELECT + CUSTOMER_ID, + TX_DATETIME, + TX_AMOUNT_1D, + TX_AMOUNT_7D, + TX_AMOUNT_30D, + TX_AMOUNT_AVG_1D, + TX_AMOUNT_AVG_7D, + TX_AMOUNT_AVG_30D, + TX_CNT_1D, + TX_CNT_7D, + TX_CNT_30D + FROM {fs_db}.{fs_data_schema}.ft_customer_transactions + """) + +# Create a feature view on top of these features +customer_transactions_fv = FeatureView( + name="customer_transactions_fv", + entities=[customer], + feature_df=customers_transactions_df, + timestamp_col="TX_DATETIME", + refresh_freq=None, + desc="Customer transaction features with window aggregates") + +# Register the feature view for use beyond the session +customer_transactions_fv = fs.register_feature_view( + feature_view=customer_transactions_fv, + version="1", + #overwrite=True, + block=True) + +``` + +### Search and discover features in the Snowflake UI + +Now, with features created, teams can view their features directly in the Snowflake UI, as shown below. This enables teams to easily search and browse features, all governed through Snowflake’s role-based access control (RBAC). + + + +### Generate training dataset + +Now that the feature view is created, data scientists produce a [training dataset](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/modeling#generating-tables-for-training) that uses the feature view. In our example, whether the data scientist is building a fraud or segmentation model, they will retrieve point-in-time correct features for a customer at a specific point in time using the Feature Store’s `generate_training_set` method. + +To generate the training set, we need to supply a spine dataframe, representing the entities and timestamp values that we will need to retrieve features for. The following example shows this using a few records, although teams can leverage other tables to produce this spine. + +```python +spine_df = session.create_dataframe( + [ + ('1', '3937', "2019-05-01 00:00"), + ('2', '2', "2019-05-01 00:00"), + ('3', '927', "2019-05-01 00:00"), + ], + schema=["INSTANCE_ID", "CUSTOMER_ID", "EVENT_TIMESTAMP"]) + +train_dataset = fs.generate_dataset( + name= "customers_fv", + version= "1_0", + spine_df=spine_df, + features=[customer_transactions_fv], + spine_timestamp_col= "EVENT_TIMESTAMP", + spine_label_cols = [] +) + +``` + +Now that we have produced the training dataset, let’s see what it looks like. + + + +### Train and deploy a model + +Now with this training set, data scientists can use [Snowflake Snowpark](https://docs.snowflake.com/en/developer-guide/snowpark/index) and [Snowpark ML Modeling](https://docs.snowflake.com/en/developer-guide/snowflake-ml/modeling) to use familiar Python frameworks for additional preprocessing, feature engineering, and model training all within Snowflake. The model can be registered in the Snowflake [Model Registry](https://docs.snowflake.com/en/developer-guide/snowflake-ml/model-registry/overview) for secure model management. Note that we will leave the model training for you as part of this exercise. + +### Retrieve features for predictions + +For inference, data pipelines retrieve feature values using the [retrieve_feature_values](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/modeling#retrieving-features-and-making-predictions) method. These retrieved values can be fed directly to a model’s predict capability in your Python session using a developed model or by invoking a model’s predict method from Snowflake’s Model Registry. For batch scoring purposes, teams can build this entire pipeline using [Snowflake ML](https://docs.snowflake.com/en/developer-guide/snowflake-ml/overview). The following code demonstrates how the features are retrieved using this method. + +```python +infernce_spine = session.create_dataframe( + [ + ('1', '3937', "2019-07-01 00:00"), + ('2', '2', "2019-07-01 00:00"), + ('3', '927', "2019-07-01 00:00"), + ], + schema=["INSTANCE_ID", "CUSTOMER_ID", "EVENT_TIMESTAMP"]) + +inference_dataset = fs.retrieve_feature_values( + spine_df=infernce_spine, + features=[customer_transactions_fv], + spine_timestamp_col="EVENT_TIMESTAMP", +) + +inference_dataset.to_pandas() + +``` + +Here’s an example view of our features produced for model inferencing. + + + +## Conclusion + +We’ve just seen how quickly and easily you can begin to develop features through dbt and leverage the Snowflake Feature Store to deliver predictive modeling as part of your data pipelines. The ability to build and deploy ML models, including integrating feature storage, data transformation, and ML logic within a single platform, simplifies the entire ML life cycle. Combining this new power with the well-established partnership of dbt and Snowflake unlocks even more potential for organizations to safely build and explore new AI/ML use cases and drive further collaboration in the organization. + +The code used in the examples above is publicly available on [GitHub](https://github.com/sfc-gh-rpettus/dbt-feature-store). Also, you can run a full example yourself in this [quickstart guide](https://quickstarts.snowflake.com/guide/getting-started-with-feature-store-and-dbt/index.html?index=..%2F..index#0) from the Snowflake docs. diff --git a/website/blog/authors.yml b/website/blog/authors.yml index 85f05a545f9..271130a477d 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -1,7 +1,7 @@ --- amy_chen: image_url: /img/blog/authors/achen.png - job_title: Product Ecosystem Manager + job_title: Product Manager links: - icon: fa-linkedin url: https://www.linkedin.com/in/yuanamychen/ @@ -386,6 +386,14 @@ lucas_bergodias: job_title: Analytics Engineer name: Lucas Bergo Dias organization: Indicium Tech +luis_leon: + image_url: /img/blog/authors/luis-leon.png + job_title: Partner Solutions Architect + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/luis-leon-03965463/ + name: Luis Leon + organization: dbt Labs matt_winkler: description: Matt is an ex-data scientist who chose to embrace the simplicity of using SQL to manage and testing data pipelines with dbt. He previously worked as a hands-on ML practitioner, and consulted with Fortune 500 clients to build and maintain ML Ops pipelines using (mostly) AWS Sagemaker. He lives in the Denver area, and you can say hello on dbt Slack or on LinkedIn. image_url: /img/blog/authors/matt-winkler.jpeg @@ -449,6 +457,14 @@ pedro_brito_de_sa: url: https://www.linkedin.com/in/pbritosa/ name: Pedro Brito de Sa organization: Sage +randy_pettus: + image_url: /img/blog/authors/randy-pettus.png + job_title: Senior Partner Sales Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/randypettus/ + name: Randy Pettus + organization: Snowflake rastislav_zdechovan: image_url: /img/blog/authors/rastislav-zdechovan.png job_title: Analytics Engineer diff --git a/website/blog/ctas.yml b/website/blog/ctas.yml index ac56d4cc749..1f9b13afa7b 100644 --- a/website/blog/ctas.yml +++ b/website/blog/ctas.yml @@ -25,3 +25,8 @@ subheader: Coalesce is the premiere analytics engineering conference! Sign up now for innovation, collaboration, and inspiration. Don't miss out! button_text: Register now url: https://coalesce.getdbt.com/register +- name: coalesce_2024_catchup + header: Missed Coalesce 2024? + subheader: Catch up on Coalesce 2024 and register to access a select number of on-demand sessions. + button_text: Register and watch + url: https://coalesce.getdbt.com/register/online diff --git a/website/blog/maching-learning-dbt-baton-pass.md b/website/blog/maching-learning-dbt-baton-pass.md index 2c38cfd8983..c1cb05fceab 100644 --- a/website/blog/maching-learning-dbt-baton-pass.md +++ b/website/blog/maching-learning-dbt-baton-pass.md @@ -145,20 +145,6 @@ This wouldn’t solve for the ML engineer and her desire to inject custom ML mod It may be worth having python scripts live side by side dbt jobs and configurations. I can get better lineage and have one less tool to context switch to. -#### fal - -- [fal](https://github.com/fal-ai/fal): Makes dbt and python interoperable. Read in a dbt model as a pandas dataframe using a [ref statement](https://github.com/fal-ai/fal/blob/b20874ab957f8eb0f65c56d82e6bb85c717de4c6/examples/write_jupyter_notebook.md#example-9-use-dbt-from-a-jupyter-notebook)! - -#### How would this change my story? - -**A ref statement would mean the same thing to both my ML engineer and me.** - -We would work in the same dbt project for the entire workflow, not just part of it. We would align python scripts to dbt configurations for better lineage (see below). - -![Screenshot of fal installation instructions](/img/blog/2022-02-18-machine-learning-dbt-baton-pass/fal-install-1.png) - -![Second screenshot of fal installation instructions](/img/blog/2022-02-18-machine-learning-dbt-baton-pass/fal-install-2.png) - #### What are the tradeoffs of this tool path? When things would go wrong, it’d still be a messy jumble to figure out how SQL changes inform python changes and vice versa. And I would need to care about which infrastructure my python code is running on. But my gut tells me the tradeoff would be worth it because there’d be less notebooks to schedule, and it’d be easier to align machine learning logic to dbt logic. diff --git a/website/blog/metadata.yml b/website/blog/metadata.yml index d0009fd62c4..8b53a7a2a04 100644 --- a/website/blog/metadata.yml +++ b/website/blog/metadata.yml @@ -2,7 +2,7 @@ featured_image: "" # This CTA lives in right sidebar on blog index -featured_cta: "coalesce_2024_signup" +featured_cta: "coalesce_2024_catchup" # Show or hide hero title, description, cta from blog index show_title: true diff --git a/website/dbt-global-variables.js b/website/dbt-global-variables.js deleted file mode 100644 index 8ee4499151e..00000000000 --- a/website/dbt-global-variables.js +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Full Documentation at: - * https://www.notion.so/dbtlabs/Versioning-on-Docusaurus-c6a4a41a66cd4ea2970854cc42cb5b70#1803b9cb666442e5ac8885cf0bba321f - * - */ - -exports.dbtVariables = { - // Example global variable with versioning - // If version 0.21 or lower is selected - // "Old Example String" will replace "Example String" - exampleString: { - name: "Example String", - versions: [ - { - "name": "Old Example String", - "version": "0.21" - } - ] - }, - dbtTheProduct: { - name: "dbt" - }, - dbtCore: { - name: "dbt Core" - }, - dbtCloud: { - name: "dbt Cloud" - }, - dbtIDE: { - name: "dbt Cloud IDE" - }, -} diff --git a/website/dbt-versions.js b/website/dbt-versions.js index 5ad3a3048c5..825af8ac6ee 100644 --- a/website/dbt-versions.js +++ b/website/dbt-versions.js @@ -1,7 +1,26 @@ +/** + * Sets the available dbt versions available in the navigation + * @type {Array.<{ + * version: string, + * EOLDate: string, + * isPrerelease: boolean, + * customDisplay: string, + * }>} + * @property {string} version The version number + * @property {string} EOLDate "End of Life" date which is used to show the EOL banner + * @property {boolean} isPrerelease Boolean used for showing the prerelease banner + * @property {string} customDisplay Allows setting a custom display name for the current version + * + * customDisplay for dbt Cloud should be a version ahead of latest dbt Core release (GA or beta). + */ exports.versions = [ + { + version: "1.10", + customDisplay: "Cloud (Versionless)", + }, { version: "1.9", - isPrerelease: true, + isPrerelease: true, }, { version: "1.8", @@ -9,164 +28,51 @@ exports.versions = [ }, { version: "1.7", - EOLDate: "2024-10-30", + EOLDate: "2024-11-01", }, - { - version: "1.6", - EOLDate: "2024-07-31", - }, -] +]; +/** + * Controls doc page visibility in the sidebar based on the current version + * @type {Array.<{ + * page: string, + * lastVersion: string, + * }>} + * @property {string} page The target page to hide/show in the sidebar + * @property {string} lastVersion The last version the page is visible in the sidebar + */ exports.versionedPages = [ { - "page": "/reference/resource-configs/target_database", - "lastVersion": "1.8", - }, - { - "page": "/reference/resource-configs/target_schema", - "lastVersion": "1.8", - }, - { - "page": "reference/global-configs/indirect-selection", - "firstVersion": "1.8", - }, - { - "page": "reference/resource-configs/store_failures_as", - "firstVersion": "1.7", - }, - { - "page": "docs/build/build-metrics-intro", - "firstVersion": "1.6", - }, - { - "page": "docs/build/sl-getting-started", - "firstVersion": "1.6", - }, - { - "page": "docs/build/about-metricflow", - "firstVersion": "1.6", - }, - { - "page": "docs/build/join-logic", - "firstVersion": "1.6", - }, - { - "page": "docs/build/validation", - "firstVersion": "1.6", - }, - { - "page": "docs/build/semantic-models", - "firstVersion": "1.6", - }, - { - "page": "docs/build/group-by", - "firstVersion": "1.6", - }, - { - "page": "docs/build/entities", - "firstVersion": "1.6", - }, - { - "page": "docs/build/metrics-overview", - "firstVersion": "1.6", - }, - { - "page": "docs/build/cumulative", - "firstVersion": "1.6", - }, - { - "page": "docs/build/derived", - "firstVersion": "1.6", - }, - { - "page": "docs/build/measure-proxy", - "firstVersion": "1.6", + page: "docs/build/incremental-microbatch", + firstVersion: "1.9", }, - { - "page": "docs/build/ratio", - "firstVersion": "1.6", - }, - { - "page": "reference/commands/clone", - "firstVersion": "1.6", - }, - { - "page": "docs/collaborate/govern/project-dependencies", - "firstVersion": "1.6", - }, - { - "page": "reference/dbt-jinja-functions/thread_id", - "firstVersion": "1.6", - }, - { - "page": "reference/resource-properties/deprecation_date", - "firstVersion": "1.6", - }, - { - "page": "reference/commands/retry", - "firstVersion": "1.6", - }, - { - "page": "docs/build/groups", - "firstVersion": "1.5", - }, - { - "page": "docs/collaborate/govern/model-contracts", - "firstVersion": "1.5", - }, - { - "page": "reference/commands/show", - "firstVersion": "1.5", + { + page: "reference/resource-configs/snapshot_meta_column_names", + firstVersion: "1.9", }, { - "page": "docs/collaborate/govern/model-access", - "firstVersion": "1.5", + page: "reference/resource-configs/target_database", + lastVersion: "1.8", }, { - "page": "docs/collaborate/govern/model-versions", - "firstVersion": "1.5", + page: "reference/resource-configs/target_schema", + lastVersion: "1.8", }, { - "page": "reference/programmatic-invocations", - "firstVersion": "1.5", + page: "reference/global-configs/indirect-selection", + firstVersion: "1.8", }, - { - "page": "reference/resource-configs/contract", - "firstVersion": "1.5", - }, - { - "page": "reference/resource-configs/group", - "firstVersion": "1.5", - }, - { - "page": "reference/resource-properties/access", - "firstVersion": "1.5", - }, - { - "page": "reference/resource-properties/constraints", - "firstVersion": "1.5", - }, - { - "page": "reference/resource-properties/latest_version", - "firstVersion": "1.5", - }, - { - "page": "reference/resource-properties/versions", - "firstVersion": "1.5", - }, - { - "page": "reference/resource-configs/on_configuration_change", - "firstVersion": "1.6", - } -] +]; +/** + * Controls doc category visibility in the sidebar based on the current version + * @type {Array.<{ + * category: string, + * firstVersion: string, + * }>} + * @property {string} category The target category to hide/show in the sidebar + * @property {string} firstVersion The first version the category is visible in the sidebar + */ exports.versionedCategories = [ - { - "category": "Model governance", - "firstVersion": "1.5", - }, - { - "category": "Build your metrics", - "firstVersion": "1.6", - } -] + +]; diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md index 98276776019..da882dba6c5 100644 --- a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md @@ -241,7 +241,9 @@ measures: ## Reviewing our work -Our completed code will look like this, our first semantic model! +Our completed code will look like this, our first semantic model! Here are two examples showing different organizational approaches: + + @@ -288,12 +290,74 @@ semantic_models: description: The total tax paid on each order. agg: sum ``` + + + + + + +```yml +semantic_models: + - name: orders + defaults: + agg_time_dimension: ordered_at + description: | + Order fact table. This table is at the order grain with one row per order. + + model: ref('stg_orders') + + entities: + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + + dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using BigQuery + type: time + type_params: + time_granularity: day + - name: is_large_order + type: categorical + expr: case when order_total > 50 then true else false end + + measures: + - name: order_total + description: The total revenue for each order. + agg: sum + - name: order_count + description: The count of individual orders. + expr: 1 + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum +``` + + +As you can see, the content of the semantic model is identical in both approaches. The key differences are: + +1. **File location** + - Co-located approach: `models/marts/orders.yml` + - Parallel sub-folder approach: `models/semantic_models/sem_orders.yml` + +2. **File naming** + - Co-located approach: Uses the same name as the corresponding mart (`orders.yml`) + - Parallel sub-folder approach: Prefixes the file with `sem_` (`sem_orders.yml`) + +Choose the approach that best fits your project structure and team preferences. The co-located approach is often simpler for new projects, while the parallel sub-folder approach can be clearer for migrating large existing projects to the Semantic Layer. ## Next steps Let's review the basics of semantic models: -- 🧱 Consist off **entities, dimensions, and measures**. +- 🧱 Consist of **entities, dimensions, and measures**. - 🫂 Describe the **semantics and relationships of objects** in the warehouse. - 1️⃣ Correspond to a **single logical model** in your dbt project. diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-7-semantic-structure.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-7-semantic-structure.md index 295d86e9c20..5bfbea82dda 100644 --- a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-7-semantic-structure.md +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-7-semantic-structure.md @@ -20,6 +20,10 @@ The first thing you need to establish is how you’re going to consistently stru It’s not terribly difficult to shift between these (it can be done with some relatively straightforward shell scripting), and this is purely a decision based on your developers’ preference (i.e. it has no impact on execution or performance), so don’t feel locked in to either path. Just pick the one that feels right and you can always shift down the road if you change your mind. +:::tip +Make sure to save all semantic models and metrics under the directory defined in the [`model-paths`](/reference/project-configs/model-paths) (or a subdirectory of it, like `models/semantic_models/`). If you save them outside of this path, it will result in an empty `semantic_manifest.json` file, and your semantic models or metrics won't be recognized. +::: + ## Naming Next, establish your system for consistent file naming: diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-8-refactor-a-rollup.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-8-refactor-a-rollup.md index 10c69566a7e..5dbb1e0517d 100644 --- a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-8-refactor-a-rollup.md +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-8-refactor-a-rollup.md @@ -49,26 +49,26 @@ So far we've been working in new pointing at a staging model to simplify things ```yaml semantic_models: -- name: locations - description: | - Location dimension table. The grain of the table is one row per location. - model: ref('stg_locations') - entities: - - name: location - type: primary - expr: location_id - dimensions: - - name: location_name - type: categorical - - name: date_trunc('day', opened_at) - type: time - type_params: - time_granularity: day - measures: - - name: average_tax_rate - description: Average tax rate. - expr: tax_rate - agg: avg + - name: locations + description: | + Location dimension table. The grain of the table is one row per location. + model: ref('stg_locations') + entities: + - name: location + type: primary + expr: location_id + dimensions: + - name: location_name + type: categorical + - name: date_trunc('day', opened_at) + type: time + type_params: + time_granularity: day + measures: + - name: average_tax_rate + description: Average tax rate. + expr: tax_rate + agg: avg ``` ## Semantic and logical interaction diff --git a/website/docs/best-practices/how-we-mesh/mesh-2-who-is-dbt-mesh-for.md b/website/docs/best-practices/how-we-mesh/mesh-2-who-is-dbt-mesh-for.md index b6fadc2d7a6..4c8adfa86a1 100644 --- a/website/docs/best-practices/how-we-mesh/mesh-2-who-is-dbt-mesh-for.md +++ b/website/docs/best-practices/how-we-mesh/mesh-2-who-is-dbt-mesh-for.md @@ -23,9 +23,6 @@ Is dbt Mesh a good fit in this scenario? Absolutely! There is no other way to sh - Onboarding hundreds of people and dozens of projects is full of friction! The challenges of a scaled, global organization are not to be underestimated. To start the migration, prioritize teams that have strong dbt familiarity and fundamentals. dbt Mesh is an advancement of core dbt deployments, so these teams are likely to have a smoother transition. Additionally, prioritize teams that manage strategic data assets that need to be shared widely. This ensures that dbt Mesh will help your teams deliver concrete value quickly. -- Bi-directional project dependencies -- currently, projects in dbt Mesh are treated like dbt resources in that they cannot depend on each other. However, many teams may want to be able to share data assets back and forth between teams. - - We've added support for [enabling bidirectional dependencies](/best-practices/how-we-mesh/mesh-3-structures#cycle-detection) across projects. If this sounds like your organization, dbt Mesh is the architecture you should pursue. ✅ diff --git a/website/docs/best-practices/how-we-mesh/mesh-3-structures.md b/website/docs/best-practices/how-we-mesh/mesh-3-structures.md index c75c566610b..38066811d8a 100644 --- a/website/docs/best-practices/how-we-mesh/mesh-3-structures.md +++ b/website/docs/best-practices/how-we-mesh/mesh-3-structures.md @@ -66,7 +66,7 @@ Since the launch of dbt Mesh, the most common pattern we've seen is one where pr Users may need to contribute models across multiple projects and this is fine. There will be some friction doing this, versus a single repo, but this is _useful_ friction, especially if upstreaming a change from a “spoke” to a “hub.” This should be treated like making an API change, one that the other team will be living with for some time to come. You should be concerned if your teammates find they need to make a coordinated change across multiple projects very frequently (every week), or as a key prerequisite for ~20%+ of their work. -### Cycle detection +### Cycle detection import CycleDetection from '/snippets/_mesh-cycle-detection.md'; diff --git a/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md b/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md index f1fb7422acf..a884de90c49 100644 --- a/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md +++ b/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md @@ -80,7 +80,7 @@ models: ## Split your projects 1. **Move your grouped models into a subfolder**. This will include any model in the selected group, it's associated YAML entry, as well as its parent or child resources as appropriate depending on where this group sits in your DAG. - 1. Note that just like in your dbt project, circular refereneces are not allowed! Project B cannot have parents and children in Project A, for example. + 1. Note that just like in your dbt project, circular references are not allowed! Project B cannot have parents and children in Project A, for example. 2. **Create a new `dbt_project.yml` file** in the subdirectory. 3. **Copy any macros** used by the resources you moved. 4. **Create a new `packages.yml` file** in your subdirectory with the packages that are used by the resources you moved. diff --git a/website/docs/best-practices/how-we-mesh/mesh-5-faqs.md b/website/docs/best-practices/how-we-mesh/mesh-5-faqs.md index 1ae49928ae5..9f12f7d2c20 100644 --- a/website/docs/best-practices/how-we-mesh/mesh-5-faqs.md +++ b/website/docs/best-practices/how-we-mesh/mesh-5-faqs.md @@ -215,7 +215,7 @@ There’s model-level access within dbt, role-based access for users and groups First things first: access to underlying data is always defined and enforced by the underlying data platform (for example, BigQuery, Databricks, Redshift, Snowflake, Starburst, etc.) This access is managed by executing “DCL statements” (namely `grant`). dbt makes it easy to [configure `grants` on models](/reference/resource-configs/grants), which provision data access for other roles/users/groups in the data warehouse. However, dbt does _not_ automatically define or coordinate those grants unless they are configured explicitly. Refer to your organization's system for managing data warehouse permissions. -[dbt Cloud Enterprise plans](https://www.getdbt.com/pricing) support [role-based access control (RBAC)](/docs/cloud/manage-access/enterprise-permissions#how-to-set-up-rbac-groups-in-dbt-cloud) that manages granular permissions for users and user groups. You can control which users can see or edit all aspects of a dbt Cloud project. A user’s access to dbt Cloud projects also determines whether they can “explore” that project in detail. Roles, users, and groups are defined within the dbt Cloud application via the UI or by integrating with an identity provider. +[dbt Cloud Enterprise plans](https://www.getdbt.com/pricing) support [role-based access control (RBAC)](/docs/cloud/manage-access/about-user-access#role-based-access-control-) that manages granular permissions for users and user groups. You can control which users can see or edit all aspects of a dbt Cloud project. A user’s access to dbt Cloud projects also determines whether they can “explore” that project in detail. Roles, users, and groups are defined within the dbt Cloud application via the UI or by integrating with an identity provider. [Model access](/docs/collaborate/govern/model-access) defines where models can be referenced. It also informs the discoverability of those projects within dbt Explorer. Model `access` is defined in code, just like any other model configuration (`materialized`, `tags`, etc). diff --git a/website/docs/best-practices/how-we-structure/4-marts.md b/website/docs/best-practices/how-we-structure/4-marts.md index 21de31a9e0d..995dea7e96f 100644 --- a/website/docs/best-practices/how-we-structure/4-marts.md +++ b/website/docs/best-practices/how-we-structure/4-marts.md @@ -26,7 +26,8 @@ models/marts ✅ **Group by department or area of concern.** If you have fewer than 10 or so marts you may not have much need for subfolders, so as with the intermediate layer, don’t over-optimize too early. If you do find yourself needing to insert more structure and grouping though, use useful business concepts here. In our marts layer, we’re no longer worried about source-conformed data, so grouping by departments (marketing, finance, etc.) is the most common structure at this stage. -✅ **Name by entity.** Use plain English to name the file based on the concept that forms the grain of the mart `customers`, `orders`. Note that for pure marts, there should not be a time dimension (`orders_per_day`) here, that is typically best captured via metrics. +✅ **Name by entity.** Use plain English to name the file based on the concept that forms the grain of the mart’s `customers`, `orders`. Marts that don't include any time-based rollups (pure marts) should not have a time dimension (`orders_per_day`) here, typically best captured via metrics. + ❌ **Build the same concept differently for different teams.** `finance_orders` and `marketing_orders` is typically considered an anti-pattern. There are, as always, exceptions — a common pattern we see is that, finance may have specific needs, for example reporting revenue to the government in a way that diverges from how the company as a whole measures revenue day-to-day. Just make sure that these are clearly designed and understandable as _separate_ concepts, not departmental views on the same concept: `tax_revenue` and `revenue` not `finance_revenue` and `marketing_revenue`. diff --git a/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md b/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md index 2dca148a226..9358b507acc 100644 --- a/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md +++ b/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md @@ -102,12 +102,14 @@ We’ve focused heavily thus far on the primary area of action in our dbt projec ### Project splitting -One important, growing consideration in the analytics engineering ecosystem is how and when to split a codebase into multiple dbt projects. Our present stance on this for most projects, particularly for teams starting out, is straightforward: you should avoid it unless you have no other option or it saves you from an even more complex workaround. If you do have the need to split up your project, it’s completely possible through the use of private packages, but the added complexity and separation is, for most organizations, a hindrance not a help, at present. That said, this is very likely subject to change! [We want to create a world where it’s easy to bring lots of dbt projects together into a cohesive lineage](https://github.com/dbt-labs/dbt-core/discussions/5244). In a world where it’s simple to break up monolithic dbt projects into multiple connected projects, perhaps inside of a modern monorepo, the calculus will be different, and the below situations we recommend against may become totally viable. So watch this space! +One important, growing consideration in the analytics engineering ecosystem is how and when to split a codebase into multiple dbt projects. Currently, our advice for most teams, especially those just starting, is fairly simple: in most cases, we recommend doing so with [dbt Mesh](/best-practices/how-we-mesh/mesh-1-intro)! dbt Mesh allows organizations to handle complexity by connecting several dbt projects rather than relying on one big, monolithic project. This approach is designed to speed up development while maintaining governance. -- ❌ **Business groups or departments.** Conceptual separations within the project are not a good reason to split up your project. Splitting up, for instance, marketing and finance modeling into separate projects will not only add unnecessary complexity, but destroy the unifying effect of collaborating across your organization on cohesive definitions and business logic. -- ❌ **ML vs Reporting use cases.** Similarly to the point above, splitting a project up based on different use cases, particularly more standard BI versus ML features, is a common idea. We tend to discourage it for the time being. As with the previous point, a foundational goal of implementing dbt is to create a single source of truth in your organization. The features you’re providing to your data science teams should be coming from the same marts and metrics that serve reports on executive dashboards. There are a growing number of tools like [fal](https://blog.fal.ai/introducing-fal-dbt/) and [Continual.ai](http://Continual.ai) that make excellent use of this unified viewpoint. -- ✅ **Data governance.** Structural, organizational needs — such as data governance and security — are one of the few worthwhile reasons to split up a project. If, for instance, you work at a healthcare company with only a small team cleared to access raw data with PII in it, you may need to split out your staging models into their own project to preserve those policies. In that case, you would import your staging project into the project that builds on those staging models as a [private package](https://docs.getdbt.com/docs/build/packages/#private-packages). +As breaking up monolithic dbt projects into smaller, connected projects, potentially within a modern mono repo becomes easier, the scenarios we currently advise against may soon become feasible. So watch this space! + +- ✅ **Business groups or departments.** Conceptual separations within the project are the primary reason to split up your project. This allows your business domains to own their own data products and still collaborate using dbt Mesh. For more information about dbt Mesh, please refer to our [dbt Mesh FAQs](/best-practices/how-we-mesh/mesh-5-faqs). +- ✅ **Data governance.** Structural, organizational needs — such as data governance and security — are one of the few worthwhile reasons to split up a project. If, for instance, you work at a healthcare company with only a small team cleared to access raw data with PII in it, you may need to split out your staging models into their own projects to preserve those policies. In that case, you would import your staging project into the project that builds on those staging models as a [private package](https://docs.getdbt.com/docs/build/packages/#private-packages). - ✅ **Project size.** At a certain point, your project may grow to have simply too many models to present a viable development experience. If you have 1000s of models, it absolutely makes sense to find a way to split up your project. +- ❌ **ML vs Reporting use cases.** Similarly to the point above, splitting a project up based on different use cases, particularly more standard BI versus ML features, is a common idea. We tend to discourage it for the time being. As with the previous point, a foundational goal of implementing dbt is to create a single source of truth in your organization. The features you’re providing to your data science teams should be coming from the same marts and metrics that serve reports on executive dashboards. ## Final considerations diff --git a/website/docs/community/resources/getting-help.md b/website/docs/community/resources/getting-help.md index 19b7c22fbdf..e8dba3ef918 100644 --- a/website/docs/community/resources/getting-help.md +++ b/website/docs/community/resources/getting-help.md @@ -55,9 +55,9 @@ If you need dedicated support to build your dbt project, consider reaching out r If you want to receive dbt training, check out our [dbt Learn](https://learn.getdbt.com/) program. ## dbt Cloud support -**Note:** If you are a **dbt Cloud user** and need help with one of the following issues, please reach out to us by using the speech bubble (💬) in the dbt Cloud interface or at support@getdbt.com +**Note:** If you are a **dbt Cloud user** and need help with one of the following issues, please reach out to us by clicking **Create a support ticket** through the dbt Cloud navigation or emailing support@getdbt.com: - Account setup (e.g. connection issues, repo connections) - Billing - Bug reports related to the web interface -As a rule of thumb, if you are using dbt Cloud, but your problem is related to code within your dbt project, then please follow the above process rather than reaching out to support. Refer to [dbt Cloud support](/docs/dbt-support) for more information. +As a rule of thumb, if you are using dbt Cloud, but your problem is related to code within your dbt project, then please follow the above process or checking out the [FAQs](/docs/faqs) rather than reaching out to support. Refer to [dbt Cloud support](/docs/dbt-support) for more information. diff --git a/website/docs/community/resources/speaking-at-a-meetup.md b/website/docs/community/resources/speaking-at-a-meetup.md index cc1a424139e..4411e9f22d9 100644 --- a/website/docs/community/resources/speaking-at-a-meetup.md +++ b/website/docs/community/resources/speaking-at-a-meetup.md @@ -63,108 +63,106 @@ Now, it’s time to write! Rather than starting with a slide deck, open up a bla Don’t get too hung up on a title at this stage — we’re happy to work with you on that later in the process. -### The basic structure +## The basic structure Below, we’ve outlined a common structure used for meetup talks — if this is your first talk, this is a great way to get started (in fact, even experienced speakers often use a structure like this). Use this as a starting point, rather than an exact formula! -###### 1. What is the business problem? +1. What is the business problem? + Relating to a business problem helps audience members understand why you undertook a project. For example: + - The finance team didn’t trust our numbers + - We were never sure what led to an increase in customer conversion + - The data team couldn’t find a balance between ad hoc requests and roadmap work + - Our tracking across mobile and web was completely inconsistent -Relating to a business problem helps audience members understand why you undertook a project. For example: -- The finance team didn’t trust our numbers -- We were never sure what led to an increase in customer conversion -- The data team couldn’t find a balance between ad hoc requests and roadmap work -- Our tracking across mobile and web was completely inconsistent +2. How did this manifest? + Include evidence that this is a genuine problem — this helps create buy-in from the audience. Slack screenshots, quotes, charts, etc. are all good here! -###### 2. How did this manifest? -Include evidence that this is a genuine problem — this helps create buy-in from the audience. Slack screenshots, quotes, charts, etc. are all good here! +3. What tactics were used to solve the problem? + Three feels like a good number here. Make sure to emphasize people and process solutions as well as technology solutions. -###### 3. What tactics were used to solve the problem? -Three feels like a good number here. Make sure to emphasize people and process solutions as well as technology solutions. +4. What was the impact on the business problem? + Since you set out a problem to be solved, it’s worth revisiting it. It’s okay if you found that your project didn’t go as planned — there’s a valuable lesson in there. Again, including evidence of improvement feels valuable. -###### 4. What was the impact on the business problem? -Since you set out a problem to be solved, it’s worth revisiting it. It’s okay if you found that your project didn’t go as planned — there’s a valuable lesson in there. Again, including evidence of improvement feels valuable. - -###### 5. What other things were learned, and/or what next steps are you taking? -Summarize high level lessons that others can take-away, and potentially talk about what you’d do differently, or what you plan on doing next. +5. What other things were learned, and/or what next steps are you taking? + Summarize high level lessons that others can take-away, and potentially talk about what you’d do differently, or what you plan on doing next. ### Why does this structure work? -The above structure might seem formulaic, but we’ve seen it work a number of times. In our opinion, this structure works because: +The previous structure might seem formulaic, but we’ve seen it work a number of times. In our opinion, this structure works because: - **Your presentation has the structure of a story** — problem, journey, solution. Human beings love stories, and so the flow feels natural and easy for your audience to follow. - **It increases the target audience**. Sharing a few different tactics means that it’s more likely there will be something in your talk for different audience members. Compare that to narrowly scoping a talk on “[Writing packages when a source table may or may not exist](https://discourse.getdbt.com/t/writing-packages-when-a-source-table-may-or-may-not-exist/1487)”— it’s not going to feel relevant to most people in the room. - **It covers both theory and application.** Too much theory and you’re giving a TedTalk, too much application and you’re just giving a product demo. The best Meetup talks help people understand how you thought through a problem and why you made certain decisions so they can apply your knowledge within their unique context. -### Examples that follow this structure +## Examples that follow this structure Here's a few of our favorite talks mapped to the structure — trust us, it works! -#### Improving data reliability — Andrea Kopitz, Envoy +### Improving data reliability — Andrea Kopitz, Envoy *[Video](https://www.youtube.com/watch?v=M_cNspn2XsE), [slides](https://docs.google.com/presentation/d/1gHChax5aM3tqKkhepX7Mghmg0DTDbY5yoDBCfUR23lg/).* -###### 1. What is the business problem? +1. What is the business problem? Envoy’s financial data appeared inconsistent. -###### 2. How did this manifest? +2. How did this manifest? Respondents to the team’s data survey said they no longer trusted the data. -###### 3. What tactics were used to solve the problem? -1. Determine responsibility -2. Build more specific dbt tests -3. Track progress +3. What tactics were used to solve the problem? + 1. Determine responsibility + 2. Build more specific dbt tests + 3. Track progress -###### 4. What was the impact on the business problem? +4. What was the impact on the business problem? In their next data survey, satisfaction rating increased, and no mention of financial data accuracy. -###### 5. What other things were learned, and/or what next steps are you taking? +5. What other things were learned, and/or what next steps are you taking? Lesson: Send out a data survey to your company to inform your roadmap. -#### Predicting customer conversions with dbt + machine learning — Kenny Ning, Better.com - +### Predicting customer conversions with dbt + machine learning — Kenny Ning, Better.com *[Video](https://www.youtube.com/watch?v=BF7HH8JDUS0), [slides](https://docs.google.com/presentation/d/1iqVjzxxRggMnRoI40ku88miDKw795djpKV_v4bbLpPE/).* -###### 1. What is the business problem? -No one knew why conversion rates for better.com customers would improve or worsen, making it difficult to know the value of different parts of the business. +1. What is the business problem? + No one knew why conversion rates for better.com customers would improve or worsen, making it difficult to know the value of different parts of the business. -###### 2. How did this manifest? -Different parts of the business took responsibility when it improved, no one took responsibility when it worsened. +2. How did this manifest? + Different parts of the business took responsibility when it improved, no one took responsibility when it worsened. -###### 3. What tactics were used to solve the problem? -1. Use a different approach to conversion rates — kaplan-meier conversion rates -2. Sketch out an ideal ML solution and see if it theoretically solves the problem -3. Build it! (ft. demonstration of solution) +3. What tactics were used to solve the problem? + 1. Use a different approach to conversion rates — kaplan-meier conversion rates + 2. Sketch out an ideal ML solution and see if it theoretically solves the problem + 3. Build it! (ft. demonstration of solution) -###### 4. What was the impact on the business problem? -In the end — not as valuable as originally hoped (and that’s ok!). Editor note: [this article](https://better.engineering/2020-06-24-wizard-part-ii/) was a great follow up on the initial project. +4. What was the impact on the business problem? + In the end — not as valuable as originally hoped (and that’s ok!). Editor note: [this article](https://better.engineering/2020-06-24-wizard-part-ii/) was a great follow up on the initial project. -###### 5. What other things were learned, and/or what next steps are you taking? -- Focus on end-to-end solutions -- Materialize your clean dataset to improve collaboration -- Sell to the business +5. What other things were learned, and/or what next steps are you taking? + - Focus on end-to-end solutions + - Materialize your clean dataset to improve collaboration + - Sell to the business -#### Migrating 387 models from Redshift to Snowflake — Bowery Farming Data Team +### Migrating 387 models from Redshift to Snowflake — Bowery Farming Data Team *[Video](https://www.youtube.com/watch?v=VhH614WVufM), [slides](https://docs.google.com/presentation/d/1wE8NSkFPLFKGQ8fvFUUKoZFVoUhws_FhFip-9mDhoPU/).* -###### 1. What is the business problem? -A new Bowery Farming site had increased the amount of data the team were dealing with, which put a strain on their data stack. +1. What is the business problem? + A new Bowery Farming site had increased the amount of data the team were dealing with, which put a strain on their data stack. -###### 2. How did this manifest? -Charts show increased dbt run times, and increased Redshift costs. +3. How did this manifest? + Charts show increased dbt run times, and increased Redshift costs. -###### 3. What tactics were used to solve the problem? -1. Push Redshift to its limit: Leverage Athena, Redshift configurations, separate clusters, python pre-processing -2. Trial Snowflake for cost and performance -3. Commit to a migration with strong project management +3. What tactics were used to solve the problem? + 1. Push Redshift to its limit: Leverage Athena, Redshift configurations, separate clusters, python pre-processing + 2. Trial Snowflake for cost and performance + 3. Commit to a migration with strong project management -###### 4. What was the impact on the business problem? -Yet to be determined (at the time, they had just finished the project). But the team showed evidence that the project has been successfully completed! +4. What was the impact on the business problem? + Yet to be determined (at the time, they had just finished the project). But the team showed evidence that the project has been successfully completed! -###### 5. What other things were learned, and/or what next steps are you taking? -Other things learned: -- Differences between Redshift and Snowflake SQL syntax -- Teamwork and coordination are key to completing a migration +5. What other things were learned, and/or what next steps are you taking? + Other things learned: + - Differences between Redshift and Snowflake SQL syntax + - Teamwork and coordination are key to completing a migration ## Turn it into a presentation Now, it's time to take your idea and turn it into a presentation. @@ -210,7 +208,7 @@ For virtual events: is there a poll you can launch, or a question you can throw ## Pair it with a blog post -The hardest part of nailing a great talk is the content, so if you’ve made it this far, you’ve already done most of the work. Turning your content into a blog post is a great way to solidify your thinking, and get some extra exposure. If you’d like to be features on the [dbt Blog](https://blog.getdbt.com/), reach out to us (@Claire and @Janessa) on Slack. +The hardest part of nailing a great talk is the content, so if you’ve made it this far, you’ve already done most of the work. Turning your content into a blog post is a great way to solidify your thinking, and get some extra exposure. If you’d like to be featured on the [dbt Blog](https://blog.getdbt.com/), please email us at [community@dbtlabs.com](mailto:community@dbtlabs.com). We’ll also be adding more resources on how to write about your work soon! @@ -246,4 +244,4 @@ Do any audience members use a communication device? Can you share your slides ah ### Responding to a conference Call for Speakers -If you’re submitting a response for a Call for Speakers, and talking about dbt, we’re happy to work with you on this. Reach out to us (@Claire and @Janessa) in Slack to let us know! +If you’re submitting a response for a Call for Speakers, and talking about dbt, we’re happy to work with you on this. You may email us at [community@dbtlabs.com](mailto:community@dbtlabs.com) for more information. diff --git a/website/docs/community/spotlight/alison-stanton.md b/website/docs/community/spotlight/alison-stanton.md index fd4a1796411..be9902dcdb1 100644 --- a/website/docs/community/spotlight/alison-stanton.md +++ b/website/docs/community/spotlight/alison-stanton.md @@ -18,6 +18,7 @@ socialLinks: dateCreated: 2023-11-07 hide_table_of_contents: true communityAward: true +communityAwardYear: 2023 --- ## When did you join the dbt community and in what way has it impacted your career? diff --git a/website/docs/community/spotlight/bruno-de-lima.md b/website/docs/community/spotlight/bruno-de-lima.md index 3cce6135ae0..3c373db06e8 100644 --- a/website/docs/community/spotlight/bruno-de-lima.md +++ b/website/docs/community/spotlight/bruno-de-lima.md @@ -2,41 +2,39 @@ id: bruno-de-lima title: Bruno de Lima description: | - Hi all! I'm a Data Engineer, deeply fascinated by the awesomeness dbt. I love talking about dbt, creating content from daily tips to blogposts and engaging with this vibrant community! - - Started my career at the beginning of 2022 at Indicium as an Analytics Engineer, working with dbt from day 1. By 2023, my path took a global trajectory as I joined phData as a Data Engineer, expanding my experiences and forging connections beyond Brazil. While dbt is at the heart of my expertise, I've also delved into data warehouses such as Snowflake, Databricks, and BigQuery; visualization tools like Power BI and Tableau; and several minor modern data stack tools. - - I actively participate in the dbt community, having attended two dbt Meetups in Brazil organized by Indicium; writing about dbt-related topics in my Medium and LinkedIn profiles; contributing to the code; and frequently checking dbt Slack and Discourse, helping (and being helped by) other dbt practitioners. If you are a community member, you may have seen me around! -image: /img/community/spotlight/bruno-de-lima.jpg + Hey all! I was born and raised in Florianopolis, Brazil, and I'm a Senior Data Engineer at phData. I live with my fiancée and I enjoy music, photography, and powerlifting. + + I started my career in early 2022 at Indicium as an Analytics Engineer, working with dbt from day 1. By 2023, my path took a global trajectory as I joined phData as a Data Engineer, expanding my experiences and creating connections beyond Brazil. While dbt is my main expertise, because of my work in consultancy I have experience with a large range of tools, specially the ones related to Snowflake, Databricks, AWS and GCP; but I have already tried several other modern data stack tools too. + + I actively participate in the dbt community, having organized dbt Meetups in Brazil (in Floripa and São Paulo); writing about dbt-related topics in my Medium and LinkedIn profiles; contributing to the dbt Core code and to the docs; and frequently checking dbt Slack and Discourse, helping (and being helped by) other dbt practitioners. If you are a community member, you may have seen me around! +image: /img/community/spotlight/bruno-souza-de-lima-newimage.jpg pronouns: he/him location: Florianópolis, Brazil -jobTitle: Data Engineer +jobTitle: Senior Data Engineer companyName: phData -organization: "" socialLinks: - name: LinkedIn link: https://www.linkedin.com/in/brunoszdl/ - name: Medium link: https://medium.com/@bruno.szdl -dateCreated: 2023-11-05 +dateCreated: 2024-11-03 hide_table_of_contents: true communityAward: true +communityAwardYear: 2024 --- ## When did you join the dbt community and in what way has it impacted your career? -I was not truly happy with my academic life. My career took a new turn when I enrolled in the Analytics Engineer course by Indicium. That was my first contact with dbt, and I didn't realize how much it would transform my career. After that, I was hired at the company as an Analytics Engineer and worked extensively with dbt from day one. +I was not truly happy with my academic life. My career took a new turn when I enrolled in the Analytics Engineer course by Indicium. That was my first contact with dbt, and I didn't realize how much it would transform my career. After that, I was hired at the company as an Analytics Engineer and worked extensively with dbt from day one. It took me some time to become an active member of the dbt community. I started working with dbt at the beginning of 2022 and became more involved towards the end of that year, encouraged by Daniel Avancini. I regret not doing this earlier, because being an active community member has been a game-changer for me, as my knowledge of dbt has grown exponentially just by participating in daily discussions on Slack. I have found #advice-dbt-help and #advice-dbt-for-power-users channels particularly useful, as well as the various database-specific channels. Additionally, the #i-made-this and #i-read-this channels have allowed me to learn about the innovative things that community members are doing. Inspired by other members, especially Josh Devlin and Owen Prough, I began answering questions on Slack and Discourse. For questions I couldn't answer, I would try engaging in discussions about possible solutions or provide useful links. I also started posting dbt tips on LinkedIn to help practitioners learn about new features or to refresh their memories about existing ones. -By being more involved in the community, I felt more connected and supported. I received help from other members, and now, I could help others, too. I was happy with this arrangement, but more unexpected surprises came my way. My active participation in Slack, Discourse, and LinkedIn opened doors to new connections and career opportunities. I had the pleasure of meeting a lot of incredible people and receiving exciting job offers, including the one for working at phData. +By being more involved in the community, I felt more connected and supported. I received help from other members, and now, I could help others, too. I was happy with this arrangement, but more unexpected surprises came my way. My active participation in Slack, Discourse, and LinkedIn opened doors to new connections and career opportunities. I had the pleasure of meeting a lot of incredible people and receiving exciting job offers, including the ones for working at phData and teaching at Zach Wilson's data engineering bootcamp. Thanks to the dbt community, I went from feeling uncertain about my career prospects to having a solid career and being surrounded by incredible people. -I would like to thank the Indicium folks for opening the first door for me for this career in data, and not just for me but for lots of people in Brazil trying to migrate from different fields who would not have this opportunity otherwise. - ## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? I identify with Gwen Windflower and Joel Labes, or at least they are the kind of leader I admire. Their strong presence and continuous interaction with all types of dbt enthusiasts make everyone feel welcomed in the community. They uplift those who contribute to the community, whether it's through a LinkedIn post or answering a question, and provide constructive feedback to help them improve. And of course they show a very strong knowledge about dbt and data in general, which is reflected in their contributions. diff --git a/website/docs/community/spotlight/christophe-oudar.md b/website/docs/community/spotlight/christophe-oudar.md new file mode 100644 index 00000000000..2381d88a381 --- /dev/null +++ b/website/docs/community/spotlight/christophe-oudar.md @@ -0,0 +1,35 @@ +--- +id: christophe-oudar +title: Christophe Oudar +description: | + I joined the dbt Community in November 2021 after exchanging some issues in Github. I currently work as a staff engineer at a scaleup in the ad tech industry called Teads, which I joined 11 years ago as a new grad. I've been using dbt Core on BigQuery since then. I write about data engineering both on Medium and Substack. I contribute on dbt-bigquery. I wrote an article that was then featured on the Developer Blog called BigQuery ingestion-time partitioning and partition copy with dbt. +image: /img/community/spotlight/christophe-oudar.jpg +pronouns: he/him +location: Montpellier, France +jobTitle: Staff Engineer +companyName: Teads +socialLinks: + - name: X + link: https://x.com/Kayrnt + - name: LinkedIn + link: https://www.linkedin.com/in/christopheoudar/ + - name: Substack + link: https://smallbigdata.substack.com/ +dateCreated: 2024-11-08 +hide_table_of_contents: true +communityAward: true +communityAwardYear: 2024 +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the community in November 2021 as a way to explore how to move our in-house data modeling layer to dbt. The transition took over a year while we ensured we could cover all our bases and add missing features to dbt-bigquery. That project was one of stepping stones that helped me to move from senior to staff level at my current job. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I identify with leaders that have strong convictions about how data engineering should move forward but remain open to innovation and ideas from everyone to bring the best to the field and make it as inclusive as possible to all cultures and profiles. I think that could mean people like Jordan Tigani or Mark Raasveldt. In the dbt community, my leadership has looked like helping people struggling and offering better ways to simplify one's day to day work when possible. + +## What have you learned from community members? What do you hope others can learn from you? + +I read a lot of articles about dbt, especially when I got started with it. It helped me a lot to build a proper Slim CI that could fit my company's ways of working. I also got to see how data pipelines were done in other companies and the pros and cons of my approaches. I hope I can share more of that knowledge for people to pick what's best for their needs. +​ diff --git a/website/docs/community/spotlight/dakota-kelley.md b/website/docs/community/spotlight/dakota-kelley.md index 85b79f0e85a..34c0b1b5887 100644 --- a/website/docs/community/spotlight/dakota-kelley.md +++ b/website/docs/community/spotlight/dakota-kelley.md @@ -16,6 +16,7 @@ socialLinks: dateCreated: 2023-11-08 hide_table_of_contents: true communityAward: true +communityAwardYear: 2023 --- ## When did you join the dbt community and in what way has it impacted your career? diff --git a/website/docs/community/spotlight/fabiyi-opeyemi.md b/website/docs/community/spotlight/fabiyi-opeyemi.md index 67efd90e1c5..b5b4bf8c9e0 100644 --- a/website/docs/community/spotlight/fabiyi-opeyemi.md +++ b/website/docs/community/spotlight/fabiyi-opeyemi.md @@ -2,13 +2,11 @@ id: fabiyi-opeyemi title: Opeyemi Fabiyi description: | - I'm an Analytics Engineer with Data Culture, a Data Consulting firm where I use dbt regularly to help clients build quality-tested data assets. I've also got a background in financial services and supply chain. I'm passionate about helping organizations to become data-driven and I majorly use dbt for data modeling, while the other aspect of the stack is largely dependent on the client infrastructure I'm working for, so I often say I'm tool-agnostic. 😀 - - I'm the founder of Nigeria's Young Data Professional Community. I'm also the organizer of the Lagos dbt Meetup which I started, and one of the organizers of the DataFest Africa Conference. I became an active member of the dbt Community in 2021 & spoke at Coalesce 2022. + I’m an Analytics Engineer with Data Culture, a Data Consulting firm where I use dbt regularly to help clients build quality-tested data assets. Before Data Culture, I worked at Cowrywise, one of the leading Fintech companies in Nigeria, where I was a solo data team member, and that was my first introduction to dbt and Analytics Engineering. Before that, I was doing Data Science and Analytics at Deloitte Nigeria. It’s been an exciting journey since I started using dbt and joining the community.Outside of work, I’m very passionate about Community building and Data Advocacy. I founded one of Nigeria’s most vibrant Data communities, “The Young Data Professional Community.” I’m also the Founder of the Lagos dbt Meetup and one of the organizers of the Largest Data Conference in Africa, DataFest Africa Conference. I became an active member of the dbt community in 2021 & spoke at Coalesce 2022. So when I’m not actively working I’m involved in one community activity or the other. image: /img/community/spotlight/fabiyi-opeyemi.jpg pronouns: he/him location: Lagos, Nigeria -jobTitle: Senior Analytics Engineer +jobTitle: Analytics Manager companyName: Data Culture organization: Young Data Professionals (YDP) socialLinks: @@ -16,9 +14,10 @@ socialLinks: link: https://twitter.com/Opiano_1 - name: LinkedIn link: https://www.linkedin.com/in/opeyemifabiyi/ -dateCreated: 2023-11-06 +dateCreated: 2024-11-02 hide_table_of_contents: true communityAward: true +communityAwardYear: 2024 --- ## When did you join the dbt community and in what way has it impacted your career? @@ -39,4 +38,4 @@ I've learned how to show empathy as a data professional and be a great engineer ## Anything else interesting you want to tell us? -Maybe, I will consider DevRel as a career sometime because of my innate passion and love for community and people. Several folks tell me I'm a strong DevRel talent and a valuable asset for any product-led company. If you need someone to bounce ideas off of or discuss😃 your community engagement efforts, please feel free to reach out. +Maybe I will consider DevRel as a career sometime because of my innate passion and love for community and people. Several folks tell me I’m a strong DevRel talent and a valuable asset for any product-led company. If you need someone to bounce ideas off of or discuss your community engagement efforts, please feel free to reach out. On a side note, it was really exciting for me to attend Coalesce 2024 in Vegas in person, which allowed me not only to learn but, most importantly, to meet amazing persons I’ve only interacted with online, like Bruno, Kuberjain, Dakota and many more; shout-out to Zenlytic and Lightdash for making that possible and, most importantly, a huge shout-out to the dbt Lab community team: Amada, Natasha and everyone on the community team for their constant supports to helping out with making the dbt Lagos (Nigeria) meetup a success. diff --git a/website/docs/community/spotlight/jenna-jordan.md b/website/docs/community/spotlight/jenna-jordan.md new file mode 100644 index 00000000000..86f19f125f8 --- /dev/null +++ b/website/docs/community/spotlight/jenna-jordan.md @@ -0,0 +1,36 @@ +--- +id: jenna-jordan +title: Jenna Jordan +description: | + I am a Senior Data Management Consultant with Analytics8, where I advise clients on dbt best practices (especially regarding dbt Mesh and the various shifts in governance and strategy that come with it). My experiences working within a dbt Mesh architecture and all of the difficulties organizations could run into with such a major paradigm shift inspired my peer exchange (role-playing/simulation game) at Coalesce 2024: "Governance co-lab: We the people, in order to govern data, do establish processes." I also experimented with bringing role-playing scenarios to data problems at the September 2024 Chicago dbt Meetup, hosted by Analytics8. I occasionally write long blog posts on my website, if you're up for the read. +image: /img/community/spotlight/jenna-jordan.jpg +pronouns: she/her +location: Asheville, USA +jobTitle: Senior Data Management Consultant +companyName: Analytics8 +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/jennajordan1/ + - name: Personal website + link: https://jennajordan.me/ +dateCreated: 2024-11-01 +hide_table_of_contents: true +communityAward: true +communityAwardYear: 2024 +--- + +## When did you join the dbt community and in what way has it impacted your career? + +My dbt learning journey kicked off with the CoRise (now Uplimit) course Analytics Engineering with dbt, with Emily Hawkins and Jake Hannan, in February 2022 – less than a month after starting as a data engineer with the City of Boston Analytics Team. About a year later, I spearheaded the adoption of dbt at the City and got to build the project and associated architecture from scratch – which is probably the best learning experience you could ask for! I saw the value dbt could bring to improving data management processes at the City, and I knew there were other cities and local governments that could benefit from dbt as well, which motivated me to find my fellow co-speakers Ian Rose and Laurie Merrell to give a talk at Coalesce 2023 called "From Coast to Coast: Implementing dbt in the public sector." As a part of our goal to identify and cultivate a community of dbt practitioners in the public (and adjacent) sectors, we also started the dbt Community Slack channel #industry-public-sector. That experience allowed me to continue to grow my career and find my current role - as well as connect with so many amazing data folks! + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +There are many leaders in the dbt community that I admire and identify with – I won’t list them all out because I will invariably miss someone (but… you probably know who you are). Technical prowess is always enviable, but I most admire those who bring the human element to data work: those who aren’t afraid to be their authentic selves, cultivate a practice of empathy and compassion, and are driven by curiosity and a desire to help others. I’ve never set out to be a leader, and I still don’t really consider myself to be a leader – I’m much more comfortable in the role of a librarian. I just want to help people by connecting them to the information and resources that they may need. + +## What have you learned from community members? What do you hope others can learn from you? + +Pretty much everything I’ve learned about dbt and working in a mature analytics ecosystem I’ve learned from dbt community members. The dbt Community Slack is full of useful information and advice, and has also helped me identify experts about certain topics that I can chat with to learn even more. When I find someone sharing useful information, I usually try to find and follow them on social media so I can see more of their content. If there is one piece of advice I want to share, it is this: don’t be afraid to engage. Ask for help when you need it, but also offer help freely. Engage with the community with the same respect and grace you would offer your friends and coworkers. + +## Anything else interesting you want to tell us? + +Library Science is so much more than the Dewey Decimal System (seriously, ask a librarian about Dewey for a juicy rant). RDF triples (for knowledge graphs) are queried using SPARQL (pronounced “sparkle”). An antelope can be a document. The correct way to write a date/time is ISO-8601. The oldest known table (of the spreadsheet variety) is from 5,000 years ago – record-keeping predates literature by a significant margin. Zip codes aren’t polygons – they don’t contain an area or have boundaries. Computers don’t always return 0.3 when asked to add 0.1 + 0.2. SQL was the sequel to SQUARE. Before computers, people programmed looms (weaving is binary). What? You asked!! On a more serious note – data teams: start hiring librarians. No, seriously. No degree could have prepared me better for what I do in the data field than my M.S. in Library & Information Science. I promise, you want the skillset & mindset that a librarian will bring to your team. diff --git a/website/docs/community/spotlight/josh-devlin.md b/website/docs/community/spotlight/josh-devlin.md index 6036105940b..3fddfe8957e 100644 --- a/website/docs/community/spotlight/josh-devlin.md +++ b/website/docs/community/spotlight/josh-devlin.md @@ -24,6 +24,7 @@ socialLinks: dateCreated: 2023-11-10 hide_table_of_contents: true communityAward: true +communityAwardYear: 2023 --- ## When did you join the dbt community and in what way has it impacted your career? diff --git a/website/docs/community/spotlight/karen-hsieh.md b/website/docs/community/spotlight/karen-hsieh.md index 22d6915baf7..bffbd712be4 100644 --- a/website/docs/community/spotlight/karen-hsieh.md +++ b/website/docs/community/spotlight/karen-hsieh.md @@ -25,6 +25,7 @@ socialLinks: dateCreated: 2023-11-04 hide_table_of_contents: true communityAward: true +communityAwardYear: 2023 --- ## When did you join the dbt community and in what way has it impacted your career? diff --git a/website/docs/community/spotlight/meagan-palmer.md b/website/docs/community/spotlight/meagan-palmer.md index ff45a3d6b7d..fffc2a6e0d6 100644 --- a/website/docs/community/spotlight/meagan-palmer.md +++ b/website/docs/community/spotlight/meagan-palmer.md @@ -3,8 +3,11 @@ id: meagan-palmer title: Meagan Palmer description: | I first started using dbt in 2016 or 2017 (I can't remember exactly). Since then, I have moved into data and analytics consulting and have dipped in and out of the dbt Community. + Late last year, I started leading dbt Cloud training courses and spending more time in the dbt Slack. + In consulting, I get to use a range of stacks. I've used dbt with Redshift, Snowflake, and Databricks in production settings with a range of loaders & reporting tools, and I've been enjoying using DuckDB for some home experimentation. + To share some of the experiences, I regularly post to LinkedIn and have recently started Analytics Engineering Today, a twice monthly newsletter about dbt in practice. image: /img/community/spotlight/Meagan-Palmer.png pronouns: she/her @@ -14,9 +17,10 @@ companyName: Altis Consulting socialLinks: - name: LinkedIn link: https://www.linkedin.com/in/meaganpalmer/ -dateCreated: 2024-07-29 +dateCreated: 2024-11-04 hide_table_of_contents: true -communityAward: false +communityAward: true +communityAwardYear: 2024 --- ## When did you join the dbt community and in what way has it impacted your career? @@ -27,9 +31,9 @@ I was fortunate that Jon Bradley at Nearmap had the vision to engage the then Fi Being in Australia, I often see replies from Jeremy Yeo to people in the dbt Slack. His clarity of communication is impressive. -For growth, I'm hoping that others can benefit from the wide range of experience I have. My newsletter, Analytics Engineering Today on LinkedIn aims to upskill the dbt Community and shed some light on some useful features that might not be well known. +For growth, I'm hoping that others can benefit from the wide range of experience I have. My LinkedIn Newsletter, Analytics Engineering Today aims to upskill the dbt Community and shed some light on some useful features that might not be well known. -I'll be at Coalesce and am doing some webinars/events later in the year. Come say hi, I love talking dbt and analytics engineering with people. +I was at Coalesce Onlineand am doing some webinars/events later in the year. Come say hi, I love talking dbt and analytics engineering with people. ## What have you learned from community members? What do you hope others can learn from you? diff --git a/website/docs/community/spotlight/mike-stanley.md b/website/docs/community/spotlight/mike-stanley.md new file mode 100644 index 00000000000..853b0e2f704 --- /dev/null +++ b/website/docs/community/spotlight/mike-stanley.md @@ -0,0 +1,30 @@ +--- +id: mike-stanley +title: Mike Stanley +description: | + I've split my time between financial services and the video games industry. Back when I wrote code every day, I worked in marketing analytics and marketing technology. I've been in the dbt community for about two years. I haven't authored any extensions to dbt's adapters yet but I've given feedback on proposed changes! +image: /img/community/spotlight/mike-stanley.jpg +pronouns: he/him +location: London, United Kingdom +jobTitle: Manager, Data +companyName: Freetrade +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/mike-stanley-31616994/ +dateCreated: 2024-11-05 +hide_table_of_contents: true +communityAward: true +communityAwardYear: 2024 +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I've led data teams for almost ten years now and it can be a challenge to stay current on new technology when you're spending a lot of time on leadership and management. I joined the dbt Community to learn how to get more from it, how to solve problems and use more advanced features, and to learn best practices. I find that answering questions is the way I learn best, so I started helping people! + +## Which dbt Community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I hope that we can all continue to level up our dbt skills and leave the data environments that we work in better than we found them. + +## What have you learned from community members? What do you hope others can learn from you? + +Everything! People share so much about their best practices and when and how to deviate from them, interesting extensions to dbt that they've worked on, common bugs and problems, and how to think in a "dbtish" way. I couldn't have learned any of that without the community! diff --git a/website/docs/community/spotlight/oliver-cramer.md b/website/docs/community/spotlight/oliver-cramer.md index 7e9974a8a2c..89f342bc650 100644 --- a/website/docs/community/spotlight/oliver-cramer.md +++ b/website/docs/community/spotlight/oliver-cramer.md @@ -17,6 +17,7 @@ socialLinks: dateCreated: 2023-11-02 hide_table_of_contents: true communityAward: true +communityAwardYear: 2023 --- ## When did you join the dbt community and in what way has it impacted your career? diff --git a/website/docs/community/spotlight/original-dbt-athena-maintainers.md b/website/docs/community/spotlight/original-dbt-athena-maintainers.md new file mode 100644 index 00000000000..b3728a71d63 --- /dev/null +++ b/website/docs/community/spotlight/original-dbt-athena-maintainers.md @@ -0,0 +1,44 @@ +--- +id: original-dbt-athena-maintainers +title: The Original dbt-athena Maintainers +description: | + The original dbt-athena Maintainers is a group of 5 people—Jérémy Guiselin, Mattia, Jesse Dobbelaere, Serhii Dimchenko, and Nicola Corda—who met via dbt Slack in the #db-athena channel, with the aim to make make dbt-athena a production-ready adapter. + + In the first periods, Winter 2022 and Spring 2023, we focused on contributing directly to the adapter, adding relevant features like Iceberg and Lake Formation support, and stabilizing some internal behaviour. + + On a second iteration our role was triaging, providing community support and bug fixing. We encouraged community members to make their first contributions, and helped them to merge their PRs. +image: /img/community/spotlight/dbt-athena-groupheadshot.jpg +location: Europe +jobTitle: A group of data-engineers +companyName: Mix of companies +organization: dbt-athena (since November 2022) +socialLinks: + - name: Jérémy's LinkedIn + link: https://www.linkedin.com/in/jrmyy/ + - name: Mattia's LinkedIn + link: https://www.linkedin.com/in/mattia-sappa/ + - name: Jesse's LinkedIn + link: https://www.linkedin.com/in/dobbelaerejesse/ + - name: Serhii's LinkedIn + link: https://www.linkedin.com/in/serhii-dimchenko-075b3061/ + - name: Nicola's LinkedIn + link: https://www.linkedin.com/in/nicolacorda/ +dateCreated: 2024-11-06 +hide_table_of_contents: true +communityAward: true +communityAwardYear: 2024 +--- + +## When did you join the dbt community and in what way has it impacted your career? + +The dbt community allowed the dbt-athena maintainers to meet each other, and share the common goal of making the dbt-athena adapter production-ready. + +## Which dbt Community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +As we grow, we are looking to embody democratic leadership. + +## What have you learned from community members? What do you hope others can learn from you? + +We learned that the power of the community was endless. People started to share best practises, and some of the best practises were incorporated directly in dbt-athena, allowing people to run the adapter smoothly in their production environment. +We reached a point where people started to ask advice for their AWS architecture, which we found pretty awesome. + diff --git a/website/docs/community/spotlight/ruth-onyekwe.md b/website/docs/community/spotlight/ruth-onyekwe.md new file mode 100644 index 00000000000..cf07e98a4f7 --- /dev/null +++ b/website/docs/community/spotlight/ruth-onyekwe.md @@ -0,0 +1,31 @@ +--- +id: ruth-onyekwe +title: Ruth Onyekwe +description: | + I've been working in the world of Data Analytics for over 5 years and have been part of the dbt community for the last 4. With a background in International Business and Digital Marketing, I experienced first hand the need for reliable data to fuel business decisions. This inspired a career move into the technology space to be able to work with the tools and the people that were facilitating this process. Today I am leading teams to deliver data modernization projects, as well as helping grow the analytics arm of my company on a day to day basis. I also have the privilege of organising the dbt Meetups in Barcelona, Spain - and am excited to continue to grow the community across Europe. +image: /img/community/spotlight/ruth-onyekwe.jpeg +pronouns: she/her +location: Madrid, Spain +jobTitle: Data Analytics Manager +companyName: Spaulding Ridge +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/ruth-onyekwe/ +dateCreated: 2024-11-07 +hide_table_of_contents: true +communityAward: true +communityAwardYear: 2024 +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community in 2021, after meeting dbt Labs reps at a conference. Through partnering with dbt Labs and learning the technology, we (Spaulding Ridge) were able to open a whole new offering in our service catalogue, and meet the growing needs of our customers. + +## Which dbt Community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I identify with the transparent leaders - those willing to share their learnings, knowledge, and experiences. I want to encourage other dbt enthusiasts to stretch themselves professionally and actively participate in the analytics community. + +## What have you learned from community members? What do you hope others can learn from you? + +I've learnt that most of us working in data have experienced the same struggles, be it searching for the best testing frameworks, or deciding how to build optimised and scalable models, or searching for the answers to non-technical questions like how to best organise teams or how to communicate with business stakeholders and translate their needs - we're all faced with the same dilemmas. And the great thing I've learned being in the dbt community, is that if you're brave enough to share your stories, you'll connect with someone who has already gone through those experiences, and can help you reach a solution a lot faster than if you tried to start from scratch. + diff --git a/website/docs/community/spotlight/sam-debruyn.md b/website/docs/community/spotlight/sam-debruyn.md index 24ce7aa1b15..fac27981b50 100644 --- a/website/docs/community/spotlight/sam-debruyn.md +++ b/website/docs/community/spotlight/sam-debruyn.md @@ -19,6 +19,7 @@ socialLinks: dateCreated: 2023-11-03 hide_table_of_contents: true communityAward: true +communityAwardYear: 2023 --- ## When did you join the dbt community and in what way has it impacted your career? diff --git a/website/docs/community/spotlight/stacy-lo.md b/website/docs/community/spotlight/stacy-lo.md index 23a5491dd18..c2bf2874697 100644 --- a/website/docs/community/spotlight/stacy-lo.md +++ b/website/docs/community/spotlight/stacy-lo.md @@ -18,6 +18,7 @@ socialLinks: dateCreated: 2023-11-01 hide_table_of_contents: true communityAward: true +communityAwardYear: 2023 --- ## When did you join the dbt community and in what way has it impacted your career? diff --git a/website/docs/community/spotlight/sydney-burns.md b/website/docs/community/spotlight/sydney-burns.md index 25278ac6ecf..7b350ae7089 100644 --- a/website/docs/community/spotlight/sydney-burns.md +++ b/website/docs/community/spotlight/sydney-burns.md @@ -16,6 +16,7 @@ socialLinks: dateCreated: 2023-11-09 hide_table_of_contents: true communityAward: true +communityAwardYear: 2023 --- ## When did you join the dbt community and in what way has it impacted your career? diff --git a/website/docs/docs/build/cumulative-metrics.md b/website/docs/docs/build/cumulative-metrics.md index aa2b85aa9c8..b44918d2fbd 100644 --- a/website/docs/docs/build/cumulative-metrics.md +++ b/website/docs/docs/build/cumulative-metrics.md @@ -16,6 +16,28 @@ Note that we use the double colon (::) to indicate whether a parameter is nested ## Parameters + + +| Parameter |
Description
| Type | +|-------------|---------------------------------------------------|-----------| +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | Required string that defines the display value in downstream tools. Accepts plain text, spaces, and quotes (such as `orders_total` or `"orders_total"`). | Required | +| `type_params` | The type parameters of the metric. Supports nested parameters indicated by the double colon, such as `type_params::measure`. | Required | +| `type_params::measure` | The measure associated with the metric. Supports both shorthand (string) and object syntax. The shorthand is used if only the name is needed, while the object syntax allows specifying additional attributes. | Required | +| `measure::name` | The name of the measure being referenced. Required if using object syntax for `type_params::measure`. | Optional | +| `measure::fill_nulls_with` | Sets a value (for example, 0) to replace nulls in the metric definition. | Optional | +| `measure::join_to_timespine` | Boolean indicating if the aggregated measure should be joined to the time spine table to fill in missing dates. Default is `false`. | Optional | +| `type_params::cumulative_type_params` | Configures the attributes like `window`, `period_agg`, and `grain_to_date` for cumulative metrics. | Optional | +| `cumulative_type_params::window` | Specifies the accumulation window, such as `1 month`, `7 days`, or `1 year`. Cannot be used with `grain_to_date`. | Optional | +| `cumulative_type_params::grain_to_date` | Sets the accumulation grain, such as `month`, restarting accumulation at the beginning of each specified grain period. Cannot be used with `window`. | Optional | +| `cumulative_type_params::period_agg` | Defines how to aggregate the cumulative metric when summarizing data to a different granularity: `first`, `last`, or `average`. Defaults to `first` if `window` is not specified. | Optional | + +
+ + + | Parameter |
Description
| Type | | --------- | ----------- | ---- | | `name` | The name of the metric. | Required | @@ -23,20 +45,41 @@ Note that we use the double colon (::) to indicate whether a parameter is nested | `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | | `label` | Required string that defines the display value in downstream tools. Accepts plain text, spaces, and quotes (such as `orders_total` or `"orders_total"`). | Required | | `type_params` | The type parameters of the metric. Supports nested parameters indicated by the double colon, such as `type_params::measure`. | Required | -| `type_params::cumulative_type_params` | Allows you to add a `window`, `period_agg`, and `grain_to_date` configuration. Nested under `type_params`. | Optional | -| `cumulative_type_params::window` | The accumulation window, such as 1 month, 7 days, 1 year. This can't be used with `grain_to_date`. | Optional | -| `cumulative_type_params::grain_to_date` | Sets the accumulation grain, such as `month`, which will accumulate data for one month and then restart at the beginning of the next. This can't be used with `window`. | Optional | -| `cumulative_type_params::period_agg` | Specifies how to aggregate the cumulative metric when summarizing data to a different granularity. Can be used with grain_to_date. Options are
- `first` (Takes the first value within the period)
- `last` (Takes the last value within the period
- `average` (Calculates the average value within the period).

Defaults to `first` if no `window` is specified. | Optional | -| `type_params::measure` | A dictionary describing the measure you will use. | Required | -| `measure::name` | The measure you are referencing. | Optional | -| `measure::fill_nulls_with` | Set the value in your metric definition instead of null (such as zero). | Optional | -| `measure::join_to_timespine` | Boolean that indicates if the aggregated measure should be joined to the time spine table to fill in missing dates. Default `false`. | Optional | +| `window` | The accumulation window, such as `1 month`, `7 days`, or `1 year`. This can't be used with `grain_to_date`. | Optional | +| `grain_to_date` | Sets the accumulation grain, such as `month`, which will accumulate data for one month and then restart at the beginning of the next. This can't be used with `window`. | Optional | +| `type_params::measure` | A list of measure inputs | Required | +| `measure:name` | The name of the measure being referenced. Required if using object syntax for `type_params::measure`. | Optional | +| `measure:fill_nulls_with` | Set the value in your metric definition instead of null (such as zero).| Optional | +| `measure:join_to_timespine` | Boolean that indicates if the aggregated measure should be joined to the time spine table to fill in missing dates. Default `false`. | Optional | + +
+ + + +The`type_params::measure` configuration can be written in different ways: +- Shorthand syntax — To only specify the name of the measure, use a simple string value. This is a shorthand approach when no other attributes are required. + ```yaml + type_params: + measure: revenue + ``` +- Object syntax — To add more details or attributes to the measure (such as adding a filter, handling `null` values, or specifying whether to join to a time spine), you need to use the object syntax. This allows for additional configuration beyond just the measure's name. + + ```yaml + type_params: + measure: + name: order_total + fill_nulls_with: 0 + join_to_timespine: true + ``` + ### Complete specification The following displays the complete specification for cumulative metrics, along with an example: + + ```yaml metrics: - name: The metric name # Required @@ -54,13 +97,35 @@ metrics: join_to_timespine: true/false # Boolean that indicates if the aggregated measure should be joined to the time spine table to fill in missing dates. Default `false`. # Optional ``` + + + + +```yaml +metrics: + - name: The metric name # Required + description: The metric description # Optional + type: cumulative # Required + label: The value that will be displayed in downstream tools # Required + type_params: # Required + measure: + name: The measure you are referencing # Required + fill_nulls_with: Set the value in your metric definition instead of null (such as zero) # Optional + join_to_timespine: false # Boolean that indicates if the aggregated measure should be joined to the time spine table to fill in missing dates. Default `false`. # Optional + window: 1 month # The accumulation window, such as 1 month, 7 days, 1 year. Optional. Cannot be used with grain_to_date. + grain_to_date: month # Sets the accumulation grain, such as month will accumulate data for one month, then restart at the beginning of the next. Optional. Cannot be used with window. +``` + + ## Cumulative metrics example Cumulative metrics measure data over a given window and consider the window infinite when no window parameter is passed, accumulating the data over all time. -The following example shows how to define cumulative metrics in a YAML file. In this example, we define three cumulative metrics: +The following example shows how to define cumulative metrics in a YAML file: + + - `cumulative_order_total`: Calculates the cumulative order total over all time. Uses `type params` to specify the measure `order_total` to be aggregated. @@ -68,10 +133,23 @@ The following example shows how to define cumulative metrics in a YAML file. In - `cumulative_order_total_mtd`: Calculates the month-to-date cumulative order total, respectively. Uses `cumulative_type_params` to specify a `grain_to_date` of `month`. + + + + +- `cumulative_order_total`: Calculates the cumulative order total over all time. Uses `type params` to specify the measure `order_total` to be aggregated. + +- `cumulative_order_total_l1m`: Calculates the trailing 1-month cumulative order total. Uses `type params` to specify a `window` of 1 month. + +- `cumulative_order_total_mtd`: Calculates the month-to-date cumulative order total, respectively. Uses `type params` to specify a `grain_to_date` of `month`. + + + -```yaml + +```yaml metrics: - name: cumulative_order_total label: Cumulative order total (All-Time) @@ -101,8 +179,44 @@ metrics: cumulative_type_params: grain_to_date: month ``` + + + + +```yaml +metrics: + - name: cumulative_order_total + label: Cumulative order total (All-Time) + description: The cumulative value of all orders + type: cumulative + type_params: + measure: + name: order_total + + - name: cumulative_order_total_l1m + label: Cumulative order total (L1M) + description: Trailing 1-month cumulative order total + type: cumulative + type_params: + measure: + name: order_total + window: 1 month + + - name: cumulative_order_total_mtd + label: Cumulative order total (MTD) + description: The month-to-date value of all orders + type: cumulative + type_params: + measure: + name: order_total + grain_to_date: month +``` + + + + ### Granularity options Use the `period_agg` parameter with `first()`, `last()`, and `average()` functions to aggregate cumulative metrics over the requested period. This is because granularity options for cumulative metrics are different than the options for other metric types. @@ -192,6 +306,8 @@ group by + + ### Window options This section details examples of when to specify and not to specify window options. @@ -218,6 +334,8 @@ measures: We can write a cumulative metric `weekly_customers` as such: + + ``` yaml @@ -240,6 +358,31 @@ From the sample YAML example, note the following: For example, in the `weekly_customers` cumulative metric, MetricFlow takes a sliding 7-day window of relevant customers and applies a count distinct function. +If you remove `window`, the measure will accumulate over all time. + + + + + + +``` yaml +metrics: + - name: weekly_customers # Define the measure and the window. + type: cumulative + type_params: + measure: customers + window: 7 days # Setting the window to 7 days since we want to track weekly active +``` + + + +From the sample YAML example, note the following: + +* `type`: Specify cumulative to indicate the type of metric. +* `type_params`: Configure the cumulative metric by providing a `measure` and optionally add a `window` or `grain_to_date` configuration. + +For example, in the `weekly_customers` cumulative metric, MetricFlow takes a sliding 7-day window of relevant customers and applies a count distinct function. + If you remove `window`, the measure will accumulate over all time. @@ -286,7 +429,6 @@ metrics: ``` - ### Grain to date @@ -310,6 +452,8 @@ We can compare the difference between a 1-month window and a monthly grain to da + + ```yaml metrics: - name: cumulative_order_total_l1m # For this metric, we use a window of 1 month @@ -330,10 +474,33 @@ metrics: grain_to_date: month # Resets at the beginning of each month period_agg: first # Optional. Defaults to first. Accepted values: first|last|average ``` + + + + +```yaml +metrics: + - name: cumulative_order_total_l1m # For this metric, we use a window of 1 month + label: Cumulative order total (L1M) + description: Trailing 1-month cumulative order amount + type: cumulative + type_params: + measure: order_total + window: 1 month # Applies a sliding window of 1 month + - name: cumulative_order_total_mtd # For this metric, we use a monthly grain-to-date + label: Cumulative order total (MTD) + description: The month-to-date value of all orders + type: cumulative + type_params: + measure: order_total + grain_to_date: month # Resets at the beginning of each month +``` + Cumulative metric with grain to date: + ```yaml @@ -390,10 +557,25 @@ order by ``` + + + + + +```yaml +- name: orders_last_month_to_date + label: Orders month to date + type: cumulative + type_params: + measure: order_count + grain_to_date: month +``` + + ## SQL implementation example -To calculate the cumulative value of the metric over a given window, join the timespine table using the primary time dimension. Use the accumulation window in the join to decide which days to include in the calculation. +To calculate the cumulative value of the metric over a given window we do a time range join to a timespine table using the primary time dimension as the join key. We use the accumulation window in the join to decide whether a record should be included on a particular day. The following SQL code produced from an example cumulative metric is provided for reference: To implement cumulative metrics, refer to the SQL code example: diff --git a/website/docs/docs/build/data-tests.md b/website/docs/docs/build/data-tests.md index 59d716b4ca9..afe4719768c 100644 --- a/website/docs/docs/build/data-tests.md +++ b/website/docs/docs/build/data-tests.md @@ -66,11 +66,25 @@ having total_amount < 0 -The name of this test is the name of the file: `assert_total_payment_amount_is_positive`. Simple enough. +The name of this test is the name of the file: `assert_total_payment_amount_is_positive`. -Singular data tests are easy to write—so easy that you may find yourself writing the same basic structure over and over, only changing the name of a column or model. By that point, the test isn't so singular! In that case, we recommend... +To add a description to a singular test in your project, add a `.yml` file to your `tests` directory, for example, `tests/schema.yml` with the following content: + +```yaml +version: 2 +data_tests: + - name: assert_total_payment_amount_is_positive + description: > + Refunds have a negative amount, so the total amount should always be >= 0. + Therefore return records where total amount < 0 to make the test fail. + +``` + + + +Singular data tests are so easy that you may find yourself writing the same basic structure repeatedly, only changing the name of a column or model. By that point, the test isn't so singular! In that case, we recommend generic data tests. ## Generic data tests Certain data tests are generic: they can be reused over and over again. A generic data test is defined in a `test` block, which contains a parametrized query and accepts arguments. It might look like: @@ -304,7 +318,6 @@ data_tests: -To suppress warnings about the rename, add `TestsConfigDeprecation` to the `silence` block of the `warn_error_options` flag in `dbt_project.yml`, [as described in the Warnings documentation](https://docs.getdbt.com/reference/global-configs/warnings).
diff --git a/website/docs/docs/build/dimensions.md b/website/docs/docs/build/dimensions.md index d74bc773ea9..5026f4c45cd 100644 --- a/website/docs/docs/build/dimensions.md +++ b/website/docs/docs/build/dimensions.md @@ -6,20 +6,18 @@ sidebar_label: "Dimensions" tags: [Metrics, Semantic Layer] --- -Dimensions are a way to group or filter information based on categories or time. It's like a special label that helps organize and analyze data. - -In a data platform, dimensions are part of a larger structure called a semantic model. It's created along with other elements like [entities](/docs/build/entities) and [measures](/docs/build/measures) and used to add more details to your data that can't be easily added up or combined. In SQL, dimensions are typically included in the `group by` clause of your SQL query. +Dimensions represent the non-aggregatable columns in your data set, which are the attributes, features, or characteristics that describe or categorize data. In the context of the dbt Semantic Layer, dimensions are part of a larger structure called a semantic model. They are created along with other elements like [entities](/docs/build/entities) and [measures](/docs/build/measures) and used to add more details to your data. In SQL, dimensions are typically included in the `group by` clause of your SQL query. -All dimensions require a `name`, `type` and in some cases, an `expr` parameter. The `name` for your dimension must be unique to the semantic model and can not be the same as an existing `entity` or `measure` within that same model. +All dimensions require a `name`, `type`, and can optionally include an `expr` parameter. The `name` for your Dimension must be unique within the same semantic model. | Parameter | Description | Type | | --------- | ----------- | ---- | | `name` | Refers to the name of the group that will be visible to the user in downstream tools. It can also serve as an alias if the column name or SQL query reference is different and provided in the `expr` parameter.

Dimension names should be unique within a semantic model, but they can be non-unique across different models as MetricFlow uses [joins](/docs/build/join-logic) to identify the right dimension. | Required | -| `type` | Specifies the type of group created in the semantic model. There are two types:

- **Categorical**: Group rows in a table by categories like geography, color, and so on.
- **Time**: Point to a date field in the data platform. Must be of type TIMESTAMP or equivalent in the data platform engine.
- You can also use time dimensions to specify time spans for [slowly changing dimensions](/docs/build/dimensions#scd-type-ii) tables. | Required | +| `type` | Specifies the type of group created in the semantic model. There are two types:

- **Categorical**: Describe attributes or features like geography or sales region.
- **Time**: Time-based dimensions like timestamps or dates. | Required | | `type_params` | Specific type params such as if the time is primary or used as a partition | Required | | `description` | A clear description of the dimension | Optional | | `expr` | Defines the underlying column or SQL query for a dimension. If no `expr` is specified, MetricFlow will use the column with the same name as the group. You can use the column name itself to input a SQL expression. | Optional | @@ -43,11 +41,13 @@ Refer to the following example to see how dimensions are used in a semantic mode semantic_models: - name: transactions description: A record for every transaction that takes place. Carts are considered multiple transactions for each SKU. - model: {{ ref("fact_transactions") }} + model: {{ ref('fact_transactions') }} defaults: agg_time_dimension: order_date # --- entities --- entities: + - name: transaction + type: primary ... # --- measures --- measures: @@ -56,14 +56,20 @@ semantic_models: dimensions: - name: order_date type: time - label: "Date of transaction" # Recommend adding a label to define the value displayed in downstream tools - expr: date_trunc('day', ts) - - name: is_bulk_transaction + type_params: + time_granularity: day + label: "Date of transaction" # Recommend adding a label to provide more context to users consuming the data + expr: ts + - name: is_bulk type: categorical expr: case when quantity > 10 then true else false end + - name: type + type: categorical ``` -MetricFlow requires that all dimensions have a primary entity. This is to guarantee unique dimension names. If your data source doesn't have a primary entity, you need to assign the entity a name using the `primary_entity: entity_name` key. It doesn't necessarily have to map to a column in that table and assigning the name doesn't affect query generation. +Dimensions are bound to the primary entity of the semantic model they are defined in. For example the dimension `type` is defined in a model that has `transaction` as a primary entity. `type` is scoped to the `transaction` entity, and to reference this dimension you would use the fully qualified dimension name i.e `transaction__type`. + +MetricFlow requires that all semantic models have a primary entity. This is to guarantee unique dimension names. If your data source doesn't have a primary entity, you need to assign the entity a name using the `primary_entity` key. It doesn't necessarily have to map to a column in that table and assigning the name doesn't affect query generation. We recommend making these "virtual primary entities" unique across your semantic model. An example of defining a primary entity for a data source that doesn't have a primary entity column is below: ```yaml semantic_model: @@ -93,7 +99,7 @@ This section further explains the dimension definitions, along with examples. Di ## Categorical -Categorical is used to group metrics by different categories such as product type, color, or geographical area. They can refer to existing columns in your dbt model or be calculated using a SQL expression with the `expr` parameter. An example of a category dimension is `is_bulk_transaction`, which is a group created by applying a case statement to the underlying column `quantity`. This allows users to group or filter the data based on bulk transactions. +Categorical dimensions are used to group metrics by different attributes, features, or characteristics such as product type. They can refer to existing columns in your dbt model or be calculated using a SQL expression with the `expr` parameter. An example of a categorical dimension is `is_bulk_transaction`, which is a group created by applying a case statement to the underlying column `quantity`. This allows users to group or filter the data based on bulk transactions. ```yaml dimensions: @@ -104,15 +110,10 @@ dimensions: ## Time -:::tip use datetime data type if using BigQuery -To use BigQuery as your data platform, time dimensions columns need to be in the datetime data type. If they are stored in another type, you can cast them to datetime using the `expr` property. Time dimensions are used to group metrics by different levels of time, such as day, week, month, quarter, and year. MetricFlow supports these granularities, which can be specified using the `time_granularity` parameter. -::: - -Time has additional parameters specified under the `type_params` section. When you query one or more metrics in MetricFlow using the CLI, the default time dimension for a single metric is the aggregation time dimension, which you can refer to as `metric_time` or use the dimensions' name. +Time has additional parameters specified under the `type_params` section. When you query one or more metrics, the default time dimension for each metric is the aggregation time dimension, which you can refer to as `metric_time` or use the dimension's name. You can use multiple time groups in separate metrics. For example, the `users_created` metric uses `created_at`, and the `users_deleted` metric uses `deleted_at`: - ```bash # dbt Cloud users dbt sl query --metrics users_created,users_deleted --group-by metric_time__year --order-by metric_time__year @@ -121,40 +122,27 @@ dbt sl query --metrics users_created,users_deleted --group-by metric_time__year mf query --metrics users_created,users_deleted --group-by metric_time__year --order-by metric_time__year ``` +You can set `is_partition` for time to define specific time spans. Additionally, use the `type_params` section to set `time_granularity` to adjust aggregation details (daily, weekly, and so on). -You can set `is_partition` for time or categorical dimensions to define specific time spans. Additionally, use the `type_params` section to set `time_granularity` to adjust aggregation detail (like daily, weekly, and so on): - - + Use `is_partition: True` to show that a dimension exists over a specific time window. For example, a date-partitioned dimensional table. When you query metrics from different tables, the dbt Semantic Layer uses this parameter to ensure that the correct dimensional values are joined to measures. -You can also use `is_partition` for [categorical](#categorical) dimensions as well. - -MetricFlow enables metric aggregation during query time. For example, you can aggregate the `messages_per_month` measure. If you originally had a `time_granularity` for the time dimensions `metric_time`, you can specify a yearly granularity for aggregation in your query: - -```bash -# dbt Cloud users -dbt sl query --metrics messages_per_month --group-by metric_time__year --order-by metric_time__year - -# dbt Core users -mf query --metrics messages_per_month --group-by metric_time__year --order metric_time__year -``` - ```yaml dimensions: - name: created_at type: time label: "Date of creation" - expr: date_trunc('day', ts_created) # ts_created is the underlying column name from the table - is_partition: True + expr: ts_created # ts_created is the underlying column name from the table + is_partition: True type_params: time_granularity: day - name: deleted_at type: time label: "Date of deletion" - expr: date_trunc('day', ts_deleted) # ts_deleted is the underlying column name from the table + expr: ts_deleted # ts_deleted is the underlying column name from the table is_partition: True type_params: time_granularity: day @@ -173,28 +161,83 @@ measures: -`time_granularity` specifies the smallest level of detail that a measure or metric should be reported at, such as daily, weekly, monthly, quarterly, or yearly. Different granularity options are available, and each metric must have a specified granularity. For example, a metric specified with weekly granularity couldn't be aggregated to a daily grain. + + +`time_granularity` specifies the grain of a time dimension. MetricFlow will transform the underlying column to the specified granularity. For example, if you add hourly granularity to a time dimension column, MetricFlow will run a `date_trunc` function to convert the timestamp to hourly. You can easily change the time grain at query time and aggregate it to a coarser grain, for example, from hourly to monthly. However, you can't go from a coarser grain to a finer grain (monthly to hourly). -The current options for time granularity are day, week, month, quarter, and year. +Our supported granularities are: +* nanosecond (Snowflake only) +* microsecond +* millisecond +* second +* minute +* hour +* day +* week +* month +* quarter +* year -Aggregation between metrics with different granularities is possible, with the Semantic Layer returning results at the highest granularity by default. For example, when querying two metrics with daily and monthly granularity, the resulting aggregation will be at the monthly level. +Aggregation between metrics with different granularities is possible, with the Semantic Layer returning results at the coarsest granularity by default. For example, when querying two metrics with daily and monthly granularity, the resulting aggregation will be at the monthly level. ```yaml dimensions: - name: created_at type: time label: "Date of creation" - expr: date_trunc('day', ts_created) # ts_created is the underlying column name from the table + expr: ts_created # ts_created is the underlying column name from the table is_partition: True type_params: - time_granularity: day + time_granularity: hour - name: deleted_at type: time label: "Date of deletion" - expr: date_trunc('day', ts_deleted) # ts_deleted is the underlying column name from the table + expr: ts_deleted # ts_deleted is the underlying column name from the table is_partition: True type_params: - time_granularity: day + time_granularity: day + +measures: + - name: users_deleted + expr: 1 + agg: sum + agg_time_dimension: deleted_at + - name: users_created + expr: 1 + agg: sum +``` + + + + + +`time_granularity` specifies the grain of a time dimension. MetricFlow will transform the underlying column to the specified granularity. For example, if you add daily granularity to a time dimension column, MetricFlow will run a `date_trunc` function to convert the timestamp to daily. You can easily change the time grain at query time and aggregate it to a coarser grain, for example, from daily to monthly. However, you can't go from a coarser grain to a finer grain (monthly to daily). + +Our supported granularities are: +* day +* week +* month +* quarter +* year + +Aggregation between metrics with different granularities is possible, with the Semantic Layer returning results at the coarsest granularity by default. For example, when querying two metrics with daily and monthly granularity, the resulting aggregation will be at the monthly level. + +```yaml +dimensions: + - name: created_at + type: time + label: "Date of creation" + expr: ts_created # ts_created is the underlying column name from the table + is_partition: True + type_params: + time_granularity: day + - name: deleted_at + type: time + label: "Date of deletion" + expr: ts_deleted # ts_deleted is the underlying column name from the table + is_partition: True + type_params: + time_granularity: day measures: - name: users_deleted @@ -206,6 +249,8 @@ measures: agg: sum ``` + + @@ -213,7 +258,7 @@ measures: ### SCD Type II :::caution -Currently, there are limitations in supporting SCDs. +Currently, semantic models with SCD Type II dimensions cannot contain measures. ::: MetricFlow supports joins against dimensions values in a semantic model built on top of a slowly changing dimension (SCD) Type II table. This is useful when you need a particular metric sliced by a group that changes over time, such as the historical trends of sales by a customer's country. @@ -246,7 +291,7 @@ Here’s an example configuration: - name: tier_start # The name of the dimension. type: time # The type of dimension (such as time) label: "Start date of tier" # A readable label for the dimension - expr: start_date # Expression or column name the the dimension represents + expr: start_date # Expression or column name the dimension represents type_params: # Additional parameters for the dimension type time_granularity: day # Specifies the granularity of the time dimension (such as day) validity_params: # Defines the validity window @@ -315,7 +360,7 @@ Additionally, the entity is tagged as `natural` to differentiate it from a `prim semantic_models: - name: sales_person_tiers description: SCD Type II table of tiers for salespeople - model: {{ref(sales_person_tiers)}} + model: {{ ref('sales_person_tiers') }} defaults: agg_time_dimension: tier_start @@ -357,7 +402,7 @@ semantic_models: There is a transaction, product, sales_person, and customer id for every transaction. There is only one transaction id per transaction. The `metric_time` or date is reflected in UTC. - model: {{ ref(fact_transactions) }} + model: {{ ref('fact_transactions') }} defaults: agg_time_dimension: metric_time diff --git a/website/docs/docs/build/documentation.md b/website/docs/docs/build/documentation.md index d040d3c5bef..6f7c6c27f31 100644 --- a/website/docs/docs/build/documentation.md +++ b/website/docs/docs/build/documentation.md @@ -101,7 +101,18 @@ The events in this table are recorded by [Snowplow](http://github.com/snowplow/s In the above example, a docs block named `table_events` is defined with some descriptive markdown contents. There is nothing significant about the name `table_events` — docs blocks can be named however you like, as long as the name only contains alphanumeric and underscore characters and does not start with a numeric character. ### Placement -Docs blocks should be placed in files with a `.md` file extension. By default, dbt will search in all resource paths for docs blocks (i.e. the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [macro-paths](/reference/project-configs/macro-paths) and [snapshot-paths](/reference/project-configs/snapshot-paths)) — you can adjust this behavior using the [docs-paths](/reference/project-configs/docs-paths) config. + + + +Docs blocks should be placed in files with a `.md` file extension. By default, dbt will search in all resource paths for docs blocks (for example, the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [test-paths](/reference/project-configs/test-paths), [macro-paths](/reference/project-configs/macro-paths), and [snapshot-paths](/reference/project-configs/snapshot-paths)) — you can adjust this behavior using the [docs-paths](/reference/project-configs/docs-paths) config. + + + + + +Docs blocks should be placed in files with a `.md` file extension. By default, dbt will search in all resource paths for docs blocks (for example, the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [macro-paths](/reference/project-configs/macro-paths), and [snapshot-paths](/reference/project-configs/snapshot-paths)) — you can adjust this behavior using the [docs-paths](/reference/project-configs/docs-paths) config. + + ### Usage diff --git a/website/docs/docs/build/entities.md b/website/docs/docs/build/entities.md index 7b0ddfffabf..e4ed0773c3c 100644 --- a/website/docs/docs/build/entities.md +++ b/website/docs/docs/build/entities.md @@ -12,7 +12,11 @@ Within a semantic graph, the required parameters for an entity are `name` and `t Entities can be specified with a single column or multiple columns. Entities (join keys) in a semantic model are identified by their name. Each entity name must be unique within a semantic model, but it doesn't have to be unique across different semantic models. -There are four entity types: primary, foreign, unique, or natural. +There are four entity types: +- [Primary](#primary) — Has only one record for each row in the table and includes every record in the data platform. This key uniquely identifies each record in the table. +- [Unique](#unique) — Contains only one record per row in the table and allows for null values. May have a subset of records in the data warehouse. +- [Foreign](#foreign) — A field (or a set of fields) in one table that uniquely identifies a row in another table. This key establishes a link between tables. +- [Natural](#natural) — Columns or combinations of columns in a table that uniquely identify a record based on real-world data. This key is derived from actual data attributes. :::tip Use entities as dimensions You can also use entities as dimensions, which allows you to aggregate a metric to the granularity of that entity. @@ -20,12 +24,74 @@ You can also use entities as dimensions, which allows you to aggregate a metric ## Entity types -MetricFlow's join logic depends on the entity `type` you use, and it also determines how to join semantic models. Refer to [Joins](/docs/build/join-logic) for more info on how to construct joins. +MetricFlow's join logic depends on the entity `type` you use and determines how to join semantic models. Refer to [Joins](/docs/build/join-logic) for more info on how to construct joins. -* **Primary —** A primary key has **only one** record for each row in the table, and it includes every record in the data platform. -* **Unique —** A unique key contains **only one** record per row in the table, but it may have a subset of records in the data warehouse. It can also include nulls. -* **Foreign —** A foreign key can include zero, one, or multiple instances of the same record. Null values may also be present. -* **Natural —** Natural keys are columns or combinations of columns in a table that uniquely identify a record based on real-world data. For instance, in a sales_person_department dimension table, the sales_person_id can serve as a natural key. You can only use natural keys for [SCD type II dimensions](/docs/build/dimensions#scd-type-ii). +### Primary +A primary key has _only one_ record for each row in the table and includes every record in the data platform. It must contain unique values and can't contain null values. Use the primary key to ensure that each record in the table is distinct and identifiable. + + + +For example, consider a table of employees with the following columns: + +```sql +employee_id (primary key) +first_name +last_name +``` +In this case, `employee_id` is the primary key. Each `employee_id` is unique and represents one specific employee. There can be no duplicate `employee_id` and can't be null. + + + +### Unique +A unique key contains _only one_ record per row in the table but may have a subset of records in the data warehouse. However, unlike the primary key, a unique key allows for null values. The unique key ensures that the column's values are distinct, except for null values. + + + +For example, consider a table of students with the following columns: + +```sql +student_id (primary key) +email (unique key) +first_name +last_name +``` + +In this example, `email` is defined as a unique key. Each email address must be unique; however, multiple students can have null email addresses. This is because the unique key constraint allows for one or more null values, but non-null values must be unique. This then creates a set of records with unique emails (non-null) that could be a subset of the entire table, which includes all students. + + + +### Foreign +A foreign key is a field (or a set of fields) in one table that uniquely identifies a row in another table. The foreign key establishes a link between the data in two tables. +It can include zero, one, or multiple instances of the same record. It can also contain null values. + + + +For example, consider you have two tables, `customers` and `orders`: + +customers table: + +```sql +customer_id (primary key) +customer_name +``` + +orders table: + +```sql +order_id (primary key) +order_date +customer_id (foreign key) +``` + +In this example, the `customer_id` in the `orders` table is a foreign key that references the `customer_id` in the `customers` table. This link means each order is associated with a specific customer. However, not every order must have a customer; the `customer_id` in the orders table can be null or have the same `customer_id` for multiple orders. + + + +### Natural + +Natural keys are columns or combinations of columns in a table that uniquely identify a record based on real-world data. For instance, if you have a `sales_person_department` dimension table, the `sales_person_id` can serve as a natural key. You can only use natural keys for [SCD type II dimensions](/docs/build/dimensions#scd-type-ii). + +## Entities configuration The following is the complete spec for entities: @@ -36,12 +102,11 @@ entities: description: A description of the field or role the entity takes in this table ## Optional expr: The field that denotes that entity (transaction_id). ## Optional Defaults to name if unspecified. - ``` Here's an example of how to define entities in a semantic model: - -``` yaml + +```yaml entities: - name: transaction type: primary @@ -54,15 +119,14 @@ entities: expr: substring(id_order from 2) ``` -### Combine columns with a key +## Combine columns with a key If a table doesn't have any key (like a primary key), use _surrogate combination_ to form a key that will help you identify a record by combining two columns. This applies to any [entity type](/docs/build/entities#entity-types). For example, you can combine `date_key` and `brand_code` from the `raw_brand_target_weekly` table to form a _surrogate key_. The following example creates a surrogate key by joining `date_key` and `brand_code` using a pipe (`|`) as a separator. ```yaml + entities: - name: brand_target_key # Entity name or identified. type: foreign # This can be any entity type key. expr: date_key || '|' || brand_code # Defines the expression for linking fields to form the surrogate key. ``` - - diff --git a/website/docs/docs/build/environment-variables.md b/website/docs/docs/build/environment-variables.md index 01601ce7eb8..b87786ac596 100644 --- a/website/docs/docs/build/environment-variables.md +++ b/website/docs/docs/build/environment-variables.md @@ -97,49 +97,56 @@ While all environment variables are encrypted at rest in dbt Cloud, dbt Cloud ha dbt Cloud has a number of pre-defined variables built in. Variables are set automatically and cannot be changed. -**dbt Cloud IDE details** +#### dbt Cloud IDE details The following environment variable is set automatically for the dbt Cloud IDE: -- `DBT_CLOUD_GIT_BRANCH`: Provides the development Git branch name in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). - - Available in dbt v 1.6 and later. +- `DBT_CLOUD_GIT_BRANCH` — Provides the development Git branch name in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). + - Available in dbt v1.6 and later. - The variable changes when the branch is changed. - Doesn't require restarting the IDE after a branch change. - Currently not available in the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation). Use case — This is useful in cases where you want to dynamically use the Git branch name as a prefix for a [development schema](/docs/build/custom-schemas) ( `{{ env_var ('DBT_CLOUD_GIT_BRANCH') }}` ). -**dbt Cloud context** +#### dbt Cloud context -The following environment variables are set automatically for deployment runs: +The following environment variables are set automatically: -- `DBT_ENV`: This key is reserved for the dbt Cloud application and will always resolve to 'prod' +- `DBT_ENV` — This key is reserved for the dbt Cloud application and will always resolve to 'prod'. For deployment runs only. +- `DBT_CLOUD_ENVIRONMENT_NAME` — The name of the dbt Cloud environment in which `dbt` is running. +- `DBT_CLOUD_ENVIRONMENT_TYPE` — The type of dbt Cloud environment in which `dbt` is running. The valid values are `development` or `deployment`. -**Run details** -- `DBT_CLOUD_PROJECT_ID`: The ID of the dbt Cloud Project for this run -- `DBT_CLOUD_JOB_ID`: The ID of the dbt Cloud Job for this run -- `DBT_CLOUD_RUN_ID`: The ID of this particular run -- `DBT_CLOUD_RUN_REASON_CATEGORY`: The "category" of the trigger for this run (one of: `scheduled`, `github_pull_request`, `gitlab_merge_request`, `azure_pull_request`, `other`) -- `DBT_CLOUD_RUN_REASON`: The specific trigger for this run (eg. `Scheduled`, `Kicked off by `, or custom via `API`) -- `DBT_CLOUD_ENVIRONMENT_ID`: The ID of the environment for this run -- `DBT_CLOUD_ACCOUNT_ID`: The ID of the dbt Cloud account for this run +#### Run details -**Git details** +- `DBT_CLOUD_PROJECT_ID` — The ID of the dbt Cloud Project for this run +- `DBT_CLOUD_JOB_ID` — The ID of the dbt Cloud Job for this run +- `DBT_CLOUD_RUN_ID` — The ID of this particular run +- `DBT_CLOUD_RUN_REASON_CATEGORY` — The "category" of the trigger for this run (one of: `scheduled`, `github_pull_request`, `gitlab_merge_request`, `azure_pull_request`, `other`) +- `DBT_CLOUD_RUN_REASON` — The specific trigger for this run (eg. `Scheduled`, `Kicked off by `, or custom via `API`) +- `DBT_CLOUD_ENVIRONMENT_ID` — The ID of the environment for this run +- `DBT_CLOUD_ACCOUNT_ID` — The ID of the dbt Cloud account for this run + +#### Git details _The following variables are currently only available for GitHub, GitLab, and Azure DevOps PR builds triggered via a webhook_ -- `DBT_CLOUD_PR_ID`: The Pull Request ID in the connected version control system -- `DBT_CLOUD_GIT_SHA`: The git commit SHA which is being run for this Pull Request build +- `DBT_CLOUD_PR_ID` — The Pull Request ID in the connected version control system +- `DBT_CLOUD_GIT_SHA` — The git commit SHA which is being run for this Pull Request build ### Example usage Environment variables can be used in many ways, and they give you the power and flexibility to do what you want to do more easily in dbt Cloud. -#### Clone private packages + + Now that you can set secrets as environment variables, you can pass git tokens into your package HTTPS URLs to allow for on-the-fly cloning of private repositories. Read more about enabling [private package cloning](/docs/build/packages#private-packages). -#### Dynamically set your warehouse in your Snowflake connection + + + + Environment variables make it possible to dynamically change the Snowflake virtual warehouse size depending on the job. Instead of calling the warehouse name directly in your project connection, you can reference an environment variable which will get set to a specific virtual warehouse at runtime. For example, suppose you'd like to run a full-refresh job in an XL warehouse, but your incremental job only needs to run in a medium-sized warehouse. Both jobs are configured in the same dbt Cloud environment. In your connection configuration, you can use an environment variable to set the warehouse name to `{{env_var('DBT_WAREHOUSE')}}`. Then in the job settings, you can set a different value for the `DBT_WAREHOUSE` environment variable depending on the job's workload. @@ -160,7 +167,10 @@ However, there are some limitations when using env vars with Snowflake OAuth Con Something to note, if you supply an environment variable in the account/host field, Snowflake OAuth Connection will **fail** to connect. This happens because the field doesn't pass through Jinja rendering, so dbt Cloud simply passes the literal `env_var` code into a URL string like `{{ env_var("DBT_ACCOUNT_HOST_NAME") }}.snowflakecomputing.com`, which is an invalid hostname. Use [extended attributes](/docs/deploy/deploy-environments#deployment-credentials) instead. ::: -#### Audit your run metadata + + + + Here's another motivating example that uses the dbt Cloud run ID, which is set automatically at each run. This additional data field can be used for auditing and debugging: ```sql @@ -186,3 +196,13 @@ select *, from users_aggregated ``` + + + + + +import SLEnvVars from '/snippets/_sl-env-vars.md'; + + + + diff --git a/website/docs/docs/build/exposures.md b/website/docs/docs/build/exposures.md index 4f027fa7d29..1a85d5fb415 100644 --- a/website/docs/docs/build/exposures.md +++ b/website/docs/docs/build/exposures.md @@ -78,4 +78,4 @@ When we generate the dbt Explorer site, you'll see the exposure appear: * [Exposure properties](/reference/exposure-properties) * [`exposure:` selection method](/reference/node-selection/methods#the-exposure-method) -* [Dashboard status tiles](/docs/deploy/dashboard-status-tiles) +* [Data health tiles](/docs/collaborate/data-tile) diff --git a/website/docs/docs/build/hooks-operations.md b/website/docs/docs/build/hooks-operations.md index 9ed20291c34..6cec2a673c0 100644 --- a/website/docs/docs/build/hooks-operations.md +++ b/website/docs/docs/build/hooks-operations.md @@ -72,6 +72,41 @@ You can use hooks to provide database-specific functionality not available out-o You can also use a [macro](/docs/build/jinja-macros#macros) to bundle up hook logic. Check out some of the examples in the reference sections for [on-run-start and on-run-end hooks](/reference/project-configs/on-run-start-on-run-end) and [pre- and post-hooks](/reference/resource-configs/pre-hook-post-hook). + + +```sql +{{ config( + pre_hook=[ + "{{ some_macro() }}" + ] +) }} +``` + + + + + +```yaml +models: + - name: + config: + pre_hook: + - "{{ some_macro() }}" +``` + + + + + +```yaml +models: + : + +pre-hook: + - "{{ some_macro() }}" +``` + + + ## About operations Operations are [macros](/docs/build/jinja-macros#macros) that you can run using the [`run-operation`](/reference/commands/run-operation) command. As such, operations aren't actually a separate resource in your dbt project — they are just a convenient way to invoke a macro without needing to run a model. diff --git a/website/docs/docs/build/incremental-microbatch.md b/website/docs/docs/build/incremental-microbatch.md new file mode 100644 index 00000000000..e1c39e6ae47 --- /dev/null +++ b/website/docs/docs/build/incremental-microbatch.md @@ -0,0 +1,309 @@ +--- +title: "About microbatch incremental models" +description: "Learn about the 'microbatch' strategy for incremental models." +id: "incremental-microbatch" +--- + +# About microbatch incremental models + +:::info Microbatch + +The `microbatch` strategy is available in beta for [dbt Cloud Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) and dbt Core v1.9. We have been developing it behind a flag to prevent unintended interactions with existing custom incremental strategies. To enable this feature, [set the environment variable](/docs/build/environment-variables#setting-and-overriding-environment-variables) `DBT_EXPERIMENTAL_MICROBATCH` to `True` in your dbt Cloud environments or wherever you're running dbt Core. + +Read and participate in the discussion: [dbt-core#10672](https://github.com/dbt-labs/dbt-core/discussions/10672) + +Refer to [Supported incremental strategies by adapter](/docs/build/incremental-strategy#supported-incremental-strategies-by-adapter) for a list of supported adapters. + +::: + +## What is "microbatch" in dbt? + +Incremental models in dbt are a [materialization](/docs/build/materializations) designed to efficiently update your data warehouse tables by only transforming and loading _new or changed data_ since the last run. Instead of reprocessing an entire dataset every time, incremental models process a smaller number of rows, and then append, update, or replace those rows in the existing table. This can significantly reduce the time and resources required for your data transformations. + +Microbatch incremental models make it possible to process transformations on very large time-series datasets with efficiency and resiliency. When dbt runs a microbatch model — whether for the first time, during incremental runs, or in specified backfills — it will split the processing into multiple queries (or "batches"), based on the `event_time` and `batch_size` you configure. + +Each "batch" corresponds to a single bounded time period (by default, a single day of data). Where other incremental strategies operate only on "old" and "new" data, microbatch models treat every batch as an atomic unit that can be built or replaced on its own. Each batch is independent and . This is a powerful abstraction that makes it possible for dbt to run batches separately — in the future, concurrently — and to retry them independently. + +### Example + +A `sessions` model aggregates and enriches data that comes from two other models. +- `page_views` is a large, time-series table. It contains many rows, new records almost always arrive after existing ones, and existing records rarely update. +- `customers` is a relatively small dimensional table. Customer attributes update often, and not in a time-based manner — that is, older customers are just as likely to change column values as newer customers. + +The `page_view_start` column in `page_views` is configured as that model's `event_time`. The `customers` model does not configure an `event_time`. Therefore, each batch of `sessions` will filter `page_views` to the equivalent time-bounded batch, and it will not filter `customers` (a full scan for every batch). + + + +```yaml +models: + - name: page_views + config: + event_time: page_view_start +``` + + +We run the `sessions` model on October 1, 2024, and then again on October 2. It produces the following queries: + + + + + +The `event_time` for the `sessions` model is set to `session_start`, which marks the beginning of a user’s session on the website. This setting allows dbt to combine multiple page views (each tracked by their own `page_view_start` timestamps) into a single session. This way, `session_start` differentiates the timing of individual page views from the broader timeframe of the entire user session. + + + +```sql +{{ config( + materialized='incremental', + incremental_strategy='microbatch', + event_time='session_start', + begin='2020-01-01', + batch_size='day' +) }} + +with page_views as ( + + -- this ref will be auto-filtered + select * from {{ ref('page_views') }} + +), + +customers as ( + + -- this ref won't + select * from {{ ref('customers') }} + +), + +select + page_views.id as session_id, + page_views.page_view_start as session_start, + customers.* + from page_views + left join customers + on page_views.customer_id = customer.id +``` + + + + + + + + + +```sql + +with page_views as ( + + select * from ( + -- filtered on configured event_time + select * from "analytics"."page_views" + where page_view_start >= '2024-10-01 00:00:00' -- Oct 1 + and page_view_start < '2024-10-02 00:00:00' + ) + +), + +customers as ( + + select * from "analytics"."customers" + +), + +... +``` + + + + + + + + + +```sql + +with page_views as ( + + select * from ( + -- filtered on configured event_time + select * from "analytics"."page_views" + where page_view_start >= '2024-10-02 00:00:00' -- Oct 2 + and page_view_start < '2024-10-03 00:00:00' + ) + +), + +customers as ( + + select * from "analytics"."customers" + +), + +... +``` + + + + + + + +dbt will instruct the data platform to take the result of each batch query and insert, update, or replace the contents of the `analytics.sessions` table for the same day of data. To perform this operation, dbt will use the most efficient atomic mechanism for "full batch" replacement that is available on each data platform. + +It does not matter whether the table already contains data for that day. Given the same input data, the resulting table is the same no matter how many times a batch is reprocessed. + + + +### Relevant configs + +Several configurations are relevant to microbatch models, and some are required: + +| Config | Type | Description | Default | +|----------|------|---------------|---------| +| `event_time` | Column (required) | The column indicating "at what time did the row occur." Required for your microbatch model and any direct parents that should be filtered. | N/A | +| `begin` | Date (required) | The "beginning of time" for the microbatch model. This is the starting point for any initial or full-refresh builds. For example, a daily-grain microbatch model run on `2024-10-01` with `begin = '2023-10-01` will process 366 batches (it's a leap year!) plus the batch for "today." | N/A | +| `batch_size` | String (required) | The granularity of your batches. Supported values are `hour`, `day`, `month`, and `year` | N/A | +| `lookback` | Integer (optional) | Process X batches prior to the latest bookmark to capture late-arriving records. | `1` | + + + +As a best practice, we recommend configuring `full_refresh: False` on microbatch models so that they ignore invocations with the `--full-refresh` flag. If you need to reprocess historical data, do so with a targeted backfill that specifies explicit start and end dates. + +### Usage + +**You must write your model query to process (read and return) exactly one "batch" of data**. This is a simplifying assumption and a powerful one: +- You don’t need to think about `is_incremental` filtering +- You don't need to pick among DML strategies (upserting/merging/replacing) +- You can preview your model, and see the exact records for a given batch that will appear when that batch is processed and written to the table + +When you run a microbatch model, dbt will evaluate which batches need to be loaded, break them up into a SQL query per batch, and load each one independently. + +dbt will automatically filter upstream inputs (`source` or `ref`) that define `event_time`, based on the `lookback` and `batch_size` configs for this model. + +During standard incremental runs, dbt will process batches according to the current timestamp and the configured `lookback`, with one query per batch. + + + +**Note:** If there’s an upstream model that configures `event_time`, but you *don’t* want the reference to it to be filtered, you can specify `ref('upstream_model').render()` to opt-out of auto-filtering. This isn't generally recommended — most models that configure `event_time` are fairly large, and if the reference is not filtered, each batch will perform a full scan of this input table. + +### Backfills + +Whether to fix erroneous source data or retroactively apply a change in business logic, you may need to reprocess a large amount of historical data. + +Backfilling a microbatch model is as simple as selecting it to run or build, and specifying a "start" and "end" for `event_time`. Note that `--event-time-start` and `--event-time-end` are mutually necessary, meaning that if you specify one, you must specify the other. + +As always, dbt will process the batches between the start and end as independent queries. + +```bash +dbt run --event-time-start "2024-09-01" --event-time-end "2024-09-04" +``` + + + + +### Retry + +If one or more of your batches fail, you can use `dbt retry` to reprocess _only_ the failed batches. + +![Partial retry](https://github.com/user-attachments/assets/f94c4797-dcc7-4875-9623-639f70c97b8f) + +### Timezones + +For now, dbt assumes that all values supplied are in UTC: + +- `event_time` +- `begin` +- `--event-time-start` +- `--event-time-end` + +While we may consider adding support for custom time zones in the future, we also believe that defining these values in UTC makes everyone's lives easier. + +## How `microbatch` compares to other incremental strategies? + +Most incremental models rely on the end user (you) to explicitly tell dbt what "new" means, in the context of each model, by writing a filter in an `{% if is_incremental() %}` conditional block. You are responsible for crafting this SQL in a way that queries [`{{ this }}`](/reference/dbt-jinja-functions/this) to check when the most recent record was last loaded, with an optional look-back window for late-arriving records. + +Other incremental strategies will control _how_ the data is being added into the table — whether append-only `insert`, `delete` + `insert`, `merge`, `insert overwrite`, etc — but they all have this in common. + +As an example: + +```sql +{{ + config( + materialized='incremental', + incremental_strategy='delete+insert', + unique_key='date_day' + ) +}} + +select * from {{ ref('stg_events') }} + + {% if is_incremental() %} + -- this filter will only be applied on an incremental run + -- add a lookback window of 3 days to account for late-arriving records + where date_day >= (select {{ dbt.dateadd("day", -3, "max(date_day)") }} from {{ this }}) + {% endif %} + +``` + +For this incremental model: + +- "New" records are those with a `date_day` greater than the maximum `date_day` that has previously been loaded +- The lookback window is 3 days +- When there are new records for a given `date_day`, the existing data for `date_day` is deleted and the new data is inserted + +Let’s take our same example from before, and instead use the new `microbatch` incremental strategy: + + + +```sql +{{ + config( + materialized='incremental', + incremental_strategy='microbatch', + event_time='event_occured_at', + batch_size='day', + lookback=3, + begin='2020-01-01', + full_refresh=false + ) +}} + +select * from {{ ref('stg_events') }} -- this ref will be auto-filtered +``` + + + +Where you’ve also set an `event_time` for the model’s direct parents - in this case, `stg_events`: + + + +```yaml +models: + - name: stg_events + config: + event_time: my_time_field +``` + + + +And that’s it! + +When you run the model, each batch templates a separate query. For example, if you were running the model on October 1, dbt would template separate queries for each day between September 28 and October 1, inclusive — four batches in total. + +The query for `2024-10-01` would look like: + + + +```sql +select * from ( + select * from "analytics"."stg_events" + where my_time_field >= '2024-10-01 00:00:00' + and my_time_field < '2024-10-02 00:00:00' +) +``` + + + +Based on your data platform, dbt will choose the most efficient atomic mechanism to insert, update, or replace these four batches (`2024-09-28`, `2024-09-29`, `2024-09-30`, and `2024-10-01`) in the existing table. diff --git a/website/docs/docs/build/incremental-models-overview.md b/website/docs/docs/build/incremental-models-overview.md index 16c950eb331..bddc6b0a55d 100644 --- a/website/docs/docs/build/incremental-models-overview.md +++ b/website/docs/docs/build/incremental-models-overview.md @@ -42,4 +42,5 @@ Transaction management, a process used in certain data platforms, ensures that a ## Related docs - [Incremental models](/docs/build/incremental-models) to learn how to configure incremental models in dbt. - [Incremental strategies](/docs/build/incremental-strategy) to understand how dbt implements incremental models on different databases. +- [Microbatch](/docs/build/incremental-strategy) to understand a new incremental strategy intended for efficient and resilient processing of very large time-series datasets. - [Materializations best practices](/best-practices/materializations/1-guide-overview) to learn about the best practices for using materializations in dbt. diff --git a/website/docs/docs/build/incremental-models.md b/website/docs/docs/build/incremental-models.md index 2f8bbc46c3a..a56246addf3 100644 --- a/website/docs/docs/build/incremental-models.md +++ b/website/docs/docs/build/incremental-models.md @@ -94,7 +94,7 @@ Not specifying a `unique_key` will result in append-only behavior, which means d The optional `unique_key` parameter specifies a field (or combination of fields) that defines the grain of your model. That is, the field(s) identify a single unique row. You can define `unique_key` in a configuration block at the top of your model, and it can be a single column name or a list of column names. -The `unique_key` should be supplied in your model definition as a string representing a single column or a list of single-quoted column names that can be used together, for example, `['col1', 'col2', …])`. Columns used in this way should not contain any nulls, or the incremental model run may fail. Either ensure that each column has no nulls (for example with `coalesce(COLUMN_NAME, 'VALUE_IF_NULL')`), or define a single-column [surrogate key](/terms/surrogate-key) (for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source)). +The `unique_key` should be supplied in your model definition as a string representing a single column or a list of single-quoted column names that can be used together, for example, `['col1', 'col2', …])`. Columns used in this way should not contain any nulls, or the incremental model may fail to match rows and generate duplicate rows. Either ensure that each column has no nulls (for example with `coalesce(COLUMN_NAME, 'VALUE_IF_NULL')`) or define a single-column [surrogate key](https://www.getdbt.com/blog/guide-to-surrogate-key) (for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source)). :::tip In cases where you need multiple columns in combination to uniquely identify each row, we recommend you pass these columns as a list (`unique_key = ['user_id', 'session_number']`), rather than a string expression (`unique_key = 'concat(user_id, session_number)'`). @@ -103,7 +103,7 @@ By using the first syntax, which is more universal, dbt can ensure that the colu When you pass a list in this way, please ensure that each column does not contain any nulls, or the incremental model run may fail. -Alternatively, you can define a single-column [surrogate key](/terms/surrogate-key), for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source). +Alternatively, you can define a single-column [surrogate key](https://www.getdbt.com/blog/guide-to-surrogate-key), for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source). ::: When you define a `unique_key`, you'll see this behavior for each row of "new" data returned by your dbt model: @@ -111,7 +111,7 @@ When you define a `unique_key`, you'll see this behavior for each row of "new" d * If the same `unique_key` is present in the "new" and "old" model data, dbt will update/replace the old row with the new row of data. The exact mechanics of how that update/replace takes place will vary depending on your database, [incremental strategy](/docs/build/incremental-strategy), and [strategy specific configs](/docs/build/incremental-strategy#strategy-specific-configs). * If the `unique_key` is _not_ present in the "old" data, dbt will insert the entire row into the table. -Please note that if there's a unique_key with more than one row in either the existing target table or the new incremental rows, the incremental model may fail depending on your database and [incremental strategy](/docs/build/incremental-strategy). If you're having issues running an incremental model, it's a good idea to double check that the unique key is truly unique in both your existing database table and your new incremental rows. You can [learn more about surrogate keys here](/terms/surrogate-key). +Please note that if there's a unique_key with more than one row in either the existing target table or the new incremental rows, the incremental model may fail depending on your database and [incremental strategy](/docs/build/incremental-strategy). If you're having issues running an incremental model, it's a good idea to double check that the unique key is truly unique in both your existing database table and your new incremental rows. You can [learn more about surrogate keys here](https://www.getdbt.com/blog/guide-to-surrogate-key). :::info While common incremental strategies, such as`delete+insert` + `merge`, might use `unique_key`, others don't. For example, the `insert_overwrite` strategy does not use `unique_key`, because it operates on partitions of data rather than individual rows. For more information, see [About incremental_strategy](/docs/build/incremental-strategy). @@ -212,11 +212,11 @@ Currently, `on_schema_change` only tracks top-level column changes. It does not ### Default behavior -This is the behavior if `on_schema_change: ignore`, which is set by default, and on older versions of dbt. +This is the behavior of `on_schema_change: ignore`, which is set by default. If you add a column to your incremental model, and execute a `dbt run`, this column will _not_ appear in your target table. -Similarly, if you remove a column from your incremental model, and execute a `dbt run`, this column will _not_ be removed from your target table. +If you remove a column from your incremental model and execute a `dbt run`, `dbt run` will fail. Instead, whenever the logic of your incremental changes, execute a full-refresh run of both your incremental model and any downstream models. diff --git a/website/docs/docs/build/incremental-strategy.md b/website/docs/docs/build/incremental-strategy.md index 8e86da0eba8..30de135b09b 100644 --- a/website/docs/docs/build/incremental-strategy.md +++ b/website/docs/docs/build/incremental-strategy.md @@ -10,32 +10,31 @@ There are various strategies to implement the concept of incremental materializa * The reliability of your `unique_key`. * The support of certain features in your data platform. -An optional `incremental_strategy` config is provided in some adapters that controls the code that dbt uses -to build incremental models. +An optional `incremental_strategy` config is provided in some adapters that controls the code that dbt uses to build incremental models. -### Supported incremental strategies by adapter - -Click the name of the adapter in the below table for more information about supported incremental strategies. +:::info Microbatch -The `merge` strategy is available in dbt-postgres and dbt-redshift beginning in dbt v1.6. +The [`microbatch` incremental strategy](/docs/build/incremental-microbatch) is intended for large time-series datasets. dbt will process the incremental model in multiple queries (or "batches") based on a configured `event_time` column. Depending on the volume and nature of your data, this can be more efficient and resilient than using a single query for adding new data. -| data platform adapter | `append` | `merge` | `delete+insert` | `insert_overwrite` | -|-----------------------------------------------------------------------------------------------------|:--------:|:-------:|:---------------:|:------------------:| -| [dbt-postgres](/reference/resource-configs/postgres-configs#incremental-materialization-strategies) | ✅ | ✅ | ✅ | | -| [dbt-redshift](/reference/resource-configs/redshift-configs#incremental-materialization-strategies) | ✅ | ✅ | ✅ | | -| [dbt-bigquery](/reference/resource-configs/bigquery-configs#merge-behavior-incremental-models) | | ✅ | | ✅ | -| [dbt-spark](/reference/resource-configs/spark-configs#incremental-models) | ✅ | ✅ | | ✅ | -| [dbt-databricks](/reference/resource-configs/databricks-configs#incremental-models) | ✅ | ✅ | | ✅ | -| [dbt-snowflake](/reference/resource-configs/snowflake-configs#merge-behavior-incremental-models) | ✅ | ✅ | ✅ | | -| [dbt-trino](/reference/resource-configs/trino-configs#incremental) | ✅ | ✅ | ✅ | | -| [dbt-fabric](/reference/resource-configs/fabric-configs#incremental) | ✅ | | ✅ | | +::: +### Supported incremental strategies by adapter -:::note Snowflake Configurations +This table represents the availability of each incremental strategy, based on the latest version of dbt Core and each adapter. -dbt has changed the default materialization for incremental table merges from `temporary table` to `view`. For more information about this change and instructions for setting the configuration to a temp table, please read about [Snowflake temporary tables](/reference/resource-configs/snowflake-configs#temporary-tables). +Click the name of the adapter in the below table for more information about supported incremental strategies. -::: +| Data platform adapter | `append` | `merge` | `delete+insert` | `insert_overwrite` | `microbatch` | +|-----------------------|:--------:|:-------:|:---------------:|:------------------:|:-------------------:| +| [dbt-postgres](/reference/resource-configs/postgres-configs#incremental-materialization-strategies) | ✅ | ✅ | ✅ | | ✅ | +| [dbt-redshift](/reference/resource-configs/redshift-configs#incremental-materialization-strategies) | ✅ | ✅ | ✅ | | | +| [dbt-bigquery](/reference/resource-configs/bigquery-configs#merge-behavior-incremental-models) | | ✅ | | ✅ | ✅ | +| [dbt-spark](/reference/resource-configs/spark-configs#incremental-models) | ✅ | ✅ | | ✅ | ✅ | +| [dbt-databricks](/reference/resource-configs/databricks-configs#incremental-models) | ✅ | ✅ | | ✅ | | +| [dbt-snowflake](/reference/resource-configs/snowflake-configs#merge-behavior-incremental-models) | ✅ | ✅ | ✅ | | ✅ | +| [dbt-trino](/reference/resource-configs/trino-configs#incremental) | ✅ | ✅ | ✅ | | | +| [dbt-fabric](/reference/resource-configs/fabric-configs#incremental) | ✅ | ✅ | ✅ | | | +| [dbt-athena](/reference/resource-configs/athena-configs#incremental-models) | ✅ | ✅ | | ✅ | | ### Configuring incremental strategy @@ -200,6 +199,7 @@ Before diving into [custom strategies](#custom-strategies), it's important to un | `delete+insert` | `get_incremental_delete_insert_sql` | | `merge` | `get_incremental_merge_sql` | | `insert_overwrite` | `get_incremental_insert_overwrite_sql` | +| `microbatch` | `get_incremental_microbatch_sql` | For example, a built-in strategy for the `append` can be defined and used with the following files: diff --git a/website/docs/docs/build/jinja-macros.md b/website/docs/docs/build/jinja-macros.md index fc4a0cad3e8..bc91e3674c9 100644 --- a/website/docs/docs/build/jinja-macros.md +++ b/website/docs/docs/build/jinja-macros.md @@ -74,7 +74,7 @@ group by 1 You can recognize Jinja based on the delimiters the language uses, which we refer to as "curlies": - **Expressions `{{ ... }}`**: Expressions are used when you want to output a string. You can use expressions to reference [variables](/reference/dbt-jinja-functions/var) and call [macros](/docs/build/jinja-macros#macros). - **Statements `{% ... %}`**: Statements don't output a string. They are used for control flow, for example, to set up `for` loops and `if` statements, to [set](https://jinja.palletsprojects.com/en/3.1.x/templates/#assignments) or [modify](https://jinja.palletsprojects.com/en/3.1.x/templates/#expression-statement) variables, or to define macros. -- **Comments `{# ... #}`**: Jinja comments are used to prevent the text within the comment from executing or outputing a string. +- **Comments `{# ... #}`**: Jinja comments are used to prevent the text within the comment from executing or outputing a string. Don't use `--` for comment. When used in a dbt model, your Jinja needs to compile to a valid query. To check what SQL your Jinja compiles to: * **Using dbt Cloud:** Click the compile button to see the compiled SQL in the Compiled SQL pane diff --git a/website/docs/docs/build/measures.md b/website/docs/docs/build/measures.md index 9458487e8d4..d60aa3f7e21 100644 --- a/website/docs/docs/build/measures.md +++ b/website/docs/docs/build/measures.md @@ -102,7 +102,7 @@ semantic_models: description: A record of every transaction that takes place. Carts are considered multiple transactions for each SKU. model: ref('schema.transactions') defaults: - agg_time_dimensions: metric_time + agg_time_dimension: transaction_date # --- entities --- entities: @@ -167,7 +167,7 @@ semantic_models: # --- dimensions --- dimensions: - - name: metric_time + - name: transaction_date type: time expr: date_trunc('day', ts) # expr refers to underlying column ts type_params: @@ -200,19 +200,19 @@ Parameters under the `non_additive_dimension` will specify dimensions that the m ```yaml semantic_models: - - name: subscription_id + - name: subscriptions description: A subscription table with one row per date for each active user and their subscription plans. model: ref('your_schema.subscription_table') defaults: - agg_time_dimension: metric_time + agg_time_dimension: subscription_date entities: - name: user_id type: foreign - primary_entity: subscription_table + primary_entity: subscription dimensions: - - name: metric_time + - name: subscription_date type: time expr: date_transaction type_params: @@ -224,21 +224,21 @@ semantic_models: expr: user_id agg: count_distinct non_additive_dimension: - name: metric_time + name: subscription_date window_choice: max - name: mrr description: Aggregate by summing all users' active subscription plans expr: subscription_value agg: sum non_additive_dimension: - name: metric_time + name: subscription_date window_choice: max - - name: mrr + - name: user_mrr description: Group by user_id to achieve each user's MRR expr: subscription_value agg: sum non_additive_dimension: - name: metric_time + name: subscription_date window_choice: max window_groupings: - user_id @@ -255,15 +255,15 @@ We can query the semi-additive metrics using the following syntax: For dbt Cloud: ```bash -dbt sl query --metrics mrr_by_end_of_month --group-by metric_time__month --order metric_time__month -dbt sl query --metrics mrr_by_end_of_month --group-by metric_time__week --order metric_time__week +dbt sl query --metrics mrr_by_end_of_month --group-by subscription__subscription_date__month --order subscription__subscription_date__month +dbt sl query --metrics mrr_by_end_of_month --group-by subscription__subscription_date__week --order subscription__subscription_date__week ``` For dbt Core: ```bash -mf query --metrics mrr_by_end_of_month --group-by metric_time__month --order metric_time__month -mf query --metrics mrr_by_end_of_month --group-by metric_time__week --order metric_time__week +mf query --metrics mrr_by_end_of_month --group-by subscription__subscription_date__month --order subscription__subscription_date__month +mf query --metrics mrr_by_end_of_month --group-by subscription__subscription_date__week --order subscription__subscription_date__week ``` import SetUpPages from '/snippets/_metrics-dependencies.md'; diff --git a/website/docs/docs/build/metricflow-commands.md b/website/docs/docs/build/metricflow-commands.md index 405f9b08ca4..2da5618b86f 100644 --- a/website/docs/docs/build/metricflow-commands.md +++ b/website/docs/docs/build/metricflow-commands.md @@ -8,7 +8,7 @@ tags: [Metrics, Semantic Layer] Once you define metrics in your dbt project, you can query metrics, dimensions, and dimension values, and validate your configs using the MetricFlow commands. -MetricFlow allows you to define and query metrics in your dbt project in the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation), [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud), or [dbt Core](/docs/core/installation-overview). To experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and dynamically query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. +MetricFlow allows you to define and query metrics in your dbt project in the [dbt Cloud](/docs/cloud/about-develop-dbt) or [dbt Core](/docs/core/installation-overview). To experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and dynamically query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. MetricFlow is compatible with Python versions 3.8, 3.9, 3.10, and 3.11. @@ -18,70 +18,72 @@ MetricFlow is a dbt package that allows you to define and query metrics in your Using MetricFlow with dbt Cloud means you won't need to manage versioning — your dbt Cloud account will automatically manage the versioning. -**dbt Cloud jobs** — MetricFlow commands aren't supported in dbt Cloud jobs yet. However, you can add MetricFlow validations with your git provider (such as GitHub Actions) by installing MetricFlow (`python -m pip install metricflow`). This allows you to run MetricFlow commands as part of your continuous integration checks on PRs. +dbt Cloud jobs support the `dbt sl validate` command to [automatically test your semantic nodes](/docs/deploy/ci-jobs#semantic-validations-in-ci). You can also add MetricFlow validations with your git provider (such as GitHub Actions) by installing MetricFlow (`python -m pip install metricflow`). This allows you to run MetricFlow commands as part of your continuous integration checks on PRs. - + -- MetricFlow [commands](#metricflow-commands) are embedded in the dbt Cloud CLI. This means you can immediately run them once you install the dbt Cloud CLI and don't need to install MetricFlow separately. -- You don't need to manage versioning — your dbt Cloud account will automatically manage the versioning for you. +In dbt Cloud, run MetricFlow commands directly in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) or in the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation). - - - - -:::info -You can create metrics using MetricFlow in the dbt Cloud IDE. However, support for running MetricFlow commands in the IDE will be available soon. -::: +For dbt Cloud CLI users, MetricFlow commands are embedded in the dbt Cloud CLI, which means you can immediately run them once you install the dbt Cloud CLI and don't need to install MetricFlow separately. You don't need to manage versioning because your dbt Cloud account will automatically manage the versioning for you. - - -:::tip Use dbt Cloud CLI for semantic layer development - -You can use the dbt Cloud CLI for the experience in defining and querying metrics in your dbt project. - -A benefit to using the dbt Cloud is that you won't need to manage versioning — your dbt Cloud account will automatically manage the versioning. -::: + You can install [MetricFlow](https://github.com/dbt-labs/metricflow#getting-started) from [PyPI](https://pypi.org/project/dbt-metricflow/). You need to use `pip` to install MetricFlow on Windows or Linux operating systems: + + 1. Create or activate your virtual environment `python -m venv venv` 2. Run `pip install dbt-metricflow` * You can install MetricFlow using PyPI as an extension of your dbt adapter in the command line. To install the adapter, run `python -m pip install "dbt-metricflow[your_adapter_name]"` and add the adapter name at the end of the command. For example, for a Snowflake adapter run `python -m pip install "dbt-metricflow[snowflake]"` -**Note**, you'll need to manage versioning between dbt Core, your adapter, and MetricFlow. + - + + +1. Create or activate your virtual environment `python -m venv venv` +2. Run `pip install dbt-metricflow` + * You can install MetricFlow using PyPI as an extension of your dbt adapter in the command line. To install the adapter, run `python -m pip install "dbt-metricflow[adapter_package_name]"` and add the adapter name at the end of the command. For example, for a Snowflake adapter run `python -m pip install "dbt-metricflow[dbt-snowflake]"` - + + +**Note**, you'll need to manage versioning between dbt Core, your adapter, and MetricFlow. Something to note, MetricFlow `mf` commands return an error if you have a Metafont latex package installed. To run `mf` commands, uninstall the package. + + + ## MetricFlow commands MetricFlow provides the following commands to retrieve metadata and query metrics. - + -You can use the `dbt sl` prefix before the command name to execute them in the dbt Cloud CLI. For example, to list all metrics, run `dbt sl list metrics`. For a complete list of the MetricFlow commands and flags, run the `dbt sl --help` command in your terminal. +You can use the `dbt sl` prefix before the command name to execute them in the dbt Cloud IDE or dbt Cloud CLI. For example, to list all metrics, run `dbt sl list metrics`. -- [`list`](#list) — Retrieves metadata values. -- [`list metrics`](#list-metrics) — Lists metrics with dimensions. -- [`list dimensions`](#list) — Lists unique dimensions for metrics. -- [`list dimension-values`](#list-dimension-values) — List dimensions with metrics. -- [`list entities`](#list-entities) — Lists all unique entities. -- [`list saved-queries`](#list-saved-queries) — Lists available saved queries. Use the `--show-exports` flag to display each export listed under a saved query. -- [`query`](#query) — Query metrics, saved queries, and dimensions you want to see in the command line interface. Refer to [query examples](#query-examples) to help you get started. -- [`export`](#export) — Runs exports for a singular saved query for testing and generating exports in your development environment. You can also use the `--select` flag to specify particular exports from a saved query. -- [`export-all`](#export-all) — Runs exports for multiple saved queries at once, saving time and effort. +dbt Cloud CLI users can run `dbt sl --help` in the terminal for a complete list of the MetricFlow commands and flags. + +The following table lists the commands compatible with the dbt Cloud IDE and dbt Cloud CLI: + +|
Command
|
Description
| dbt Cloud IDE | dbt Cloud CLI | +|---------|-------------|---------------|---------------| +| [`list metrics`](#list-metrics) | Lists metrics with dimensions. | ✅ | ✅ | +| [`list dimensions`](#list) | Lists unique dimensions for metrics. | ✅ | ✅ | +| [`list dimension-values`](#list-dimension-values) | List dimensions with metrics. | ✅ | ✅ | +| [`list entities`](#list-entities) | Lists all unique entities. | ✅ | ✅ | +| [`list saved-queries`](#list-saved-queries) | Lists available saved queries. Use the `--show-exports` flag to display each export listed under a saved query or `--show-parameters` to show the full query parameters each saved query uses. | ✅ | ✅ | +| [`query`](#query) | Query metrics, saved queries, and dimensions you want to see in the command line interface. Refer to [query examples](#query-examples) to help you get started. | ✅ | ✅ | +| [`validate`](#validate) | Validates semantic model configurations. | ✅ | ✅ | +| [`export`](#export) | Runs exports for a singular saved query for testing and generating exports in your development environment. You can also use the `--select` flag to specify particular exports from a saved query. | ❌ | ✅ | +| [`export-all`](#export-all) | Runs exports for multiple saved queries at once, saving time and effort. | ❌ | ✅ | @@ -100,11 +102,10 @@ Check out the following video for a short video demo of how to query or preview
- + Use the `mf` prefix before the command name to execute them in dbt Core. For example, to list all metrics, run `mf list metrics`. -- [`list`](#list) — Retrieves metadata values. - [`list metrics`](#list-metrics) — Lists metrics with dimensions. - [`list dimensions`](#list) — Lists unique dimensions for metrics. - [`list dimension-values`](#list-dimension-values) — List dimensions with metrics. @@ -117,17 +118,7 @@ Use the `mf` prefix before the command name to execute them in dbt Core. For exa
-### List - -This command retrieves metadata values related to [Metrics](/docs/build/metrics-overview), [Dimensions](/docs/build/dimensions), and [Entities](/docs/build/entities) values. - - ### List metrics - -```bash -dbt sl list # In dbt Cloud -mf list # In dbt Core -``` This command lists the metrics with their available dimensions: ```bash @@ -218,30 +209,28 @@ The list of available saved queries: - Export(new_customer_orders, alias=orders, schemas=customer_schema, exportAs=TABLE) ``` -### Validate-configs +### Validate The following command performs validations against the defined semantic model configurations. -Note, in dbt Cloud you don't need to validate the Semantic Layer config separately. Running a dbt command (such as `dbt parse`, `dbt build`, `dbt compile`, `dbt run`) automatically checks it. - ```bash - -mf validate-configs # In dbt Core +dbt sl validate # For dbt Cloud users +mf validate-configs # For dbt Core users Options: - --dw-timeout INTEGER Optional timeout for data warehouse + --timeout # dbt Cloud only + Optional timeout for data warehouse validation in dbt Cloud. + --dw-timeout INTEGER # dbt Core only + Optional timeout for data warehouse validation steps. Default None. - --skip-dw If specified, skips the data warehouse - validations - --show-all If specified, prints warnings and future- - errors - --verbose-issues If specified, prints any extra details - issues might have - --semantic-validation-workers INTEGER - Optional. Uses the number of workers - specified to run the semantic validations. - Should only be used for exceptionally large - configs + --skip-dw # dbt Core only + Skips the data warehouse validations. + --show-all # dbt Core only + Prints warnings and future errors. + --verbose-issues # dbt Core only + Prints extra details about issues. + --semantic-validation-workers INTEGER # dbt Core only + Uses specified number of workers for large configs. --help Show this message and exit. ``` @@ -270,7 +259,7 @@ Create a new query with MetricFlow and execute it against your data platform. Th ```bash dbt sl query --metrics --group-by # In dbt Cloud -dbt sl query --saved-query # In dbt Cloud CLI +dbt sl query --saved-query # In dbt Cloud mf query --metrics --group-by # In dbt Core @@ -362,13 +351,13 @@ mf query --metrics order_total,users_active --group-by metric_time # In dbt Core -You can include multiple dimensions in a query. For example, you can group by the `is_food_order` dimension to confirm if orders were for food or not. +You can include multiple dimensions in a query. For example, you can group by the `is_food_order` dimension to confirm if orders were for food or not. Note that when you query a dimension, you need to specify the primary entity for that dimension. In the following example, the primary entity is `order_id`. **Query** ```bash -dbt sl query --metrics order_total --group-by metric_time,is_food_order # In dbt Cloud +dbt sl query --metrics order_total --group-by order_id__is_food_order # In dbt Cloud -mf query --metrics order_total --group-by metric_time,is_food_order # In dbt Core +mf query --metrics order_total --group-by order_id__is_food_order # In dbt Core ``` **Result** @@ -392,13 +381,15 @@ mf query --metrics order_total --group-by metric_time,is_food_order # In dbt Cor You can add order and limit functions to filter and present the data in a readable format. The following query limits the data set to 10 records and orders them by `metric_time`, descending. Note that using the `-` prefix will sort the query in descending order. Without the `-` prefix sorts the query in ascending order. + Note that when you query a dimension, you need to specify the primary entity for that dimension. In the following example, the primary entity is `order_id`. + **Query** ```bash # In dbt Cloud -dbt sl query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order-by -metric_time +dbt sl query --metrics order_total --group-by order_id__is_food_order --limit 10 --order-by -metric_time # In dbt Core -mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order-by -metric_time +mf query --metrics order_total --group-by order_id__is_food_order --limit 10 --order-by -metric_time ``` **Result** @@ -418,15 +409,15 @@ mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 - -You can further filter the data set by adding a `where` clause to your query. The following example shows you how to query the `order_total` metric, grouped by `metric_time` with multiple where statements (orders that are food orders and orders from the week starting on or after Feb 1st, 2024): +You can further filter the data set by adding a `where` clause to your query. The following example shows you how to query the `order_total` metric, grouped by `is_food_order` with multiple where statements (orders that are food orders and orders from the week starting on or after Feb 1st, 2024). Note that when you query a dimension, you need to specify the primary entity for that dimension. In the following example, the primary entity is `order_id`. **Query** ```bash # In dbt Cloud -dbt sl query --metrics order_total --group-by metric_time --where "{{ Dimension('order_id__is_food_order') }} = True and metric_time__week >= '2024-02-01'" +dbt sl query --metrics order_total --group-by order_id__is_food_order --where "{{ Dimension('order_id__is_food_order') }} = True and metric_time__week >= '2024-02-01'" # In dbt Core -mf query --metrics order_total --group-by metric_time --where "{{ Dimension('order_id__is_food_order') }} = True and metric_time__week >= '2024-02-01'" +mf query --metrics order_total --group-by order_id__is_food_order --where "{{ Dimension('order_id__is_food_order') }} = True and metric_time__week >= '2024-02-01'" ``` **Result** @@ -452,16 +443,16 @@ mf query --metrics order_total --group-by metric_time --where "{{ Dimension('ord To filter by time, there are dedicated start and end time options. Using these options to filter by time allows MetricFlow to further optimize query performance by pushing down the where filter when appropriate. - + Note that when you query a dimension, you need to specify the primary entity for that dimension. In the following example, the primary entity is `order_id`. **Query** ```bash # In dbt Core -mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order-by -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' +mf query --metrics order_total --group-by order_id__is_food_order --limit 10 --order-by -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' ``` **Result** @@ -505,8 +496,6 @@ The following tabs present additional query examples, like exporting to a CSV. S - - Add `--compile` (or `--explain` for dbt Core users) to your query to view the SQL generated by MetricFlow. @@ -525,24 +514,24 @@ mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 - ```bash ✔ Success 🦄 - query completed after 0.28 seconds 🔎 SQL (remove --compile to see data or add --show-dataflow-plan to see the generated dataflow plan): -SELECT +select metric_time , is_food_order - , SUM(order_cost) AS order_total -FROM ( - SELECT - cast(ordered_at as date) AS metric_time + , sum(order_cost) as order_total +from ( + select + cast(ordered_at as date) as metric_time , is_food_order , order_cost - FROM ANALYTICS.js_dbt_sl_demo.orders orders_src_1 - WHERE cast(ordered_at as date) BETWEEN CAST('2017-08-22' AS TIMESTAMP) AND CAST('2017-08-27' AS TIMESTAMP) + from analytics.js_dbt_sl_demo.orders orders_src_1 + where cast(ordered_at as date) between cast('2017-08-22' as timestamp) and cast('2017-08-27' as timestamp) ) subq_3 -WHERE is_food_order = True -GROUP BY +where is_food_order = True +group by metric_time , is_food_order -ORDER BY metric_time DESC -LIMIT 10 +order by metric_time desc +limit 10 ``` diff --git a/website/docs/docs/build/metricflow-time-spine.md b/website/docs/docs/build/metricflow-time-spine.md index 997d85e38a8..5f16af38023 100644 --- a/website/docs/docs/build/metricflow-time-spine.md +++ b/website/docs/docs/build/metricflow-time-spine.md @@ -1,18 +1,128 @@ --- title: MetricFlow time spine id: metricflow-time-spine -description: "MetricFlow expects a default timespine table called metricflow_time_spine" +description: "MetricFlow expects a default time spine table called metricflow_time_spine" sidebar_label: "MetricFlow time spine" tags: [Metrics, Semantic Layer] --- + -MetricFlow uses a timespine table to construct cumulative metrics. By default, MetricFlow expects the timespine table to be named `metricflow_time_spine` and doesn't support using a different name. + -To create this table, you need to create a model in your dbt project called `metricflow_time_spine` and add the following code: +It's common in analytics engineering to have a date dimension or "time spine" table as a base table for different types of time-based joins and aggregations. The structure of this table is typically a base column of daily or hourly dates, with additional columns for other time grains, like fiscal quarters, defined based on the base column. You can join other tables to the time spine on the base column to calculate metrics like revenue at a point in time, or to aggregate to a specific time grain. - +MetricFlow requires you to define at least one dbt model which provides a time-spine, and then specify (in YAML) the columns to be used for time-based joins. MetricFlow will join against the time-spine model for the following types of metrics and dimensions: + +- [Cumulative metrics](/docs/build/cumulative) +- [Metric offsets](/docs/build/derived#derived-metric-offset) +- [Conversion metrics](/docs/build/conversion) +- [Slowly Changing Dimensions](/docs/build/dimensions#scd-type-ii) +- [Metrics](/docs/build/metrics-overview) with the `join_to_timespine` configuration set to true + +To see the generated SQL for the metric and dimension types that use time spine joins, refer to the respective documentation or add the `compile=True` flag when querying the Semantic Layer to return the compiled SQL. + +## Configuring time spine in YAML + + Time spine models are normal dbt models with extra configurations that tell dbt and MetricFlow how to use specific columns by defining their properties. Add the [`models` key](/reference/model-properties) for the time spine in your `models/` directory. If your project already includes a calendar table or date dimension, you can configure that table as a time spine. Otherwise, review the [example time-spine tables](#example-time-spine-tables) to create one. + + Some things to note when configuring time spine models: + +- Add the configurations under the `time_spine` key for that [model's properties](/reference/model-properties), just as you would add a description or tests. +- You only need to configure time-spine models that the Semantic Layer should recognize. +- At a minimum, define a time-spine table for a daily grain. +- You can optionally define additional time-spine tables for different granularities, like hourly. Review the [granularity considerations](#granularity-considerations) when deciding which tables to create. +- If you're looking to specify the grain of a time dimension so that MetricFlow can transform the underlying column to the required granularity, refer to the [Time granularity documentation](/docs/build/dimensions?dimension=time_gran) + +:::tip +If you previously used a model called `metricflow_time_spine`, you no longer need to create this specific model. You can now configure MetricFlow to use any date dimension or time spine table already in your project by updating the `model` setting in the Semantic Layer. + +If you don’t have a date dimension table, you can still create one by using the code snippet in the [next section](#creating-a-time-spine-table) to build your time spine model. +::: + +### Creating a time spine table + +MetricFlow supports granularities ranging from milliseconds to years. Refer to the [Dimensions page](/docs/build/dimensions?dimension=time_gran#time) (time_granularity tab) to find the full list of supported granularities. + +To create a time spine table from scratch, you can do so by adding the following code to your dbt project. +This example creates a time spine at an hourly grain and a daily grain: `time_spine_hourly` and `time_spine_daily`. + + + + +```yaml +[models:](/reference/model-properties) +# Hourly time spine + - name: time_spine_hourly + description: my favorite time spine + time_spine: + standard_granularity_column: date_hour # column for the standard grain of your table, must be date time type. + custom_granularities: + - name: fiscal_year + column_name: fiscal_year_column + columns: + - name: date_hour + granularity: hour # set granularity at column-level for standard_granularity_column + +# Daily time spine + - name: time_spine_daily + time_spine: + standard_granularity_column: date_day # column for the standard grain of your table + columns: + - name: date_day + granularity: day # set granularity at column-level for standard_granularity_column +``` + + + + + + + +- This example configuration shows a time spine model called `time_spine_hourly` and `time_spine_daily`. It sets the time spine configurations under the `time_spine` key. +- The `standard_granularity_column` is the column that maps to one of our [standard granularities](/docs/build/dimensions?dimension=time_gran). This column must be set under the `columns` key and should have a grain that is finer or equal to any custom granularity columns defined in the same model. + - It needs to reference a column defined under the `columns` key, in this case, `date_hour` and `date_day`, respectively. + - It sets the granularity at the column-level using the `granularity` key, in this case, `hour` and `day`, respectively. +- MetricFlow will use the `standard_granularity_column` as the join key when joining the time spine table to another source table. +- [The `custom_granularities` field](#custom-calendar), (available in Versionless and dbt v1.9 and higher) lets you specify non-standard time periods like `fiscal_year` or `retail_month` that your organization may use. - +For an example project, refer to our [Jaffle shop](https://github.com/dbt-labs/jaffle-sl-template/blob/main/models/marts/_models.yml) example. + +### Considerations when choosing which granularities to create{#granularity-considerations} + +- MetricFlow will use the time spine with the largest compatible granularity for a given query to ensure the most efficient query possible. For example, if you have a time spine at a monthly grain, and query a dimension at a monthly grain, MetricFlow will use the monthly time spine. If you only have a daily time spine, MetricFlow will use the daily time spine and date_trunc to month. +- You can add a time spine for each granularity you intend to use if query efficiency is more important to you than configuration time, or storage constraints. For most engines, the query performance difference should be minimal and transforming your time spine to a coarser grain at query time shouldn't add significant overhead to your queries. +- We recommend having a time spine at the finest grain used in any of your dimensions to avoid unexpected errors. For example, if you have dimensions at an hourly grain, you should have a time spine at an hourly grain. + +## Example time spine tables + +### Daily + + ```sql {{ @@ -24,10 +134,10 @@ To create this table, you need to create a model in your dbt project called `met with days as ( {{ - dbt_utils.date_spine( + dbt.date_spine( 'day', "to_date('01/01/2000','mm/dd/yyyy')", - "to_date('01/01/2027','mm/dd/yyyy')" + "to_date('01/01/2025','mm/dd/yyyy')" ) }} @@ -39,11 +149,47 @@ final as ( ) select * from final +where date_day > dateadd(year, -4, current_timestamp()) +and date_day < dateadd(day, 30, current_timestamp()) ``` - +### Daily (BigQuery) + +Use this model if you're using BigQuery. BigQuery supports `DATE()` instead of `TO_DATE()`: - + + +```sql + +{{config(materialized='table')}} +with days as ( + {{dbt.date_spine( + 'day', + "DATE(2000,01,01)", + "DATE(2025,01,01)" + ) + }} +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * +from final +-- filter the time spine to a specific range +where date_day > dateadd(year, -4, current_timestamp()) +and date_day < dateadd(day, 30, current_timestamp()) +``` + + + + + +### Hourly + + ```sql {{ @@ -52,43 +198,64 @@ select * from final ) }} -with days as ( +with hours as ( {{ dbt.date_spine( - 'day', + 'hour', "to_date('01/01/2000','mm/dd/yyyy')", - "to_date('01/01/2027','mm/dd/yyyy')" + "to_date('01/01/2025','mm/dd/yyyy')" ) }} ), final as ( - select cast(date_day as date) as date_day - from days + select cast(date_hour as timestamp) as date_hour + from hours ) select * from final +-- filter the time spine to a specific range +where date_day > dateadd(year, -4, current_timestamp()) +and date_hour < dateadd(day, 30, current_timestamp()) ``` + + + - + + + + +MetricFlow uses a time spine table to construct cumulative metrics. By default, MetricFlow expects the time spine table to be named `metricflow_time_spine` and doesn't support using a different name. For supported granularities, refer to the [dimensions](/docs/build/dimensions?dimension=time_gran#time) page. + +To create this table, you need to create a model in your dbt project called `metricflow_time_spine` and add the following code: + +### Daily + + - ```sql --- filename: metricflow_time_spine.sql --- BigQuery supports DATE() instead of TO_DATE(). Use this model if you're using BigQuery -{{config(materialized='table')}} -with days as ( - {{dbt_utils.date_spine( - 'day', - "DATE(2000,01,01)", - "DATE(2030,01,01)" +{{ + config( + materialized = 'table', ) +}} + +with days as ( + + {{ + dbt.date_spine( + 'day', + "to_date('01/01/2000','mm/dd/yyyy')", + "to_date('01/01/2025','mm/dd/yyyy')" + ) }} + ), final as ( @@ -96,23 +263,26 @@ final as ( from days ) -select * -from final +select * from final +where date_day > dateadd(year, -4, current_timestamp()) +and date_day < dateadd(day, 30, current_timestamp()) ``` - + + +### Daily (BigQuery) - +Use this model if you're using BigQuery. BigQuery supports `DATE()` instead of `TO_DATE()`: + + ```sql --- filename: metricflow_time_spine.sql --- BigQuery supports DATE() instead of TO_DATE(). Use this model if you're using BigQuery {{config(materialized='table')}} with days as ( {{dbt.date_spine( 'day', "DATE(2000,01,01)", - "DATE(2030,01,01)" + "DATE(2025,01,01)" ) }} ), @@ -124,8 +294,71 @@ final as ( select * from final +-- filter the time spine to a specific range +where date_day > dateadd(year, -4, current_timestamp()) +and date_day < dateadd(day, 30, current_timestamp()) ``` + + +You only need to include the `date_day` column in the table. MetricFlow can handle broader levels of detail, but finer grains are only supported in versions 1.9 and higher. + + + + +## Custom calendar + + + +The ability to configure custom calendars, such as a fiscal calendar, is available in [dbt Cloud Versionless](/docs/dbt-versions/versionless-cloud) or dbt Core [v1.9 and higher](/docs/dbt-versions/core). + +To access this feature, [upgrade to Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) or your dbt Core version to v1.9 or higher. + -You only need to include the `date_day` column in the table. MetricFlow can handle broader levels of detail, but it doesn't currently support finer grains. + + +Custom date transformations can be complex, and organizations often have unique needs that can’t be easily generalized. Creating a custom calendar model allows you to define these transformations in SQL, offering more flexibility than native transformations in MetricFlow. This approach lets you map custom columns back to MetricFlow granularities, ensuring consistency while giving you control over the transformations. + +For example, if you use a custom calendar in your organization, such as a fiscal calendar, you can configure it in MetricFlow using its date and time operations. + +- This is useful for calculating metrics based on a custom calendar, such as fiscal quarters or weeks. +- Use the `custom_granularities` key to define a non-standard time period for querying data, such as a `retail_month` or `fiscal_week`, instead of standard options like `day`, `month`, or `year`. +- This feature provides more control over how time-based metrics are calculated. + + + +When working with custom calendars in MetricFlow, it's important to ensure: + +- Consistent data types — Both your dimension column and the time spine column should use the same data type to allow accurate comparisons. Functions like `DATE_TRUNC` don't change the data type of the input in some databases (like Snowflake). Using different data types can lead to mismatches and inaccurate results. + + We recommend using `DATETIME` or `TIMESTAMP` data types for your time dimensions and time spine, as they support all granularities. The `DATE` data type may not support smaller granularities like hours or minutes. + +- Time zones — MetricFlow currently doesn't perform any timezone manipulation. When working with timezone-aware data, inconsistent time zones may lead to unexpected results during aggregations and comparisons. + +For example, if your time spine column is `TIMESTAMP` type and your dimension column is `DATE` type, comparisons between these columns might not work as intended. To fix this, convert your `DATE` column to `TIMESTAMP`, or make sure both columns are the same data type. + + + +### Add custom granularities + +To add custom granularities, the Semantic Layer supports custom calendar configurations that allow users to query data using non-standard time periods like `fiscal_year` or `retail_month`. You can define these custom granularities (all lowercased) by modifying your model's YAML configuration like this: + + + +```yaml +models: + - name: my_time_spine + description: my favorite time spine + time_spine: + standard_granularity_column: date_day + custom_granularities: + - name: fiscal_year + column_name: fiscal_year_column +``` + + +#### Coming soon +Note that features like calculating offsets and period-over-period will be supported soon! + + diff --git a/website/docs/docs/build/metrics-overview.md b/website/docs/docs/build/metrics-overview.md index a96c22be883..7021a6d7330 100644 --- a/website/docs/docs/build/metrics-overview.md +++ b/website/docs/docs/build/metrics-overview.md @@ -9,7 +9,7 @@ pagination_next: "docs/build/cumulative" Once you've created your semantic models, it's time to start adding metrics. Metrics can be defined in the same YAML files as your semantic models, or split into separate YAML files into any other subdirectories (provided that these subdirectories are also within the same dbt project repo). -The keys for metrics definitions are: +This article explains the different supported metric types you can add to your dbt project. The keys for metrics definitions are: @@ -27,6 +27,8 @@ The keys for metrics definitions are: Here's a complete example of the metrics spec configuration: + + ```yaml metrics: - name: metric name ## Required @@ -42,6 +44,8 @@ metrics: {{ Dimension('entity__name') }} > 0 and {{ Dimension(' entity__another_name') }} is not null and {{ Metric('metric_name', group_by=['entity_name']) }} > 5 ``` + + @@ -61,6 +65,8 @@ metrics: Here's a complete example of the metrics spec configuration: + + ```yaml metrics: - name: metric name ## Required @@ -76,19 +82,60 @@ metrics: {{ Dimension('entity__name') }} > 0 and {{ Dimension(' entity__another_name') }} is not null and {{ Metric('metric_name', group_by=['entity_name']) }} > 5 ``` + -This page explains the different supported metric types you can add to your dbt project. - import SLCourses from '/snippets/_sl-course.md'; -### Conversion metrics +## Default granularity for metrics + + +Default time granularity for metrics is useful if your time dimension has a very fine grain, like second or hour, but you typically query metrics rolled up at a coarser grain. + +To set the default time granularity for metrics, you need to be on dbt Cloud Versionless or dbt v1.9 and higher. + + + + + +It's possible to define a default time granularity for metrics if it's different from the granularity of the default aggregation time dimensions (`metric_time`). This is useful if your time dimension has a very fine grain, like second or hour, but you typically query metrics rolled up at a coarser grain. + +The granularity can be set using the `time_granularity` parameter on the metric, and defaults to `day`. If day is not available because the dimension is defined at a coarser granularity, it will default to the defined granularity for the dimension. + +### Example +You have a semantic model called `orders` with a time dimension called `order_time`. You want the `orders` metric to roll up to `monthly` by default; however, you want the option to look at these metrics hourly. You can set the `time_granularity` parameter on the `order_time` dimension to `hour`, and then set the `time_granularity` parameter in the metric to `month`. +```yaml +semantic_models: + ... + dimensions: + - name: order_time + type: time + type_params: + time_granularity: hour + measures: + - name: orders + expr: 1 + agg: sum + metrics: + - name: orders + type: simple + label: Count of Orders + type_params: + measure: + name: orders + time_granularity: month -- Optional, defaults to day +``` + + +## Conversion metrics [Conversion metrics](/docs/build/conversion) help you track when a base event and a subsequent conversion event occur for an entity within a set time period. + + ```yaml metrics: - name: The metric name @@ -112,11 +159,14 @@ metrics: - base_property: DIMENSION or ENTITY conversion_property: DIMENSION or ENTITY ``` + -### Cumulative metrics +## Cumulative metrics [Cumulative metrics](/docs/build/cumulative) aggregate a measure over a given window. If no window is specified, the window will accumulate the measure over all of the recorded time period. Note that you will need to create the [time spine model](/docs/build/metricflow-time-spine) before you add cumulative metrics. + + ```yaml # Cumulative metrics aggregate a measure over a given window. The window is considered infinite if no window parameter is passed (accumulate the measure over all of time) metrics: @@ -130,11 +180,14 @@ metrics: join_to_timespine: true window: 7 days ``` + -### Derived metrics +## Derived metrics [Derived metrics](/docs/build/derived) are defined as an expression of other metrics. Derived metrics allow you to do calculations on top of metrics. + + ```yaml metrics: - name: order_gross_profit @@ -149,6 +202,8 @@ metrics: - name: order_cost alias: cost ``` + + -### Ratio metrics +## Ratio metrics [Ratio metrics](/docs/build/ratio) involve a numerator metric and a denominator metric. A `filter` string can be applied to both the numerator and denominator or separately to the numerator or denominator. + + ```yaml metrics: - name: cancellation_rate @@ -191,8 +248,9 @@ metrics: filter: | {{ Dimension('customer__country') }} = 'MX' ``` + -### Simple metrics +## Simple metrics [Simple metrics](/docs/build/simple) point directly to a measure. You may think of it as a function that takes only one measure as the input. @@ -200,6 +258,8 @@ metrics: **Note:** If you've already defined the measure using the `create_metric: True` parameter, you don't need to create simple metrics. However, if you would like to include a constraint on top of the measure, you will need to create a simple type metric. + + ```yaml metrics: - name: cancellations @@ -214,6 +274,7 @@ metrics: {{ Dimension('order__value')}} > 100 and {{Dimension('user__acquisition')}} is not null join_to_timespine: true ``` + ## Filters @@ -221,6 +282,10 @@ A filter is configured using Jinja templating. Use the following syntax to refer Refer to [Metrics as dimensions](/docs/build/ref-metrics-in-filters) for details on how to use metrics as dimensions with metric filters: + + + + ```yaml filter: | {{ Entity('entity_name') }} @@ -232,10 +297,40 @@ filter: | {{ TimeDimension('time_dimension', 'granularity') }} filter: | - {{ Metric('metric_name', group_by=['entity_name']) }} # Available in v1.8 or with [versionless (/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) dbt Cloud] + {{ Metric('metric_name', group_by=['entity_name']) }} + +``` + + + + + + + + +```yaml +filter: | + {{ Entity('entity_name') }} + +filter: | + {{ Dimension('primary_entity__dimension_name') }} + +filter: | + {{ TimeDimension('time_dimension', 'granularity') }} + +``` + + + +For example, if you want to filter for the order date dimension grouped by month, use the following syntax: + +```yaml +filter: | + {{ TimeDimension('order_date', 'month') }} + ``` -### Further configuration +## Further configuration You can set more metadata for your metrics, which can be used by other tools later on. The way this metadata is used will vary based on the specific integration partner diff --git a/website/docs/docs/build/packages.md b/website/docs/docs/build/packages.md index 0b69d10cee6..49cd7e00b1c 100644 --- a/website/docs/docs/build/packages.md +++ b/website/docs/docs/build/packages.md @@ -20,9 +20,10 @@ In dbt, libraries like these are called _packages_. dbt's packages are so powerf * Models to understand [Redshift](https://hub.getdbt.com/dbt-labs/redshift/latest/) privileges. * Macros to work with data loaded by [Stitch](https://hub.getdbt.com/dbt-labs/stitch_utils/latest/). -dbt _packages_ are in fact standalone dbt projects, with models and macros that tackle a specific problem area. As a dbt user, by adding a package to your project, the package's models and macros will become part of your own project. This means: +dbt _packages_ are in fact standalone dbt projects, with models, macros, and other resources that tackle a specific problem area. As a dbt user, by adding a package to your project, all of the package's resources will become part of your own project. This means: * Models in the package will be materialized when you `dbt run`. * You can use `ref` in your own models to refer to models from the package. +* You can use `source` to refer to sources in the package. * You can use macros in the package in your own project. * It's important to note that defining and installing dbt packages is different from [defining and installing Python packages](/docs/build/python-models#using-pypi-packages) @@ -82,11 +83,7 @@ packages: version: [">=0.7.0", "<0.8.0"] ``` - - -Beginning in v1.7, `dbt deps` "pins" each package by default. See ["Pinning packages"](#pinning-packages) for details. - - +`dbt deps` "pins" each package by default. See ["Pinning packages"](#pinning-packages) for details. Where possible, we recommend installing packages via dbt Hub, since this allows dbt to handle duplicate dependencies. This is helpful in situations such as: * Your project uses both the dbt-utils and Snowplow packages, and the Snowplow package _also_ uses the dbt-utils package. @@ -145,18 +142,8 @@ packages: revision: 4e28d6da126e2940d17f697de783a717f2503188 ``` - - -We **strongly recommend** ["pinning" your packages](#pinning-packages) to a specific release by specifying a release name. - - - - - By default, `dbt deps` "pins" each package. See ["Pinning packages"](#pinning-packages) for details. - - ### Internally hosted tarball URL Some organizations have security requirements to pull resources only from internal services. To address the need to install packages from hosted environments such as Artifactory or cloud storage buckets, dbt Core enables you to install packages from internally-hosted tarball URLs. @@ -318,18 +305,6 @@ When you remove a package from your `packages.yml` file, it isn't automatically ### Pinning packages - - -We **strongly recommend** "pinning" your package to a specific release by specifying a tagged release name or a specific commit hash. - -If you do not provide a revision, or if you use the main branch, then any updates to the package will be incorporated into your project the next time you run `dbt deps`. While we generally try to avoid making breaking changes to these packages, they are sometimes unavoidable. Pinning a package revision helps prevent your code from changing without your explicit approval. - -To find the latest release for a package, navigate to the `Releases` tab in the relevant GitHub repository. For example, you can find all of the releases for the dbt-utils package [here](https://github.com/dbt-labs/dbt-utils/releases). - - - - - Beginning with v1.7, running [`dbt deps`](/reference/commands/deps) "pins" each package by creating or updating the `package-lock.yml` file in the _project_root_ where `packages.yml` is recorded. - The `package-lock.yml` file contains a record of all packages installed. @@ -337,8 +312,6 @@ Beginning with v1.7, running [`dbt deps`](/reference/commands/deps) "pins" each For example, if you use a branch name, the `package-lock.yml` file pins to the head commit. If you use a version range, it pins to the latest release. In either case, subsequent commits or versions will **not** be installed. To get new commits or versions, run `dbt deps --upgrade` or add `package-lock.yml` to your .gitignore file. - - As of v0.14.0, dbt will warn you if you install a package using the `git` syntax without specifying a revision (see below). ### Configuring packages diff --git a/website/docs/docs/build/ratio-metrics.md b/website/docs/docs/build/ratio-metrics.md index cc1d13b7835..fdaeb878450 100644 --- a/website/docs/docs/build/ratio-metrics.md +++ b/website/docs/docs/build/ratio-metrics.md @@ -24,6 +24,8 @@ Ratio allows you to create a ratio between two metrics. You simply specify a num The following displays the complete specification for ratio metrics, along with an example. + + ```yaml metrics: - name: The metric name # Required @@ -40,11 +42,19 @@ metrics: filter: Filter for the denominator # Optional alias: Alias for the denominator # Optional ``` + For advanced data modeling, you can use `fill_nulls_with` and `join_to_timespine` to [set null metric values to zero](/docs/build/fill-nulls-advanced), ensuring numeric values for every data row. ## Ratio metrics example +These examples demonstrate how to create ratio metrics in your model. They cover basic and advanced use cases, including applying filters to the numerator and denominator metrics. + +#### Example 1 +This example is a basic ratio metric that calculates the ratio of food orders to total orders: + + + ```yaml metrics: - name: food_order_pct @@ -55,6 +65,30 @@ metrics: numerator: food_orders denominator: orders ``` + + +#### Example 2 +This example is a ratio metric that calculates the ratio of food orders to total orders, with a filter and alias applied to the numerator. Note that in order to add these attributes, you'll need to use an explicit key for the name attribute too. + + + +```yaml +metrics: + - name: food_order_pct + description: "The food order count as a ratio of the total order count, filtered by location" + label: Food order ratio by location + type: ratio + type_params: + numerator: + name: food_orders + filter: location = 'New York' + alias: ny_food_orders + denominator: + name: orders + filter: location = 'New York' + alias: ny_orders +``` + ## Ratio metrics using different semantic models @@ -109,6 +143,8 @@ on Users can define constraints on input metrics for a ratio metric by applying a filter directly to the input metric, like so: + + ```yaml metrics: - name: frequent_purchaser_ratio @@ -123,6 +159,7 @@ metrics: denominator: name: distinct_purchasers ``` + Note the `filter` and `alias` parameters for the metric referenced in the numerator. - Use the `filter` parameter to apply a filter to the metric it's attached to. diff --git a/website/docs/docs/build/saved-queries.md b/website/docs/docs/build/saved-queries.md index e9beffca15f..ed56d13dcc9 100644 --- a/website/docs/docs/build/saved-queries.md +++ b/website/docs/docs/build/saved-queries.md @@ -102,24 +102,7 @@ saved_queries: ``` -Note, you can set `export_as` to both the saved query and the exports [config](/reference/resource-properties/config), with the exports config value taking precedence. If a key isn't set in the exports config, it will inherit the saved query config value. - -#### Project-level saved queries - -To enable saved queries at the project level, you can set the `saved-queries` configuration in the [`dbt_project.yml` file](/reference/dbt_project.yml). This saves you time in configuring saved queries in each file: - - - -```yaml -saved-queries: - my_saved_query: - config: - +cache: - enabled: true -``` - - -For more information on `dbt_project.yml` and config naming conventions, see the [dbt_project.yml reference page](/reference/dbt_project.yml#naming-convention). +Note, that you can set `export_as` to both the saved query and the exports [config](/reference/resource-properties/config), with the exports config value taking precedence. If a key isn't set in the exports config, it will inherit the saved query config value. #### Where clause @@ -171,6 +154,22 @@ saved_queries: +#### Project-level saved queries + +To enable saved queries at the project level, you can set the `saved-queries` configuration in the [`dbt_project.yml` file](/reference/dbt_project.yml). This saves you time in configuring saved queries in each file: + + + +```yaml +saved-queries: + my_saved_query: + +cache: + enabled: true +``` + + +For more information on `dbt_project.yml` and config naming conventions, see the [dbt_project.yml reference page](/reference/dbt_project.yml#naming-convention). + To build `saved_queries`, use the [`--resource-type` flag](/reference/global-configs/resource-type) and run the command `dbt build --resource-type saved_query`. ## Configure exports diff --git a/website/docs/docs/build/semantic-models.md b/website/docs/docs/build/semantic-models.md index e136b2a064d..609d7f1ff8d 100644 --- a/website/docs/docs/build/semantic-models.md +++ b/website/docs/docs/build/semantic-models.md @@ -119,8 +119,6 @@ semantic_models: type: categorical ``` - - Semantic models support [`meta`](/reference/resource-configs/meta), [`group`](/reference/resource-configs/group), and [`enabled`](/reference/resource-configs/enabled) [`config`](/reference/resource-properties/config) property in either the schema file or at the project level: - Semantic model config in `models/semantic.yml`: @@ -148,8 +146,6 @@ Semantic models support [`meta`](/reference/resource-configs/meta), [`group`](/r For more information on `dbt_project.yml` and config naming conventions, see the [dbt_project.yml reference page](/reference/dbt_project.yml#naming-convention). - - ### Name Define the name of the semantic model. You must define a unique name for the semantic model. The semantic graph will use this name to identify the model, and you can update it at any time. Avoid using double underscores (\_\_) in the name as they're not supported. @@ -227,20 +223,20 @@ You can refer to entities (join keys) in a semantic model using the `name` param ### Dimensions -[Dimensions](/docs/build/dimensions) are different ways to organize or look at data. For example, you might group data by things like region, country, or what job someone has. However, trying to set up a system that covers every possible way to group data can be time-consuming and prone to errors. +[Dimensions](/docs/build/dimensions) are different ways to organize or look at data. They are effectively the group by parameters for metrics. For example, you might group data by things like region, country, or job title. -Instead of trying to figure out all the possible groupings ahead of time, MetricFlow lets you ask for the data you need and sorts out how to group it dynamically. You tell it what groupings (dimensions parameters) you're interested in by giving it a `name` (either a column or SQL expression like "country" or "user role") and the `type` of grouping it is (`categorical` or `time`). Categorical groups are for things you can't measure in numbers, while time groups represent dates. +MetricFlow takes a dynamic approach when making dimensions available for metrics. Instead of trying to figure out all the possible groupings ahead of time, MetricFlow lets you ask for the dimensions you need and constructs any joins necessary to reach the requested dimensions at query time. The advantage of this approach is that you don't need to set up a system that pre-materializes every possible way to group data, which can be time-consuming and prone to errors. Instead, you define the dimensions (group by parameters) you're interested in within the semantic model, and they will automatically be made available for valid metrics. -- Dimensions are identified using the name parameter, just like identifiers. -- The naming of groups must be unique within a semantic model, but not across semantic models since MetricFlow, uses entities to determine the appropriate groups. -- MetricFlow requires all dimensions to be tied to a primary entity. +Dimensions have the following characteristics: + +- There are two types of dimensions: categorical and time. Categorical dimensions are for things you can't measure in numbers, while time dimensions represent dates and timestamps. +- Dimensions are bound to the primary entity of the semantic model in which they are defined. For example, if a dimension called `full_name` is defined in a model with `user` as a primary entity, then `full_name` is scoped to the `user` entity. To reference this dimension, you would use the fully qualified dimension name `user__full_name`. +- The naming of dimensions must be unique in each semantic model with the same primary entity. Dimension names can be repeated if defined in semantic models with a different primary entity. -While there's technically no limit to the number of dimensions in a semantic model, it's important to ensure the model remains effective and efficient for its intended purpose. :::info For time groups For semantic models with a measure, you must have a [primary time group](/docs/build/dimensions#time). - ::: ### Measures diff --git a/website/docs/docs/build/simple.md b/website/docs/docs/build/simple.md index a5294c5eeb8..f57d498d290 100644 --- a/website/docs/docs/build/simple.md +++ b/website/docs/docs/build/simple.md @@ -11,7 +11,7 @@ Simple metrics are metrics that directly reference a single measure, without any The parameters, description, and type for simple metrics are: - :::tip +:::tip Note that we use the double colon (::) to indicate whether a parameter is nested within another parameter. So for example, `query_params::metrics` means the `metrics` parameter is nested under `query_params`. ::: diff --git a/website/docs/docs/build/snapshots.md b/website/docs/docs/build/snapshots.md index 82b5104fcef..f5321aa626a 100644 --- a/website/docs/docs/build/snapshots.md +++ b/website/docs/docs/build/snapshots.md @@ -18,215 +18,205 @@ Snapshots implement [type-2 Slowly Changing Dimensions](https://en.wikipedia.org | id | status | updated_at | | -- | ------ | ---------- | -| 1 | pending | 2019-01-01 | +| 1 | pending | 2024-01-01 | Now, imagine that the order goes from "pending" to "shipped". That same record will now look like: | id | status | updated_at | | -- | ------ | ---------- | -| 1 | shipped | 2019-01-02 | +| 1 | shipped | 2024-01-02 | This order is now in the "shipped" state, but we've lost the information about when the order was last in the "pending" state. This makes it difficult (or impossible) to analyze how long it took for an order to ship. dbt can "snapshot" these changes to help you understand how values in a row change over time. Here's an example of a snapshot table for the previous example: | id | status | updated_at | dbt_valid_from | dbt_valid_to | | -- | ------ | ---------- | -------------- | ------------ | -| 1 | pending | 2019-01-01 | 2019-01-01 | 2019-01-02 | -| 1 | shipped | 2019-01-02 | 2019-01-02 | `null` | +| 1 | pending | 2024-01-01 | 2024-01-01 | 2024-01-02 | +| 1 | shipped | 2024-01-02 | 2024-01-02 | `null` | -In dbt, snapshots are `select` statements, defined within a snapshot block in a `.sql` file (typically in your `snapshots` directory). You'll also need to configure your snapshot to tell dbt how to detect record changes. - +## Configuring snapshots - +:::info Previewing or compiling snapshots in IDE not supported -```sql -{% snapshot orders_snapshot %} - -{{ - config( - target_database='analytics', - target_schema='snapshots', - unique_key='id', +It is not possible to "preview data" or "compile sql" for snapshots in dbt Cloud. Instead, [run the `dbt snapshot` command](#how-snapshots-work) in the IDE. - strategy='timestamp', - updated_at='updated_at', - ) -}} +::: -select * from {{ source('jaffle_shop', 'orders') }} + -{% endsnapshot %} -``` +- To configure snapshots in versions 1.8 and earlier, refer to [Configure snapshots in versions 1.8 and earlier](#configure-snapshots-in-versions-18-and-earlier). These versions use an older syntax where snapshots are defined within a snapshot block in a `.sql` file, typically located in your `snapshots` directory. +- Note that defining multiple resources in a single file can significantly slow down parsing and compilation. For faster and more efficient management, consider the updated snapshot YAML syntax, [available in Versionless](/docs/dbt-versions/versionless-cloud) or [dbt Core v1.9 and later](/docs/dbt-versions/core). - - +Configure your snapshots in YAML files to tell dbt how to detect record changes. Define snapshots configurations in YAML files, alongside your models, for a cleaner, faster, and more consistent set up. + + + +```yaml +snapshots: + - name: string + relation: relation # source('my_source', 'my_table') or ref('my_model') + config: + [database](/reference/resource-configs/database): string + [schema](/reference/resource-configs/schema): string + [alias](/reference/resource-configs/alias): string + [strategy](/reference/resource-configs/strategy): timestamp | check + [unique_key](/reference/resource-configs/unique_key): column_name_or_expression + [check_cols](/reference/resource-configs/check_cols): [column_name] | all + [updated_at](/reference/resource-configs/updated_at): column_name + [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes): true | false + [snapshot_meta_column_names](/reference/resource-configs/snapshot_meta_column_names): dictionary -```sql -{% snapshot orders_snapshot %} - -{{ - config( - unique_key='id', - schema='snapshots', - strategy='timestamp', - updated_at='updated_at', - ) -}} - -select * from {{ source('jaffle_shop', 'orders') }} - -{% endsnapshot %} ``` - - -:::info Preview or Compile Snapshots in IDE - -It is not possible to "preview data" or "compile sql" for snapshots in dbt Cloud. Instead, run the `dbt snapshot` command in the IDE by completing the following steps. - -::: - -When you run the [`dbt snapshot` command](/reference/commands/snapshot): -* **On the first run:** dbt will create the initial snapshot table — this will be the result set of your `select` statement, with additional columns including `dbt_valid_from` and `dbt_valid_to`. All records will have a `dbt_valid_to = null`. -* **On subsequent runs:** dbt will check which records have changed or if any new records have been created: - - The `dbt_valid_to` column will be updated for any existing records that have changed - - The updated record and any new records will be inserted into the snapshot table. These records will now have `dbt_valid_to = null` +The following table outlines the configurations available for snapshots: -Snapshots can be referenced in downstream models the same way as referencing models — by using the [ref](/reference/dbt-jinja-functions/ref) function. +| Config | Description | Required? | Example | +| ------ | ----------- | --------- | ------- | +| [database](/reference/resource-configs/database) | Specify a custom database for the snapshot | No | analytics | +| [schema](/reference/resource-configs/schema) | Specify a custom schema for the snapshot | No | snapshots | +| [alias](/reference/resource-configs/alias) | Specify an alias for the snapshot | No | your_custom_snapshot | +| [strategy](/reference/resource-configs/strategy) | The snapshot strategy to use. Valid values: `timestamp` or `check` | Yes | timestamp | +| [unique_key](/reference/resource-configs/unique_key) | A column or expression for the record | Yes | id | +| [check_cols](/reference/resource-configs/check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | +| [updated_at](/reference/resource-configs/updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | +| [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) | Find hard deleted records in source and set `dbt_valid_to` to current time if the record no longer exists | No | True | +| [snapshot_meta_column_names](/reference/resource-configs/snapshot_meta_column_names) | Customize the names of the snapshot meta fields | No | dictionary | -## Example +- In versions prior to v1.9, the `target_schema` (required) and `target_database` (optional) configurations defined a single schema or database to build a snapshot across users and environment. This created problems when testing or developing a snapshot, as there was no clear separation between development and production environments. In v1.9, `target_schema` became optional, allowing snapshots to be environment-aware. By default, without `target_schema` or `target_database` defined, snapshots now use the `generate_schema_name` or `generate_database_name` macros to determine where to build. Developers can still set a custom location with [`schema`](/reference/resource-configs/schema) and [`database`](/reference/resource-configs/database) configs, consistent with other resource types. +- A number of other configurations are also supported (for example, `tags` and `post-hook`). For the complete list, refer to [Snapshot configurations](/reference/snapshot-configs). +- You can configure snapshots from both the `dbt_project.yml` file and a `config` block. For more information, refer to the [configuration docs](/reference/snapshot-configs). -To add a snapshot to your project: -1. Create a file in your `snapshots` directory with a `.sql` file extension, e.g. `snapshots/orders.sql` -2. Use a `snapshot` block to define the start and end of a snapshot: +### Add a snapshot to your project - +To add a snapshot to your project follow these steps. For users on versions 1.8 and earlier, refer to [Configure snapshots in versions 1.8 and earlier](#configure-snapshots-in-versions-18-and-earlier). -```sql -{% snapshot orders_snapshot %} +1. Create a YAML file in your `snapshots` directory: `snapshots/orders_snapshot.yml` and add your configuration details. You can also configure your snapshot from your `dbt_project.yml` file ([docs](/reference/snapshot-configs)). -{% endsnapshot %} -``` + - + ```yaml + snapshots: + - name: orders_snapshot + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + database: analytics + unique_key: id + strategy: timestamp + updated_at: updated_at -3. Write a `select` statement within the snapshot block (tips for writing a good snapshot query are below). This select statement defines the results that you want to snapshot over time. You can use `sources` and `refs` here. + ``` + - +2. Since snapshots focus on configuration, the transformation logic is minimal. Typically, you'd select all data from the source. If you need to apply transformations (like filters, deduplication), it's best practice to define an ephemeral model and reference it in your snapshot configuration. -```sql -{% snapshot orders_snapshot %} + -select * from {{ source('jaffle_shop', 'orders') }} + ```yaml + {{ config(materialized='ephemeral') }} -{% endsnapshot %} -``` + select * from {{ source('jaffle_shop', 'orders') }} + ``` + - +3. Check whether the result set of your query includes a reliable timestamp column that indicates when a record was last updated. For our example, the `updated_at` column reliably indicates record changes, so we can use the `timestamp` strategy. If your query result set does not have a reliable timestamp, you'll need to instead use the `check` strategy — more details on this below. -4. Check whether the result set of your query includes a reliable timestamp column that indicates when a record was last updated. For our example, the `updated_at` column reliably indicates record changes, so we can use the `timestamp` strategy. If your query result set does not have a reliable timestamp, you'll need to instead use the `check` strategy — more details on this below. +4. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example, a new table will be created at `analytics.snapshots.orders_snapshot`. The [`schema`](/reference/resource-configs/schema) config will utilize the `generate_schema_name` macro. -5. Add configurations to your snapshot using a `config` block (more details below). You can also configure your snapshot from your `dbt_project.yml` file ([docs](/reference/snapshot-configs)). + ``` + $ dbt snapshot + Running with dbt=1.9.0 - + 15:07:36 | Concurrency: 8 threads (target='dev') + 15:07:36 | + 15:07:36 | 1 of 1 START snapshot snapshots.orders_snapshot...... [RUN] + 15:07:36 | 1 of 1 OK snapshot snapshots.orders_snapshot..........[SELECT 3 in 1.82s] + 15:07:36 | + 15:07:36 | Finished running 1 snapshots in 0.68s. - + Completed successfully -```sql -{% snapshot orders_snapshot %} + Done. PASS=2 ERROR=0 SKIP=0 TOTAL=1 + ``` -{{ - config( - target_database='analytics', - target_schema='snapshots', - unique_key='id', +5. Inspect the results by selecting from the table dbt created (`analytics.snapshots.orders_snapshot`). After the first run, you should see the results of your query, plus the [snapshot meta fields](#snapshot-meta-fields) as described later on. - strategy='timestamp', - updated_at='updated_at', - ) -}} +6. Run the `dbt snapshot` command again and inspect the results. If any records have been updated, the snapshot should reflect this. -select * from {{ source('jaffle_shop', 'orders') }} +7. Select from the `snapshot` in downstream models using the `ref` function. -{% endsnapshot %} -``` + - + ```sql + select * from {{ ref('orders_snapshot') }} + ``` + -6. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example a new table will be created at `analytics.snapshots.orders_snapshot`. You can change the `target_database` configuration, the `target_schema` configuration and the name of the snapshot (as defined in `{% snapshot .. %}`) will change how dbt names this table. +8. Snapshots are only useful if you run them frequently — schedule the `dbt snapshot` command to run regularly. - +### Configuration best practices - + -```sql -{% snapshot orders_snapshot %} +This strategy handles column additions and deletions better than the `check` strategy. -{{ - config( - schema='snapshots', - unique_key='id', - strategy='timestamp', - updated_at='updated_at', - ) -}} + -select * from {{ source('jaffle_shop', 'orders') }} + -{% endsnapshot %} -``` +The unique key is used by dbt to match rows up, so it's extremely important to make sure this key is actually unique! If you're snapshotting a source, I'd recommend adding a uniqueness test to your source ([example](https://github.com/dbt-labs/jaffle_shop/blob/8e7c853c858018180bef1756ec93e193d9958c5b/models/staging/schema.yml#L26)). + - + -6. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example, a new table will be created at `analytics.snapshots.orders_snapshot`. The [`schema`](/reference/resource-configs/schema) config will utilize the `generate_schema_name` macro. + +Snapshots cannot be rebuilt. As such, it's a good idea to put snapshots in a separate schema so end users know they are special. From there, you may want to set different privileges on your snapshots compared to your models, and even run them as a different user (or role, depending on your warehouse) to make it very difficult to drop a snapshot unless you really want to. + + -``` -$ dbt snapshot -Running with dbt=1.8.0 + -15:07:36 | Concurrency: 8 threads (target='dev') -15:07:36 | -15:07:36 | 1 of 1 START snapshot snapshots.orders_snapshot...... [RUN] -15:07:36 | 1 of 1 OK snapshot snapshots.orders_snapshot..........[SELECT 3 in 1.82s] -15:07:36 | -15:07:36 | Finished running 1 snapshots in 0.68s. + -Completed successfully +Snapshots can't be rebuilt. Because of this, it's a good idea to put snapshots in a separate schema so end users know they're special. From there, you may want to set different privileges on your snapshots compared to your models, and even run them as a different user (or role, depending on your warehouse) to make it very difficult to drop a snapshot unless you really want to. -Done. PASS=2 ERROR=0 SKIP=0 TOTAL=1 -``` + -7. Inspect the results by selecting from the table dbt created. After the first run, you should see the results of your query, plus the [snapshot meta fields](#snapshot-meta-fields) as described below. + -8. Run the `snapshot` command again, and inspect the results. If any records have been updated, the snapshot should reflect this. + If you need to clean or transform your data before snapshotting, create an ephemeral model or a staging model that applies the necessary transformations. Then, reference this model in your snapshot configuration. This approach keeps your snapshot definitions clean and allows you to test and run transformations separately. -9. Select from the `snapshot` in downstream models using the `ref` function. + + - +### How snapshots work -```sql -select * from {{ ref('orders_snapshot') }} -``` - - +When you run the [`dbt snapshot` command](/reference/commands/snapshot): +* **On the first run:** dbt will create the initial snapshot table — this will be the result set of your `select` statement, with additional columns including `dbt_valid_from` and `dbt_valid_to`. All records will have a `dbt_valid_to = null`. +* **On subsequent runs:** dbt will check which records have changed or if any new records have been created: + - The `dbt_valid_to` column will be updated for any existing records that have changed + - The updated record and any new records will be inserted into the snapshot table. These records will now have `dbt_valid_to = null` -10. Schedule the `snapshot` command to run regularly — snapshots are only useful if you run them frequently. +Note, these column names can be customized to your team or organizational conventions using the [snapshot_meta_column_names](#snapshot-meta-fields) config. +Snapshots can be referenced in downstream models the same way as referencing models — by using the [ref](/reference/dbt-jinja-functions/ref) function. ## Detecting row changes -Snapshot "strategies" define how dbt knows if a row has changed. There are two strategies built-in to dbt — `timestamp` and `check`. +Snapshot "strategies" define how dbt knows if a row has changed. There are two strategies built-in to dbt: +- [Timestamp](#timestamp-strategy-recommended) — Uses an `updated_at` column to determine if a row has changed. +- [Check](#check-strategy) — Compares a list of columns between their current and historical values to determine if a row has changed. ### Timestamp strategy (recommended) The `timestamp` strategy uses an `updated_at` field to determine if a row has changed. If the configured `updated_at` column for a row is more recent than the last time the snapshot ran, then dbt will invalidate the old record and record the new one. If the timestamps are unchanged, then dbt will not take any action. @@ -266,27 +256,19 @@ The `timestamp` strategy requires the following configurations: - - -```sql -{% snapshot orders_snapshot_timestamp %} - - {{ - config( - schema='snapshots', - strategy='timestamp', - unique_key='id', - updated_at='updated_at', - ) - }} - - select * from {{ source('jaffle_shop', 'orders') }} - -{% endsnapshot %} + + +```yaml +snapshots: + - name: orders_snapshot_timestamp + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: timestamp + updated_at: updated_at ``` - - ### Check strategy @@ -298,15 +280,12 @@ The `check` strategy requires the following configurations: | ------ | ----------- | ------- | | check_cols | A list of columns to check for changes, or `all` to check all columns | `["name", "email"]` | - - :::caution check_cols = 'all' The `check` snapshot strategy can be configured to track changes to _all_ columns by supplying `check_cols = 'all'`. It is better to explicitly enumerate the columns that you want to check. Consider using a to condense many columns into a single column. ::: - **Example Usage** @@ -336,23 +315,19 @@ The `check` snapshot strategy can be configured to track changes to _all_ column - - -```sql -{% snapshot orders_snapshot_check %} - - {{ - config( - schema='snapshots', - strategy='check', - unique_key='id', - check_cols=['status', 'is_cancelled'], - ) - }} - - select * from {{ source('jaffle_shop', 'orders') }} - -{% endsnapshot %} + + +```yaml +snapshots: + - name: orders_snapshot_check + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: check + check_cols: + - status + - is_cancelled ``` @@ -397,112 +372,42 @@ For this configuration to work with the `timestamp` strategy, the configured `up - - -```sql -{% snapshot orders_snapshot_hard_delete %} - - {{ - config( - schema='snapshots', - strategy='timestamp', - unique_key='id', - updated_at='updated_at', - invalidate_hard_deletes=True, - ) - }} - - select * from {{ source('jaffle_shop', 'orders') }} - -{% endsnapshot %} + + +```yaml +snapshots: + - name: orders_snapshot_hard_delete + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: timestamp + updated_at: updated_at + invalidate_hard_deletes: true ``` -## Configuring snapshots -### Snapshot configurations -There are a number of snapshot-specific configurations: - - - -| Config | Description | Required? | Example | -| ------ | ----------- | --------- | ------- | -| [target_database](/reference/resource-configs/target_database) | The database that dbt should render the snapshot table into | No | analytics | -| [target_schema](/reference/resource-configs/target_schema) | The schema that dbt should render the snapshot table into | Yes | snapshots | -| [strategy](/reference/resource-configs/strategy) | The snapshot strategy to use. One of `timestamp` or `check` | Yes | timestamp | -| [unique_key](/reference/resource-configs/unique_key) | A column or expression for the record | Yes | id | -| [check_cols](/reference/resource-configs/check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | -| [updated_at](/reference/resource-configs/updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | -| [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) | Find hard deleted records in source, and set `dbt_valid_to` current time if no longer exists | No | True | - -A number of other configurations are also supported (e.g. `tags` and `post-hook`), check out the full list [here](/reference/snapshot-configs). - -Snapshots can be configured from both your `dbt_project.yml` file and a `config` block, check out the [configuration docs](/reference/snapshot-configs) for more information. - -Note: BigQuery users can use `target_project` and `target_dataset` as aliases for `target_database` and `target_schema`, respectively. - - - - - -| Config | Description | Required? | Example | -| ------ | ----------- | --------- | ------- | -| [database](/reference/resource-configs/database) | Specify a custom database for the snapshot | No | analytics | -| [schema](/reference/resource-configs/schema) | Specify a custom schema for the snapshot | No | snapshots | -| [alias](/reference/resource-configs/alias) | Specify an alias for the snapshot | No | your_custom_snapshot | -| [strategy](/reference/resource-configs/strategy) | The snapshot strategy to use. Valid values: `timestamp` or `check` | Yes | timestamp | -| [unique_key](/reference/resource-configs/unique_key) | A column or expression for the record | Yes | id | -| [check_cols](/reference/resource-configs/check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | -| [updated_at](/reference/resource-configs/updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | -| [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) | Find hard deleted records in source and set `dbt_valid_to` to current time if the record no longer exists | No | True | - -In versions prior to v1.9, the `target_schema` (required) and `target_database` (optional) configurations defined a single schema or database to build a snapshot into across users and environments. This created problems when testing or developing a snapshot, as there was no clear separation between development and production environments. In v1.9, support was added for environment-aware snapshots by making `target_schema` optional. Snapshots, by default with no `target_schema` or `target_database` config defined, now resolve the schema or database to build the snapshot into using the `generate_schema_name` or `generate_database_name` macros. Developers can optionally define a custom location for snapshots to build to with the [`schema`](/reference/resource-configs/schema) and [`database`](/reference/resource-configs/database) configs, as is consistent with other resource types. - -A number of other configurations are also supported (for example, `tags` and `post-hook`). For the complete list, refer to [Snapshot configurations](/reference/snapshot-configs). - -You can configure snapshots from both the `dbt_project.yml` file and a `config` block. For more information, refer to the [configuration docs](/reference/snapshot-configs). - - - -### Configuration best practices -#### Use the `timestamp` strategy where possible -This strategy handles column additions and deletions better than the `check` strategy. - -#### Ensure your unique key is really unique -The unique key is used by dbt to match rows up, so it's extremely important to make sure this key is actually unique! If you're snapshotting a source, I'd recommend adding a uniqueness test to your source ([example](https://github.com/dbt-labs/jaffle_shop/blob/8e7c853c858018180bef1756ec93e193d9958c5b/models/staging/schema.yml#L26)). - - - -#### Use a `target_schema` that is separate to your analytics schema -Snapshots cannot be rebuilt. As such, it's a good idea to put snapshots in a separate schema so end users know they are special. From there, you may want to set different privileges on your snapshots compared to your models, and even run them as a different user (or role, depending on your warehouse) to make it very difficult to drop a snapshot unless you really want to. - - - - - -#### Use a schema that is separate to your models' schema -Snapshots can't be rebuilt. Because of this, it's a good idea to put snapshots in a separate schema so end users know they're special. From there, you may want to set different privileges on your snapshots compared to your models, and even run them as a different user (or role, depending on your warehouse) to make it very difficult to drop a snapshot unless you really want to. - - - ## Snapshot query best practices -#### Snapshot source data. -Your models should then select from these snapshots, treating them like regular data sources. As much as possible, snapshot your source data in its raw form and use downstream models to clean up the data +This section outlines some best practices for writing snapshot queries: + +- #### Snapshot source data + Your models should then select from these snapshots, treating them like regular data sources. As much as possible, snapshot your source data in its raw form and use downstream models to clean up the data -#### Use the `source` function in your query. -This helps when understanding data lineage in your project. +- #### Use the `source` function in your query + This helps when understanding data lineage in your project. -#### Include as many columns as possible. -In fact, go for `select *` if performance permits! Even if a column doesn't feel useful at the moment, it might be better to snapshot it in case it becomes useful – after all, you won't be able to recreate the column later. +- #### Include as many columns as possible + In fact, go for `select *` if performance permits! Even if a column doesn't feel useful at the moment, it might be better to snapshot it in case it becomes useful – after all, you won't be able to recreate the column later. -#### Avoid joins in your snapshot query. -Joins can make it difficult to build a reliable `updated_at` timestamp. Instead, snapshot the two tables separately, and join them in downstream models. +- #### Avoid joins in your snapshot query + Joins can make it difficult to build a reliable `updated_at` timestamp. Instead, snapshot the two tables separately, and join them in downstream models. -#### Limit the amount of transformation in your query. -If you apply business logic in a snapshot query, and this logic changes in the future, it can be impossible (or, at least, very difficult) to apply the change in logic to your snapshots. +- #### Limit the amount of transformation in your query + If you apply business logic in a snapshot query, and this logic changes in the future, it can be impossible (or, at least, very difficult) to apply the change in logic to your snapshots. Basically – keep your query as simple as possible! Some reasonable exceptions to these recommendations include: * Selecting specific columns if the table is wide. @@ -512,6 +417,8 @@ Basically – keep your query as simple as possible! Some reasonable exceptions Snapshot tables will be created as a clone of your source dataset, plus some additional meta-fields*. +Starting in 1.9 or with [dbt Cloud Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless), these column names can be customized to your team or organizational conventions via the [`snapshot_meta_column_names`](/reference/resource-configs/snapshot_meta_column_names) config. + | Field | Meaning | Usage | | -------------- | ------- | ----- | | dbt_valid_from | The timestamp when this snapshot row was first inserted | This column can be used to order the different "versions" of a record. | @@ -526,30 +433,30 @@ For the `timestamp` strategy, the configured `updated_at` column is used to popu
Details for the timestamp strategy -Snapshot query results at `2019-01-01 11:00` +Snapshot query results at `2024-01-01 11:00` | id | status | updated_at | | -- | ------- | ---------------- | -| 1 | pending | 2019-01-01 10:47 | +| 1 | pending | 2024-01-01 10:47 | Snapshot results (note that `11:00` is not used anywhere): | id | status | updated_at | dbt_valid_from | dbt_valid_to | dbt_updated_at | | -- | ------- | ---------------- | ---------------- | ---------------- | ---------------- | -| 1 | pending | 2019-01-01 10:47 | 2019-01-01 10:47 | | 2019-01-01 10:47 | +| 1 | pending | 2024-01-01 10:47 | 2024-01-01 10:47 | | 2024-01-01 10:47 | -Query results at `2019-01-01 11:30`: +Query results at `2024-01-01 11:30`: | id | status | updated_at | | -- | ------- | ---------------- | -| 1 | shipped | 2019-01-01 11:05 | +| 1 | shipped | 2024-01-01 11:05 | Snapshot results (note that `11:30` is not used anywhere): | id | status | updated_at | dbt_valid_from | dbt_valid_to | dbt_updated_at | | -- | ------- | ---------------- | ---------------- | ---------------- | ---------------- | -| 1 | pending | 2019-01-01 10:47 | 2019-01-01 10:47 | 2019-01-01 11:05 | 2019-01-01 10:47 | -| 1 | shipped | 2019-01-01 11:05 | 2019-01-01 11:05 | | 2019-01-01 11:05 | +| 1 | pending | 2024-01-01 10:47 | 2024-01-01 10:47 | 2024-01-01 11:05 | 2024-01-01 10:47 | +| 1 | shipped | 2024-01-01 11:05 | 2024-01-01 11:05 | | 2024-01-01 11:05 |
@@ -560,7 +467,7 @@ For the `check` strategy, the current timestamp is used to populate each column.
Details for the check strategy -Snapshot query results at `2019-01-01 11:00` +Snapshot query results at `2024-01-01 11:00` | id | status | | -- | ------- | @@ -570,9 +477,9 @@ Snapshot results: | id | status | dbt_valid_from | dbt_valid_to | dbt_updated_at | | -- | ------- | ---------------- | ---------------- | ---------------- | -| 1 | pending | 2019-01-01 11:00 | | 2019-01-01 11:00 | +| 1 | pending | 2024-01-01 11:00 | | 2024-01-01 11:00 | -Query results at `2019-01-01 11:30`: +Query results at `2024-01-01 11:30`: | id | status | | -- | ------- | @@ -582,11 +489,191 @@ Snapshot results: | id | status | dbt_valid_from | dbt_valid_to | dbt_updated_at | | --- | ------- | ---------------- | ---------------- | ---------------- | -| 1 | pending | 2019-01-01 11:00 | 2019-01-01 11:30 | 2019-01-01 11:00 | -| 1 | shipped | 2019-01-01 11:30 | | 2019-01-01 11:30 | +| 1 | pending | 2024-01-01 11:00 | 2024-01-01 11:30 | 2024-01-01 11:00 | +| 1 | shipped | 2024-01-01 11:30 | | 2024-01-01 11:30 |
+## Configure snapshots in versions 1.8 and earlier + + + +This section is for users on dbt versions 1.8 and earlier. To configure snapshots in versions 1.9 and later, refer to [Configuring snapshots](#configuring-snapshots). The latest versions use an updated snapshot configuration syntax that optimizes performance. + + + + + +- In dbt versions 1.8 and earlier, snapshots are `select` statements, defined within a snapshot block in a `.sql` file (typically in your `snapshots` directory). You'll also need to configure your snapshot to tell dbt how to detect record changes. +- The earlier dbt versions use an older syntax that allows for defining multiple resources in a single file. This syntax can significantly slow down parsing and compilation. +- For faster and more efficient management, consider[ upgrading to Versionless](/docs/dbt-versions/versionless-cloud) or the [latest version of dbt Core](/docs/dbt-versions/core), which introduces an updated snapshot configuration syntax that optimizes performance. + +The following example shows how to configure a snapshot: + + + +```sql +{% snapshot orders_snapshot %} + +{{ + config( + target_database='analytics', + target_schema='snapshots', + unique_key='id', + + strategy='timestamp', + updated_at='updated_at', + ) +}} + +select * from {{ source('jaffle_shop', 'orders') }} + +{% endsnapshot %} +``` + + + +The following table outlines the configurations available for snapshots in versions 1.8 and earlier: + +| Config | Description | Required? | Example | +| ------ | ----------- | --------- | ------- | +| [target_database](/reference/resource-configs/target_database) | The database that dbt should render the snapshot table into | No | analytics | +| [target_schema](/reference/resource-configs/target_schema) | The schema that dbt should render the snapshot table into | Yes | snapshots | +| [strategy](/reference/resource-configs/strategy) | The snapshot strategy to use. One of `timestamp` or `check` | Yes | timestamp | +| [unique_key](/reference/resource-configs/unique_key) | A column or expression for the record | Yes | id | +| [check_cols](/reference/resource-configs/check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | +| [updated_at](/reference/resource-configs/updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | +| [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) | Find hard deleted records in source, and set `dbt_valid_to` current time if no longer exists | No | True | + +- A number of other configurations are also supported (e.g. `tags` and `post-hook`), check out the full list [here](/reference/snapshot-configs). +- Snapshots can be configured from both your `dbt_project.yml` file and a `config` block, check out the [configuration docs](/reference/snapshot-configs) for more information. +- Note: BigQuery users can use `target_project` and `target_dataset` as aliases for `target_database` and `target_schema`, respectively. + +### Configuration example + +To add a snapshot to your project: + +1. Create a file in your `snapshots` directory with a `.sql` file extension, e.g. `snapshots/orders.sql` +2. Use a `snapshot` block to define the start and end of a snapshot: + + + +```sql +{% snapshot orders_snapshot %} + +{% endsnapshot %} +``` + + + +3. Write a `select` statement within the snapshot block (tips for writing a good snapshot query are below). This select statement defines the results that you want to snapshot over time. You can use `sources` and `refs` here. + + + +```sql +{% snapshot orders_snapshot %} + +select * from {{ source('jaffle_shop', 'orders') }} + +{% endsnapshot %} +``` + + + +4. Check whether the result set of your query includes a reliable timestamp column that indicates when a record was last updated. For our example, the `updated_at` column reliably indicates record changes, so we can use the `timestamp` strategy. If your query result set does not have a reliable timestamp, you'll need to instead use the `check` strategy — more details on this below. + +5. Add configurations to your snapshot using a `config` block (more details below). You can also configure your snapshot from your `dbt_project.yml` file ([docs](/reference/snapshot-configs)). + + + + + +```sql +{% snapshot orders_snapshot %} + +{{ + config( + target_database='analytics', + target_schema='snapshots', + unique_key='id', + + strategy='timestamp', + updated_at='updated_at', + ) +}} + +select * from {{ source('jaffle_shop', 'orders') }} + +{% endsnapshot %} +``` + + + +6. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example a new table will be created at `analytics.snapshots.orders_snapshot`. You can change the `target_database` configuration, the `target_schema` configuration and the name of the snapshot (as defined in `{% snapshot .. %}`) will change how dbt names this table. + + + + + + + +```sql +{% snapshot orders_snapshot %} + +{{ + config( + schema='snapshots', + unique_key='id', + strategy='timestamp', + updated_at='updated_at', + ) +}} + +select * from {{ source('jaffle_shop', 'orders') }} + +{% endsnapshot %} +``` + + + +6. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example, a new table will be created at `analytics.snapshots.orders_snapshot`. The [`schema`](/reference/resource-configs/schema) config will utilize the `generate_schema_name` macro. + + + +``` +$ dbt snapshot +Running with dbt=1.8.0 + +15:07:36 | Concurrency: 8 threads (target='dev') +15:07:36 | +15:07:36 | 1 of 1 START snapshot snapshots.orders_snapshot...... [RUN] +15:07:36 | 1 of 1 OK snapshot snapshots.orders_snapshot..........[SELECT 3 in 1.82s] +15:07:36 | +15:07:36 | Finished running 1 snapshots in 0.68s. + +Completed successfully + +Done. PASS=2 ERROR=0 SKIP=0 TOTAL=1 +``` + +7. Inspect the results by selecting from the table dbt created. After the first run, you should see the results of your query, plus the [snapshot meta fields](#snapshot-meta-fields) as described earlier. + +8. Run the `dbt snapshot` command again, and inspect the results. If any records have been updated, the snapshot should reflect this. + +9. Select from the `snapshot` in downstream models using the `ref` function. + + + +```sql +select * from {{ ref('orders_snapshot') }} +``` + + + +10. Snapshots are only useful if you run them frequently — schedule the `snapshot` command to run regularly. + + + ## FAQs diff --git a/website/docs/docs/build/sources.md b/website/docs/docs/build/sources.md index 1594dd72dba..4926601f3b2 100644 --- a/website/docs/docs/build/sources.md +++ b/website/docs/docs/build/sources.md @@ -44,7 +44,7 @@ sources: *By default, `schema` will be the same as `name`. Add `schema` only if you want to use a source name that differs from the existing schema. -If you're not already familiar with these files, be sure to check out [the documentation on schema.yml files](/reference/configs-and-properties) before proceeding. +If you're not already familiar with these files, be sure to check out [the documentation on properties.yml files](/reference/configs-and-properties) before proceeding. ### Selecting from a source diff --git a/website/docs/docs/build/sql-models.md b/website/docs/docs/build/sql-models.md index a019508d370..bbf07986222 100644 --- a/website/docs/docs/build/sql-models.md +++ b/website/docs/docs/build/sql-models.md @@ -266,7 +266,7 @@ You can also document and test models — skip ahead to the section on [test - + diff --git a/website/docs/docs/build/unit-tests.md b/website/docs/docs/build/unit-tests.md index 55b35721298..1d7143d7476 100644 --- a/website/docs/docs/build/unit-tests.md +++ b/website/docs/docs/build/unit-tests.md @@ -22,7 +22,8 @@ With dbt Core v1.8 and dbt Cloud environments that have gone versionless by sele - We currently only support unit testing SQL models. - We currently only support adding unit tests to models in your _current_ project. -- We currently *don't* support unit testing models that use recursive SQL. +- We currently _don't_ support unit testing models that use the [`materialized view`](/docs/build/materializations#materialized-view) materialization. +- We currently _don't_ support unit testing models that use recursive SQL. - You must specify all fields in a BigQuery STRUCT in a unit test. You cannot use only a subset of fields in a STRUCT. - If your model has multiple versions, by default the unit test will run on *all* versions of your model. Read [unit testing versioned models](/reference/resource-properties/unit-testing-versions) for more information. - Unit tests must be defined in a YML file in your `models/` directory. diff --git a/website/docs/docs/build/validation.md b/website/docs/docs/build/validation.md index 6ca5a680895..89297726a44 100644 --- a/website/docs/docs/build/validation.md +++ b/website/docs/docs/build/validation.md @@ -14,11 +14,10 @@ The code that handles validation [can be found here](https://github.com/dbt-labs ## Validations command -You can run validations against the defined semantic model configurations from the command line with the following [MetricFlow commands](/docs/build/metricflow-commands): - -Note, in dbt Cloud you don't need to validate the Semantic Layer config separately. Running a dbt command (such as `dbt parse`, `dbt build`, `dbt compile`, or `dbt run`) automatically checks it. +You can run validations from dbt Cloud or the command line with the following [MetricFlow commands](/docs/build/metricflow-commands). In dbt Cloud, you need developer credentials to run `dbt sl validate-configs` in the IDE or CLI, and deployment credentials to run it in CI. ```bash +dbt sl validate # dbt Cloud users mf validate-configs # dbt Core users ``` diff --git a/website/docs/docs/cloud-integrations/about-snowflake-native-app.md b/website/docs/docs/cloud-integrations/about-snowflake-native-app.md index fa1c54ca5a6..86ee6a7d630 100644 --- a/website/docs/docs/cloud-integrations/about-snowflake-native-app.md +++ b/website/docs/docs/cloud-integrations/about-snowflake-native-app.md @@ -34,12 +34,17 @@ The following diagram provides an illustration of the architecture: ## Access -You can log in to the dbt Snowflake Native App using your regular Snowflake login authentication method. During this [Preview](/docs/dbt-versions/product-lifecycles#dbt-cloud), you do not need dbt Cloud credentials (a dbt Cloud seat) to access the application but this is subject to change. + +Log in to the dbt Snowflake Native App using your regular Snowflake login authentication method. The Snowflake user must have a corresponding dbt Cloud user with a _[developer license](/docs/cloud/manage-access/seats-and-users)_. Previously, this wasn't a requirement during the feature [Preview](/docs/dbt-versions/product-lifecycles#dbt-cloud). + +If your Snowflake Native App is already configured, you will be prompted to [link credentials](#link-credentials) the next time you access dbt Cloud from the app. This is a one-time process. If you don't have a dbt Cloud account associated with a developer license, you will be denied access to the dbt Cloud environment and will need an admin to assist. + +_Users with IT or read-only licenses will be denied access to dbt Cloud via the Snowflake Native App._ App users are able to access all information that's available to the API service token. ## Procurement -The dbt Snowflake Native App is available on the [Snowflake Marketplace](https://app.snowflake.com/marketplace/listing/GZTYZSRT2R3). With the purchase of the listing, users will have access to the Native App and a dbt Cloud account that's on the Enterprise plan. +The dbt Snowflake Native App is available on the [Snowflake Marketplace](https://app.snowflake.com/marketplace/listing/GZTYZSRT2R3). Purchasing it includes access to the Native App and a dbt Cloud account that's on the Enterprise plan. Existing dbt Cloud Enterprise customers can also access it. If interested, contact your Enterprise account manager. If you're interested, please [contact us](matilto:sales_snowflake_marketplace@dbtlabs.com) for more information. @@ -47,4 +52,18 @@ If you're interested, please [contact us](matilto:sales_snowflake_marketplace@db If you have any questions about the dbt Snowflake Native App, you may [contact our Support team](mailto:dbt-snowflake-marketplace@dbtlabs.com) for help. Please provide information about your installation of the Native App, including your dbt Cloud account ID and Snowflake account identifier. ## Limitations -- The Native app does not support dbt Cloud accounts with [IP Restrictions](/docs/cloud/secure/ip-restrictions] enabled. +- The Native app does not support dbt Cloud accounts with [IP Restrictions](/docs/cloud/secure/ip-restrictions) enabled. + +## Link credentials + +During the early stages of the Snowflake Native App preview, users were only required to exist in the Snowflake platform and could access dbt Cloud via the app without having a corresponding user. This is no longer the case, and every Snowflake user must also have dbt Cloud account access with a [developer license](/docs/cloud/manage-access/seats-and-users). + +For existing accounts with the Snowflake Native App configured, users will be prompted to authenticate with dbt Cloud the next time they log in. This is a one-time process if they have a user in dbt Cloud. If they don’t have a dbt Cloud user, they will be denied access, and an admin will need to [create one](/docs/cloud/manage-access/invite-users). + +1. When you attempt to access the dbt Cloud platform from the Snowflake Native App, you will be prompted to link your account. + + + +2. Click **Link account** and you will be prompted for your dbt Cloud credentials. + + diff --git a/website/docs/docs/cloud-integrations/avail-sl-integrations.md b/website/docs/docs/cloud-integrations/avail-sl-integrations.md index eea93c92b93..acc36623ab5 100644 --- a/website/docs/docs/cloud-integrations/avail-sl-integrations.md +++ b/website/docs/docs/cloud-integrations/avail-sl-integrations.md @@ -20,7 +20,7 @@ import AvailIntegrations from '/snippets/_sl-partner-links.md'; ### Custom integration - [Exports](/docs/use-dbt-semantic-layer/exports) enable custom integration with additional tools that don't natively connect with the dbt Semantic Layer, such as PowerBI. -- Develop custom integrations using different languages and tools, supported through JDBC, ADBC, and GraphQL APIs. For more info, check out [our examples on GitHub](https://github.com/dbt-labs/example-semantic-layer-clients/). +- [Consume metrics](/docs/use-dbt-semantic-layer/consume-metrics) and develop custom integrations using different languages and tools, supported through [JDBC](/docs/dbt-cloud-apis/sl-jdbc), ADBC, and [GraphQL](/docs/dbt-cloud-apis/sl-graphql) APIs, and [Python SDK library](/docs/dbt-cloud-apis/sl-python). For more info, check out [our examples on GitHub](https://github.com/dbt-labs/example-semantic-layer-clients/). - Connect to any tool that supports SQL queries. These tools must meet one of the two criteria: - Offers a generic JDBC driver option (such as DataGrip) or - Is compatible Arrow Flight SQL JDBC driver version 12.0.0 or higher. @@ -29,7 +29,7 @@ import AvailIntegrations from '/snippets/_sl-partner-links.md'; - {frontMatter.meta.api_name} to learn how to integrate and query your metrics in downstream tools. - [dbt Semantic Layer API query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) -- [Hex dbt Semantic Layer cells](https://learn.hex.tech/docs/logic-cell-types/transform-cells/dbt-metrics-cells) to set up SQL cells in Hex. +- [Hex dbt Semantic Layer cells](https://learn.hex.tech/docs/explore-data/cells/data-cells/dbt-metrics-cells) to set up SQL cells in Hex. - [Resolve 'Failed APN'](/faqs/Troubleshooting/sl-alpn-error) error when connecting to the dbt Semantic Layer. - [dbt Semantic Layer on-demand course](https://learn.getdbt.com/courses/semantic-layer) - [dbt Semantic Layer FAQs](/docs/use-dbt-semantic-layer/sl-faqs) diff --git a/website/docs/docs/cloud-integrations/configure-auto-exposures.md b/website/docs/docs/cloud-integrations/configure-auto-exposures.md new file mode 100644 index 00000000000..42e36e572b3 --- /dev/null +++ b/website/docs/docs/cloud-integrations/configure-auto-exposures.md @@ -0,0 +1,74 @@ +--- +title: "Auto-exposures" +id: "configure-auto-exposures" +sidebar_label: "Configure auto-exposures" +description: "Import and auto-generate exposures from dashboards and understand how models are used in downstream tools for a richer lineage." +image: /img/docs/cloud-integrations/auto-exposures/explorer-lineage2.jpg +--- + +# Configure auto-exposures + +As a data team, it’s critical that you have context into the downstream use cases and users of your data products. [Auto-exposures](/docs/collaborate/auto-exposures) integrates natively with Tableau and [auto-generates downstream lineage](/docs/collaborate/auto-exposures#view-auto-exposures-in-dbt-explorer) in dbt Explorer for a richer experience. + +Auto-exposures help data teams optimize their efficiency and ensure data quality by: + +- Helping users understand how their models are used in downstream analytics tools to inform investments and reduce incidents — ultimately building trust and confidence in data products. +- Importing and auto-generating exposures based on Tableau dashboards, with user-defined curation. +- Enabling the active exposure work to run models based on when exposures are updated or need to be updated, improving timeliness and reducing costs. + +## Prerequisites + +To access the features, you should meet the following: + +1. Your environment and jobs are on [Versionless](/docs/dbt-versions/versionless-cloud) dbt. +2. You have a dbt Cloud account on the [Enterprise plan](https://www.getdbt.com/pricing/). +3. You have set up a [production](/docs/deploy/deploy-environments#set-as-production-environment) deployment environment for each project you want to explore, with at least one successful job run. +4. You have [admin permissions](/docs/cloud/manage-access/enterprise-permissions) in dbt Cloud to edit project settings or production environment settings. +5. Use Tableau as your BI tool and enable metadata permissions or work with an admin to do so. Compatible with Tableau Cloud or Tableau Server with the Metadata API enabled. + +## Set up in Tableau + +This section of the document explains the steps you need to set up the auto-exposures integration with Tableau. Once you've set this up in Tableau and dbt Cloud, you can view the [auto-exposures](/docs/collaborate/auto-exposures#view-auto-exposures-in-dbt-explorer) in dbt Explorer. + +To set up [personal access tokens (PATs)](https://help.tableau.com/current/server/en-us/security_personal_access_tokens.htm) needed for auto exposures, ask a site admin to configure it for the account. + +1. Ensure you or a site admin enables PATs for the account in Tableau. + + +2. Create a PAT that you can add to dbt Cloud to pull in Tableau metadata for auto exposures. Ensure the user creating the PAT has access to collections/folders, as the PAT only grants access matching the creator's existing privileges. + + +3. Copy the **Secret** and the **Token name** and enter them in dbt Cloud. The secret is only displayed once, so store it in a safe location (like a password manager). + + +4. Copy the **Server URL** and **Sitename**. You can find these in the URL while logged into Tableau. + + + For example, if the full URL is: `10az.online.tableau.com/#/site/dbtlabspartner/explore`: + - The **Server URL** is the first part of the URL, in this case: `10az.online.tableau.com` + - The **Sitename** is right after the `site` in the URL, in this case: `dbtlabspartner` + +5. You should now be ready to set up auto-exposures in dbt Cloud after copying the following items, which you'll need during the dbt Cloud setup: ServerURL, Sitename, Token name, and Secret. + +## Set up in dbt Cloud + +1. In dbt Cloud, navigate to the project you want to add the auto-exposures to and then select **Settings**. +2. Under the **Exposures** section, select **Add integration** to add the Tableau connection. + +3. Enter the details for the exposure connection you collected from Tableau in the [previous step](#set-up-in-tableau) and click **Continue**. Note that all fields are case-sensitive. + +4. Select the collections you want to include for auto exposures. + + dbt Cloud automatically imports and syncs any workbook within the selected collections. New additions to the collections will be added to the lineage in dbt Cloud during the next automatic sync (usually once per day). + +5. Click **Save**. + +dbt Cloud imports everything in the collection(s) and you can continue to view them in Explorer. For more information on how to view and use auto-exposures, refer to [View auto-exposures from dbt Explorer](/docs/collaborate/auto-exposures) page. + + + +## Refresh auto-exposures in jobs + +:::info Coming soon +Soon, you’ll also be able to use auto-exposures to trigger the refresh of the data used in your Tableau dashboards from within dbt Cloud. Stay tuned for more on this soon! +::: diff --git a/website/docs/docs/cloud-integrations/overview.md b/website/docs/docs/cloud-integrations/overview.md index e0adfe72303..8334632a7f8 100644 --- a/website/docs/docs/cloud-integrations/overview.md +++ b/website/docs/docs/cloud-integrations/overview.md @@ -2,15 +2,21 @@ title: "About dbt Cloud integrations" sidebar_label: "About dbt Cloud integrations" pagination_prev: null -pagination_next: "docs/cloud-integrations/snowflake-native-app" +pagination_next: "docs/cloud-integrations/configure-auto-exposures" +hide_table_of_contents: true --- Many data applications integrate with dbt Cloud, enabling you to leverage the power of dbt for a variety of use cases and workflows. - ## Integrations with dbt -
+
+ + -
\ No newline at end of file +
diff --git a/website/docs/docs/cloud-integrations/semantic-layer/excel.md b/website/docs/docs/cloud-integrations/semantic-layer/excel.md index 6a70217e0f7..c80040dce01 100644 --- a/website/docs/docs/cloud-integrations/semantic-layer/excel.md +++ b/website/docs/docs/cloud-integrations/semantic-layer/excel.md @@ -6,8 +6,6 @@ tags: [Semantic Layer] sidebar_label: "Microsoft Excel" --- -# Microsoft Excel - The dbt Semantic Layer offers a seamless integration with Excel Online and Desktop through a custom menu. This add-on allows you to build dbt Semantic Layer queries and return data on your metrics directly within Excel. ## Prerequisites @@ -18,14 +16,15 @@ The dbt Semantic Layer offers a seamless integration with Excel Online and Deskt - You must have a dbt Cloud Team or Enterprise [account](https://www.getdbt.com/pricing). Suitable for both Multi-tenant and Single-tenant deployment. - Single-tenant accounts should contact their account representative for necessary setup and enablement. -import SLCourses from '/snippets/_sl-course.md'; +:::tip - +📹 For on-demand video learning, explore the [Querying the Semantic Layer with Excel](https://learn.getdbt.com/courses/querying-the-semantic-layer-with-excel) course to learn how to query metrics with Excel. +::: ## Installing the add-on -The dbt Semantic Layer Microsoft Excel integration is available to download directly on [Microsoft AppSource](https://appsource.microsoft.com/en-us/marketplace/apps?product=office). You can choose to download this add in for both [Excel Desktop](https://pages.store.office.com/addinsinstallpage.aspx?assetid=WA200007100&rs=en-US&correlationId=4132ecd1-425d-982d-efb4-de94ebc83f26) and [Excel Online](https://pages.store.office.com/addinsinstallpage.aspx?assetid=WA200007100&rs=en-US&correlationid=4132ecd1-425d-982d-efb4-de94ebc83f26&isWac=True) +The dbt Semantic Layer Microsoft Excel integration is available to download directly on [Microsoft AppSource](https://appsource.microsoft.com/en-us/product/office/WA200007100?tab=Overview). You can choose to download this add-on in for both [Excel Desktop](https://pages.store.office.com/addinsinstallpage.aspx?assetid=WA200007100&rs=en-US&correlationId=4132ecd1-425d-982d-efb4-de94ebc83f26) and [Excel Online](https://pages.store.office.com/addinsinstallpage.aspx?assetid=WA200007100&rs=en-US&correlationid=4132ecd1-425d-982d-efb4-de94ebc83f26&isWac=True) 1. In Excel, authenticate with your host, dbt Cloud environment ID, and service token. - Access your Environment ID, Host, and URLs in your dbt Cloud Semantic Layer settings. Generate a service token in the Semantic Layer settings or API tokens settings @@ -37,11 +36,11 @@ import Tools from '/snippets/_sl-excel-gsheets.md'; - ## FAQs diff --git a/website/docs/docs/cloud-integrations/semantic-layer/gsheets.md b/website/docs/docs/cloud-integrations/semantic-layer/gsheets.md index b3931f0f528..f215bee9671 100644 --- a/website/docs/docs/cloud-integrations/semantic-layer/gsheets.md +++ b/website/docs/docs/cloud-integrations/semantic-layer/gsheets.md @@ -40,13 +40,15 @@ import Tools from '/snippets/_sl-excel-gsheets.md'; type="Google Sheets" bullet_1="The custom menu operation has a timeout limit of six (6) minutes." bullet_2="If you're using this extension, make sure you're signed into Chrome with the same Google profile you used to set up the Add-On. Log in with one Google profile at a time as using multiple Google profiles at once might cause issues." -queryBuilder="/img/docs/dbt-cloud/semantic-layer/gsheets-query-builder.jpg" +queryBuilder="/img/docs/dbt-cloud/semantic-layer/query-builder.png" +PrivateSelections="You can also make these selections private or public. Public selections mean your inputs are available in the menu to everyone on the sheet. +Private selections mean your inputs are only visible to you. Note that anyone added to the sheet can still see the data from these private selections, but they won't be able to interact with the selection in the menu or benefit from the automatic refresh." /> - + **Limited use policy disclosure** diff --git a/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md b/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md index 4f370de6d4d..49e6f90e41f 100644 --- a/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md +++ b/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md @@ -144,5 +144,10 @@ Check that the SL user has been granted access to the `dbt_sl_llm` schema and ma -If there's been an update to the dbt Cloud account ID, access URL, or API service token, you need to update the configuration for the dbt Snowflake Native App. In Snowflake, navigate to the app's configuration page and delete the existing configurations. Add the new configuration and then run `CALL app_public.restart_ap ();` in the application database in Snowsight. +If there's been an update to the dbt Cloud account ID, access URL, or API service token, you need to update the configuration for the dbt Snowflake Native App. In Snowflake, navigate to the app's configuration page and delete the existing configurations. Add the new configuration and then run `CALL app_public.restart_app();` in the application database in Snowsight. + + + + +[Environment variables](/docs/build/environment-variables), like `{{env_var('DBT_WAREHOUSE') }}` aren’t supported in the dbt Semantic Layer yet. To use the 'Ask dbt' feature, you must use the actual credentials instead. diff --git a/website/docs/docs/cloud/about-cloud-develop-defer.md b/website/docs/docs/cloud/about-cloud-develop-defer.md index 4e2f70b7b82..fc55edf8a38 100644 --- a/website/docs/docs/cloud/about-cloud-develop-defer.md +++ b/website/docs/docs/cloud/about-cloud-develop-defer.md @@ -50,8 +50,11 @@ The dbt Cloud CLI offers additional flexibility by letting you choose the source - ```yml -defer-env-id: '123456' +```yml +context: + active-host: ... + active-project: ... + defer-env-id: '123456' ``` @@ -60,7 +63,7 @@ defer-env-id: '123456' ```yml -dbt_cloud: +dbt-cloud: defer-env-id: '123456' ``` diff --git a/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md index 059f9e6ff7b..6366c06b99b 100644 --- a/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md +++ b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md @@ -25,7 +25,7 @@ dbt Cloud's [flexible plans](https://www.getdbt.com/pricing/) and features make diff --git a/website/docs/docs/cloud/about-cloud/architecture.md b/website/docs/docs/cloud/about-cloud/architecture.md index 4b7a1b945b7..0f9f544d697 100644 --- a/website/docs/docs/cloud/about-cloud/architecture.md +++ b/website/docs/docs/cloud/about-cloud/architecture.md @@ -49,7 +49,7 @@ The git repo information is stored on dbt Cloud servers to make it accessible du ### Authentication services -The default settings of dbt Cloud enable local users with credentials stored in dbt Cloud. Still, integrations with various authentication services are offered as an alternative, including [single sign-on services](/docs/cloud/manage-access/sso-overview). Access to features can be granted/restricted by role using [RBAC](/docs/cloud/manage-access/enterprise-permissions). +The default settings of dbt Cloud enable local users with credentials stored in dbt Cloud. Still, integrations with various authentication services are offered as an alternative, including [single sign-on services](/docs/cloud/manage-access/sso-overview). Access to features can be granted/restricted by role using [RBAC](/docs/cloud/manage-access/about-user-access#role-based-access-control-). SSO features are essential because they reduce the number of credentials a user must maintain. Users sign in once and the authentication token is shared among integrated services (such as dbt Cloud). The token expires and must be refreshed at predetermined intervals, requiring the user to go through the authentication process again. If the user is disabled in the SSO provider service, their access to dbt Cloud is disabled, and they cannot override this with local auth credentials. diff --git a/website/docs/docs/cloud/about-cloud/browsers.md b/website/docs/docs/cloud/about-cloud/browsers.md index 12665bc7b72..1e26d3a6d59 100644 --- a/website/docs/docs/cloud/about-cloud/browsers.md +++ b/website/docs/docs/cloud/about-cloud/browsers.md @@ -27,4 +27,4 @@ To improve your experience using dbt Cloud, we suggest that you turn off ad bloc A session is a period of time during which you’re signed in to a dbt Cloud account from a browser. If you close your browser, it will end your session and log you out. You'll need to log in again the next time you try to access dbt Cloud. -If you've logged in using [SSO](/docs/cloud/manage-access/sso-overview) or [OAuth](/docs/cloud/git/connect-github#personally-authenticate-with-github), you can customize your maximum session duration, which might vary depending on your identity provider (IdP). +If you've logged in using [SSO](/docs/cloud/manage-access/sso-overview), you can customize your maximum session duration, which might vary depending on your identity provider (IdP). diff --git a/website/docs/docs/cloud/account-settings.md b/website/docs/docs/cloud/account-settings.md new file mode 100644 index 00000000000..aaad9b28e5c --- /dev/null +++ b/website/docs/docs/cloud/account-settings.md @@ -0,0 +1,50 @@ +--- +title: "Account settings in dbt Cloud" +sidebar_label: "Account settings" +description: "Learn how to enable account settings for your dbt Cloud users." +--- + +The following sections describe the different **Account settings** available from your dbt Cloud account in the sidebar (under your account name on the lower left-hand side). + + + +## Git repository caching + +At the start of every job run, dbt Cloud clones the project's Git repository so it has the latest versions of your project's code and runs `dbt deps` to install your dependencies. + +For improved reliability and performance on your job runs, you can enable dbt Cloud to keep a cache of the project's Git repository. So, if there's a third-party outage that causes the cloning operation to fail, dbt Cloud will instead use the cached copy of the repo so your jobs can continue running as scheduled. + +dbt Cloud caches your project's Git repo after each successful run and retains it for 8 days if there are no repo updates. It caches all packages regardless of installation method and does not fetch code outside of the job runs. + +dbt Cloud will use the cached copy of your project's Git repo under these circumstances: + +- Outages from third-party services (for example, the [dbt package hub](https://hub.getdbt.com/)). +- Git authentication fails. +- There are syntax errors in the `packages.yml` file. You can set up and use [continuous integration (CI)](/docs/deploy/continuous-integration) to find these errors sooner. +- If a package doesn't work with the current dbt version. You can set up and use [continuous integration (CI)](/docs/deploy/continuous-integration) to identify this issue sooner. + +To use, select the **Enable repository caching** option from your account settings. + + + +## Partial parsing + +At the start of every dbt invocation, dbt reads all the files in your project, extracts information, and constructs an internal manifest containing every object (model, source, macro, and so on). Among other things, it uses the `ref()`, `source()`, and `config()` macro calls within models to set properties, infer dependencies, and construct your project's DAG. When dbt finishes parsing your project, it stores the internal manifest in a file called `partial_parse.msgpack`. + +Parsing projects can be time-consuming, especially for large projects with hundreds of models and thousands of files. To reduce the time it takes dbt to parse your project, use the partial parsing feature in dbt Cloud for your environment. When enabled, dbt Cloud uses the `partial_parse.msgpack` file to determine which files have changed (if any) since the project was last parsed, and then it parses _only_ the changed files and the files related to those changes. + +Partial parsing in dbt Cloud requires dbt version 1.4 or newer. The feature does have some known limitations. Refer to [Known limitations](/reference/parsing#known-limitations) to learn more about them. + +To use, select the **Enable partial parsing between deployment runs** option from your account settings. + + + +## Account access to Advanced CI features + +[Advanced CI](/docs/deploy/advanced-ci) features, such as [compare changes](/docs/deploy/advanced-ci#compare-changes), allow dbt Cloud account members to view details about the changes between what's in the production environment and the pull request. + +To use Advanced CI features, your dbt Cloud account must have access to them. Ask your dbt Cloud administrator to enable Advanced CI features on your account, which they can do by choosing the **Enable account access to Advanced CI** option from the account settings. + +Once enabled, the **dbt compare** option becomes available in the CI job settings for you to select. + + diff --git a/website/docs/docs/cloud/configure-cloud-cli.md b/website/docs/docs/cloud/configure-cloud-cli.md index 2874e166a8f..2e0fc174517 100644 --- a/website/docs/docs/cloud/configure-cloud-cli.md +++ b/website/docs/docs/cloud/configure-cloud-cli.md @@ -52,21 +52,29 @@ Once you install the dbt Cloud CLI, you need to configure it to connect to a dbt The config file looks like this: - ```yaml - version: "1" - context: - active-project: "" - active-host: "" - defer-env-id: "" - projects: - - project-id: "" - account-host: "" - api-key: "" - - - project-id: "" - account-host: "" - api-key: "" - ``` + ```yaml + version: "1" + context: + active-project: "" + active-host: "" + defer-env-id: "" + projects: + - project-name: "" + project-id: "" + account-name: "" + account-id: "" + account-host: "" # for example, "cloud.getdbt.com" + token-name: "" + token-value: "" + + - project-name: "" + project-id: "" + account-name: "" + account-id: "" + account-host: "" # for example, "cloud.getdbt.com" + token-name: "" + token-value: "" + ``` 3. After downloading the config file and creating your directory, navigate to a dbt project in your terminal: @@ -100,7 +108,6 @@ To set environment variables in the dbt Cloud CLI for your dbt project: 2. Then select **Profile Settings**, then **Credentials**. 3. Click on your project and scroll to the **Environment Variables** section. 4. Click **Edit** on the lower right and then set the user-level environment variables. - - Note, when setting up the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), using [environment variables](/docs/build/environment-variables) like `{{env_var('DBT_WAREHOUSE')}}` is not supported. You should use the actual credentials instead. ## Use the dbt Cloud CLI @@ -188,4 +195,11 @@ move %USERPROFILE%\Downloads\dbt_cloud.yml %USERPROFILE%\.dbt\dbt_cloud.yml This command moves the `dbt_cloud.yml` from the `Downloads` folder to the `.dbt` folder. If your `dbt_cloud.yml` file is located elsewhere, adjust the path accordingly. + + + + +By default, [all artifacts](/reference/artifacts/dbt-artifacts) are downloaded when you execute dbt commands from the dbt Cloud CLI. To skip these files from being downloaded, add `--download-artifacts=false` to the command you want to run. This can help improve run-time performance but might break workflows that depend on assets like the [manifest](/reference/artifacts/manifest-json). + + diff --git a/website/docs/docs/cloud/connect-data-platform/about-connections.md b/website/docs/docs/cloud/connect-data-platform/about-connections.md index 58e6ece30a7..89dd13808ec 100644 --- a/website/docs/docs/cloud/connect-data-platform/about-connections.md +++ b/website/docs/docs/cloud/connect-data-platform/about-connections.md @@ -8,7 +8,7 @@ pagination_prev: null --- dbt Cloud can connect with a variety of data platform providers including: - [AlloyDB](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) -- [Amazon Athena (Beta)](/docs/cloud/connect-data-platform/connect-amazon-athena) +- [Amazon Athena](/docs/cloud/connect-data-platform/connect-amazon-athena) - [Amazon Redshift](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) - [Apache Spark](/docs/cloud/connect-data-platform/connect-apache-spark) - [Azure Synapse Analytics](/docs/cloud/connect-data-platform/connect-azure-synapse-analytics) @@ -18,32 +18,35 @@ dbt Cloud can connect with a variety of data platform providers including: - [PostgreSQL](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) - [Snowflake](/docs/cloud/connect-data-platform/connect-snowflake) - [Starburst or Trino](/docs/cloud/connect-data-platform/connect-starburst-trino) +- [Teradata](/docs/cloud/connect-data-platform/connect-teradata) -You can connect to your database in dbt Cloud by clicking the gear in the top right and selecting **Account Settings**. From the Account Settings page, click **+ New Project**. +To connect to your database in dbt Cloud: - +1. Click your account name at the bottom of the left-side menu and click **Account settings** +2. Select **Projects** from the top left, and from there click **New Project** + + These connection instructions provide the basic fields required for configuring a data platform connection in dbt Cloud. For more detailed guides, which include demo project data, read our [Quickstart guides](https://docs.getdbt.com/guides) ## Connection management -:::info Connections are moving! +:::info Connection management now at account-level -Up until July 2024, connections were nested under projects. One dbt Cloud project could only have one connection, which was re-used across all its environments. Extended attributes were leveraged to switch warehouse instances depending on the environment for a given project. +Starting July 2024, connection management has moved from the project level to the account level for all users in dbt Cloud. Previously, each dbt Cloud project could only have one connection, which was used across all its environments. Extended attributes were used to switch warehouse instances depending on the environment for a given project. - + -We are rolling out an important change that moves connection management to the account level. The following connection management section describes these changes. +Connections created with APIs before this change cannot be accessed with the [latest APIs](https://docs.getdbt.com/dbt-cloud/api-v3#/operations/List%20Account%20Connections). dbt Labs recommends [recreating the connections](https://docs.getdbt.com/dbt-cloud/api-v3#/operations/Create%20Account%20Connection) with the latest APIs. -This feature is being rolled out in phases over the coming weeks. ::: Warehouse connections are an account-level resource. As such you can find them under **Accounts Settings** > **Connections**: - + -Warehouse connections can be re-used across projects. If multiple projects all connect to the same warehouse, you should re-use the same connection in order to streamline your management operations. Connections are assigned to a project via an [environment](/docs/dbt-cloud-environments). +Warehouse connections can be re-used across projects. If multiple projects all connect to the same warehouse, you should re-use the same connection to streamline your management operations. Connections are assigned to a project via an [environment](/docs/dbt-cloud-environments). @@ -51,13 +54,17 @@ As shown in the image, a project with 2 environments can target between 1 and 2 ### Migration from project level connections to account level connections -Rolling out account-level connections will not require any interruption of service in your current usage (IDE, CLI, jobs, etc.). +Rolling out account-level connections will not require any interruption of service in your current usage (IDE, CLI, jobs, and so on.). + +:::info Why am I prompted to configure a development environment? +If your project did not previously have a development environment, you may be redirected to the project setup page. Your project is still intact. Choose a connection for your new development environment, and you can view all your environments again. +::: However, to fully utilize the value of account-level connections, you may have to rethink how you assign and use connections across projects and environments. -Please consider all of the following actions, as the steps you take will depend on the desired outcome. +Please consider the following actions, as the steps you take will depend on the desired outcome. - The initial clean-up of your connection list - Delete unused connections with 0 environments. @@ -93,4 +100,4 @@ dbt Cloud will always connect to your data platform from the IP addresses specif Be sure to allow traffic from these IPs in your firewall, and include them in any database grants. -Allowing these IP addresses only enables the connection to your . However, you might want to send API requests from your restricted network to the dbt Cloud API. For example, you could use the API to send a POST request that [triggers a job to run](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#operation/triggerRun). Using the dbt Cloud API requires that you allow the `cloud.getdbt.com` subdomain. For more on the dbt Cloud architecture, see [Deployment architecture](/docs/cloud/about-cloud/architecture). +Allowing these IP addresses only enables the connection to your . However, you might want to send API requests from your restricted network to the dbt Cloud API. Using the dbt Cloud API requires allowing the `cloud.getdbt.com` subdomain. For more on the dbt Cloud architecture, see [Deployment architecture](/docs/cloud/about-cloud/architecture). diff --git a/website/docs/docs/cloud/connect-data-platform/connect-amazon-athena.md b/website/docs/docs/cloud/connect-data-platform/connect-amazon-athena.md index 0cb7c7b0ce3..f1009f61274 100644 --- a/website/docs/docs/cloud/connect-data-platform/connect-amazon-athena.md +++ b/website/docs/docs/cloud/connect-data-platform/connect-amazon-athena.md @@ -5,13 +5,7 @@ description: "Configure the Amazon Athena data platform connection in dbt Cloud. sidebar_label: "Connect Amazon Athena" --- -# Connect Amazon Athena - -:::note beta - -This is a beta feature with limited availability. A public preview will follow shortly, for wider early access. For more information, check out our [product lifecycle](/docs/dbt-versions/product-lifecycles#dbt-cloud) page. - -::: +# Connect Amazon Athena Your environment(s) must be on ["Versionless"](/docs/dbt-versions/versionless-cloud) to use the Amazon Athena connection. diff --git a/website/docs/docs/cloud/connect-data-platform/connect-teradata.md b/website/docs/docs/cloud/connect-data-platform/connect-teradata.md new file mode 100644 index 00000000000..cf41814078b --- /dev/null +++ b/website/docs/docs/cloud/connect-data-platform/connect-teradata.md @@ -0,0 +1,29 @@ +--- +title: "Connect Teradata" +id: connect-teradata +description: "Configure the Teradata platform connection in dbt Cloud." +sidebar_label: "Connect Teradata" +--- + +# Connect Teradata + +Your environment(s) must be on ["Versionless"](/docs/dbt-versions/versionless-cloud) to use the Teradata connection. + +| Field | Description | Type | Required? | Example | +| ----------------------------- | --------------------------------------------------------------------------------------------- | -------------- | --------- | ------- | +| Host | Host name of your Teradata environment. | String | Required | host-name.env.clearscape.teradata.com | +| Port | The database port number. Equivalent to the Teradata JDBC Driver DBS_PORT connection parameter.| Quoted integer | Optional | 1025 | +| Retries | Number of times to retry to connect to database upon error. | Integer | optional | 10 | +| Request timeout | The waiting period between connections attempts in seconds. Default is "1" second. | Quoted integer | Optional | 3 | + + + +### Development and deployment credentials + +| Field | Description | Type | Required? | Example | +| ------------------------------|-----------------------------------------------------------------------------------------------|----------------|-----------|--------------------| +| Username | The database username. Equivalent to the Teradata JDBC Driver USER connection parameter. | String | Required | database_username | +| Password | The database password. Equivalent to the Teradata JDBC Driver PASSWORD connection parameter. | String | Required | DatabasePassword123 | +| Schema | Specifies the initial database to use after login, rather than the user's default database. | String | Required | dbtlabsdocstest | + + diff --git a/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md b/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md index 7ea6e380000..1ce9712ab91 100644 --- a/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md +++ b/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md @@ -4,6 +4,9 @@ id: connect-bigquery description: "Configure BigQuery connection." sidebar_label: "Connect BigQuery" --- + +## Authentication + ### JSON keyfile :::info Uploading a service account JSON keyfile @@ -48,3 +51,220 @@ As an end user, if your organization has set up BigQuery OAuth, you can link a p ## Configuration To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [BigQuery-specific configuration](/reference/resource-configs/bigquery-configs). + +### Optional configurations + +In BigQuery, optional configurations let you tailor settings for tasks such as query priority, dataset location, job timeout, and more. These options give you greater control over how BigQuery functions behind the scenes to meet your requirements. + +To customize your optional configurations in dbt Cloud: + +1. Click your name at the bottom left-hand side bar menu in dbt Cloud +2. Select **Your profile** from the menu +3. From there, click **Projects** and select your BigQuery project +5. Go to **Development Connection** and select BigQuery +6. Click **Edit** and then scroll down to **Optional settings** + + + +The following are the optional configurations you can set in dbt Cloud: + +| Configuration |
Information
| Type |
Example
| +|---------------------------|-----------------------------------------|---------|--------------------| +| [Priority](#priority) | Sets the priority for BigQuery jobs (either `interactive` or queued for `batch` processing) | String | `batch` or `interactive` | +| [Retries](#retries) | Specifies the number of retries for failed jobs due to temporary issues | Integer | `3` | +| [Location](#location) | Location for creating new datasets | String | `US`, `EU`, `us-west2` | +| [Maximum bytes billed](#maximum-bytes-billed) | Limits the maximum number of bytes that can be billed for a query | Integer | `1000000000` | +| [Execution project](#execution-project) | Specifies the project ID to bill for query execution | String | `my-project-id` | +| [Impersonate service account](#impersonate-service-account) | Allows users authenticated locally to access BigQuery resources under a specified service account | String | `service-account@project.iam.gserviceaccount.com` | +| [Job retry deadline seconds](#job-retry-deadline-seconds) | Sets the total number of seconds BigQuery will attempt to retry a job if it fails | Integer | `600` | +| [Job creation timeout seconds](#job-creation-timeout-seconds) | Specifies the maximum timeout for the job creation step | Integer | `120` | +| [Google cloud storage-bucket](#google-cloud-storage-bucket) | Location for storing objects in Google Cloud Storage | String | `my-bucket` | +| [Dataproc region](#dataproc-region) | Specifies the cloud region for running data processing jobs | String | `US`, `EU`, `asia-northeast1` | +| [Dataproc cluster name](#dataproc-cluster-name) | Assigns a unique identifier to a group of virtual machines in Dataproc | String | `my-cluster` | + + + + +The `priority` for the BigQuery jobs that dbt executes can be configured with the `priority` configuration in your BigQuery profile. The priority field can be set to one of `batch` or `interactive`. For more information on query priority, consult the [BigQuery documentation](https://cloud.google.com/bigquery/docs/running-queries). + + + + + +Retries in BigQuery help to ensure that jobs complete successfully by trying again after temporary failures, making your operations more robust and reliable. + + + + + +The `location` of BigQuery datasets can be set using the `location` setting in a BigQuery profile. As per the [BigQuery documentation](https://cloud.google.com/bigquery/docs/locations), `location` may be either a multi-regional location (for example, `EU`, `US`), or a regional location (like `us-west2`). + + + + + +When a `maximum_bytes_billed` value is configured for a BigQuery profile, that allows you to limit how much data your query can process. It’s a safeguard to prevent your query from accidentally processing more data than you expect, which could lead to higher costs. Queries executed by dbt will fail if they exceed the configured maximum bytes threshhold. This configuration should be supplied as an integer number of bytes. + +If your `maximum_bytes_billed` is 1000000000, you would enter that value in the `maximum_bytes_billed` field in dbt cloud. + + + + + + +By default, dbt will use the specified `project`/`database` as both: + +1. The location to materialize resources (models, seeds, snapshots, and so on), unless they specify a custom project/database config +2. The GCP project that receives the bill for query costs or slot usage + +Optionally, you may specify an execution project to bill for query execution, instead of the project/database where you materialize most resources. + + + + + +This feature allows users authenticating using local OAuth to access BigQuery resources based on the permissions of a service account. + +For a general overview of this process, see the official docs for [Creating Short-lived Service Account Credentials](https://cloud.google.com/iam/docs/create-short-lived-credentials-direct). + + + + + +Job retry deadline seconds is the maximum amount of time BigQuery will spend retrying a job before it gives up. + + + + + +Job creation timeout seconds is the maximum time BigQuery will wait to start the job. If the job doesn’t start within that time, it times out. + + + +#### Run dbt python models on Google Cloud Platform + +import BigQueryDataproc from '/snippets/_bigquery-dataproc.md'; + + + + + +Everything you store in Cloud Storage must be placed inside a [bucket](https://cloud.google.com/storage/docs/buckets). Buckets help you organize your data and manage access to it. + + + + + +A designated location in the cloud where you can run your data processing jobs efficiently. This region must match the location of your BigQuery dataset if you want to use Dataproc with BigQuery to ensure data doesn't move across regions, which can be inefficient and costly. + +For more information on [Dataproc regions](https://cloud.google.com/bigquery/docs/locations), refer to the BigQuery documentation. + + + + + +A unique label you give to your group of virtual machines to help you identify and manage your data processing tasks in the cloud. When you integrate Dataproc with BigQuery, you need to provide the cluster name so BigQuery knows which specific set of resources (the cluster) to use for running the data jobs. + +Have a look at [Dataproc's document on Create a cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) for an overview on how clusters work. + + + +### Account level connections and credential management + +You can re-use connections across multiple projects with [global connections](/docs/cloud/connect-data-platform/about-connections#migration-from-project-level-connections-to-account-level-connections). Connections are attached at the environment level (formerly project level), so you can utilize multiple connections inside of a single project (to handle dev, staging, production, etc.). + +BigQuery connections in dbt Cloud currently expect the credentials to be handled at the connection level (and only BigQuery connections). This was originally designed to facilitate creating a new connection by uploading a service account keyfile. This describes how to override credentials at the environment level, via [extended attributes](/docs/dbt-cloud-environments#extended-attributes), _to allow project administrators to manage credentials independently_ of the account level connection details used for that environment. + +For a project, you will first create an environment variable to store the secret `private_key` value. Then, you will use extended attributes to override the entire service account JSON (you can't only override the secret key due to a constraint of extended attributes). + +1. **New environment variable** + + - Create a new _secret_ [environment variable](https://docs.getdbt.com/docs/build/environment-variables#handling-secrets) to handle the private key: `DBT_ENV_SECRET_PROJECTXXX_PRIVATE_KEY` + - Fill in the private key value according the environment + + To automate your deployment, use the following [admin API request](https://docs.getdbt.com/dbt-cloud/api-v3#/operations/Create%20Projects%20Environment%20Variables%20Bulk), with `XXXXX` your account number, `YYYYY` your project number, `ZZZZZ` your [API token](/docs/dbt-cloud-apis/authentication): + + ```shell + curl --request POST \ + --url https://cloud.getdbt.com/api/v3/accounts/XXXXX/projects/YYYYY/environment-variables/bulk/ \ + --header 'Accept: application/json' \ + --header 'Authorization: Bearer ZZZZZ' \ + --header 'Content-Type: application/json' \ + --data '{ + "env_var": [ + { + "new_name": "DBT_ENV_SECRET_PROJECTXXX_PRIVATE_KEY", + "project": "Value by default for the entire project", + "ENVIRONMENT_NAME_1": "Optional, if wanted, value for environment name 1", + "ENVIRONMENT_NAME_2": "Optional, if wanted, value for environment name 2" + } + ] + }' + ``` + +2. **Extended attributes** + + In the environment details, complete the [extended attributes](/docs/dbt-cloud-environments#extended-attributes) block with the following payload (replacing `XXX` with your corresponding information): + + ```yaml + keyfile_json: + type: service_account + project_id: xxx + private_key_id: xxx + private_key: '{{ env_var(''DBT_ENV_SECRET_PROJECTXXX_PRIVATE_KEY'') }}' + client_email: xxx + client_id: xxx + auth_uri: xxx + token_uri: xxx + auth_provider_x509_cert_url: xxx + client_x509_cert_url: xxx + ``` + + If you require [other fields](/docs/core/connect-data-platform/bigquery-setup#service-account-json) to be overridden at the environment level via extended attributes, please respect the [expected indentation](/docs/dbt-cloud-environments#only-the-top-level-keys-are-accepted-in-extended-attributes) (ordering doesn't matter): + + ```yaml + priority: interactive + keyfile_json: + type: xxx + project_id: xxx + private_key_id: xxx + private_key: '{{ env_var(''DBT_ENV_SECRET_PROJECTXXX_PRIVATE_KEY'') }}' + client_email: xxx + client_id: xxx + auth_uri: xxx + token_uri: xxx + auth_provider_x509_cert_url: xxx + client_x509_cert_url: xxx + execution_project: buck-stops-here-456 + ``` + + To automate your deployment, you first need to [create the extended attributes payload](https://docs.getdbt.com/dbt-cloud/api-v3#/operations/Create%20Extended%20Attributes) for a given project, and then [assign it](https://docs.getdbt.com/dbt-cloud/api-v3#/operations/Update%20Environment) to a specific environment. With `XXXXX` as your account number, `YYYYY` as your project number, and `ZZZZZ` as your [API token](/docs/dbt-cloud-apis/authentication): + + ```shell + curl --request POST \ + --url https://cloud.getdbt.com/api/v3/accounts/XXXXX/projects/YYYYY/extended-attributes/ \ + --header 'Accept: application/json' \ + --header 'Authorization: Bearer ZZZZZ' \ + --header 'Content-Type: application/json' \ + --data '{ + "id": null, + "extended_attributes": {"type":"service_account","project_id":"xxx","private_key_id":"xxx","private_key":"{{ env_var('DBT_ENV_SECRET_PROJECTXXX_PRIVATE_KEY') }}","client_email":"xxx","client_id":xxx,"auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://oauth2.googleapis.com/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_x509_cert_url":"xxx"}, + "state": 1 + }' + ``` + _Make a note of the `id` returned in the message._ It will be used in the following call. With `EEEEE` the environment id, `FFFFF` the extended attributes id: + + ```shell + curl --request POST \ + --url https://cloud.getdbt.com/api/v3/accounts/XXXXX/projects/YYYYY/environments/EEEEE/ \ + --header 'Accept: application/json' \ + --header 'Authorization: Bearer ZZZZZZ' \ + --header 'Content-Type: application/json' \ + --data '{ + "extended_attributes_id": FFFFF + }' + ``` + + + + diff --git a/website/docs/docs/cloud/dbt-assist-data.md b/website/docs/docs/cloud/dbt-assist-data.md deleted file mode 100644 index ad32c304ca8..00000000000 --- a/website/docs/docs/cloud/dbt-assist-data.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: "dbt Assist privacy and data" -sidebar_label: "dbt Assist privacy" -description: "dbt Assist’s powerful AI feature helps you deliver data that works." ---- - -# dbt Assist privacy and data - -dbt Labs is committed to protecting your privacy and data. This page provides information about how dbt Labs handles your data when you use dbt Assist. - -#### Is my data used by dbt Labs to train AI models? - -No, dbt Assist does not use client warehouse data to train any AI models. It uses API calls to an AI provider. - -#### Does dbt Labs share my personal data with third parties - -dbt Labs only shares client personal information as needed to perform the services, under client instructions, or for legal, tax, or compliance reasons. - -#### Does dbt Assist store or use personal data? - -The user clicks the AI assist button, and the user does not otherwise enter data. - -#### Does dbt Assist access my warehouse data? - -dbt Assist utilizes metadata, including column names, model SQL, the model's name, and model documentation. The row-level data from the warehouse is never used or sent to a third-party provider. Such output must be double-checked by the user for completeness and accuracy. - -#### Can dbt Assist data be deleted upon client written request? - -dbt Assist data, aside from usage data, does not persist on dbt Labs systems. Usage data is retained by dbt Labs. dbt Labs does not have possession of any personal or sensitive data. To the extent client identifies personal or sensitive information uploaded by or on behalf of client to dbt Labs systems, such data can be deleted within 30 days of written request. diff --git a/website/docs/docs/cloud/dbt-assist.md b/website/docs/docs/cloud/dbt-assist.md deleted file mode 100644 index eafe7d05821..00000000000 --- a/website/docs/docs/cloud/dbt-assist.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "About dbt Assist" -sidebar_label: "About dbt Assist" -description: "dbt Assist’s powerful AI co-pilot feature helps you deliver data that works." -pagination_next: "docs/cloud/enable-dbt-assist" -pagination_prev: null ---- - -# About dbt Assist - -dbt Assist is a powerful artificial intelligence (AI) co-pilot feature that helps automate development in dbt Cloud, allowing you to focus on delivering data that works. dbt Assist’s AI co-pilot generates [documentation](/docs/build/documentation) and [tests](/docs/build/data-tests) for your dbt SQL models directly in the dbt Cloud IDE, with a click of a button, and helps you accomplish more in less time. - -:::tip Beta feature -dbt Assist is an AI tool meant to _help_ developers generate documentation and tests in dbt Cloud. It's available in beta, in the dbt Cloud IDE only. - -To use dbt Assist, you must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing) and agree to use dbt Labs' OpenAI key. [Register your interest](https://docs.google.com/forms/d/e/1FAIpQLScPjRGyrtgfmdY919Pf3kgqI5E95xxPXz-8JoVruw-L9jVtxg/viewform) to join the private beta or reach out to your account team to begin this process. -::: - - - -## Feedback - -Please note: Always review AI-generated code and content as it may produce incorrect results. dbt Assist features and/or functionality may be added or eliminated as part of the beta trial. - -To give feedback, please reach out to your dbt Labs account team. We appreciate your feedback and suggestions as we improve dbt Assist. diff --git a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md index 438cb8c7981..c9d2cbbad30 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md @@ -10,11 +10,10 @@ pagination_prev: null The dbt Cloud integrated development environment (IDE) is a single web-based interface for building, testing, running, and version-controlling dbt projects. It compiles dbt code into SQL and executes it directly on your database. -The dbt Cloud IDE offers several [keyboard shortcuts](/docs/cloud/dbt-cloud-ide/keyboard-shortcuts) and [editing features](/docs/cloud/dbt-cloud-ide/ide-user-interface#editing-features) for faster and efficient development and governance: The dbt Cloud IDE offers several [keyboard shortcuts](/docs/cloud/dbt-cloud-ide/keyboard-shortcuts) and [editing features](/docs/cloud/dbt-cloud-ide/ide-user-interface#editing-features) for faster and efficient development and governance: - Syntax highlighting for SQL — Makes it easy to distinguish different parts of your code, reducing syntax errors and enhancing readability. -- AI co-pilot — Use [dbt Assist](/docs/cloud/dbt-assist), a powerful AI co-pilot feature, to generate documentation and tests for your dbt SQL models. +- AI copilot — Use [dbt Copilot](/docs/cloud/dbt-copilot), a powerful AI engine that can generate documentation, tests, and semantic models for your dbt SQL models. - Auto-completion — Suggests table names, arguments, and column names as you type, saving time and reducing typos. - Code [formatting and linting](/docs/cloud/dbt-cloud-ide/lint-format) — Helps standardize and fix your SQL code effortlessly. - Navigation tools — Easily move around your code, jump to specific lines, find and replace text, and navigate between project files. @@ -54,9 +53,9 @@ To understand how to navigate the IDE and its user interface elements, refer to | Feature | Description | |---|---| | [**Keyboard shortcuts**](/docs/cloud/dbt-cloud-ide/keyboard-shortcuts) | You can access a variety of [commands and actions](/docs/cloud/dbt-cloud-ide/keyboard-shortcuts) in the IDE by choosing the appropriate keyboard shortcut. Use the shortcuts for common tasks like building modified models or resuming builds from the last failure. | -| **IDE version control** | The IDE version control section and git button allow you to apply the concept of [version control](/docs/collaborate/git/version-control-basics) to your project directly into the IDE.

- Create or change branches, execute git commands using the git button.
- Commit or revert individual files by right-clicking the edited file
- [Resolve merge conflicts](/docs/collaborate/git/merge-conflicts)
- Link to the repo directly by clicking the branch name
- Edit, format, or lint files and execute dbt commands in your primary protected branch, and commit to a new branch.
- Use Git diff view to view what has been changed in a file before you make a pull request.
- From dbt version 1.6 and higher, use the **Prune branches** [button](/docs/cloud/dbt-cloud-ide/ide-user-interface#prune-branches-modal) to delete local branches that have been deleted from the remote repository, keeping your branch management tidy. | +| **IDE version control** | The IDE version control section and git button allow you to apply the concept of [version control](/docs/collaborate/git/version-control-basics) to your project directly into the IDE.

- Create or change branches, execute git commands using the git button.
- Commit or revert individual files by right-clicking the edited file
- [Resolve merge conflicts](/docs/collaborate/git/merge-conflicts)
- Link to the repo directly by clicking the branch name
- Edit, format, or lint files and execute dbt commands in your primary protected branch, and commit to a new branch.
- Use Git diff view to view what has been changed in a file before you make a pull request.
- Use the **Prune branches** [button](/docs/cloud/dbt-cloud-ide/ide-user-interface#prune-branches-modal) (dbt v1.6 and higher) to delete local branches that have been deleted from the remote repository, keeping your branch management tidy.
- Sign your [git commits](/docs/cloud/dbt-cloud-ide/git-commit-signing) to mark them as 'Verified'. | | **Preview and Compile button** | You can [compile or preview](/docs/cloud/dbt-cloud-ide/ide-user-interface#console-section) code, a snippet of dbt code, or one of your dbt models after editing and saving. | -| [**dbt Assist**](/docs/cloud/dbt-assist) | A powerful AI co-pilot feature that generates documentation and tests for your dbt SQL models. Available for dbt Cloud Enterprise plans. | +| [**dbt Copilot**](/docs/cloud/dbt-copilot) | A powerful AI engine that can generate documentation, tests, and semantic models for your dbt SQL models. Available for dbt Cloud Enterprise plans. | | **Build, test, and run button** | Build, test, and run your project with a button click or by using the Cloud IDE command bar. | **Command bar** | You can enter and run commands from the command bar at the bottom of the IDE. Use the [rich model selection syntax](/reference/node-selection/syntax) to execute [dbt commands](/reference/dbt-commands) directly within dbt Cloud. You can also view the history, status, and logs of previous runs by clicking History on the left of the bar. | **Drag and drop** | Drag and drop files located in the file explorer, and use the file breadcrumb on the top of the IDE for quick, linear navigation. Access adjacent files in the same file by right-clicking on the breadcrumb file. @@ -102,6 +101,7 @@ Nice job, you're ready to start developing and building models 🎉! ### Considerations - To improve your experience using dbt Cloud, we suggest that you turn off ad blockers. This is because some project file names, such as `google_adwords.sql`, might resemble ad traffic and trigger ad blockers. - To preserve performance, there's a file size limitation for repositories over 6 GB. If you have a repo over 6 GB, please contact [dbt Support](mailto:support@getdbt.com) before running dbt Cloud. +- The IDE's idle session timeout is one hour. - ### Start-up process @@ -128,8 +128,9 @@ Nice job, you're ready to start developing and building models 🎉! - If a model or test fails, dbt Cloud makes it easy for you to view and download the run logs for your dbt invocations to fix the issue. - Use dbt's [rich model selection syntax](/reference/node-selection/syntax) to [run dbt commands](/reference/dbt-commands) directly within dbt Cloud. - Starting from dbt v1.6, leverage [environments variables](/docs/build/environment-variables#special-environment-variables) to dynamically use the Git branch name. For example, using the branch name as a prefix for a development schema. + - Run [MetricFlow commands](/docs/build/metricflow-commands) to create and manage metrics in your project with the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl). -- **Generate your YAML configurations with dbt Assist** — [dbt Assist](/docs/cloud/dbt-assist) is a powerful artificial intelligence (AI) co-pilot feature that helps automate development in dbt Cloud. It generates documentation and tests for your dbt SQL models directly in the dbt Cloud IDE, with a click of a button, and helps you accomplish more in less time. Available for dbt Cloud Enterprise plans. +- **Generate your YAML configurations with dbt Copilot** — [dbt Copilot](/docs/cloud/dbt-copilot) is a powerful artificial intelligence (AI) feature that helps automate development in dbt Cloud. It can generate documentation, tests, and semantic models for your dbt SQL models directly in the dbt Cloud IDE, with a click of a button, and helps you accomplish more in less time. Available for dbt Cloud Enterprise plans. - **Build and view your project's docs** — The dbt Cloud IDE makes it possible to [build and view](/docs/collaborate/build-and-view-your-docs) documentation for your dbt project while your code is still in development. With this workflow, you can inspect and verify what your project's generated documentation will look like before your changes are released to production. diff --git a/website/docs/docs/cloud/dbt-cloud-ide/git-commit-signing.md b/website/docs/docs/cloud/dbt-cloud-ide/git-commit-signing.md new file mode 100644 index 00000000000..afaa0751669 --- /dev/null +++ b/website/docs/docs/cloud/dbt-cloud-ide/git-commit-signing.md @@ -0,0 +1,80 @@ +--- +title: "Git commit signing" +description: "Learn how to sign your Git commits when using the IDE for development." +sidebar_label: Git commit signing +--- + +# Git commit signing + +To prevent impersonation and enhance security, you can sign your Git commits before pushing them to your repository. Using your signature, a Git provider can cryptographically verify a commit and mark it as "verified", providing increased confidence about its origin. + +You can configure dbt Cloud to sign your Git commits when using the IDE for development. To set up, enable the feature in dbt Cloud, follow the flow to generate a keypair, and upload the public key to your Git provider to use for signature verification. + + +## Prerequisites + +- GitHub or GitLab is your Git provider. Currently, Azure DevOps is not supported. +- You have a dbt Cloud account on the [Enterprise plan](https://www.getdbt.com/pricing/). + +## Generate GPG keypair in dbt Cloud + +To generate a GPG keypair in dbt Cloud, follow these steps: +1. Go to your **Personal profile** page in dbt Cloud. +2. Navigate to **Signed Commits** section. +3. Enable the **Sign commits originating from this user** toggle. +4. This will generate a GPG keypair. The private key will be used to sign all future Git commits. The public key will be displayed, allowing you to upload it to your Git provider. + + + +## Upload public key to Git provider + +To upload the public key to your Git provider, follow the detailed documentation provided by the supported Git provider: + +- [GitHub instructions](https://docs.github.com/en/authentication/managing-commit-signature-verification/adding-a-gpg-key-to-your-github-account) +- [GitLab instructions](https://docs.gitlab.com/ee/user/project/repository/signed_commits/gpg.html) + +Once you have uploaded the public key to your Git provider, your Git commits will be marked as "Verified" after you push the changes to the repository. + + + +## Considerations + +- The GPG keypair is tied to the user, not a specific account. There is a 1:1 relationship between the user and keypair. The same key will be used for signing commits on any accounts the user is a member of. +- The GPG keypair generated in dbt Cloud is linked to the email address associated with your account at the time of keypair creation. This email identifies the author of signed commits. +- For your Git commits to be marked as "verified", your dbt Cloud email address must be a verified email address with your Git provider. The Git provider (such as, GitHub, GitLab) checks that the commit's signed email matches a verified email in your Git provider account. If they don’t match, the commit won't be marked as "verified." +- Keep your dbt Cloud email and Git provider's verified email in sync to avoid verification issues. If you change your dbt Cloud email address: + - Generate a new GPG keypair with the updated email, following the [steps mentioned earlier](/docs/cloud/dbt-cloud-ide/git-commit-signing#generate-gpg-keypair-in-dbt-cloud). + - Add and verify the new email in your Git provider. + + + +## FAQs + + + + + +If you delete your GPG keypair in dbt Cloud, your Git commits will no longer be signed. You can generate a new GPG keypair by following the [steps mentioned earlier](/docs/cloud/dbt-cloud-ide/git-commit-signing#generate-gpg-keypair-in-dbt-cloud). + + + + +GitHub and GitLab support commit signing, while Azure DevOps does not. Commit signing is a [git feature](https://git-scm.com/book/ms/v2/Git-Tools-Signing-Your-Work), and is independent of any specific provider. However, not all providers support the upload of public keys, or the display of verification badges on commits. + + + + + +If your Git Provider does not explicitly support the uploading of public GPG keys, then +commits will still be signed using the private key, but no verification information will +be displayed by the provider. + + + + + +If your Git provider is configured to enforce commit verification, then unsigned commits +will be rejected. To avoid this, ensure that you have followed all previous steps to generate +a keypair, and uploaded the public key to the provider. + + diff --git a/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md b/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md index 8d80483485c..36c6cc898dc 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md @@ -35,7 +35,7 @@ The IDE streamlines your workflow, and features a popular user interface layout * Added (A) — The IDE detects added files * Deleted (D) — The IDE detects deleted files. - + 5. **Command bar —** The Command bar, located in the lower left of the IDE, is used to invoke [dbt commands](/reference/dbt-commands). When a command is invoked, the associated logs are shown in the Invocation History Drawer. @@ -107,15 +107,19 @@ Starting from dbt v1.6 or higher, when you save changes to a model, you can comp 3. **Build button —** The build button allows users to quickly access dbt commands related to the active model in the File Editor. The available commands include dbt build, dbt test, and dbt run, with options to include only the current resource, the resource and its upstream dependencies, the resource, and its downstream dependencies, or the resource with all dependencies. This menu is available for all executable nodes. -4. **Format button —** The editor has a **Format** button that can reformat the contents of your files. For SQL files, it uses either `sqlfmt` or `sqlfluff`, and for Python files, it uses `black`. +4. **Lint button** — The **Lint** button runs the [linter](/docs/cloud/dbt-cloud-ide/lint-format) on the active file in the File Editor. The linter checks for syntax errors and style issues in your code and displays the results in the **Code quality** tab. -5. **Results tab —** The Results console tab displays the most recent Preview results in tabular format. +5. **dbt Copilot** — [dbt Copilot](/docs/cloud/dbt-copilot) is a powerful artificial intelligence engine that can generate documentation, tests, and semantic models for you. dbt Copilot is available in the IDE for Enterprise plans. + +6. **Results tab —** The Results console tab displays the most recent Preview results in tabular format. -6. **Compiled Code tab —** The Compile button triggers a compile invocation that generates compiled code, which is displayed in the Compiled Code tab. +7. **Code quality tab** — The Code Quality tab displays the results of the linter on the active file in the File Editor. It allows you to view code errors, provides code quality visibility and management, and displays the SQLFluff version used. + +8. **Compiled Code tab —** The Compile generates the compiled code when the Compile button is executed. The Compiled Code tab displays the compiled SQL code for the active file in the File Editor. -7. **Lineage tab —** The Lineage tab in the File Editor displays the active model's lineage or . By default, it shows two degrees of lineage in both directions (`2+model_name+2`), however, you can change it to +model+ (full DAG). +9. **Lineage tab —** The Lineage tab in the File Editor displays the active model's lineage or . By default, it shows two degrees of lineage in both directions (`2+model_name+2`), however, you can change it to +model+ (full DAG). To use the lineage: - Double-click a node in the DAG to open that file in a new tab - Expand or shrink the DAG using node selection syntax. - Note, the `--exclude` flag isn't supported. @@ -158,11 +162,11 @@ Use menus and modals to interact with IDE and access useful options to help your - #### File Search You can easily search for and navigate between files using the File Navigation menu, which can be accessed by pressing Command-O or Control-O or clicking on the 🔍 icon in the File Explorer. - + - #### Global Command Palette The Global Command Palette provides helpful shortcuts to interact with the IDE, such as git actions, specialized dbt commands, and compile, and preview actions, among others. To open the menu, use Command-P or Control-P. - + - #### IDE Status modal The IDE Status modal shows the current error message and debug logs for the server. This also contains an option to restart the IDE. Open this by clicking on the IDE Status button. @@ -193,7 +197,7 @@ Use menus and modals to interact with IDE and access useful options to help your * Toggling between dark or light mode for a better viewing experience * Restarting the IDE - * Fully recloning your repository to refresh your git state and view status details + * Rollback your repo to remote, to refresh your git state and view status details * Viewing status details, including the IDE Status modal. - + diff --git a/website/docs/docs/cloud/dbt-copilot-data.md b/website/docs/docs/cloud/dbt-copilot-data.md new file mode 100644 index 00000000000..b55681542e3 --- /dev/null +++ b/website/docs/docs/cloud/dbt-copilot-data.md @@ -0,0 +1,29 @@ +--- +title: "dbt Copilot privacy and data" +sidebar_label: "dbt Copilot privacy" +description: "dbt Copilot is a powerful AI engine to help you deliver data that works." +--- + +# dbt Copilot privacy and data + +dbt Labs is committed to protecting your privacy and data. This page provides information about how the dbt Copilot AI engine handles your data. + +#### Is my data used by dbt Labs to train AI models? + +No, dbt Copilot does not use client warehouse data to train any AI models. It uses API calls to an AI provider. + +#### Does dbt Labs share my personal data with third parties + +dbt Labs only shares client personal information as needed to perform the services, under client instructions, or for legal, tax, or compliance reasons. + +#### Does dbt Copilot store or use personal data? + +The user clicks the dbt Copilot button, and the user does not otherwise enter data. + +#### Does dbt Copilot access my warehouse data? + +dbt Copilot utilizes metadata, including column names, model SQL, the model's name, and model documentation. The row-level data from the warehouse is never used or sent to a third-party provider. Such output must be double-checked by the user for completeness and accuracy. + +#### Can dbt Copilot data be deleted upon client written request? + +The data from using dbt Copilot, aside from usage data, _doesn't_ persist on dbt Labs systems. Usage data is retained by dbt Labs. dbt Labs doesn't have possession of any personal or sensitive data. To the extent client identifies personal or sensitive information uploaded by or on behalf of client to dbt Labs systems, such data can be deleted within 30 days of written request. diff --git a/website/docs/docs/cloud/dbt-copilot.md b/website/docs/docs/cloud/dbt-copilot.md new file mode 100644 index 00000000000..403df86a089 --- /dev/null +++ b/website/docs/docs/cloud/dbt-copilot.md @@ -0,0 +1,25 @@ +--- +title: "About dbt Copilot" +sidebar_label: "About dbt Copilot" +description: "dbt Copilot is a powerful AI engine designed to accelerate your analytics workflows throughout your entire ADLC." +pagination_next: "docs/cloud/enable-dbt-copilot" +pagination_prev: null +--- + +# About dbt Copilot + +dbt Copilot is a powerful artificial intelligence (AI) engine that's fully integrated into your dbt Cloud experience and designed to accelerate your analytics workflows. dbt Copilot embeds AI-driven assistance across every stage of the analytics development life cycle (ADLC), empowering data practitioners to deliver data products faster, improve data quality, and enhance data accessibility. With automatic code generation, you can let the AI engine generate the [documentation](/docs/build/documentation), [tests](/docs/build/data-tests), and [semantic models](/docs/build/semantic-models) for you. + +:::tip Beta feature +dbt Copilot is designed to _help_ developers generate documentation, tests, and semantic models in dbt Cloud. It's available in beta, in the dbt Cloud IDE only. + +To use dbt Copilot, you must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing) and either agree to use dbt Labs' OpenAI key or provide your own Open AI API key. [Register here](https://docs.google.com/forms/d/e/1FAIpQLScPjRGyrtgfmdY919Pf3kgqI5E95xxPXz-8JoVruw-L9jVtxg/viewform) or reach out to the Account Team if you're interested in joining the private beta. +::: + + + +## Feedback + +Please note: Always review AI-generated code and content as it may produce incorrect results. The features and/or functionality of dbt Copilot may be added or eliminated as part of the beta trial. + +To give feedback, please contact your dbt Labs account team. We appreciate your feedback and suggestions as we improve dbt Copilot. diff --git a/website/docs/docs/cloud/enable-dbt-assist.md b/website/docs/docs/cloud/enable-dbt-assist.md deleted file mode 100644 index 9432f858001..00000000000 --- a/website/docs/docs/cloud/enable-dbt-assist.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -title: "Enable dbt Assist" -sidebar_label: "Enable dbt Assist" -description: "Enable dbt Assist in dbt Cloud and leverage AI to speed up your development." ---- - -# Enable dbt Assist - -This page explains how to enable dbt Assist in dbt Cloud to leverage AI to speed up your development and allow you to focus on delivering quality data. - -## Prerequisites - -- Available in the dbt Cloud IDE only. -- Must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing). -- Development environment be ["Versionless"](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless). -- Current dbt Assist deployments use a central OpenAI API key managed by dbt Labs. In the future, you may provide your own key for Azure OpenAI or OpenAI. -- Accept and sign legal agreements. Reach out to your account team to begin this process. - -## Enable dbt Assist - -dbt Assist will only be available at an account level after your organization has signed the legal requirements. It will be disabled by default. Your dbt Cloud Admin(s) will enable it by following these steps: - -1. Navigate to **Account Settings** in the navigation menu. - -2. Under **Settings**, confirm the account you're enabling. - -3. Click **Edit** in the top right corner. - -4. To turn on dbt Assist, toggle the **Enable account access to AI-powered features** switch to the right. The toggle will slide to the right side, activating dbt Assist. - -5. Click **Save** and you should now have dbt Assist AI enabled to use. - -Note: To disable (only after enabled), repeat steps 1 to 3, toggle off in step 4, and repeat step 5. - - diff --git a/website/docs/docs/cloud/enable-dbt-copilot.md b/website/docs/docs/cloud/enable-dbt-copilot.md new file mode 100644 index 00000000000..07a9f6294da --- /dev/null +++ b/website/docs/docs/cloud/enable-dbt-copilot.md @@ -0,0 +1,51 @@ +--- +title: "Enable dbt Copilot" +sidebar_label: "Enable dbt Copilot" +description: "Enable the dbt Copilot AI engine in dbt Cloud to speed up your development." +--- + +# Enable dbt Copilot + +This page explains how to enable the dbt Copilot engine in dbt Cloud, leveraging AI to speed up your development and allowing you to focus on delivering quality data. + +## Prerequisites + +- Available in the dbt Cloud IDE only. +- Must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing). +- Development environment has been upgraded to ["Versionless"](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless). +- By default, dbt Copilot deployments use a central OpenAI API key managed by dbt Labs. Alternatively, you can [provide your own OpenAI API key](#bringing-your-own-openai-api-key-byok). +- Accept and sign legal agreements. Reach out to your Account team to begin this process. + +## Enable dbt Copilot + +dbt Copilot is only available to your account after your organization has signed the required legal documents. It's disabled by default. A dbt Cloud admin can enable it by following these steps: + +1. Navigate to **Account settings** in the navigation menu. + +2. Under **Settings**, confirm the account you're enabling. + +3. Click **Edit** in the top right corner. + +4. Enable the **Enable account access to AI-powered features** option. + +5. Click **Save**. You should now have the dbt Copilot AI engine enabled for use. + +Note: To disable (only after enabled), repeat steps 1 to 3, toggle off in step 4, and repeat step 5. + + + +### Bringing your own OpenAI API key (BYOK) + +Once AI features have been enabled, you can provide your organization's OpenAI API key. dbt Cloud will then leverage your OpenAI account and terms to power dbt CoPilot. This will incur billing charges to your organization from OpenAI for requests made by dbt CoPilot. + +Note that Azure OpenAI is not currently supported, but will be in the future. + +A dbt Cloud admin can provide their API key by following these steps: + +1. Navigate to **Account settings** in the side menu. + +2. Find the **Settings** section and click on **Integrations**. + +3. Scroll to **AI** and select the toggle for **OpenAI** + +4. Enter your API key and click **Save**. \ No newline at end of file diff --git a/website/docs/docs/cloud/git/authenticate-azure.md b/website/docs/docs/cloud/git/authenticate-azure.md index 42028bf993b..5278c134f72 100644 --- a/website/docs/docs/cloud/git/authenticate-azure.md +++ b/website/docs/docs/cloud/git/authenticate-azure.md @@ -13,9 +13,9 @@ If you use the dbt Cloud IDE or dbt Cloud CLI to collaborate on your team's Azur Connect your dbt Cloud profile to Azure DevOps using OAuth: -1. Click the gear icon at the top right and select **Profile settings**. -2. Click **Linked Accounts**. -3. Next to Azure DevOps, click **Link**. +1. Click your account name at the bottom of the left-side menu and click **Account settings** +2. Scroll down to **Your profile** and select **Personal profile**. +3. Go to the **Linked accounts** section in the middle of the page. 4. Once you're redirected to Azure DevOps, sign into your account. diff --git a/website/docs/docs/cloud/git/connect-github.md b/website/docs/docs/cloud/git/connect-github.md index ff0f2fff18f..e2bf459275e 100644 --- a/website/docs/docs/cloud/git/connect-github.md +++ b/website/docs/docs/cloud/git/connect-github.md @@ -7,7 +7,6 @@ sidebar_label: "Connect to GitHub" Connecting your GitHub account to dbt Cloud provides convenience and another layer of security to dbt Cloud: -- Log into dbt Cloud using OAuth through GitHub. - Import new GitHub repositories with a couple clicks during dbt Cloud project setup. - Clone repos using HTTPS rather than SSH. - Trigger [Continuous integration](/docs/deploy/continuous-integration)(CI) builds when pull requests are opened in GitHub. @@ -18,7 +17,7 @@ Connecting your GitHub account to dbt Cloud provides convenience and another lay * **Note** — [Single tenant](/docs/cloud/about-cloud/tenancy#single-tenant) accounts offer enhanced connection options for integrating with an On-Premises GitHub deployment setup using the native integration. This integration allows you to use all the features of the integration, such as triggering CI builds. The dbt Labs infrastructure team will coordinate with you to ensure any additional networking configuration requirements are met and completed. To discuss details, contact dbt Labs support or your dbt Cloud account team. - You _must_ be a **GitHub organization owner** in order to [install the dbt Cloud application](/docs/cloud/git/connect-github#installing-dbt-cloud-in-your-github-account) in your GitHub organization. To learn about GitHub organization roles, see the [GitHub documentation](https://docs.github.com/en/organizations/managing-peoples-access-to-your-organization-with-roles/roles-in-an-organization). - The GitHub organization owner requires [_Owner_](/docs/cloud/manage-access/self-service-permissions) or [_Account Admin_](/docs/cloud/manage-access/enterprise-permissions) permissions when they log into dbt Cloud to integrate with a GitHub environment using organizations. -- You may need to temporarily provide an extra dbt Cloud user account with _Owner_ or _Account Admin_ [permissions](/docs/cloud/manage-access/self-service-permissions) for your GitHub organization owner until they complete the installation. +- You may need to temporarily provide an extra dbt Cloud user account with _Owner_ or _Account Admin_ [permissions](/docs/cloud/manage-access/enterprise-permissions) for your GitHub organization owner until they complete the installation. ## Installing dbt Cloud in your GitHub account @@ -48,19 +47,22 @@ To connect your dbt Cloud account to your GitHub account: - Read and write access to Workflows 6. Once you grant access to the app, you will be redirected back to dbt Cloud and shown a linked account success state. You are now personally authenticated. -7. Ask your team members to [personally authenticate](/docs/cloud/git/connect-github#personally-authenticate-with-github) by connecting their GitHub profiles. +7. Ask your team members to individually authenticate by connecting their [personal GitHub profiles](#authenticate-your-personal-github-account). ## Limiting repository access in GitHub If you are your GitHub organization owner, you can also configure the dbt Cloud GitHub application to have access to only select repositories. This configuration must be done in GitHub, but we provide an easy link in dbt Cloud to start this process. -## Personally authenticate with GitHub +## Authenticate your personal GitHub account + +After the dbt Cloud administrator [sets up a connection](/docs/cloud/git/connect-github#installing-dbt-cloud-in-your-github-account) to your organization's GitHub account, you need to authenticate using your personal account. You must connect your personal GitHub profile to dbt Cloud to use the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) and [CLI](/docs/cloud/cloud-cli-installation) and verify your read and write access to the repository. + +:::info GitHub profile connection -Once the dbt Cloud admin has [set up a connection](/docs/cloud/git/connect-github#installing-dbt-cloud-in-your-github-account) to your organization GitHub account, you need to personally authenticate, which improves the security of dbt Cloud by enabling you to log in using OAuth through GitHub. -:::infoGitHub profile connection - dbt Cloud developers on the [Enterprise plan](https://www.getdbt.com/pricing/) must each connect their GitHub profiles to dbt Cloud. This is because the dbt Cloud IDE verifies every developer's read / write access for the dbt repo. - dbt Cloud developers on the [Team plan](https://www.getdbt.com/pricing/) don't need to each connect their profiles to GitHub, however, it's still recommended to do so. + ::: To connect a personal GitHub account: @@ -74,7 +76,7 @@ To connect a personal GitHub account: 4. Once you approve authorization, you will be redirected to dbt Cloud, and you should now see your connected account. -The next time you log into dbt Cloud, you will be able to do so via OAuth through GitHub, and if you're on the Enterprise plan, you're ready to use the dbt Cloud IDE or dbt Cloud CLI. +You can now use the dbt Cloud IDE or dbt Cloud CLI. ## FAQs diff --git a/website/docs/docs/cloud/git/setup-azure.md b/website/docs/docs/cloud/git/setup-azure.md index 6fdb2517f1a..273660ba3dd 100644 --- a/website/docs/docs/cloud/git/setup-azure.md +++ b/website/docs/docs/cloud/git/setup-azure.md @@ -17,7 +17,7 @@ To use our native integration with Azure DevOps in dbt Cloud, an account admin n 4. [Connect Azure DevOps to your new app](#connect-azure-devops-to-your-new-app). 5. [Add your Entra ID app to dbt Cloud](#add-your-azure-ad-app-to-dbt-cloud). -Once the Microsoft Entra ID app is added to dbt Cloud, an account admin must also [connect a service user](#connecting-a-service-user) via OAuth, which will be used to power headless actions in dbt Cloud such as deployment runs and CI. +Once the Microsoft Entra ID app is added to dbt Cloud, an account admin must also [connect a service user](/docs/cloud/git/setup-azure#connect-a-service-user) via OAuth, which will be used to power headless actions in dbt Cloud such as deployment runs and CI. Once the Microsoft Entra ID app is added to dbt Cloud and the service user is connected, then dbt Cloud developers can personally authenticate in dbt Cloud from Azure DevOps. For more on this, see [Authenticate with Azure DevOps](/docs/cloud/git/authenticate-azure). @@ -89,7 +89,7 @@ An Azure admin will need one of the following permissions in both the Microsoft - Azure Service Administrator - Azure Co-administrator -If your Azure DevOps account is connected to Entra ID, then you can proceed to [Connecting a service user](#connecting-a-service-user). However, if you're just getting set up, connect Azure DevOps to the Microsoft Entra ID app you just created: +If your Azure DevOps account is connected to Entra ID, then you can proceed to [Connect a service user](#connect-a-service-user). However, if you're just getting set up, connect Azure DevOps to the Microsoft Entra ID app you just created: 1. From your Azure DevOps account, select **Organization settings** in the bottom left. 2. Navigate to Microsoft Entra ID. @@ -373,6 +373,16 @@ A dbt Cloud account admin with access to the service user's Azure DevOps account Once connected, dbt Cloud displays the email address of the service user so you know which user's permissions are enabling headless actions in deployment environments. To change which account is connected, disconnect the profile in dbt Cloud, sign into the alternative Azure DevOps service account, and re-link the account in dbt Cloud. +### Using Azure AD for SSO with dbt Cloud and Microsoft tools + +If you're using Azure AD for SSO with dbt Cloud and Microsoft tools, the SSO flow may sometimes direct your account admin to their personal user account instead of the service user. If this happens, follow these steps to resolve it: + +1. Sign in to the service user's Azure DevOps account (ensure they are also connected to dbt Cloud through SSO). +2. When connected to dbt Cloud, sign out of Azure AD through the [Azure portal](https://portal.azure.com/). +3. Disconnect the service user in dbt Cloud, and follow the steps to set it up again. +4. You should then be prompted to enter service user credentials. + + :::info Personal Access Tokens (PATs) dbt Cloud leverages the service user to generate temporary access tokens called [PATs](https://learn.microsoft.com/en-us/azure/devops/organizations/accounts/use-personal-access-tokens-to-authenticate?toc=%2Fazure%2Fdevops%2Fmarketplace-extensibility%2Ftoc.json&view=azure-devops&tabs=Windows). diff --git a/website/docs/docs/cloud/manage-access/about-access.md b/website/docs/docs/cloud/manage-access/about-access.md index 64826531245..b9d23b28add 100644 --- a/website/docs/docs/cloud/manage-access/about-access.md +++ b/website/docs/docs/cloud/manage-access/about-access.md @@ -8,142 +8,211 @@ pagination_prev: null :::info "User access" is not "Model access" -**User groups and access** and **model groups and access** mean two different things. "Model groups and access" is a specific term used in the language of dbt-core. Refer to [Model access](/docs/collaborate/govern/model-access) for more info on what it means in dbt-core. +This page is specific to user groups and access, which includes: +- User licenses, permissions, and group memberships +- Role-based access controls for projects and environments +- Single sign-on and secure authentication -::: +"Model groups and access" is a feature specific to models and their availability across projects. Refer to [Model access](/docs/collaborate/govern/model-access) for more info on what it means for your dbt projects. -dbt Cloud administrators can use dbt Cloud's permissioning model to control -user-level access in a dbt Cloud account. This access control comes in two flavors: -License-based and Role-based. +::: -- **License-based Access Controls:** User are configured with account-wide - license types. These licenses control the specific parts of the dbt Cloud application - that a given user can access. -- **Role-based Access Control (RBAC):** Users are assigned to _groups_ that have - specific permissions on specific projects or the entire account. A user may be - a member of multiple groups, and those groups may have permissions on multiple - projects. +# About user access +You can regulate access to dbt Cloud by various measures, including licenses, groups, permissions, and role-based access control (RBAC). To understand the possible approaches to user access to dbt Cloud features and functionality, you should first know how we approach users and groups. -## License-based access control +### Users -Each user on an account is assigned a license type when the user is first -invited to a given account. This license type may change over time, but a -user can only have one type of license at any given time. +Individual users in dbt Cloud can be people you [manually invite](/docs/cloud/manage-access/invite-users) or grant access via an external identity provider (IdP), such as Microsoft Entra ID, Okta, or Google Workspace. -A user's license type controls the features in dbt Cloud that the user is able -to access. dbt Cloud's three license types are: +In either scenario, when you add a user to dbt Cloud, they are assigned a [license](#licenses). You assign licenses at the individual user or group levels. When you manually invite a user, you will assign the license in the invitation window. - - **Developer** — User may be granted _any_ permissions. - - **Read-Only** — User has read-only permissions applied to all dbt Cloud resources regardless of the role-based permissions that the user is assigned. - - **IT** — User has [Security Admin](/docs/cloud/manage-access/enterprise-permissions#security-admin) and [Billing Admin](/docs/cloud/manage-access/enterprise-permissions#billing-admin) permissions applied regardless of the role-based permissions that the user is assigned. + -For more information on these license types, see [Seats & Users](/docs/cloud/manage-access/seats-and-users). +You can edit an existing user's license by navigating to the **Users** section of the **Account settings**, clicking on a user, and clicking **Edit** on the user pane. Delete users from this same window to free up licenses for new users. -## Role-based access control + -:::info dbt Cloud Enterprise -Role-based access control is a feature of the dbt Cloud Enterprise plan +### Groups -::: +Groups in dbt Cloud serve much of the same purpose as they do in traditional directory tools — to gather individual users together to make bulk assignment of permissions easier. Admins use groups in dbt Cloud to assign [licenses](#licenses) and [permissions](#permissions). The permissions are more granular than licenses, and you only assign them at the group level; _you can’t assign permissions at the user level._ Every user in dbt Cloud must be assigned to at least one group. -Role-based access control allows for fine-grained permissioning in the dbt Cloud -application. With role-based access control, users can be assigned varying -permissions to different projects within a dbt Cloud account. For teams on the -Enterprise tier, role-based permissions can be generated dynamically from -configurations in an [Identity Provider](sso-overview). +There are three default groups available as soon as you create your dbt Cloud account (the person who created the account is added to all three automatically): -Role-based permissions are applied to _groups_ and pertain to _projects_. The -assignable permissions themselves are granted via _permission sets_. +- **Owner:** This group is for individuals responsible for the entire account and will give them elevated account admin privileges. You cannot change the permissions. +- **Member:** This group is for the general members of your organization, who will also have full access to the account. You cannot change the permissions. By default, dbt Cloud adds new users to this group. +- **Everyone:** A general group for all members of your organization. Customize the permissions to fit your organizational needs. By default, dbt Cloud adds new users to this group. +We recommend deleting the default `Owner`, `Member`, and `Everyone` groups before deploying and replacing them with your organizational groups. This prevents users from receiving more elevated privileges than they should and helps admins ensure they are properly placed. -### Groups +Create new groups from the **Groups & Licenses** section of the **Account settings**. If you use an external IdP for SSO, you can sync those SSO groups to dbt Cloud from the **Group details** pane when creating or editing existing groups. -A group is a collection of users. Users may belong to multiple groups. Members -of a group inherit any permissions applied to the group itself. + -Users can be added to a dbt Cloud group based on their group memberships in the -configured [Identity Provider](sso-overview) for the account. In this way, dbt -Cloud administrators can manage access to dbt Cloud resources via identity -management software like Microsoft Entra ID (formerly Azure AD), Okta, or GSuite. See _SSO Mappings_ below for -more information. +:::important -You can view the groups in your account or create new groups from the **Groups & Licenses** -page in your Account Settings.
+If a user is assigned licenses and permissions from multiple groups, the group that grants the most access will take precedence. You must assign a permission set to any groups created beyond the three defaults, or users assigned will not have access to features beyond their user profile. - +::: -### SSO mappings +#### SSO mappings -SSO Mappings connect Identity Provider (IdP) group membership to dbt Cloud group -membership. When a user logs into dbt Cloud via a supported identity provider, -their IdP group memberships are synced with dbt Cloud. Upon logging in -successfully, the user's group memberships (and therefore, permissions) are -adjusted accordingly within dbt Cloud automatically. +SSO Mappings connect an identity provider (IdP) group membership to a dbt Cloud group. When users log into dbt Cloud via a supported identity provider, their IdP group memberships sync with dbt Cloud. Upon logging in successfully, the user's group memberships (and permissions) will automatically adjust within dbt Cloud. :::tip Creating SSO Mappings -While dbt Cloud supports mapping multiple IdP groups to a single dbt Cloud -group, we recommend using a 1:1 mapping to make administration as simple as -possible. Consider using the same name for your dbt Cloud groups and your IdP -groups. +While dbt Cloud supports mapping multiple IdP groups to a single dbt Cloud group, we recommend using a 1:1 mapping to make administration as simple as possible. Use the same names for your dbt Cloud groups and your IdP groups. ::: +Create an SSO mapping in the group view: + +1. Open an existing group to edit or create a new group. +2. In the **SSO** portion of the group screen, enter the name of the SSO group exactly as it appears in the IdP. If the name is not the same, the users will not be properly placed into the group. +3. In the **Users** section, ensure the **Add all users by default** option is disabled. +4. Save the group configuration. New SSO users will be added to the group upon login, and existing users will be added to the group upon their next login. + + + +Refer to [role-based access control](#role-based-access-control) for more information about mapping SSO groups for user assignment to dbt Cloud groups. + +## Grant access + +dbt Cloud users have both a license (assigned to an individual user or by group membership) and permissions (by group membership only) that determine what actions they can take. Licenses are account-wide, and permissions provide more granular access or restrictions to specific features. + +### Licenses + +Every user in dbt Cloud will have a license assigned. Licenses consume "seats" which impact how your account is [billed](/docs/cloud/billing), depending on your [service plan](https://www.getdbt.com/pricing). + +There are three license types in dbt Cloud: + +- **Developer** — User can be granted _any_ permissions. +- **Read-Only** — User has read-only permissions applied to all dbt Cloud resources regardless of the role-based permissions that the user is assigned. +- **IT** — User has Security Admin and Billing Admin [permissions](/docs/cloud/manage-access/enterprise-permissions) applied, regardless of the group permissions assigned. + +Developer licenses will make up a majority of the users in your environment and have the highest impact on billing, so it's important to monitor how many you have at any given time. + +For more information on these license types, see [Seats & Users](/docs/cloud/manage-access/seats-and-users) + +### Permissions + +Permissions determine what a developer-licensed user can do in your dbt Cloud account. By default, members of the `Owner` and `Member` groups have full access to all areas and features. When you want to restrict access to features, assign users to groups with stricter permission sets. Keep in mind that if a user belongs to multiple groups, the most permissive group will take precedence. + +The permissions available depends on whether you're on an [Enterprise](/docs/cloud/manage-access/enterprise-permissions) or [self-service Team](/docs/cloud/manage-access/self-service-permissions) plan. Developer accounts only have a single user, so permissions aren't applicable. + + + +Some permissions (those that don't grant full access, like admins) allow groups to be "assigned" to specific projects and environments only. Read about [environment-level permissions](/docs/cloud/manage-access/environment-permissions-setup) for more information on restricting environment access. + + + +## Role-based access control + +Role-based access control (RBAC) allows you to grant users access to features and functionality based on their group membership. With this method, you can grant users varying access levels to different projects and environments. You can take access and security to the next level by integrating dbt Cloud with a third-party identity provider (IdP) to grant users access when they authenticate with your SSO or OAuth service. + +There are a few things you need to know before you configure RBAC for SSO users: +- New SSO users join any groups with the **Add all new users by default** option enabled. By default, the `Everyone` and `Member` groups have this option enabled. Disable this option across all groups for the best RBAC experience. +- You must have the appropriate SSO groups configured in the group details SSO section. If the SSO group name does not match _exactly_, users will not be placed in the group correctly. + +- dbt Labs recommends that your dbt Cloud group names match the IdP group names. + +Let's say you have a new employee being onboarded into your organization using [Okta](/docs/cloud/manage-access/set-up-sso-okta) as the IdP and dbt Cloud groups with SSO mappings. In this scenario, users are working on `The Big Project` and a new analyst named `Euclid Ean` is joining the group. + +Check out the following example configurations for an idea of how you can implement RBAC for your organization (these examples assume you have already configured [SSO](/docs/cloud/manage-access/sso-overview)): + + -### Permission sets +You and your IdP team add `Euclid Ean` to your Okta environment and assign them to the `dbt Cloud` SSO app via a group called `The Big Project`. -Permission sets are predefined collections of granular permissions. Permission -sets combine low-level permission grants into high-level roles that can be -assigned to groups. Some examples of existing permission sets are: - - Account Admin - - Git Admin - - Job Admin - - Job Viewer - - ...and more + -For a full list of enterprise permission sets, see [Enterprise Permissions](/docs/cloud/manage-access/enterprise-permissions). -These permission sets are available for assignment to groups and control the ability -for users in these groups to take specific actions in the dbt Cloud application. +Configure the group attribute statements the `dbt Cloud` application in Okta. The group statements in the following example are set to the group name exactly (`The Big Project`), but yours will likely be a much broader configuration. Companies often use the same prefix across all dbt groups in their IdP. For example `DBT_GROUP_` -In the following example, the _dbt Cloud Owners_ group is configured with the -**Account Admin** permission set on _All Projects_ and the **Job Admin** permission -set on the _Internal Analytics_ project. + - + + -### Manual assignment +You and your dbt Cloud admin team configure the groups in your account's settings: +1. Navigate to the **Account settings** and click **Groups & Licenses** on the left-side menu. +2. Click **Create group** or select an existing group and click **Edit**. +3. Enter the group name in the **SSO** field. +4. Configure the **Access and permissions** fields to your needs. Select a [permission set](/docs/cloud/manage-access/enterprise-permissions), the project they can access, and [environment-level access](/docs/cloud/manage-access/environment-permissions). -dbt Cloud administrators can manually assign users to groups independently of -IdP attributes. If a dbt Cloud group is configured _without_ any -SSO Mappings, then the group will be _unmanaged_ and dbt Cloud will not adjust -group membership automatically when users log into dbt Cloud via an identity -provider. This behavior may be desirable for teams that have connected an identity -provider, but have not yet configured SSO Mappings between dbt Cloud and the -IdP. + -If an SSO Mapping is added to an _unmanaged_ group, then it will become -_managed_, and dbt Cloud may add or remove users to the group automatically at -sign-in time based on the user's IdP-provided group membership information. +Euclid is limited to the `Analyst` role, the `Jaffle Shop` project, and the `Development`, `Staging`, and `General` environments of that project. Euclid has no access to the `Production` environment in their role. + + + + +Euclid takes the following steps to log in: + +1. Access the SSO URL or the dbt Cloud app in their Okta account. The URL can be found on the **Single sign-on** configuration page in the **Account settings**. + + + +2. Login with their Okta credentials. + + + +3. Since it's their first time logging in with SSO, Euclid Ean is presented with a message and no option to move forward until they check the email address associated with their Okta account. + + + +4. They now open their email and click the link to join dbt Labs, which completes the process. + + + +Euclid is now logged in to their account. They only have access to the `Jaffle Shop` pr, and the project selection option is removed from their UI entirely. + + + +They can now configure development credentials. The `Production` environment is visible, but it is `read-only`, and they have full access in the `Staging` environment. + + + + + + + +With RBAC configured, you now have granular control over user access to features across dbt Cloud. ## FAQs -- **When are IdP group memberships updated for SSO Mapped groups?**
- Group memberships are updated whenever a user logs into dbt Cloud via a supported SSO provider. If you've changed group memberships in your identity provider or dbt Cloud, ask your users to log back into dbt Cloud to synchronize these group memberships. -- **Can I set up SSO without RBAC?**
+ + +Group memberships are updated whenever a user logs into dbt Cloud via a supported SSO provider. If you've changed group memberships in your identity provider or dbt Cloud, ask your users to log back into dbt Cloud to synchronize these group memberships. + + + + + Yes, see the documentation on [Manual Assignment](#manual-assignment) above for more information on using SSO without RBAC. -- **Can I configure a user's License Type based on IdP Attributes?**
- Yes, see the docs on [managing license types](/docs/cloud/manage-access/seats-and-users#managing-license-types) for more information. -- **Why can't I edit a user's group membership?**
-Make sure you're not trying to edit your own user as this isn't allowed for security reasons. To edit the group membership of your own user, you'll need a different user to make those changes. +
+ + -- **How do I add or remove users**?
-Each dbt Cloud plan comes with a base number of Developer and Read-Only licenses. You can add or remove licenses by modifying the number of users in your account settings. - - If you're on an Enterprise plans and have the correct [permissions](/docs/cloud/manage-access/enterprise-permissions), you can add or remove developers by adjusting your developer user seat count in **Account settings** -> **Users**. +Yes, see the docs on [managing license types](/docs/cloud/manage-access/seats-and-users#managing-license-types) for more information. + +
+ + + +Don't try to edit your own user, as this isn't allowed for security reasons. You'll need a different user to make changes to your own user's group membership. + + + + + +Each dbt Cloud plan has a base number of Developer and Read-Only licenses. You can add or remove licenses by modifying the number of users in your account settings. + - If you're on an Enterprise plan and have the correct [permissions](/docs/cloud/manage-access/enterprise-permissions), you can add or remove developers by adjusting your developer user seat count in **Account settings** -> **Users**. - If you're on a Team plan and have the correct [permissions](/docs/cloud/manage-access/self-service-permissions), you can add or remove developers by making two changes: adjust your developer user seat count AND your developer billing seat count in **Account settings** -> **Users** and then in **Account settings** -> **Billing**. - Refer to [Users and licenses](/docs/cloud/manage-access/seats-and-users#licenses) for detailed steps. +For detailed steps, refer to [Users and licenses](/docs/cloud/manage-access/seats-and-users#licenses). + + \ No newline at end of file diff --git a/website/docs/docs/cloud/manage-access/audit-log.md b/website/docs/docs/cloud/manage-access/audit-log.md index 0abf54ff991..a7be86a7f99 100644 --- a/website/docs/docs/cloud/manage-access/audit-log.md +++ b/website/docs/docs/cloud/manage-access/audit-log.md @@ -9,12 +9,12 @@ pagination_prev: "docs/cloud/manage-access/about-user-access" To review actions performed by people in your organization, dbt provides logs of audited user and system events in real time. The audit log appears as events happen and includes details such as who performed the action, what the action was, and when it was performed. You can use these details to troubleshoot access issues, perform security audits, or analyze specific events. -You must be an **Account Admin** to access the audit log and this feature is only available on Enterprise plans. +You must be an **Account Admin** or an **Account Viewer** to access the audit log and this feature is only available on Enterprise plans. The dbt Cloud audit log stores all the events that occurred in your organization in real-time, including: - For events within 90 days, the dbt Cloud audit log has a selectable date range that lists events triggered. -- For events beyond 90 days, **Account Admins** can [export all events](#exporting-logs) by using **Export All**. +- For events beyond 90 days, **Account Admins** and **Account Viewers** can [export all events](#exporting-logs) by using **Export All**. ## Accessing the audit log @@ -32,7 +32,7 @@ On the audit log page, you will see a list of various events and their associate ### Event details -Click the event card to see the details about the activity that triggered the event. This view provides important details, including when it happened and what type of event was triggered. For example, if someone changes the settings for a job, you can use the event details to see which job was changed (type of event: `job_definition.Changed`), by whom (person who triggered the event: `actor`), and when (time it was triggered: `created_at_utc`). For types of events and their descriptions, see [Events in audit log](#events-in-audit-log). +Click the event card to see the details about the activity that triggered the event. This view provides important details, including when it happened and what type of event was triggered. For example, if someone changes the settings for a job, you can use the event details to see which job was changed (type of event: `job_definition.Changed`), by whom (person who triggered the event: `actor`), and when (time it was triggered: `created_at_utc`). For types of events and their descriptions, see [Events in audit log](#audit-log-events). The event details provide the key factors of an event: @@ -60,10 +60,9 @@ The audit log supports various events for different objects in dbt Cloud. You wi | Event Name | Event Type | Description | | -------------------------- | ---------------------------------------- | ------------------------------------------------------ | | Auth Provider Changed | auth_provider.Changed | Authentication provider settings changed | -| Credential Login Failed | auth.CredentialsLoginFailed | User login via username and password failed | | Credential Login Succeeded | auth.CredentialsLoginSucceeded | User successfully logged in with username and password | | SSO Login Failed | auth.SsoLoginFailed | User login via SSO failed | -| SSO Login Succeeded | auth.SsoLoginSucceeded | User successfully logged in via SSO +| SSO Login Succeeded | auth.SsoLoginSucceeded | User successfully logged in via SSO | ### Environment @@ -94,7 +93,7 @@ The audit log supports various events for different objects in dbt Cloud. You wi | ------------- | ----------------------------- | ------------------------------ | | Group Added | user_group.Added | New Group successfully created | | Group Changed | user_group.Changed | Group settings changed | -| Group Removed | user_group.Changed | Group successfully removed | +| Group Removed | user_group.Removed | Group successfully removed | ### User @@ -150,12 +149,65 @@ The audit log supports various events for different objects in dbt Cloud. You wi ### Credentials -| Event Name | Event Type | Description | -| -------------------------------- | ----------------------------- | -------------------------------- | +| Event Name | Event Type | Description | +| -------------------------------- | ----------------------------- | -----------------------| | Credentials Added to Project | credentials.Added | Project credentials added | | Credentials Changed in Project | credentials.Changed | Credentials changed in project | | Credentials Removed from Project | credentials.Removed | Credentials removed from project | + +### Git integration + +| Event Name | Event Type | Description | +| -------------------------------- | ----------------------------- | -----------------------| +| GitLab Application Changed | gitlab_application.changed | GitLab configuration in dbt Cloud changed | + +### Webhooks + +| Event Name | Event Type | Description | +| -------------------------------- | ----------------------------- | -----------------------| +| Webhook Subscriptions Added | webhook_subscription.added | New webhook configured in settings | +| Webhook Subscriptions Changed | webhook_subscription.changed | Existing webhook configuration altered | +| Webhook Subscriptions Removed | webhook_subscription.removed | Existing webhook deleted | + + +### Semantic Layer + +| Event Name | Event Type | Description | +| -------------------------------- | ----------------------------- | -----------------------| +| Semantic Layer Config Added | semantic_layer_config.added | Semantic Layer config added | +| Semantic Layer Config Changed | semantic_layer_config.changed | Semantic Layer config (not related to credentials) changed | +| Semantic Layer Config Removed | semantic_layer_config.removed | Semantic Layer config removed | +| Semantic Layer Credentials Added | semantic_layer_credentials.added | Semantic Layer credentials added | +| Semantic Layer Credentials Changed| semantic_layer_credentials.changed | Semantic Layer credentials changed. Does not trigger semantic_layer_config.changed| +| Semantic Layer Credentials Removed| semantic_layer_credentials.removed | Semantic Layer credentials removed | + +### Extended attributes + +| Event Name | Event Type | Description | +| -------------------------------- | ----------------------------- | -----------------------| +| Extended Attribute Added | extended_attributes.added | Extended attribute added to a project | +| Extended Attribute Changed | extended_attributes.changed | Extended attribute changed or removed | + + +### Account-scoped personal access token + +| Event Name | Event Type | Description | +| -------------------------------- | ----------------------------- | -----------------------| +| Account Scoped Personal Access Token Created | account_scoped_pat.created | An account-scoped PAT was created | +| Account Scoped Personal Access Token Deleted | account_scoped_pat.deleted | An account-scoped PAT was deleted | + +### IP restrictions + +| Event Name | Event Type | Description | +| -------------------------------- | ----------------------------- | -----------------------| +| IP Restrictions Toggled | ip_restrictions.toggled | IP restrictions feature enabled or disabled | +| IP Restrictions Rule Added | ip_restrictions.rule.added | IP restriction rule created | +| IP Restrictions Rule Changed | ip_restrictions.rule.changed | IP restriction rule edited | +| IP Restrictions Rule Removed | ip_restrictions.rule.removed | IP restriction rule deleted | + + + ## Searching the audit log You can search the audit log to find a specific event or actor, which is limited to the ones listed in [Events in audit log](#events-in-audit-log). The audit log successfully lists historical events spanning the last 90 days. You can search for an actor or event using the search bar, and then narrow your results using the time window. @@ -170,6 +222,6 @@ You can use the audit log to export all historical audit results for security, c - **For events within 90 days** — dbt Cloud will automatically display the 90-day selectable date range. Select **Export Selection** to download a CSV file of all the events that occurred in your organization within 90 days. -- **For events beyond 90 days** — Select **Export All**. The Account Admin will receive an email link to download a CSV file of all the events that occurred in your organization. +- **For events beyond 90 days** — Select **Export All**. The Account Admin or Account Viewer will receive an email link to download a CSV file of all the events that occurred in your organization. diff --git a/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md b/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md index f636be796d3..f814d58777a 100644 --- a/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md +++ b/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md @@ -3,7 +3,7 @@ title: "Users and licenses" description: "Learn how dbt Cloud administrators can use licenses and seats to control access in a dbt Cloud account." id: "seats-and-users" sidebar: "Users and licenses" -pagination_next: "docs/cloud/manage-access/self-service-permissions" +pagination_next: "docs/cloud/manage-access/enterprise-permissions" pagination_prev: null --- @@ -55,7 +55,7 @@ If you're on an Enterprise plan and have the correct [permissions](/docs/cloud/m - To add a user, go to **Account Settings** and select **Users**. - Click the [**Invite Users**](/docs/cloud/manage-access/invite-users) button. - - For fine-grained permission configuration, refer to [Role based access control](/docs/cloud/manage-access/enterprise-permissions). + - For fine-grained permission configuration, refer to [Role based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control-).
@@ -124,9 +124,7 @@ Great work! After completing these steps, your dbt Cloud user count and billing ## Managing license types -Licenses can be assigned manually, or automatically based on IdP configuration -(enterprise only). By default, new users in an account will be assigned a -Developer license. +Licenses can be assigned to users individually or through group membership. To assign a license via group membership, you can manually add a user to a group during the invitation process or assign them to a group after they’ve enrolled in dbt Cloud. Alternatively, with [SSO configuration](/docs/cloud/manage-access/sso-overview) and [role-based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control-) (Enterprise only), users can be automatically assigned to groups. By default, new users in an account are assigned a Developer license. ### Manual configuration @@ -142,16 +140,9 @@ change. -### Mapped configuration +### Mapped configuration -**Note:** This feature is only available on the Enterprise plan. - -If your account is connected to an Identity Provider (IdP) for [Single Sign -On](/docs/cloud/manage-access/sso-overview), you can automatically map IdP user -groups to specific license types in dbt Cloud. To configure license mappings, -navigate to the Account Settings > Team > License Mappings page. From -here, you can create or edit SSO mappings for both Read-Only and Developer -license types. +If your account is connected to an Identity Provider (IdP) for [Single Sign On](/docs/cloud/manage-access/sso-overview), you can automatically map IdP user groups to specific groups in dbt Cloud and assign license types to those groups. To configure license mappings, navigate to the **Account Settings** > **Groups & Licenses** > **License Mappings** page. From here, you can create or edit SSO mappings for both Read-Only and Developer license types. By default, all new members of a dbt Cloud account will be assigned a Developer license. To assign Read-Only licenses to certain groups of users, create a new diff --git a/website/docs/docs/cloud/manage-access/enterprise-permissions.md b/website/docs/docs/cloud/manage-access/enterprise-permissions.md index a1f6d795c23..5a56900d529 100644 --- a/website/docs/docs/cloud/manage-access/enterprise-permissions.md +++ b/website/docs/docs/cloud/manage-access/enterprise-permissions.md @@ -22,22 +22,14 @@ The following roles and permission sets are available for assignment in dbt Clou :::tip Licenses or Permission sets -The user's [license](/docs/cloud/manage-access/seats-and-users) type always overrides their assigned permission set. This means that even if a user belongs to a dbt Cloud group with 'Account Admin' permissions, having a 'Read-Only' license would still prevent them from performing administrative actions on the account. +The user's [license](/docs/cloud/manage-access/about-user-access) type always overrides their assigned permission set. This means that even if a user belongs to a dbt Cloud group with 'Account Admin' permissions, having a 'Read-Only' license would still prevent them from performing administrative actions on the account. ::: -## How to set up RBAC Groups in dbt Cloud +## Additional resources -Role-Based Access Control (RBAC) is helpful for automatically assigning permissions to dbt admins based on their SSO provider group associations. RBAC does not apply to [model groups](/docs/collaborate/govern/model-access#groups). +- [Grant users access](/docs/cloud/manage-access/about-user-access#grant-access) +- [Role-based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control-) +- [Environment-level permissions](/docs/cloud/manage-access/environment-permissions) -1. Click the gear icon to the top right and select **Account Settings**. Click **Groups & Licenses** - - - -2. Select an existing group or create a new group to add RBAC. Name the group (this can be any name you like, but it's recommended to keep it consistent with the SSO groups). If you have configured SSO with SAML 2.0, you may have to use the GroupID instead of the name of the group. -3. Configure the SSO provider groups you want to add RBAC by clicking **Add** in the **SSO** section. These fields are case-sensitive and must match the source group formatting. -4. Configure the permissions for users within those groups by clicking **Add** in the **Access** section of the window. - - -5. When you've completed your configurations, click **Save**. Users will begin to populate the group automatically once they have signed in to dbt Cloud with their SSO credentials. diff --git a/website/docs/docs/cloud/manage-access/environment-permissions.md b/website/docs/docs/cloud/manage-access/environment-permissions.md index 44cf2dc9a64..b99da64609c 100644 --- a/website/docs/docs/cloud/manage-access/environment-permissions.md +++ b/website/docs/docs/cloud/manage-access/environment-permissions.md @@ -77,4 +77,4 @@ If the user has the same roles across projects, you can apply environment access ## Related docs --[Environment-level permissions setup](/docs/cloud/manage-access/environment-permissions-setup) +- [Environment-level permissions setup](/docs/cloud/manage-access/environment-permissions-setup) diff --git a/website/docs/docs/cloud/manage-access/external-oauth.md b/website/docs/docs/cloud/manage-access/external-oauth.md index 112b6aa1d92..deb23f36f09 100644 --- a/website/docs/docs/cloud/manage-access/external-oauth.md +++ b/website/docs/docs/cloud/manage-access/external-oauth.md @@ -1,29 +1,26 @@ --- -title: "Set up external Oauth" +title: "Set up external OAuth" id: external-oauth -description: "Configuration instructions for dbt Cloud and external Oauth connections" -sidebar_label: "Set up external Oauth" -unlisted: true +description: "Configuration instructions for dbt Cloud and external OAuth connections" +sidebar_label: "Set up external OAuth" pagination_next: null pagination_prev: null --- -# Set up external Oauth +# Set up external OAuth -:::note Beta feature +:::note -External OAuth for authentication is available in a limited beta. If you are interested in joining the beta, please contact your account manager. - -This feature is currently only available for the Okta and Entra ID identity providers and Snowflake connections. Only available to Enterprise accounts. +This feature is currently only available for the Okta and Entra ID identity providers and [Snowflake connections](/docs/cloud/connect-data-platform/connect-snowflake). ::: -dbt Cloud Enterprise supports [OAuth authentication](https://docs.snowflake.net/manuals/user-guide/oauth-intro.html) with external providers. When External OAuth is enabled, users can authorize their Development credentials using single sign-on (SSO) via the identity provider (IdP). This grants users authorization to access multiple applications, including dbt Cloud, without their credentials being shared with the service. Not only does this make the process of authenticating for development environments easier on the user, it provides an additional layer of security to your dbt Cloud account. +dbt Cloud Enterprise supports [external OAuth authentication](https://docs.snowflake.com/en/user-guide/oauth-ext-overview) with external providers. When External OAuth is enabled, users can authorize their Development credentials using single sign-on (SSO) via the identity provider (IdP). This grants users authorization to access multiple applications, including dbt Cloud, without their credentials being shared with the service. Not only does this make the process of authenticating for development environments easier on the user, it provides an additional layer of security to your dbt Cloud account. ## Getting started -The process of setting up external Oauth will require a little bit of back-and-forth between your dbt Cloud, IdP, and Snowflake accounts, and having them open in multiple browser tabs will help speed up the configuration process: +The process of setting up external OAuth will require a little bit of back-and-forth between your dbt Cloud, IdP, and Snowflake accounts, and having them open in multiple browser tabs will help speed up the configuration process: - **dbt Cloud:** You’ll primarily be working in the **Account Settings** —> **Integrations** page. You will need [proper permission](/docs/cloud/manage-access/enterprise-permissions) to set up the integration and create the connections. - **Snowflake:** Open a worksheet in an account that has permissions to [create a security integration](https://docs.snowflake.com/en/sql-reference/sql/create-security-integration). @@ -34,7 +31,7 @@ If the admins that handle these products are all different people, it’s better ### Snowflake commands -The following is a template for creating the Oauth configurations in the Snowflake environment: +The following is a template for creating the OAuth configurations in the Snowflake environment: ```sql @@ -53,41 +50,45 @@ external_oauth_any_role_mode = 'ENABLE' The `external_oauth_token_user_mapping_claim` and `external_oauth_snowflake_user_mapping_attribute` can be modified based on the your organizations needs. These values point to the claim in the users’ token. In the example, Snowflake will look up the Snowflake user whose `email` matches the value in the `sub` claim. -**Note:** The Snowflake default roles ACCOUNTADMIN, ORGADMIN, or SECURITYADMIN, are blocked from external Oauth by default and they will likely fail to authenticate. See the [Snowflake documentation](https://docs.snowflake.com/en/sql-reference/sql/create-security-integration-oauth-external) for more information. +**Note:** The Snowflake default roles ACCOUNTADMIN, ORGADMIN, or SECURITYADMIN, are blocked from external OAuth by default and they will likely fail to authenticate. See the [Snowflake documentation](https://docs.snowflake.com/en/sql-reference/sql/create-security-integration-oauth-external) for more information. + +## Identity provider configuration -## Set up with Okta +Select a supported identity provider (IdP) for instructions on configuring external OAuth in their environment and completing the integration in dbt Cloud. + + ### 1. Initialize the dbt Cloud settings -1. In your dbt Cloud account, navigate to **Account settings** —> **Integrations**. +1. In your dbt Cloud account, navigate to **Account settings** —> **Integrations**. 2. Scroll down to **Custom integrations** and click **Add integrations** -3. Leave this window open. You can set the **Integration type** to Okta and make a note of the **Redirect URI** at the bottom of the page. Copy this to your clipboard for use in the next steps. +3. Leave this window open. You can set the **Integration type** to Okta and note the **Redirect URI** at the bottom of the page. Copy this to your clipboard for use in the next steps. ### 2. Create the Okta app -1. From the Okta dashboard, expand the **Applications** section and click **Applications.** Click the **Create app integration** button. +1. Expand the **Applications** section from the Okta dashboard and click **Applications.** Click the **Create app integration** button. 2. Select **OIDC** as the sign-in method and **Web applications** as the application type. Click **Next**. -3. Give the application an appropriate name, something like “External Oauth app for dbt Cloud” that will make it easily identifiable. +3. Give the application an appropriate name, something like “External OAuth app for dbt Cloud,” that will make it easily identifiable. 4. In the **Grant type** section, enable the **Refresh token** option. -5. Scroll down to the **Sign-in redirect URIs** option. Here, you’ll need to paste the redirect URI you gathered from dbt Cloud in step 1.3. +5. Scroll down to the **Sign-in redirect URIs** option. You’ll need to paste the redirect URI you gathered from dbt Cloud in step 1.3. - + -6. Save the app configuration. You’ll come back to it, but for now move on to the next steps. +6. Save the app configuration. You’ll come back to it, but move on to the next steps for now. ### 3. Create the Okta API -1. From the Okta sidebar menu, expand the **Security** section and clicl **API**. -2. On the API screen, click **Add authorization server**. Give the authorizations server a name (a nickname for your Snowflake account would be appropriate). For the **Audience** field, copy and paste your Snowflake login URL (for example, https://abdc-ef1234.snowflakecomputing.com). Give the server an appropriate description and click **Save**. +1. Expand the **Security** section and click **API** from the Okta sidebar menu. +2. On the API screen, click **Add authorization server**. Give the authorization server a name (a nickname for your Snowflake account would be appropriate). For the **Audience** field, copy and paste your Snowflake login URL (for example, https://abdc-ef1234.snowflakecomputing.com). Give the server an appropriate description and click **Save**. -3. On the authorization server config screen, open the **Metadata URI** in a new tab. You’ll need information from this screen in later steps. +3. On the authorization server config screen, open the **Metadata URI** in a new tab. You’ll need information from this screen in later steps. @@ -97,7 +98,7 @@ The `external_oauth_token_user_mapping_claim` and `external_oauth_snowflake_u -5. Open the **Access policies** tab and click **Add policy**. Give the policy a **Name** and **Description** and set **Assign to** as **The following clients**. Start typing the name of the app you created in step 2.3 and you’ll see it autofill. Select the app and click **Create Policy**. +5. Open the **Access policies** tab and click **Add policy**. Give the policy a **Name** and **Description** and set **Assign to** as **The following clients**. Start typing the name of the app you created in step 2.3, and you’ll see it autofill. Select the app and click **Create Policy**. @@ -105,13 +106,13 @@ The `external_oauth_token_user_mapping_claim` and `external_oauth_snowflake_u -7. Give the rule a descriptive name and scroll down to **token lifetimes**. Configure the **Access token lifetime is**, **Refresh token lifetime is**, and **but will expire if not used every** settings according to your organizational policies. We recommend the defaults of 1 hour and 90 days. Stricter rules increases the odds of your users having to re-authenticate. +7. Give the rule a descriptive name and scroll down to **token lifetimes**. Configure the **Access token lifetime is**, **Refresh token lifetime is**, and **but will expire if not used every** settings according to your organizational policies. We recommend the defaults of 1 hour and 90 days. Stricter rules increase the odds of your users having to re-authenticate. 8. Navigate back to the **Settings** tab and leave it open in your browser. You’ll need some of the information in later steps. -### 4. Create the Oauth settings in Snowflake +### 4. Create the OAuth settings in Snowflake 1. Open up a Snowflake worksheet and copy/paste the following: @@ -130,9 +131,9 @@ external_oauth_any_role_mode = 'ENABLE' ``` -2. Change `your_integration_name` to something appropriately descriptive. For example, `dev_OktaAccountNumber_okta`. Copy the `external_oauth_issuer` and `external_oauth_jws_keys_url` from the metadate URI in step 3.3. Use the same Snowflake URL that you entered in step 3.2 as the `external_oauth_audience_list`. +2. Change `your_integration_name` to something appropriately descriptive. For example, `dev_OktaAccountNumber_okta`. Copy the `external_oauth_issuer` and `external_oauth_jws_keys_url` from the metadata URI in step 3.3. Use the same Snowflake URL you entered in step 3.2 as the `external_oauth_audience_list`. -Adjust the other settings as needed to meet your organizations configurations in Okta and Snowflake. +Adjust the other settings as needed to meet your organization's configurations in Okta and Snowflake. @@ -140,39 +141,47 @@ Adjust the other settings as needed to meet your organizations configurations in ### 5. Configuring the integration in dbt Cloud -1. Navigate back to the dbt Cloud **Account settings** —> **Integrations** page you were on at the beginning. It’s time to start filling out all of the fields. - 1. `Integration name`: Give the integration a descriptive name that includes identifying information about the Okta environment so future users won’t have to guess where it belongs. - 2. `Client ID` and `Client secrets`: Retrieve these from the Okta application page. - - 3. Authorize URL and Token URL: Found in the metadata URI. - +1. Navigate back to the dbt Cloud **Account settings** —> **Integrations** page you were on at the beginning. It’s time to start filling out all of the fields. + 1. `Integration name`: Give the integration a descriptive name that includes identifying information about the Okta environment so future users won’t have to guess where it belongs. + 2. `Client ID` and `Client secrets`: Retrieve these from the Okta application page. + + 3. Authorize URL and Token URL: Found in the metadata URI. + 2. **Save** the configuration + ### 6. Create a new connection in dbt Cloud -1. Navigate the **Account settings** and click **Connections** from the menu. Click **Add connection**. -2. Configure the `Account`, `Database`, and `Warehouse` as you normally would and for the `Oauth method` select the external Oauth you just created. - +1. Navigate the **Account settings** and click **Connections** from the menu. Click **Add connection**. +2. Configure the `Account`, `Database`, and `Warehouse` as you normally would, and for the `OAuth method`, select the external OAuth you just created. + + + + -3. Scroll down to the **External Oauth** configurations box and select the config from the list. +3. Scroll down to the **External OAuth** configurations box and select the config from the list. - -4. **Save** the connection and you have now configured External Oauth with Okta and Snowflake! + -## Set up with Entra ID + +4. **Save** the connection, and you have now configured External OAuth with Okta and Snowflake! + + + + ### 1. Initialize the dbt Cloud settings -1. In your dbt Cloud account, navigate to **Account settings** —> **Integrations**. +1. In your dbt Cloud account, navigate to **Account settings** —> **Integrations**. 2. Scroll down to **Custom integrations** and click **Add integrations**. -3. Leave this window open. You can set the **Integration type** to Okta and make a note of the **Redirect URI** at the bottom of the page. Copy this to your clipboard for use in the next steps. +3. Leave this window open. You can set the **Integration type** to Entra ID and note the **Redirect URI** at the bottom of the page. Copy this to your clipboard for use in the next steps. ### Entra ID -You’ll create two different `apps` in the Azure portal &mdash: A resource server and a client app. +You’ll create two apps in the Azure portal: A resource server and a client app. :::important @@ -187,68 +196,74 @@ In your Azure portal, open the **Entra ID** and click **App registrations** from ### 1. Create a resource server 1. From the app registrations screen, click **New registration**. - 1. Give the app a name. - 2. Ensure **Supported account types** are set to “Accounts in this organizational directory only (`Org name` - Single Tenant).” - 3. Click **Register** and you will be taken to the apps overview. + 1. Give the app a name. + 2. Ensure **Supported account types** are set to “Accounts in this organizational directory only (`Org name` - Single Tenant).” + 3. Click **Register**to see the application’s overview. 2. From the app overview page, click **Expose an API** from the left menu. -3. Click **Add** next to **Application ID URI**. The field will automatically populate. Click **Save**. -4. Record the `value` field as it will be used in a future step. *This is only displayed once. Be sure to record it immediately. It will be hidden when you leave the page and come back.* +3. Click **Add** next to **Application ID URI**. The field will automatically populate. Click **Save**. +4. Record the `value` field for use in a future step. _This is only displayed once. Be sure to record it immediately. Microsoft hides the field when you leave the page and come back._ 5. From the same screen, click **Add scope**. - 1. Give the scope a name. - 2. Set “Who can consent?” to **Admins and users**. - 3. Set **Admin consent display name** session:role-any and give it a description. - 4. Ensure **State** is set to **Enabled**. - 5. Click **Add scope**. + 1. Give the scope a name. + 2. Set “Who can consent?” to **Admins and users**. + 3. Set **Admin consent display name** session:role-any and give it a description. + 4. Ensure **State** is set to **Enabled**. + 5. Click **Add scope**. ### 2. Create a client app 1. From the **App registration page**, click **New registration**. - 1. Give the app a name that uniquely identifies it as the client app. - 2. Ensure **Supported account types** are set to “Accounts in this organizational directory only (`Org name` - Single Tenant).” - 3. Set the **Redirect URI** to **Web** and copy/paste the **Redirect URI** from dbt Cloud into the field. - 4. Click **Register**. + 1. Give the app a name that uniquely identifies it as the client app. + 2. Ensure **Supported account types** are set to “Accounts in this organizational directory only (`Org name` - Single Tenant).” + 3. Set the **Redirect URI** to **Web** and copy/paste the **Redirect URI** from dbt Cloud into the field. + 4. Click **Register**. 2. From the app overview page, click **API permissions** from the left menu, and click **Add permission**. 3. From the pop-out screen, click **APIs my organization uses**, search for the resource server name from the previous steps, and click it. 4. Ensure the box for the **Permissions** `session:role-any` is enabled and click **Add permissions**. 5. Click **Grant admin consent** and from the popup modal click **Yes**. -6. From the left menu, click **Certificates and secrets** and cllick **New client secret**. Name the secret, set an expiration, and click **Add**. -**Note**: Microsoft does not allow “forever” as an expiration. The maximum time is two years. It’s essential to document the expiration date so that the secret can be refreshed before the expiration or user authorization will fail. -7. Record the `value` for use in a future step and record it immediately. -**Note**: This value will not be displayed again once you navigate away from this screen. +6. From the left menu, click **Certificates and secrets** and click **New client secret**. Name the secret, set an expiration, and click **Add**. +**Note**: Microsoft does not allow “forever” as an expiration date. The maximum time is two years. Documenting the expiration date so you can refresh the secret before the expiration or user authorization fails is essential. +7. Record the `value` for use in a future step and record it immediately. +**Note**: Entra ID will not display this value again once you navigate away from this screen. ### 3. Snowflake configuration -You'll be switching between the Entra ID site and Snowflake. Keep your Entra ID account open for this process. +You'll be switching between the Entra ID site and Snowflake. Keep your Entra ID account open for this process. Copy and paste the following as a template in a Snowflake worksheet: ```sql + create or replace security integration - type = external_oauth - enabled = true - external_oauth_type = azure - external_oauth_issuer = '' - external_oauth_jws_keys_url = '' - external_oauth_audience_list = ('') - external_oauth_token_user_mapping_claim = 'upn' - external_oauth_any_role_mode = 'ENABLE' - external_oauth_snowflake_user_mapping_attribute = 'login_name'; + type = external_oauth + enabled = true + external_oauth_type = azure + external_oauth_issuer = '' + external_oauth_jws_keys_url = '' + external_oauth_audience_list = ('') + external_oauth_token_user_mapping_claim = 'upn' + external_oauth_any_role_mode = 'ENABLE' + external_oauth_snowflake_user_mapping_attribute = 'login_name'; + ``` + On the Entra ID site: -1. From the Client ID app in Entra ID, click **Endpoints** and open the **Federation metadata document** in a new tab. - - The **entity ID** on this page maps to the `external_oauth_issuer` field in the Snowflake config. +1. From the Client ID +app in Entra ID, click **Endpoints** and open the **Federation metadata document** in a new tab. + - The **entity ID** on this page maps to the `external_oauth_issuer` field in the Snowflake config. 2. Back on the list of endpoints, open the **OpenID Connect metadata document** in a new tab. - - The **jwks_uri** field maps to the `external_oauth_jws_keys_url` field in Snowflake. + - The **jwks_uri** field maps to the `external_oauth_jws_keys_url` field in Snowflake. 3. Navigate to the resource server in previous steps. - - The **Application ID URI** maps to teh `external_oauth_audience_list` field in Snowflake. -4. Run the configurations. Be sure the admin who created the Microsoft apps is also a user in Snowflake, or the configuration will fail. + - The **Application ID URI** maps to the `external_oauth_audience_list` field in Snowflake. +4. Run the configurations. Be sure the admin who created the Microsoft apps is also a user in Snowflake, or the configuration will fail. ### 4. Configuring the integration in dbt Cloud -1. Navigate back to the dbt Cloud **Account settings** —> **Integrations** page you were on at the beginning. It’s time to start filling out all of the fields. There will be some back-and-forth between the Entra ID account and dbt Cloud. -2. `Integration name`: Give the integration a descriptive name that includes identifying information about the Entra ID environment so future users won’t have to guess where it belongs. -3. `Client secrets`: These are found in the Client ID from the **Certificates and secrets** page. `Value` is the `Client secret` . Note that it only appears when created; if you return later, it will be hidden, and you must recreate the secret. +1. Navigate back to the dbt Cloud **Account settings** —> **Integrations** page you were on at the beginning. It’s time to start filling out all of the fields. There will be some back-and-forth between the Entra ID account and dbt Cloud. +2. `Integration name`: Give the integration a descriptive name that includes identifying information about the Entra ID environment so future users won’t have to guess where it belongs. +3. `Client secrets`: Found in the Client ID from the **Certificates and secrets** page. `Value` is the `Client secret`. Note that it only appears when created; _Microsoft hides the secret if you return later, and you must recreate it._ 4. `Client ID`: Copy the’ Application (client) ID’ on the overview page for the client ID app. -5. `Authorization URL` and `Token URL`: From the client ID app, open the `Endpoints` tab. The `Oauth 2.0 authorization endpoint (v2)` and `Oauth 2.0 token endpoint (v2)` fields map to these. *You must use v2 of the `Oauth 2.0 authorization endpoint`. Do not use V1.* You can use either version of the `Oauth 2.0 token endpoint`. +5. `Authorization URL` and `Token URL`: From the client ID app, open the `Endpoints` tab. These URLs map to the `OAuth 2.0 authorization endpoint (v2)` and `OAuth 2.0 token endpoint (v2)` fields. *You must use v2 of the `OAuth 2.0 authorization endpoint`. Do not use V1.* You can use either version of the `OAuth 2.0 token endpoint`. 6. `Application ID URI`: Copy the `Application ID URI` field from the resource server’s Overview screen. + + diff --git a/website/docs/docs/cloud/manage-access/invite-users.md b/website/docs/docs/cloud/manage-access/invite-users.md index c82e15fd48f..0922b4dc991 100644 --- a/website/docs/docs/cloud/manage-access/invite-users.md +++ b/website/docs/docs/cloud/manage-access/invite-users.md @@ -17,19 +17,16 @@ You must have proper permissions to invite new users: ## Invite new users -1. In your dbt Cloud account, select the gear menu in the upper right corner and then select **Account Settings**. -2. From the left sidebar, select **Users**. - - - -3. Click on **Invite Users**. +1. In your dbt Cloud account, select your account name in the bottom left corner. Then select **Account settings**. +2. Under **Settings**, select **Users**. +3. Click on **Invite users**. -4. In the **Email Addresses** field, enter the email addresses of the users you would like to invite separated by comma, semicolon, or a new line. +4. In the **Email Addresses** field, enter the email addresses of the users you want to invite separated by a comma, semicolon, or a new line. 5. Select the license type for the batch of users from the **License** dropdown. -6. Select the group(s) you would like the invitees to belong to. -7. Click **Send Invitations**. +6. Select the group(s) you want the invitees to belong to. +7. Click **Send invitations**. - If the list of invitees exceeds the number of licenses your account has available, you will receive a warning when you click **Send Invitations** and the invitations will not be sent. diff --git a/website/docs/docs/cloud/manage-access/licenses-and-groups.md b/website/docs/docs/cloud/manage-access/licenses-and-groups.md deleted file mode 100644 index b91af80f9b3..00000000000 --- a/website/docs/docs/cloud/manage-access/licenses-and-groups.md +++ /dev/null @@ -1,145 +0,0 @@ ---- -title: "Licenses and groups" -id: "licenses-and-groups" ---- - -## Overview - -dbt Cloud administrators can use dbt Cloud's permissioning model to control -user-level access in a dbt Cloud account. This access control comes in two flavors: -License-based and Role-based. - -- **License-based Access Controls:** User are configured with account-wide - license types. These licenses control the specific parts of the dbt Cloud application - that a given user can access. -- **Role-based Access Control (RBAC):** Users are assigned to _groups_ that have - specific permissions on specific projects or the entire account. A user may be - a member of multiple groups, and those groups may have permissions on multiple - projects. - -## License-based access control - -Each user on an account is assigned a license type when the user is first -invited to a given account. This license type may change over time, but a -user can only have one type of license at any given time. - -A user's license type controls the features in dbt Cloud that the user is able -to access. dbt Cloud's three license types are: - - **Read-Only** - - **Developer** - - **IT** - -For more information on these license types, see [Seats & Users](/docs/cloud/manage-access/seats-and-users). -At a high-level, Developers may be granted _any_ permissions, whereas Read-Only -users will have read-only permissions applied to all dbt Cloud resources -regardless of the role-based permissions that the user is assigned. IT users will have Security Admin and Billing Admin permissions applied regardless of the role-based permissions that the user is assigned. - -## Role-based access control - -:::info dbt Cloud Enterprise - -Role-based access control is a feature of the dbt Cloud Enterprise plan - -::: - -Role-based access control allows for fine-grained permissioning in the dbt Cloud -application. With role-based access control, users can be assigned varying -permissions to different projects within a dbt Cloud account. For teams on the -Enterprise tier, role-based permissions can be generated dynamically from -configurations in an [Identity Provider](sso-overview). - -Role-based permissions are applied to _groups_ and pertain to _projects_. The -assignable permissions themselves are granted via _permission sets_. - - -### Groups - -A group is a collection of users. Users may belong to multiple groups. Members -of a group inherit any permissions applied to the group itself. - -Users can be added to a dbt Cloud group based on their group memberships in the -configured [Identity Provider](sso-overview) for the account. In this way, dbt -Cloud administrators can manage access to dbt Cloud resources via identity -management software like Microsoft Entra ID (formerly Azure AD), Okta, or GSuite. See _SSO Mappings_ below for -more information. - -You can view the groups in your account or create new groups from the **Team > Groups** -page in your Account Settings. - - - - -### SSO Mappings - -SSO Mappings connect Identity Provider (IdP) group membership to dbt Cloud group -membership. When a user logs into dbt Cloud via a supported identity provider, -their IdP group memberships are synced with dbt Cloud. Upon logging in -successfully, the user's group memberships (and therefore, permissions) are -adjusted accordingly within dbt Cloud automatically. - -:::tip Creating SSO Mappings - -While dbt Cloud supports mapping multiple IdP groups to a single dbt Cloud -group, we recommend using a 1:1 mapping to make administration as simple as -possible. Consider using the same name for your dbt Cloud groups and your IdP -groups. - -::: - - -### Permission Sets - -Permission sets are predefined collections of granular permissions. Permission -sets combine low-level permission grants into high-level roles that can be -assigned to groups. Some examples of existing permission sets are: - - Account Admin - - Git Admin - - Job Admin - - Job Viewer - - ...and more - -For a full list of enterprise permission sets, see [Enterprise Permissions](/docs/cloud/manage-access/enterprise-permissions). -These permission sets are available for assignment to groups and control the ability -for users in these groups to take specific actions in the dbt Cloud application. - -In the following example, the _dbt Cloud Owners_ group is configured with the -**Account Admin** permission set on _All Projects_ and the **Job Admin** permission -set on the _Internal Analytics_ project. - - - - -### Manual assignment - -dbt Cloud administrators can manually assign users to groups independently of -IdP attributes. If a dbt Cloud group is configured _without_ any -SSO Mappings, then the group will be _unmanaged_ and dbt Cloud will not adjust -group membership automatically when users log into dbt Cloud via an identity -provider. This behavior may be desirable for teams that have connected an identity -provider, but have not yet configured SSO Mappings between dbt Cloud and the -IdP. - -If an SSO Mapping is added to an _unmanaged_ group, then it will become -_managed_, and dbt Cloud may add or remove users to the group automatically at -sign-in time based on the user's IdP-provided group membership information. - - -## FAQs -- **When are IdP group memberships updated for SSO Mapped groups?** Group memberships - are updated every time a user logs into dbt Cloud via a supported SSO provider. If - you've changed group memberships in your identity provider or dbt Cloud, ask your - users to log back into dbt Cloud for these group memberships to be synchronized. - -- **Can I set up SSO without RBAC?** Yes, see the documentation on - [Manual Assignment](#manual-assignment) above for more information on using - SSO without RBAC. - -- **Can I configure a user's License Type based on IdP Attributes?** Yes, see - the docs on [managing license types](/docs/cloud/manage-access/seats-and-users#managing-license-types) - for more information. diff --git a/website/docs/docs/cloud/manage-access/mfa.md b/website/docs/docs/cloud/manage-access/mfa.md index a06251e6468..bcddc04f072 100644 --- a/website/docs/docs/cloud/manage-access/mfa.md +++ b/website/docs/docs/cloud/manage-access/mfa.md @@ -7,6 +7,13 @@ sidebar: null # Multi-factor authentication +:::important + + +dbt Cloud enforces multi-factor authentication (MFA) for all users with username and password credentials. If MFA is not set up, you will see a notification bar prompting you to configure one of the supported methods when you log in. If you do not, you will have to configure MFA upon subsequent logins, or you will be unable to access dbt Cloud. + +::: + dbt Cloud provides multiple options for multi-factor authentication (MFA). MFA provides an additional layer of security to username and password logins for Developer and Team plan accounts. The available MFA methods are: - SMS verification code (US-based phone numbers only) diff --git a/website/docs/docs/cloud/manage-access/self-service-permissions.md b/website/docs/docs/cloud/manage-access/self-service-permissions.md index 24e1283b126..a5bdba825c2 100644 --- a/website/docs/docs/cloud/manage-access/self-service-permissions.md +++ b/website/docs/docs/cloud/manage-access/self-service-permissions.md @@ -1,42 +1,84 @@ --- -title: "Self-service permissions" -description: "Learn how dbt Cloud administrators can use self-service permissions to control access in a dbt Cloud account." +title: "Self-service Team account permissions" +description: "Learn how dbt Cloud administrators can use self-service permissions to control access in a dbt Cloud Team account." +sidebar_label: "Team permissions" id: "self-service-permissions" --- -import Permissions from '/snippets/_self-service-permissions-table.md'; +Self-service Team accounts are a quick and easy way to get dbt Cloud up and running for a small team. For teams looking to scale and access advanced features like SSO, group management, and support for larger user bases, upgrading to an [Enterprise](/docs/cloud/manage-access/enterprise-permissions) account unlocks these capabilities. +If you're interested in upgrading, contact [dbt Labs today](https://www.getdbt.com/contact) - +## Groups and permissions -## Read-Only vs. Developer License Types +Groups determine a user's permission and there are three groups are available for Team plan dbt Cloud accounts: Owner, Member, and Everyone. The first Owner user is the person who created the dbt Cloud account. -Users configured with Read-Only license types will experience a restricted set of permissions in dbt Cloud. If a user is associated with a _Member_ permission set and a Read-Only seat license, then they will only have access to what a Read-Only seat allows. See [Seats and Users](/docs/cloud/manage-access/seats-and-users) for more information on the impact of licenses on these permissions. +New users are added to the Member and Everyone groups when they onboard but this can be changed when the invitation is created. These groups only affect users with a [Developer license](#licenses) assigned. -## Owner and Member Groups in dbt Cloud Enterprise +The group access permissions are as follows: -By default, new users are added to the Member and Owner groups when they onboard to a new dbt Cloud account. Member and Owner groups are included with every new dbt Cloud account because they provide access for administrators to add users and groups, and to apply permission sets. +- **Owner** — Full access to account features. +- **Member** — Robust access to the account with restrictions on features that can alter billing or security. +- **Everyone** — A catch-all group for all users in the account. This group does not have any permission assignments beyond the user's profile. Users must be assigned to either the Member or Owner group to work in dbt Cloud. -You will need owner and member groups to help with account onboarding, but these groups can create confusion when initially setting up SSO and RBAC for dbt Cloud Enterprise accounts as described in the [Enterprise Permissions](enterprise-permissions) guide. Owner and Member groups are **account level** groups, so their permissions override any project-level permissions you wish to apply. +## Licenses -After onboarding administrative users and configuring RBAC/SSO groups, we recommend the following steps for onboarding users to a dbt Cloud Enterprise account. +You assign licenses to every user onboarded into dbt Cloud. You only assign Developer-licensed users to the Owner and Member groups. The groups have no impact on Read-only or IT licensed users. +There are three license types: -### Prerequisites +- **Developer** — The default license. Developer licenses don't restrict access to any features, so users with this license should be assigned to either the Owner or Member group. You're allotted up to 8 developer licenses per account. +- **Read-Only** — Read-only access to your project, including environments dbt Explorer. Doesn't have access to account settings at all. Functions the same regardless of group assignments. You're allotted up to 5 read-only licenses per account. +- **IT** — Partial access to the account settings including users, integrations, billing, and API settings. Cannot create or edit connects or access the project at all. Functions the same regardless of group assignments. You're allocated 1 seat per account. -You need to create an Account Admins group before removing any other groups. +See [Seats and Users](/docs/cloud/manage-access/seats-and-users) for more information on the impact of licenses on these permissions. -1. Create an Account Admins group. -2. Assign at least one user to the Account Admins group. The assigned user can manage future group, SSO mapping, and user or group assignment. +## Table of groups, licenses, and permissions -### Remove the Owner and Member groups +Key: -Follow these steps for both Owner and Member groups: +* (W)rite — Create new or modify existing. Includes `send`, `create`, `delete`, `allocate`, `modify`, and `read`. +* (R)ead — Can view but can not create or change any fields. +* No value — No access to the feature. + +Permissions: + +* [Account-level permissions](#account-permissions-for-account-roles) — Permissions related to management of the dbt Cloud account. For example, billing and account settings. +* [Project-level permissions](#project-permissions-for-account-roles) — Permissions related to the projects in dbt Cloud. For example, Explorer and the IDE. + +The following tables outline the access that users have if they are assigned a Developer license and the Owner or Member group, Read-only license, or IT license. + +#### Account permissions for account roles + +| Account-level permission| Owner | Member | Read-only license| IT license | +|:------------------------|:-----:|:------:|:----------------:|:------------:| +| Account settings | W | W | | W | +| Billing | W | | | W | +| Invitations | W | W | | W | +| Licenses | W | R | | W | +| Users | W | R | | W | +| Project (create) | W | W | | W | +| Connections | W | W | | W | +| Service tokens | W | | | W | +| Webhooks | W | W | | | + +#### Project permissions for account roles + +|Project-level permission | Owner | Member | Read-only | IT license | +|:------------------------|:-----:|:-------:|:---------:|:----------:| +| Adapters | W | W | R | | +| Connections | W | W | R | | +| Credentials | W | W | R | | +| Custom env. variables | W | W | R | | +| Develop (IDE or dbt Cloud CLI)| W | W | | | +| Environments | W | W | R | | +| Jobs | W | W | R | | +| dbt Explorer | W | W | R | | +| Permissions | W | R | | | +| Profile | W | W | R | | +| Projects | W | W | R | | +| Repositories | W | W | R | | +| Runs | W | W | R | | +| Semantic Layer Config | W | W | R | | -1. Log into dbt Cloud. -2. Click the gear icon at the top right and select **Account settings**. -3. Select **Groups** then select **OWNER** or **MEMBER**** group. -4. Click **Edit**. -5. At the bottom of the Group page, click **Delete**. -The Account Admin can add additional SSO mapping groups, permission sets, and users as needed. diff --git a/website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md b/website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md index 1cd24c16481..27c09cbca09 100644 --- a/website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md +++ b/website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md @@ -10,14 +10,34 @@ This guide describes a feature of the dbt Cloud Enterprise plan. If you’re int ::: -dbt Cloud Enterprise supports [OAuth authentication](https://docs.snowflake.net/manuals/user-guide/oauth-intro.html) with Snowflake. When Snowflake OAuth is enabled, users can authorize their Development credentials using Single Sign On (SSO) via Snowflake rather than submitting a username and password to dbt Cloud. If Snowflake is setup with SSO through a third-party identity provider, developers can use this method to log into Snowflake and authorize the dbt Development credentials without any additional setup. +dbt Cloud Enterprise supports [OAuth authentication](https://docs.snowflake.net/manuals/user-guide/oauth-intro.html) with Snowflake. When Snowflake OAuth is enabled, users can authorize their Development credentials using Single Sign On (SSO) via Snowflake rather than submitting a username and password to dbt Cloud. If Snowflake is set up with SSO through a third-party identity provider, developers can use this method to log into Snowflake and authorize the dbt Development credentials without any additional setup. -### Configuring a security integration -To enable Snowflake OAuth, you will need to create a [security integration](https://docs.snowflake.net/manuals/sql-reference/sql/create-security-integration.html) in Snowflake to manage the OAuth connection between dbt Cloud and Snowflake. +To set up Snowflake OAuth in dbt Cloud, admins from both are required for the following steps: +1. [Locate the redirect URI value](#locate-the-redirect-uri-value) in dbt Cloud. +2. [Create a security integration](#create-a-security-integration) in Snowflake. +3. [Configure a connection](#configure-a-connection-in-dbt-cloud) in dbt Cloud. + +To use Snowflake in the dbt Cloud IDE, all developers must [authenticate with Snowflake](#authorize-developer-credentials) in their profile credentials. + +### Locate the redirect URI value + +To get started, copy the connection's redirect URI from dbt Cloud: +1. Navigate to **Account settings**. +1. Select **Projects** and choose a project from the list. +1. Select the connection to view its details and set the **OAuth method** to "Snowflake SSO". +1. Copy the **Redirect URI** to use in the later steps. + + ### Create a security integration -In Snowflake, execute a query to create a security integration. Please find the complete documentation on creating a security integration for custom clients [here](https://docs.snowflake.net/manuals/sql-reference/sql/create-security-integration.html#syntax). In the following example `create or replace security integration` query, replace `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/access-regions-ip-addresses) for your region and plan. +In Snowflake, execute a query to create a security integration. Please find the complete documentation on creating a security integration for custom clients [here](https://docs.snowflake.net/manuals/sql-reference/sql/create-security-integration.html#syntax). + +In the following `CREATE OR REPLACE SECURITY INTEGRATION` example query, replace `` value with the Redirect URI (also referred to as the [access URL](/docs/cloud/about-cloud/access-regions-ip-addresses)) copied in dbt Cloud. To locate the Redirect URI, refer to the previous [locate the redirect URI value](#locate-the-redirect-uri-value) section. ``` CREATE OR REPLACE SECURITY INTEGRATION DBT_CLOUD @@ -25,7 +45,7 @@ CREATE OR REPLACE SECURITY INTEGRATION DBT_CLOUD ENABLED = TRUE OAUTH_CLIENT = CUSTOM OAUTH_CLIENT_TYPE = 'CONFIDENTIAL' - OAUTH_REDIRECT_URI = 'https://YOUR_ACCESS_URL/complete/snowflake' + OAUTH_REDIRECT_URI = '' OAUTH_ISSUE_REFRESH_TOKENS = TRUE OAUTH_REFRESH_TOKEN_VALIDITY = 7776000; ``` @@ -42,7 +62,7 @@ CREATE OR REPLACE SECURITY INTEGRATION DBT_CLOUD | ENABLED | Required | | OAUTH_CLIENT | Required | | OAUTH_CLIENT_TYPE | Required | -| OAUTH_REDIRECT_URI | Required. Use the access URL that corresponds to your server [region](/docs/cloud/about-cloud/access-regions-ip-addresses). | +| OAUTH_REDIRECT_URI | Required. Use the value in the [dbt Cloud account settings](#locate-the-redirect-uri-value). | | OAUTH_ISSUE_REFRESH_TOKENS | Required | | OAUTH_REFRESH_TOKEN_VALIDITY | Required. This configuration dictates the number of seconds that a refresh token is valid for. Use a smaller value to force users to re-authenticate with Snowflake more frequently. | @@ -70,11 +90,11 @@ Enter the Client ID and Client Secret into dbt Cloud to complete the creation of -### Authorize Developer Credentials +### Authorize developer credentials Once Snowflake SSO is enabled, users on the project will be able to configure their credentials in their Profiles. By clicking the "Connect to Snowflake Account" button, users will be redirected to Snowflake to authorize with the configured SSO provider, then back to dbt Cloud to complete the setup process. At this point, users should now be able to use the dbt IDE with their development credentials. -### SSO OAuth Flow Diagram +### SSO OAuth flow diagram diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md b/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md index e4ff998015c..2b2575efc57 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md @@ -117,7 +117,7 @@ If the verification information looks appropriate, then you have completed the c ## Setting up RBAC Now you have completed setting up SSO with GSuite, the next steps will be to set up -[RBAC groups](/docs/cloud/manage-access/enterprise-permissions) to complete your access control configuration. +[RBAC groups](/docs/cloud/manage-access/about-user-access#role-based-access-control-) to complete your access control configuration. ## Troubleshooting diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-microsoft-entra-id.md b/website/docs/docs/cloud/manage-access/set-up-sso-microsoft-entra-id.md index 5dbbf219ccf..4658141034c 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-microsoft-entra-id.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-microsoft-entra-id.md @@ -126,8 +126,8 @@ To complete setup, follow the steps below in the dbt Cloud application. | Field | Value | | ----- | ----- | | **Log in with** | Microsoft Entra ID Single Tenant | -| **Client ID** | Paste the **Application (client) ID** recorded in the steps above | -| **Client Secret** | Paste the **Client Secret** (remember to use the Secret Value instead of the Secret ID) recorded in the steps above;
**Note:** When the client secret expires, an Entra ID admin will have to generate a new one to be pasted into dbt Cloud for uninterrupted application access. | +| **Client ID** | Paste the **Application (client) ID** recorded in the steps above | +| **Client Secret** | Paste the **Client Secret** (remember to use the Secret Value instead of the Secret ID) from the steps above;
**Note:** When the client secret expires, an Entra ID admin will have to generate a new one to be pasted into dbt Cloud for uninterrupted application access. | | **Tenant ID** | Paste the **Directory (tenant ID)** recorded in the steps above | | **Domain** | Enter the domain name for your Azure directory (such as `fishtownanalytics.com`). Only use the primary domain; this won't block access for other domains. | | **Slug** | Enter your desired login slug. Users will be able to log into dbt Cloud by navigating to `https://YOUR_ACCESS_URL/enterprise-login/LOGIN-SLUG`, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/manage-access/sso-overview#auth0-multi-tenant-uris) for your region and plan. Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. | diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-okta.md b/website/docs/docs/cloud/manage-access/set-up-sso-okta.md index 53986513ce2..fda32f118ef 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-okta.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-okta.md @@ -190,4 +190,4 @@ configured in the steps above. ## Setting up RBAC Now you have completed setting up SSO with Okta, the next steps will be to set up -[RBAC groups](/docs/cloud/manage-access/enterprise-permissions) to complete your access control configuration. +[RBAC groups](/docs/cloud/manage-access/about-user-access#role-based-access-control-) to complete your access control configuration. diff --git a/website/docs/docs/cloud/manage-access/sso-overview.md b/website/docs/docs/cloud/manage-access/sso-overview.md index 560be72e31d..6b6527df753 100644 --- a/website/docs/docs/cloud/manage-access/sso-overview.md +++ b/website/docs/docs/cloud/manage-access/sso-overview.md @@ -43,7 +43,7 @@ Then, assign all of these (and only these) to the user license. This step will a ## SSO enforcement -* **SSO Enforcement:** If you have SSO turned on in your organization, dbt Cloud will enforce SSO-only logins for all non-admin users. If an Account Admin already has a password, they can continue logging in with a password. +* **SSO Enforcement:** If SSO is turned on in your organization, dbt Cloud will enforce SSO-only logins for all non-admin users. By default, if an Account Admin or Security Admin already has a password, they can continue logging in with a password. To restrict admins from using passwords, turn off **Allow password logins for account administrators** in the **Single sign-on** section of your organization's **Account settings**. * **SSO Re-Authentication:** dbt Cloud will prompt you to re-authenticate using your SSO provider every 24 hours to ensure high security. ### How should non-admin users log in? diff --git a/website/docs/docs/cloud/migration.md b/website/docs/docs/cloud/migration.md index 8bdf47eae5a..2665b8f6a97 100644 --- a/website/docs/docs/cloud/migration.md +++ b/website/docs/docs/cloud/migration.md @@ -7,34 +7,52 @@ pagination_next: null pagination_prev: null --- -dbt Labs is in the process of migrating dbt Cloud to a new _cell-based architecture_. This architecture will be the foundation of dbt Cloud for years to come, and will bring improved scalability, reliability, and security to all customers and users of dbt Cloud. +dbt Labs is in the process of rolling out a new cell-based architecture for dbt Cloud. This architecture provides the foundation of dbt Cloud for years to come, and brings improved reliability, performance, and consistency to users of dbt Cloud. -There is some preparation required to ensure a successful migration. +We're scheduling migrations by account. When we're ready to migrate your account, you will receive a banner or email communication with your migration date. If you have not received this communication, then you don't need to take action at this time. dbt Labs will share information about your migration with you, with appropriate advance notice, when applicable to your account. -Migrations are being scheduled on a per-account basis. _If you haven't received any communication (either with a banner or by email) about a migration date, you don't need to take any action at this time._ dbt Labs will share migration date information with you, with appropriate advance notice, before we complete any migration steps in the dbt Cloud backend. +Your account will be automatically migrated on or after its scheduled date. However, if you use certain features, you must take action before that date to avoid service disruptions. -This document outlines the steps that you must take to prevent service disruptions before your environment is migrated over to the cell-based architecture. This will impact areas such as login, IP restrictions, and API access. +## Recommended actions -## Pre-migration checklist +:::info Rescheduling your migration -Prior to your migration date, your dbt Cloud account admin will need to make some changes to your account. Most of your configurations will be migrated automatically, but a few will require manual intervention. +If you're on the dbt Cloud Enterprise tier, you can postpone your account migration by up to 45 days. To reschedule your migration, navigate to **Account Settings** → **Migration guide**. -If your account is scheduled for migration, you will see a banner indicating your migration date when you log in. If you don't see a banner, you don't need to take any action. +For help, contact the dbt Support Team at [support@getdbt.com](mailto:support@getdbt.com). +::: -1. **IP addresses** — dbt Cloud will be using new IPs to access your warehouse after the migration. Make sure to allow inbound traffic from these IPs in your firewall and include it in any database grants. All six of the IPs below should be added to allowlists. - * Old IPs: `52.45.144.63`, `54.81.134.249`, `52.22.161.231` - * New IPs: `52.3.77.232`, `3.214.191.130`, `34.233.79.135` -2. **User invitations** — Any pending user invitations will be invalidated during the migration. You can resend the invitations after the migration is complete. -3. **SSO integrations** — If you've completed the Auth0 migration, your account SSO configurations will be automatically transferred. If you haven't completed the Auth0 migration, dbt Labs recommends doing that before starting the mult-cell migration to avoid service disruptions. -4. **IDE sessions** — Any unsaved changes in the IDE might be lost during migration. dbt Labs _strongly_ recommends committing all changes in the IDE before your scheduled migration time. +We highly recommended you take these actions: -## Post-migration +- Ensure pending user invitations are accepted or note outstanding invitations. Pending user invitations might be voided during the migration. You can resend user invitations after the migration is complete. +- Commit unsaved changes in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). Unsaved changes might be lost during migration. +- Export and download [audit logs](/docs/cloud/manage-access/audit-log) older than 90 days, as they will be unavailable from dbt Cloud after the migration is complete. Logs older than 90 days while within the data retention period are not deleted, but you will have to work with the dbt Labs Customer Support team to recover. -After migration, if you completed all the [Pre-migration checklist](#pre-migration-checklist) items, your dbt Cloud resources and jobs will continue to work as they did before. +## Required actions -You have the option to log in to dbt Cloud at a different URL: - * If you were previously logging in at `cloud.getdbt.com`, you should instead plan to login at `us1.dbt.com`. The original URL will still work, but you’ll have to click through to be redirected upon login. - * You may also log in directly with your account’s unique [access URL](/docs/cloud/about-cloud/access-regions-ip-addresses#accessing-your-account). +These actions are required to prevent users from losing access dbt Cloud: -:::info Login with GitHub -Users who previously used the "Login with GitHub" functionality will no longer be able to use this method to login to dbt Cloud after migration. To continue accessing your account, you can use your existing email and password. +- If you still need to, complete [Auth0 migration for SSO](/docs/cloud/manage-access/auth0-migration) before your scheduled migration date to avoid service disruptions. If you've completed the Auth0 migration, your account SSO configurations will be transferred automatically. +- Update your IP allow lists. dbt Cloud will be using new IPs to access your warehouse post-migration. Allow inbound traffic from all of the following new IPs in your firewall and include them in any database grants: + + - `52.3.77.232` + - `3.214.191.130` + - `34.233.79.135` + + Keep the old dbt Cloud IPs listed until the migration is complete. + +## Post-migration​ + +Complete all of these items to ensure your dbt Cloud resources and jobs will continue working without interruption. + +Use one of these two URL login options: + +- `us1.dbt.com.` If you were previously logging in at `cloud.getdbt.com`, you should instead plan to log in at us1.dbt.com. The original URL will still work, but you’ll have to click through to be redirected upon login. +- `ACCOUNT_PREFIX.us1.dbt.com`: A unique URL specifically for your account. If you belong to multiple accounts, each will have a unique URL available as long as they have been migrated to multi-cell. +Check out [access, regions, and IP addresses](/docs/cloud/about-cloud/access-regions-ip-addresses) for more information. + +Remove the following old IP addresses from your firewall and database grants: + +- `52.45.144.63` +- `54.81.134.249` +- `52.22.161.231` diff --git a/website/docs/docs/cloud/secure/about-privatelink.md b/website/docs/docs/cloud/secure/about-privatelink.md index 731cef3f019..f19790fd708 100644 --- a/website/docs/docs/cloud/secure/about-privatelink.md +++ b/website/docs/docs/cloud/secure/about-privatelink.md @@ -7,10 +7,13 @@ sidebar_label: "About PrivateLink" import SetUpPages from '/snippets/_available-tiers-privatelink.md'; import PrivateLinkHostnameWarning from '/snippets/_privatelink-hostname-restriction.md'; +import CloudProviders from '/snippets/_privatelink-across-providers.md'; -PrivateLink enables a private connection from any dbt Cloud Multi-Tenant environment to your data platform hosted on AWS using [AWS PrivateLink](https://aws.amazon.com/privatelink/) technology. PrivateLink allows dbt Cloud customers to meet security and compliance controls as it allows connectivity between dbt Cloud and your data platform without traversing the public internet. This feature is supported in most regions across NA, Europe, and Asia, but [contact us](https://www.getdbt.com/contact/) if you have questions about availability. +PrivateLink enables a private connection from any dbt Cloud Multi-Tenant environment to your data platform hosted on a cloud provider, such as [AWS](https://aws.amazon.com/privatelink/) or [Azure](https://azure.microsoft.com/en-us/products/private-link), using that provider’s PrivateLink technology. PrivateLink allows dbt Cloud customers to meet security and compliance controls as it allows connectivity between dbt Cloud and your data platform without traversing the public internet. This feature is supported in most regions across NA, Europe, and Asia, but [contact us](https://www.getdbt.com/contact/) if you have questions about availability. + + ### Cross-region PrivateLink diff --git a/website/docs/docs/cloud/secure/databricks-privatelink.md b/website/docs/docs/cloud/secure/databricks-privatelink.md index c83e9c67c95..d754f2b76c4 100644 --- a/website/docs/docs/cloud/secure/databricks-privatelink.md +++ b/website/docs/docs/cloud/secure/databricks-privatelink.md @@ -7,39 +7,58 @@ pagination_next: null --- import SetUpPages from '/snippets/_available-tiers-privatelink.md'; +import PrivateLinkSLA from '/snippets/_PrivateLink-SLA.md'; +import CloudProviders from '/snippets/_privatelink-across-providers.md'; -The following steps will walk you through the setup of a Databricks AWS PrivateLink endpoint in the dbt Cloud multi-tenant environment. +The following steps will walk you through the setup of a Databricks AWS PrivateLink or Azure Private Link endpoint in the dbt Cloud multi-tenant environment. + + -## Configure PrivateLink +## Configure AWS PrivateLink 1. Locate your [Databricks instance name](https://docs.databricks.com/en/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) - - Example: `cust-success.cloud.databricks.com` -2. Add the required information to the template below, and submit your request to [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support): -``` -Subject: New Multi-Tenant PrivateLink Request -- Type: Databricks -- Databricks instance name: -- Databricks cluster AWS Region (e.g., us-east-1, eu-west-2): -- dbt Cloud multi-tenant environment (US, EMEA, AU): -``` + - Example: `cust-success.cloud.databricks.com` -import PrivateLinkSLA from '/snippets/_PrivateLink-SLA.md'; +1. Add the required information to the following template and submit your AWS PrivateLink request to [dbt Support](https://docs.getdbt.com/docs/dbt-support#dbt-cloud-support): + ``` + Subject: New AWS Multi-Tenant PrivateLink Request + - Type: Databricks + - Databricks instance name: + - Databricks cluster AWS Region (e.g., us-east-1, eu-west-2): + - dbt Cloud multi-tenant environment (US, EMEA, AU): + ``` + + +1. Once dbt Cloud support has notified you that setup is complete, [register the VPC endpoint in Databricks](https://docs.databricks.com/administration-guide/cloud-configurations/aws/privatelink.html#step-3-register-privatelink-objects-and-attach-them-to-a-workspace) and attach it to the workspace: + - [Register your VPC endpoint](https://docs.databricks.com/en/security/network/classic/vpc-endpoints.html) — Register the VPC endpoint using the VPC endpoint ID provided by dbt Support. + - [Create a Private Access Settings object](https://docs.databricks.com/en/security/network/classic/private-access-settings.html) — Create a Private Access Settings (PAS) object with your desired public access settings, and setting Private Access Level to **Endpoint**. Choose the registered endpoint created in the previous step. + - [Create or update your workspace](https://docs.databricks.com/en/security/network/classic/privatelink.html#step-3d-create-or-update-the-workspace-front-end-back-end-or-both) — Create a workspace, or update your an existing workspace. Under **Advanced configurations → Private Link** choose the private access settings object created in the previous step. + + :::warning + If using an existing Databricks workspace, all workloads running in the workspace need to be stopped to enable Private Link. Workloads also can't be started for another 20 minutes after making changes. From the [Databricks documentation](https://docs.databricks.com/en/security/network/classic/privatelink.html#step-3d-create-or-update-the-workspace-front-end-back-end-or-both): - + "After creating (or updating) a workspace, wait until it’s available for using or creating clusters. The workspace status stays at status RUNNING and the VPC change happens immediately. However, you cannot use or create clusters for another 20 minutes. If you create or use clusters before this time interval elapses, clusters do not launch successfully, fail, or could cause other unexpected behavior." -3. Once dbt Cloud support has notified you that setup is complete, [register the VPC endpoint in Databricks](https://docs.databricks.com/administration-guide/cloud-configurations/aws/privatelink.html#step-3-register-privatelink-objects-and-attach-them-to-a-workspace) and attach it to the workspace: - - [Register your VPC endpoint](https://docs.databricks.com/en/security/network/classic/vpc-endpoints.html) — Register the VPC endpoint using the VPC endpoint ID provided by dbt Support. - - [Create a Private Access Settings object](https://docs.databricks.com/en/security/network/classic/private-access-settings.html) — Create a Private Access Settings (PAS) object with your desired public access settings, and setting Private Access Level to **Endpoint**. Choose the registered endpoint created in the previous step. - - [Create or update your workspace](https://docs.databricks.com/en/security/network/classic/privatelink.html#step-3d-create-or-update-the-workspace-front-end-back-end-or-both) — Create a workspace, or update your an existing workspace. Under **Advanced configurations → Private Link** choose the private access settings object created in the previous step. + ::: -:::note Warning -If using an existing Databricks workspace, all workloads running in the workspace will need to be stopped to enable Private Link! Workloads also cannot be started for another 20 minutes after making changes. From the [Databricks documentation](https://docs.databricks.com/en/security/network/classic/privatelink.html#step-3d-create-or-update-the-workspace-front-end-back-end-or-both): +## Configure Azure Private Link ->After creating (or updating) a workspace, wait until it’s available for using or creating clusters. The workspace status stays at status RUNNING and the VPC change happens immediately. However, you cannot use or create clusters for another 20 minutes. If you create or use clusters before this time interval elapses, clusters do not launch successfully, fail, or could cause other unexpected behavior. +1. Navigate to your Azure Databricks workspace. + The path format is: `/subscriptions//resourceGroups//providers/Microsoft.Databricks/workspaces/`. +2. From the workspace overview, click **JSON view**. +3. Copy the value in the `resource_id` field. +4. Add the required information to the following template and submit your Azure Private Link request to [dbt Support](https://docs.getdbt.com/docs/dbt-support#dbt-cloud-support): + ``` + Subject: New Azure Multi-Tenant Private Link Request + - Type: Databricks + - Databricks instance name: + - Databricks Azure resource ID: + - dbt Cloud multi-tenant environment: EMEA + ``` +5. Once our Support team confirms the resources are available in the Azure portal, navigate to the Azure Databricks Workspace and browse to **Networking** > **Private Endpoint Connections**. Then, highlight the `dbt` named option and select **Approve**. -::: ## Create Connection in dbt Cloud diff --git a/website/docs/docs/cloud/secure/ip-restrictions.md b/website/docs/docs/cloud/secure/ip-restrictions.md index 034b3a6c144..d39960dab42 100644 --- a/website/docs/docs/cloud/secure/ip-restrictions.md +++ b/website/docs/docs/cloud/secure/ip-restrictions.md @@ -13,7 +13,7 @@ import SetUpPages from '/snippets/_available-tiers-iprestrictions.md'; IP Restrictions help control which IP addresses are allowed to connect to dbt Cloud. IP restrictions allow dbt Cloud customers to meet security and compliance controls by only allowing approved IPs to connect to their dbt Cloud environment. This feature is supported in all regions across NA, Europe, and Asia-Pacific, but contact us if you have questions about availability. -## Configuring IP Restrictions +## Configuring IP restrictions To configure IP restrictions, go to **Account Settings** → **IP Restrictions**. IP restrictions provide two methods for determining which IPs can access dbt Cloud: an allowlist and a blocklist. IPs in the allowlist are allowed to access dbt Cloud, and IPs in the deny list will be blocked from accessing dbt Cloud. IP Restrictions can be used for a range of use cases, including: @@ -29,7 +29,7 @@ For any version control system integrations (Github, Gitlab, ADO, etc.) inbound To add an IP to the allowlist, from the **IP Restrictions** page: -1. Click **edit** +1. Click **Edit** 2. Click **Add Rule** 3. Add name and description for the rule - For example, Corporate VPN CIDR Range @@ -39,7 +39,9 @@ To add an IP to the allowlist, from the **IP Restrictions** page: - You can add multiple ranges in the same rule. 6. Click **Save** -Note that simply adding the IP Ranges will not enforce IP restrictions. For more information, see the section “Enabling Restrictions.” +Add multiple IP ranges by clicking the **Add IP range** button to create a new text field. + +Note that simply adding the IP Ranges will not enforce IP restrictions. For more information, see the [Enabling restrictions](#enabling-restrictions) section. If you only want to allow the IP ranges added to this list and deny all other requests, adding a denylist is not necessary. By default, if only an allow list is added, dbt Cloud will only allow IPs in the allowable range and deny all other IPs. However, you can add a denylist if you want to deny specific IP addresses within your allowlist CIDR range. @@ -65,9 +67,9 @@ It is possible to put an IP range on one list and then a sub-range or IP address ::: -## Enabling Restrictions +## Enabling restrictions -Once you are done adding all your ranges, IP restrictions can be enabled by selecting the **Enable IP restrictions** button and clicking **Save**. If your IP address is in any of the denylist ranges, you won’t be able to save or enable IP restrictions - this is done to prevent accidental account lockouts. If you do get locked out due to IP changes on your end, please reach out to support@dbtlabs.com +Once you are done adding all your ranges, IP restrictions can be enabled by selecting the **Enable IP restrictions** button and clicking **Save**. If your IP address is in any of the denylist ranges, you won’t be able to save or enable IP restrictions - this is done to prevent accidental account lockouts. If you do get locked out due to IP changes on your end, please reach out to support@getdbt.com Once enabled, when someone attempts to access dbt Cloud from a restricted IP, they will encounter one of the following messages depending on whether they use email & password or SSO login. diff --git a/website/docs/docs/cloud/secure/postgres-privatelink.md b/website/docs/docs/cloud/secure/postgres-privatelink.md index 58098f4c23a..4d670354686 100644 --- a/website/docs/docs/cloud/secure/postgres-privatelink.md +++ b/website/docs/docs/cloud/secure/postgres-privatelink.md @@ -5,11 +5,16 @@ description: "Configuring PrivateLink for Postgres" sidebar_label: "PrivateLink for Postgres" --- import SetUpPages from '/snippets/_available-tiers-privatelink.md'; +import PrivateLinkTroubleshooting from '/snippets/_privatelink-troubleshooting.md'; +import PrivateLinkCrossZone from '/snippets/_privatelink-cross-zone-load-balancing.md'; +import CloudProviders from '/snippets/_privatelink-across-providers.md'; A Postgres database, hosted either in AWS or in a properly connected on-prem data center, can be accessed through a private network connection using AWS Interface-type PrivateLink. The type of Target Group connected to the Network Load Balancer (NLB) may vary based on the location and type of Postgres instance being connected, as explained in the following steps. + + ## Configuring Postgres interface-type PrivateLink ### 1. Provision AWS resources @@ -40,9 +45,16 @@ Creating an Interface VPC PrivateLink connection requires creating multiple AWS - Target Group protocol: **TCP** - **Network Load Balancer (NLB)** — Requires creating a Listener that attaches to the newly created Target Group for port `5432` + - **Scheme:** Internal + - **IP address type:** IPv4 + - **Network mapping:** Choose the VPC that the VPC Endpoint Service and NLB are being deployed in, and choose subnets from at least two Availability Zones. + - **Security Groups:** The Network Load Balancer (NLB) associated with the VPC endpoint service must either not have an associated security group, or the security group must have a rule that allows requests from the appropriate dbt Cloud **private CIDR(s)**. Note that _this is different_ than the static public IPs listed on the dbt Cloud [Access, Regions, & IP addresses](https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses) page. dbt Support can provide the correct private CIDR(s) upon request. If necessary, until you can refine the rule to the smaller CIDR provided by dbt, allow connectivity by temporarily adding an allow rule of `10.0.0.0/8`. + - **Listeners:** Create one listener per target group that maps the appropriate incoming port to the corresponding target group ([details](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/load-balancer-listeners.html)). - **VPC Endpoint Service** — Attach to the newly created NLB. - Acceptance required (optional) — Requires you to [accept our connection request](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#accept-reject-connection-requests) after dbt creates the endpoint. + + ### 2. Grant dbt AWS account access to the VPC Endpoint Service On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the root user in the appropriate production AWS account and save your changes. @@ -86,3 +98,5 @@ Once dbt Cloud support completes the configuration, you can start creating new c 3. Select the private endpoint from the dropdown (this will automatically populate the hostname/account field). 4. Configure the remaining data platform details. 5. Test your connection and save it. + + diff --git a/website/docs/docs/cloud/secure/redshift-privatelink.md b/website/docs/docs/cloud/secure/redshift-privatelink.md index 23e2b4382fc..75924cf76a9 100644 --- a/website/docs/docs/cloud/secure/redshift-privatelink.md +++ b/website/docs/docs/cloud/secure/redshift-privatelink.md @@ -6,6 +6,9 @@ sidebar_label: "PrivateLink for Redshift" --- import SetUpPages from '/snippets/_available-tiers-privatelink.md'; +import PrivateLinkTroubleshooting from '/snippets/_privatelink-troubleshooting.md'; +import PrivateLinkCrossZone from '/snippets/_privatelink-cross-zone-load-balancing.md'; +import CloudProviders from '/snippets/_privatelink-across-providers.md'; @@ -15,6 +18,8 @@ AWS provides two different ways to create a PrivateLink VPC endpoint for a Redsh dbt Cloud supports both types of endpoints, but there are a number of [considerations](https://docs.aws.amazon.com/redshift/latest/mgmt/managing-cluster-cross-vpc.html#managing-cluster-cross-vpc-considerations) to take into account when deciding which endpoint type to use. Redshift-managed provides a far simpler setup with no additional cost, which might make it the preferred option for many, but may not be an option in all environments. Based on these criteria, you will need to determine which is the right type for your system. Follow the instructions from the section below that corresponds to your chosen endpoint type. + + :::note Redshift Serverless While Redshift Serverless does support Redshift-managed type VPC endpoints, this functionality is not currently available across AWS accounts. Due to this limitation, an Interface-type VPC endpoint service must be used for Redshift Serverless cluster PrivateLink connectivity from dbt Cloud. ::: @@ -78,9 +83,16 @@ Creating an Interface VPC PrivateLink connection requires creating multiple AWS - Target Group protocol: **TCP** - **Network Load Balancer (NLB)** — Requires creating a Listener that attaches to the newly created Target Group for port `5439` + - **Scheme:** Internal + - **IP address type:** IPv4 + - **Network mapping:** Choose the VPC that the VPC Endpoint Service and NLB are being deployed in, and choose subnets from at least two Availability Zones. + - **Security Groups:** The Network Load Balancer (NLB) associated with the VPC endpoint service must either not have an associated security group, or the security group must have a rule that allows requests from the appropriate dbt Cloud **private CIDR(s)**. Note that _this is different_ than the static public IPs listed on the dbt Cloud [Access, Regions, & IP addresses](https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses) page. dbt Support can provide the correct private CIDR(s) upon request. If necessary, until you can refine the rule to the smaller CIDR provided by dbt, allow connectivity by temporarily adding an allow rule of `10.0.0.0/8`. + - **Listeners:** Create one listener per target group that maps the appropriate incoming port to the corresponding target group ([details](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/load-balancer-listeners.html)). - **VPC Endpoint Service** — Attach to the newly created NLB. - Acceptance required (optional) — Requires you to [accept our connection request](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#accept-reject-connection-requests) after dbt creates the endpoint. + + ### 2. Grant dbt AWS Account access to the VPC Endpoint Service On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the root user in the appropriate production AWS account and save your changes. @@ -115,3 +127,5 @@ Once dbt Cloud support completes the configuration, you can start creating new c 3. Select the private endpoint from the dropdown (this will automatically populate the hostname/account field). 4. Configure the remaining data platform details. 5. Test your connection and save it. + + diff --git a/website/docs/docs/cloud/secure/snowflake-privatelink.md b/website/docs/docs/cloud/secure/snowflake-privatelink.md index 6cffc373d3b..b943791292f 100644 --- a/website/docs/docs/cloud/secure/snowflake-privatelink.md +++ b/website/docs/docs/cloud/secure/snowflake-privatelink.md @@ -6,10 +6,13 @@ sidebar_label: "PrivateLink for Snowflake" --- import SetUpPages from '/snippets/_available-tiers-privatelink.md'; +import CloudProviders from '/snippets/_privatelink-across-providers.md'; -The following steps walk you through the setup of a Snowflake AWS PrivateLink and Azure Private Link endpoint in the dbt Cloud multi-tenant environment. +The following steps walk you through the setup of a Snowflake AWS PrivateLink or Azure Private Link endpoint in a dbt Cloud multi-tenant environment. + + :::note Snowflake SSO with PrivateLink Users connecting to Snowflake using SSO over a PrivateLink connection from dbt Cloud will also require access to a PrivateLink endpoint from their local workstation. @@ -19,26 +22,37 @@ Users connecting to Snowflake using SSO over a PrivateLink connection from dbt C - [Snowflake SSO with Private Connectivity](https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-overview#label-sso-private-connectivity) ::: -## Configure PrivateLink +## About private connectivity for Snowflake + +dbt Cloud supports private connectivity for Snowflake using one of the following services: + +- AWS [PrivateLink](#configure-aws-privatelink) +- Azure [Private Link](#configure-azure-private-link) + +## Configure AWS PrivateLink + +To configure Snowflake instances hosted on AWS for [PrivateLink](https://aws.amazon.com/privatelink): -1. Open a Support case with Snowflake to allow access from the dbt Cloud AWS account -- Snowflake prefers that the account owner opens the Support case directly, rather than dbt Labs acting on their behalf. For more information, refer to [Snowflake's knowledge base article](https://community.snowflake.com/s/article/HowtosetupPrivatelinktoSnowflakefromCloudServiceVendors) +1. Open a support case with Snowflake to allow access from the dbt Cloud AWS or Entra ID account. +- Snowflake prefers that the account owner opens the support case directly rather than dbt Labs acting on their behalf. For more information, refer to [Snowflake's knowledge base article](https://community.snowflake.com/s/article/HowtosetupPrivatelinktoSnowflakefromCloudServiceVendors). - Provide them with your dbt Cloud account ID along with any other information requested in the article. - - AWS account ID: `346425330055` - _NOTE: This account ID only applies to dbt Cloud Multi-Tenant environments. For Virtual Private/Single-Tenant account IDs please contact [Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support)._ + - **AWS account ID**: `346425330055` — _NOTE: This account ID only applies to AWS dbt Cloud multi-tenant environments. For AWS Virtual Private/Single-Tenant account IDs, please contact [Support](https://docs.getdbt.com/docs/dbt-support#dbt-cloud-support)._ - You will need to have `ACCOUNTADMIN` access to the Snowflake instance to submit a Support request. 2. After Snowflake has granted the requested access, run the Snowflake system function [SYSTEM$GET_PRIVATELINK_CONFIG](https://docs.snowflake.com/en/sql-reference/functions/system_get_privatelink_config.html) and copy the output. -3. Add the required information to the template below, and submit your request to [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support): +3. Add the required information to the following template and submit your request to [dbt Support](https://docs.getdbt.com/docs/dbt-support#dbt-cloud-support): ``` -Subject: New Multi-Tenant PrivateLink Request +Subject: New Multi-Tenant (Azure or AWS) PrivateLink Request - Type: Snowflake - SYSTEM$GET_PRIVATELINK_CONFIG output: - *Use privatelink-account-url or regionless-privatelink-account-url?: -- dbt Cloud multi-tenant environment (US, EMEA, AU): +- dbt Cloud multi-tenant environment + - AWS: US, EMEA, or AU + - Azure: EMEA only ``` _*By default dbt Cloud will be configured to use `privatelink-account-url` from the provided [SYSTEM$GET_PRIVATELINK_CONFIG](https://docs.snowflake.com/en/sql-reference/functions/system_get_privatelink_config.html) as the PrivateLink endpoint. Upon request, `regionless-privatelink-account-url` can be used instead._ @@ -47,6 +61,32 @@ import PrivateLinkSLA from '/snippets/_PrivateLink-SLA.md'; +## Configure Azure Private Link + +To configure Snowflake instances hosted on Azure for [Private Link](https://learn.microsoft.com/en-us/azure/private-link/private-link-overview): + +1. In your Snowflake account, run the following SQL statements and copy the output: + +```sql + +USE ROLE ACCOUNTADMIN; +SYSTEM$GET_PRIVATELINK_CONFIG; + +``` + + +2. Add the required information to the following template and submit your request to [dbt Support](https://docs.getdbt.com/docs/dbt-support#dbt-cloud-support): + +``` +Subject: New Multi-Tenant (Azure or AWS) PrivateLink Request +- Type: Snowflake +- The output from SYSTEM$GET_PRIVATELINK_CONFIG: + - Include the privatelink-pls-id +- dbt Cloud Azure multi-tenant environment: +``` + +3. dbt Support will provide the `private endpoint resource_id` of our `private_endpoint` and the `CIDR` range for you to complete the [PrivateLink configuration](https://community.snowflake.com/s/article/HowtosetupPrivatelinktoSnowflakefromCloudServiceVendors) by contacting the Snowflake Support team. + ## Create Connection in dbt Cloud Once dbt Cloud support completes the configuration, you can start creating new connections using PrivateLink. @@ -57,6 +97,27 @@ Once dbt Cloud support completes the configuration, you can start creating new c 4. Configure the remaining data platform details. 5. Test your connection and save it. +## Enable the connection in Snowflake + +To complete the setup, follow the remaining steps from the Snowflake setup guides. The instructions vary based on the platform: + +- [Snowflake AWS PrivateLink](https://docs.snowflake.com/en/user-guide/admin-security-privatelink) +- [Snowflake Azure Private Link](https://docs.snowflake.com/en/user-guide/privatelink-azure) + +There are some nuances for each connection and you will need a Snowflake administrator. As the Snowflake administrator, call the `SYSTEM$AUTHORIZE_STAGE_PRIVATELINK_ACCESS` function using the privateEndpointResourceID value as the function argument. This authorizes access to the Snowflake internal stage through the private endpoint. + +```sql + +USE ROLE ACCOUNTADMIN; + +-- AWS PrivateLink +SELECT SYSTEMS$AUTHORIZE_STATE_PRIVATELINK_ACCESS ( `AWS VPC ID` ); + +-- Azure Private Link +SELECT SYSTEMS$AUTHORIZE_STATE_PRIVATELINK_ACCESS ( `AZURE PRIVATE ENDPOINT RESOURCE ID` ); + +``` + ## Configuring Network Policies If your organization uses [Snowflake Network Policies](https://docs.snowflake.com/en/user-guide/network-policies) to restrict access to your Snowflake account, you will need to add a network rule for dbt Cloud. @@ -84,19 +145,23 @@ Open the Snowflake UI and take the following steps: ### Using SQL + For quick and automated setup of network rules via SQL in Snowflake, the following commands allow you to create and configure access rules for dbt Cloud. These SQL examples demonstrate how to add a network rule and update your network policy accordingly. 1. Create a new network rule with the following SQL: ```sql + CREATE NETWORK RULE allow_dbt_cloud_access MODE = INGRESS TYPE = AWSVPCEID VALUE_LIST = (''); -- Replace '' with the actual ID provided + ``` 2. Add the rule to a network policy with the following SQL: ```sql + ALTER NETWORK POLICY ADD ALLOWED_NETWORK_RULE_LIST =('allow_dbt_cloud_access'); -``` +``` diff --git a/website/docs/docs/cloud/secure/vcs-privatelink.md b/website/docs/docs/cloud/secure/vcs-privatelink.md index b08154d2e72..28b4df8f706 100644 --- a/website/docs/docs/cloud/secure/vcs-privatelink.md +++ b/website/docs/docs/cloud/secure/vcs-privatelink.md @@ -6,6 +6,8 @@ sidebar_label: "PrivateLink for VCS" --- import SetUpPages from '/snippets/_available-tiers-privatelink.md'; +import PrivateLinkTroubleshooting from '/snippets/_privatelink-troubleshooting.md'; +import PrivateLinkCrossZone from '/snippets/_privatelink-cross-zone-load-balancing.md'; @@ -43,12 +45,15 @@ Creating an Interface VPC PrivateLink connection requires creating multiple AWS - **Scheme:** Internal - **IP address type:** IPv4 - **Network mapping:** Choose the VPC that the VPC Endpoint Service and NLB are being deployed in, and choose subnets from at least two Availability Zones. + - **Security Groups:** The Network Load Balancer (NLB) associated with the VPC Endpoint Service must either not have an associated Security Group, or the Security Group must have a rule that allows requests from the appropriate dbt Cloud **private CIDR(s)**. Note that **this is different** than the static public IPs listed on the dbt Cloud [Access, Regions, & IP addresses](https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses) page. The correct private CIDR(s) can be provided by dbt Support upon request. If necessary, temporarily adding an allow rule of `10.0.0.0/8` should allow connectivity until the rule can be refined to the smaller dbt provided CIDR. - **Listeners:** Create one Listener per Target Group that maps the appropriate incoming port to the corresponding Target Group ([details](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/load-balancer-listeners.html)). - **Endpoint Service** - The VPC Endpoint Service is what allows for the VPC to VPC connection, routing incoming requests to the configured load balancer. - **Load balancer type:** Network. - **Load balancer:** Attach the NLB created in the previous step. - **Acceptance required (recommended)**: When enabled, requires a new connection request to the VPC Endpoint Service to be accepted by the customer before connectivity is allowed ([details](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#accept-reject-connection-requests)). + + ### 2. Grant dbt AWS account access to the VPC Endpoint Service Once these resources have been provisioned, access needs to be granted for the dbt Labs AWS account to create a VPC Endpoint in our VPC. On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the following IAM role in the appropriate production AWS account and save your changes ([details](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#add-remove-permissions)). @@ -106,3 +111,5 @@ Once dbt confirms that the PrivateLink integration is complete, you can use it i + + \ No newline at end of file diff --git a/website/docs/docs/cloud/use-dbt-assist.md b/website/docs/docs/cloud/use-dbt-assist.md deleted file mode 100644 index 4eef6c87329..00000000000 --- a/website/docs/docs/cloud/use-dbt-assist.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: "Use dbt Assist" -sidebar_label: "Use dbt Assist" -description: "Use dbt Assist to generate documentation and tests from scratch, giving you the flexibility to modify or fix generated code." ---- - -# Use dbt Assist - -Use dbt Assist to generate documentation and tests from scratch, giving you the flexibility to modify or fix generated code. To access and use dbt Assist: - -1. Navigate to the dbt Cloud IDE and select a SQL model file under the **File Explorer**. - -2. In the **Console** section (under the **File Editor**), select the **dbt Assist** to view the available AI options. - -3. Select the available options: **Documentation** or **Tests** to generate the YAML config. - - To generate both for the same model, click each option separately. dbt Assist intelligently saves the YAML config in the same file. - -4. Verify the AI-generated code. You can update or fix the code as needed. - -5. Click **Save** to save the code. You should see the file changes under the **Version control** section. - - diff --git a/website/docs/docs/cloud/use-dbt-copilot.md b/website/docs/docs/cloud/use-dbt-copilot.md new file mode 100644 index 00000000000..30def967f96 --- /dev/null +++ b/website/docs/docs/cloud/use-dbt-copilot.md @@ -0,0 +1,22 @@ +--- +title: "Use dbt Copilot" +sidebar_label: "Use dbt Copilot" +description: "Use the dbt Copilot AI engine to generate documentation, tests, and semantic models from scratch, giving you the flexibility to modify or fix generated code." +--- + +# Use dbt Copilot + +Use dbt Copilot to generate documentation, tests, and semantic models from scratch, giving you the flexibility to modify or fix generated code. To access and use this AI engine: + +1. Navigate to the dbt Cloud IDE and select a SQL model file under the **File Explorer**. + +2. In the **Console** section (under the **File Editor**), click **dbt Copilot** to view the available AI options. + +3. Select the available options to generate the YAML config: **Generate Documentation**, **Generate Tests**, or **Generate Semantic Model**. + - To generate multiple YAML configs for the same model, click each option separately. dbt Copilot intelligently saves the YAML config in the same file. + +4. Verify the AI-generated code. You can update or fix the code as needed. + +5. Click **Save As**. You should see the file changes under the **Version control** section. + + diff --git a/website/docs/docs/collaborate/auto-exposures.md b/website/docs/docs/collaborate/auto-exposures.md new file mode 100644 index 00000000000..9b25a2fb305 --- /dev/null +++ b/website/docs/docs/collaborate/auto-exposures.md @@ -0,0 +1,22 @@ +--- +title: "Auto-exposures" +sidebar_label: "Auto-exposures" +description: "Import and auto-generate exposures from dashboards and understand how models are used in downstream tools for a richer lineage." +pagination_prev: null +pagination_next: "docs/collaborate/data-tile" +image: /img/docs/cloud-integrations/auto-exposures/explorer-lineage.jpg +--- + +# Auto-exposures + +As a data team, it’s critical that you have context into the downstream use cases and users of your data products. Auto-exposures integrates natively with Tableau (Power BI coming soon) and auto-generates downstream lineage in dbt Explorer for a richer experience. + +Auto-exposures helps users understand how their models are used in downstream analytics tools to inform investments and reduce incidents — ultimately building trust and confidence in data products. It imports and auto-generates exposures based on Tableau dashboards, with user-defined curation. + +Auto-exposures is available on [Versionless](/docs/dbt-versions/versionless-cloud) and on [dbt Cloud Enterprise](https://www.getdbt.com/pricing/) plans. + +For more information on how to set up auto-exposures, prerequisites, and more — refer to [configure auto-exposures in Tableau and dbt Cloud](/docs/cloud-integrations/configure-auto-exposures). + +import ViewExposures from '/snippets/_auto-exposures-view.md'; + + diff --git a/website/docs/docs/collaborate/data-tile.md b/website/docs/docs/collaborate/data-tile.md new file mode 100644 index 00000000000..1d5b26e26b7 --- /dev/null +++ b/website/docs/docs/collaborate/data-tile.md @@ -0,0 +1,271 @@ +--- +title: "Data health tile" +id: "data-tile" +sidebar_label: "Data health tile" +description: "Embed data health tiles in your dashboards to distill trust signals for data consumers." +image: /img/docs/collaborate/dbt-explorer/data-tile-pass.jpg +--- + +With data health tiles, stakeholders will get an at-a-glance confirmation on whether the data they’re looking at is stale or degraded. This trust signal allows teams to immediately go back into Explorer to see more details and investigate issues. + +The data health tile: + +- Distills trust signals for data consumers. +- Deep links you into dbt Explorer where you can further dive into upstream data issues. +- Provides richer information and makes it easier to debug. +- Revamps the existing, [job-based tiles](#job-based-data-health). + +Data health tiles rely on [exposures](/docs/build/exposures) to surface trust signals in your dashboards. When you configure exposures in your dbt project, you are explicitly defining how specific outputs—like dashboards or reports—depend on your data models. + + + + + + +## Prerequisites + +- You must have a dbt Cloud account on a [Team or Enterprise plan](https://www.getdbt.com/pricing/). +- You must be an account admin to set up [service tokens](/docs/dbt-cloud-apis/service-tokens#permissions-for-service-account-tokens). +- You must have [develop permissions](/docs/cloud/manage-access/seats-and-users). +- Have [exposures](/docs/build/exposures) configured in your project and [source freshness](/docs/deploy/source-freshness) enabled in the job that generates this exposure. + +## View exposure in dbt Explorer + +First, be sure to enable [source freshness](/docs/deploy/source-freshness) in the job that generates this exposure. + +1. Navigate to dbt Explorer by clicking on the **Explore** link in the navigation. +2. In the main **Overview** page, go to the left navigation. +3. Under the **Resources** tab, click on **Exposures** to view the [exposures](/docs/build/exposures) list. +4. Select a dashboard exposure and go to the **General** tab to view the data health information. +5. In this tab, you’ll see: + - Name of the exposure. + - Data health status: Data freshness passed, Data quality passed, Data may be stale, Data quality degraded. + - Resource type (model, source, and so on). + - Dashboard status: Failure, Pass, Stale. + - You can also see the last check completed, the last check time, and the last check duration. +6. You can click the **Open Dashboard** button on the upper right to immediately view this in your analytics tool. + + + +## Embed in your dashboard + +Once you’ve navigated to the exposure in dbt Explorer, you’ll need to set up your data health tile and [service token](/docs/dbt-cloud-apis/service-tokens). You can embed data health tile to any analytics tool that supports URL or iFrame embedding. + +Follow these steps to set up your data health tile: + +1. Go to **Account settings** in dbt Cloud. +2. Select **API tokens** in the left sidebar and then **Service tokens**. +3. Click on **Create service token** and give it a name. +4. Select the [**Metadata Only**](/docs/dbt-cloud-apis/service-tokens) permission. This token will be used to embed the tile in your dashboard in the later steps. + + +5. Copy the **Metadata Only** token and save it in a secure location. You'll need it token in the next steps. +6. Navigate back to dbt Explorer and select an exposure. +7. Below the **Data health** section, expand on the toggle for instructions on how to embed the exposure tile (if you're an account admin with develop permissions). +8. In the expanded toggle, you'll see a text field where you can paste your **Metadata Only token**. + + +9. Once you’ve pasted your token, you can select either **URL** or **iFrame** depending on which you need to add to your dashboard. + +If your analytics tool supports iFrames, you can embed the dashboard tile within it. + +### Examples +The following examples show how to embed the data health tile in Tableau and PowerBI. + + + + + +You can embed the data health tile iFrame in PowerBI using PowerBI Pro Online, Fabric PowerBI, or PowerBI Desktop. + + + +Follow these steps to embed the data health tile in PowerBI: + +1. Create a dashboard in PowerBI and connect to your database to pull in the data. +2. Create a new PowerBI measure by right-clicking on your **Data**, **More options**, and then **New measure**. + + +3. Navigate to dbt Explorer, select the exposure, and expand the [**Embed data health into your dashboard**](/docs/collaborate/data-tile#embed-in-your-dashboard) toggle. +4. Go to the **iFrame** tab and copy the iFrame code. Make sure the Metadata Only token is already set up. +5. In PowerBI, paste the iFrame code you copied into your measure calculation window. The iFrame code should look like this: + + ```html + Website = + "" + ``` + + + +6. PowerBI desktop doesn't support HTML rendering by default, so you need to install an HTML component from the PowerBI Visuals Store. +7. To do this, go to **Build visuals** and then **Get more visuals**. +8. Login with your PowerBI account. +9. There are several third-party HTML visuals. The one tested for this guide is [HTML content](https://appsource.microsoft.com/en-us/product/power-bi-visuals/WA200001930?tab=Overview). Install it, but please keep in mind it's a third-party plugin not created or supported by dbt Labs. +10. Drag the metric with the iFrame code into the HTML content widget in PowerBI. This should now display your data health tile. + + + +*Refer to [this tutorial](https://www.youtube.com/watch?v=SUm9Hnq8Th8) for additional information on embedding a website into your Power BI report.* + + + + + +Follow these steps to embed the data health tile in Tableau: + + + +1. Create a dashboard in Tableau and connect to your database to pull in the data. +2. Ensure you've copied the URL or iFrame snippet available in dbt Explorer's **Data health** section, under the **Embed data health into your dashboard** toggle. +3. Insert a **Web Page** object. +4. Insert the URL and click **Ok**. + + ```html + https://metadata.ACCESS_URL/exposure-tile?uniqueId=exposure.EXPOSURE_NAME&environmentType=production&environmentId=220370&token= + ``` + + *Note, replace the placeholders with your actual values.* +5. You should now see the data health tile embedded in your Tableau dashboard. + + + + + +Follow these steps to embed the data health tile in Sigma: + + + +1. Create a dashboard in Sigma and connect to your database to pull in the data. +2. Ensure you've copied the URL or iFrame snippet available in dbt Explorer's **Data health** section, under the **Embed data health into your dashboard** toggle. +3. Add a new embedded UI element in your Sigma Workbook in the following format: + + ```html + https://metadata.ACCESS_URL/exposure-tile?uniqueId=exposure.EXPOSURE_NAME&environmentType=production&environmentId=ENV_ID_NUMBER&token= + ``` + + *Note, replace the placeholders with your actual values.* +4. You should now see the data health tile embedded in your Sigma dashboard. + + + + + +## Job-based data health + +The default experience is the [environment-based data health tile](#view-exposure-in-dbt-explorer) with dbt Explorer. + +This section is for legacy job-based data health tiles. If you're using the revamped environment-based exposure tile, refer to the previous section. Expand the following to learn more about the legacy job-based data health tile. + + +In dbt Cloud, the [Discovery API](/docs/dbt-cloud-apis/discovery-api) can power dashboard status tiles, which are job-based. A dashboard status tile is placed on a dashboard (specifically: anywhere you can embed an iFrame) to give insight into the quality and freshness of the data feeding into that dashboard. This is done in dbt [exposures](/docs/build/exposures). + +#### Functionality +The dashboard status tile looks like this: + + + +The data freshness check fails if any sources feeding into the exposure are stale. The data quality check fails if any dbt tests fail. A failure state could look like this: + + + +Clicking into **see details** from the Dashboard Status Tile takes you to a landing page where you can learn more about the specific sources, models, and tests feeding into this exposure. + +#### Setup +First, be sure to enable [source freshness](/docs/deploy/source-freshness) in the job that generates this exposure. + +In order to set up your dashboard status tile, here is what you need: + +1. **Metadata Only token.** You can learn how to set up a Metadata-Only token [here](/docs/dbt-cloud-apis/service-tokens). + +2. **Exposure name.** You can learn more about how to set up exposures [here](/docs/build/exposures). + +3. **Job iD.** Remember that you can select your job ID directly from the URL when looking at the relevant job in dbt Cloud. + +You can insert these three fields into the following iFrame, and then embed it **anywhere that you can embed an iFrame**: + +``` + +``` + +:::tip Replace `YOUR_ACCESS_URL` with your region and plan's Access URL + +dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. Replace `YOUR_ACCESS_URL` with the appropriate [Access URL](/docs/cloud/about-cloud/access-regions-ip-addresses) for your region and plan. For example, if your account is hosted in the EMEA region, you would use the following iFrame code: + +``` + +``` + +::: + +#### Embedding with BI tools +The dashboard status tile should work anywhere you can embed an iFrame. But below are some tactical tips on how to integrate with common BI tools. + + + + +#### Mode +Mode allows you to directly [edit the HTML](https://mode.com/help/articles/report-layout-and-presentation/#html-editor) of any given report, where you can embed the iFrame. + +Note that Mode has also built its own [integration](https://mode.com/get-dbt/) with the dbt Cloud Discovery API! + + + + +#### Looker +Looker does not allow you to directly embed HTML and instead requires creating a [custom visualization](https://docs.looker.com/admin-options/platform/visualizations). One way to do this for admins is to: +- Add a [new visualization](https://fishtown.looker.com/admin/visualizations) on the visualization page for Looker admins. You can use [this URL](https://metadata.cloud.getdbt.com/static/looker-viz.js) to configure a Looker visualization powered by the iFrame. It will look like this: + + + +- Once you have set up your custom visualization, you can use it on any dashboard! You can configure it with the exposure name, job ID, and token relevant to that dashboard. + + + + + + +#### Tableau +Tableau does not require you to embed an iFrame. You only need to use a Web Page object on your Tableau Dashboard and a URL in the following format: + +``` +https://metadata.YOUR_ACCESS_URL/exposure-tile?name=&jobId=&token= +``` + +:::tip Replace `YOUR_ACCESS_URL` with your region and plan's Access URL + +dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. Replace `YOUR_ACCESS_URL` with the appropriate [Access URL](/docs/cloud/about-cloud/access-regions-ip-addresses) for your region and plan. For example, if your account is hosted in the North American region, you would use the following code: + +``` +https://metadata.cloud.getdbt.com/exposure-tile?name=&jobId=&token= + +``` +::: + + + + + + +#### Sigma + +Sigma does not require you to embed an iFrame. Add a new embedded UI element in your Sigma Workbook in the following format: + +``` +https://metadata.YOUR_ACCESS_URL/exposure-tile?name=&jobId=&token= +``` + +:::tip Replace `YOUR_ACCESS_URL` with your region and plan's Access URL + +dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. Replace `YOUR_ACCESS_URL` with the appropriate [Access URL](/docs/cloud/about-cloud/access-regions-ip-addresses) for your region and plan. For example, if your account is hosted in the APAC region, you would use the following code: + +``` +https://metadata.au.dbt.com/exposure-tile?name=&jobId=&token= + +``` +::: + + + + + + diff --git a/website/docs/docs/collaborate/dbt-explorer-faqs.md b/website/docs/docs/collaborate/dbt-explorer-faqs.md index 7533aa8ff99..f9c2a8ae631 100644 --- a/website/docs/docs/collaborate/dbt-explorer-faqs.md +++ b/website/docs/docs/collaborate/dbt-explorer-faqs.md @@ -2,7 +2,7 @@ title: "dbt Explorer FAQs" sidebar_label: "dbt Explorer FAQs" description: "Learn more with the FAQs about dbt Explorer, how it works, how to interact with it, and more." -pagination_next: null +pagination_next: "docs/collaborate/auto-exposures" --- [dbt Explorer](/docs/collaborate/explore-projects) is dbt Cloud’s new knowledge base and lineage visualization experience. It offers an interactive and high-level view of your company’s entire data estate, where you can dive deep into the context you need to understand and improve lineage so your teams can trust the data they’re using to make decisions. diff --git a/website/docs/docs/collaborate/explore-projects.md b/website/docs/docs/collaborate/explore-projects.md index 2c4646fa4d4..a4388a8696e 100644 --- a/website/docs/docs/collaborate/explore-projects.md +++ b/website/docs/docs/collaborate/explore-projects.md @@ -2,11 +2,12 @@ title: "Discover data with dbt Explorer" sidebar_label: "Discover data with dbt Explorer" description: "Learn about dbt Explorer and how to interact with it to understand, improve, and leverage your dbt projects." -pagination_next: "docs/collaborate/column-level-lineage" +image: /img/docs/collaborate/dbt-explorer/example-project-lineage-graph.png +pagination_next: "docs/collaborate/access-from-dbt-cloud" pagination_prev: null --- -With dbt Explorer, you can view your project's [resources](/docs/build/projects) (such as models, tests, and metrics) and their lineage to gain a better understanding of its latest production state. Navigate and manage your projects within dbt Cloud to help you and other data developers, analysts, and consumers discover and leverage your dbt resources. +With dbt Explorer, you can view your project's [resources](/docs/build/projects) (such as models, tests, and metrics), their lineage, and [model consumption](/docs/collaborate/auto-exposures) to gain a better understanding of its latest production state. Navigate and manage your projects within dbt Cloud to help you and other data developers, analysts, and consumers discover and leverage your dbt resources. import ExplorerCourse from '/snippets/_explorer-course-link.md'; @@ -19,15 +20,17 @@ import ExplorerCourse from '/snippets/_explorer-course-link.md'; - You have at least one successful job run in the deployment environment. Note that [CI jobs](/docs/deploy/ci-jobs) do not update dbt Explorer. - You are on the dbt Explorer page. To do this, select **Explore** from the navigation in dbt Cloud. -## Overview page +## Overview page -Navigate the dbt Explorer overview page to access your project's resources and metadata, available in beta. The page includes the following sections: +Navigate the dbt Explorer overview page to access your project's resources and metadata. The page includes the following sections: - **Search bar** — [Search](#search-resources) for resources in your project by keyword. You can also use filters to refine your search results. - **Sidebar** — Use the left sidebar to access model [performance](/docs/collaborate/model-performance), [project recommendations](/docs/collaborate/project-recommendations) in the **Project details** section. Browse your project's [resources, file tree, and database](#browse-with-the-sidebar) in the lower section of the sidebar. - **Lineage graph** — Explore your project's or account's [lineage graph](#project-lineage) to visualize the relationships between resources. - **Latest updates** — View the latest changes or issues related to your project's resources, including the most recent job runs, changed properties, lineage, and issues. -- **Marts and public models** — View the [marts](/best-practices/how-we-structure/1-guide-overview#guide-structure-overview) and [public models](/docs/collaborate/govern/model-access#access-modifiers) in your project. +- **Marts and public models** — View the [marts](/best-practices/how-we-structure/1-guide-overview#guide-structure-overview) and [public models](/docs/collaborate/govern/model-access#access-modifiers) in your project. You can also navigate to all public models in your account through this view. +- **Model query history** — Use [model query history](/docs/collaborate/model-query-history) to track consumption queries on your models for deeper insights. +- **Auto-exposures** — [Set up and view auto-exposures](/docs/collaborate/auto-exposures) to automatically expose relevant data models from Tableau to enhance visibility. @@ -93,7 +96,7 @@ To explore the lineage graphs of tests and macros, view [their resource details ### Example of full lineage graph -Example of exploring the `order_items` model in the project's lineage graph: +Example of exploring a model in the project's lineage graph: @@ -110,13 +113,14 @@ Lenses are helpful to analyze a subset of the DAG if you're zoomed in, or to fin A resource in your project is characterized by resource type, materialization type, or model layer, as well as its latest run or latest test status. Lenses are available for the following metadata: - **Relationship**: Organizes resources by resource type, such as models, tests, seeds, and [more](/reference/node-selection/syntax). Resource type uses the `resource_type` selector. -- **Materialization Type**: Identifies the strategy for building the dbt models in your data platform. -- **Latest Status**: The status from the latest execution of the resource in the current environment. For example, diagnosing a failed DAG region. -- **Model Layer**: The modeling layer that the model belongs to according to [best practices guide](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview#guide-structure-overview). For example, discovering marts models to analyze. +- **Materialization type**: Identifies the strategy for building the dbt models in your data platform. +- **Latest status**: The status from the latest execution of the resource in the current environment. For example, diagnosing a failed DAG region. +- **Model layer**: The modeling layer that the model belongs to according to [best practices guide](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview#guide-structure-overview). For example, discovering marts models to analyze. - **Marts** — A model with the prefix `fct_` or `dim_` or a model that lives in the `/marts/` subdirectory. - **Intermediate** — A model with the prefix `int_`. Or, a model that lives in the `/int/` or `/intermediate/` subdirectory. - **Staging** — A model with the prefix `stg_`. Or, a model that lives in the `/staging/` subdirectory. -- **Test Status**: The status from the latest execution of the tests that ran again this resource. In the case that a model has multiple tests with different results, the lens reflects the 'worst case' status. +- **Test status**: The status from the latest execution of the tests that ran again this resource. In the case that a model has multiple tests with different results, the lens reflects the 'worst case' status. +- **Usage queries**: The number of queries against this resource over a given time period. @@ -158,12 +162,64 @@ Under the the **Models** option, you can filter on model properties (access or m + + +Trust signal icons offer a quick, at-a-glance view of data health when browsing your models in dbt Explorer. These icons keep you informed on the status of your model's health using the indicators **Healthy**, **Caution**, **Degraded**, and **Unknown**. For accurate health data, ensure the resource is up-to-date and has had a recent job run. + +Each trust signal icon reflects key data health components, such as test success status, missing resource descriptions, absence of builds in 30-day windows, and more. + +To access trust signals: +- Use the search function or click on **Models** or **Sources** under the **Resource** tab. +- View the icons under the **Health** column. +- Hover over or click the trust signal to see detailed information. +- For sources, the trust signal also indicates the source freshness status. + + + + + + + ### Example of keyword search -Example of results from searching on the keyword `item` and applying the filters models, description, and code: +Example of results from searching on the keyword `customers` and applying the filters models, description, and code. Trust signals are visible to the right of the model name in the search results. - ## Browse with the sidebar From the sidebar, you can browse your project's resources, its file tree, and the database. @@ -197,6 +253,7 @@ In the upper right corner of the resource details page, you can: +- Trust signal icon — Icons offering a quick, at-a-glance view of data health. These icons indicate whether a model is Healthy, Caution, Degraded, or Unknown. Hover over an icon to view detailed information about the model's health. - **Status bar** (below the page title) — Information on the last time the model ran, whether the run was successful, how the data is materialized, number of rows, and the size of the model. - **General** tab includes: - **Lineage** graph — The model’s lineage graph that you can interact with. The graph includes one upstream node and one downstream node from the model. Click the Expand icon in the graph's upper right corner to view the model in full lineage graph mode. @@ -214,11 +271,12 @@ In the upper right corner of the resource details page, you can: - **Status bar** (below the page title) — Information on the last time the exposure was updated. - **General** tab includes: + - **Data health** — The status on data freshness and data quality. - **Status** section — The status on data freshness and data quality. - - **Lineage** graph — The exposure’s lineage graph. Click the Expand icon in the graph's upper right corner to view the exposure in full lineage graph mode. + - **Lineage** graph — The exposure’s lineage graph. Click the **Expand** icon in the graph's upper right corner to view the exposure in full lineage graph mode. Integrates natively with Tableau and auto-generates downstream lineage. - **Description** section — A description of the exposure. - **Details** section — Details like exposure type, maturity, owner information, and more. - - **Relationships** section — The nodes the exposure **Depends On**. + - **Relationships** section — The nodes the exposure **Depends On**. @@ -257,9 +315,13 @@ Example of the Tests view: ### Example of model details -Example of the details view for the model `customers`: + + +Example of the details view for the model `customers`:
- + + +
## Staging environment @@ -269,7 +331,6 @@ You can explore the metadata from your production or staging environment to info - ## Related content - [Enterprise permissions](/docs/cloud/manage-access/enterprise-permissions) - [About model governance](/docs/collaborate/govern/about-model-governance) diff --git a/website/docs/docs/collaborate/git/version-control-basics.md b/website/docs/docs/collaborate/git/version-control-basics.md index edde1bdb48c..f72f05d7508 100644 --- a/website/docs/docs/collaborate/git/version-control-basics.md +++ b/website/docs/docs/collaborate/git/version-control-basics.md @@ -23,7 +23,7 @@ Check out some common git terms below that you might encounter when developing: | Branch | A branch is a parallel version of a repository. It is contained within the repository but does not affect the primary or main branch allowing you to work freely without disrupting the live version. When you've made the changes you want to make, you can merge your branch back into the main branch to publish your changes | | Checkout | The `checkout` command is used to create a new branch, change your current working branch to a different branch, or switch to a different version of a file from a different branch. | | Commit | A commit is a user’s change to a file (or set of files). When you make a commit to save your work, Git creates a unique ID that allows you to keep a record of the specific changes committed along with who made them and when. Commits usually contain a commit message which is a brief description of what changes were made. | -| main | The primary, base branch of all repositories. All committed and accepted changes should be on the main branch.

In the Cloud IDE, the main branch is protected. This means you can directly edit, format, or lint files and execute dbt commands in your protected primary git branch. Since the dbt Cloud IDE prevents commits to the protected branch, you can commit those changes to a new branch.| +| main | The primary, base branch of all repositories. All committed and accepted changes should be on the main branch.

In the Cloud IDE, the main branch is protected. This means you can't directly edit, format, or lint files and execute dbt commands in your protected primary git branch. Since the dbt Cloud IDE prevents commits to the protected branch, you can commit those changes to a new branch.| | Merge | Merge takes the changes from one branch and adds them into another (usually main) branch. These commits are usually first requested via pull request before being merged by a maintainer. | | Pull Request | If someone has changed code on a separate branch of a project and wants it to be reviewed to add to the main branch, they can submit a pull request. Pull requests ask the repo maintainers to review the commits made, and then, if acceptable, merge the changes upstream. A pull happens when adding the changes to the main branch. | | Push | A `push` updates a remote branch with the commits made to the current branch. You are literally _pushing_ your changes into the remote. | diff --git a/website/docs/docs/collaborate/govern/about-model-governance.md b/website/docs/docs/collaborate/govern/about-model-governance.md index a845e941e54..195b1d03caa 100644 --- a/website/docs/docs/collaborate/govern/about-model-governance.md +++ b/website/docs/docs/collaborate/govern/about-model-governance.md @@ -11,3 +11,5 @@ pagination_prev: null [**Model contracts**](model-contracts): Guarantee the shape of a model while it is building to avoid surprises or breaking changes for downstream queries. Explicitly define column names, data types, and constraints (as supported by your data platform). [**Model versions**](model-versions): When a breaking change is unavoidable, provide a smoother upgrade pathway by creating a new version of the model. These model versions share a common reference name and can reuse properties & configurations. + +[**Project dependencies**](/docs/collaborate/govern/project-dependencies): Use cross project dependencies to reference public models across dbt projects using the [two-argument ref](/reference/dbt-jinja-functions/ref#ref-project-specific-models), which includes the project name. diff --git a/website/docs/docs/collaborate/govern/model-contracts.md b/website/docs/docs/collaborate/govern/model-contracts.md index 9e59d089e49..d30024157c8 100644 --- a/website/docs/docs/collaborate/govern/model-contracts.md +++ b/website/docs/docs/collaborate/govern/model-contracts.md @@ -28,7 +28,8 @@ At present, model contracts are supported for: Model contracts are _not_ supported for: - Python models. -- `ephemeral`-materialized SQL models. +- `materialized view` or `ephemeral`-materialized SQL models. +- Custom materializations (unless added by the author). - Models with recursive 's in BigQuery. - Other resource types, such as `sources`, `seeds`, `snapshots`, and so on. @@ -177,14 +178,14 @@ Currently, `not_null` and `check` constraints are enforced only after a model is ### Which models should have contracts? Any model meeting the criteria described above _can_ define a contract. We recommend defining contracts for ["public" models](model-access) that are being relied on downstream. -- Inside of dbt: Shared with other groups, other teams, and (in the future) other dbt projects. +- Inside of dbt: Shared with other groups, other teams, and [other dbt projects](/best-practices/how-we-mesh/mesh-1-intro). - Outside of dbt: Reports, dashboards, or other systems & processes that expect this model to have a predictable structure. You might reflect these downstream uses with [exposures](/docs/build/exposures). ### How are contracts different from tests? A model's contract defines the **shape** of the returned dataset. If the model's logic or input data doesn't conform to that shape, the model does not build. -[Data Tests](/docs/build/data-tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the data test. Data tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures, because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). +[Data Tests](/docs/build/data-tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the data test. Data tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). In some cases, you can replace a data test with its equivalent constraint. This has the advantage of guaranteeing the validation at build time, and it probably requires less compute (cost) in your data platform. The prerequisites for replacing a data test with a constraint are: - Making sure that your data platform can support and enforce the constraint that you need. Most platforms only enforce `not_null`. @@ -205,9 +206,12 @@ At the same time, for models with many columns, we understand that this can mean When comparing to a previous project state, dbt will look for breaking changes that could impact downstream consumers. If breaking changes are detected, dbt will present a contract error. Breaking changes include: -- Removing an existing column -- Changing the `data_type` of an existing column -- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher) +- Removing an existing column. +- Changing the `data_type` of an existing column. +- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher). +- Removing a contracted model by deleting, renaming, or disabling it (dbt v1.9 or higher). + - versioned models will raise an error. + - unversioned models will raise a warning. More details are available in the [contract reference](/reference/resource-configs/contract#detecting-breaking-changes). diff --git a/website/docs/docs/collaborate/govern/model-versions.md b/website/docs/docs/collaborate/govern/model-versions.md index f255aa9db1a..eefcf76e824 100644 --- a/website/docs/docs/collaborate/govern/model-versions.md +++ b/website/docs/docs/collaborate/govern/model-versions.md @@ -69,7 +69,7 @@ When you make updates to a model's source code — its logical definition, i **Versioned models are different.** Defining model `versions` is appropriate when people, systems, and processes beyond your team's control, inside or outside of dbt, depend on your models. You can neither simply go migrate them all, nor break their queries on a whim. You need to offer a migration path, with clear diffs and deprecation dates. -Multiple versions of a model will live in the same code repository at the same time, and be deployed into the same data environment simultaneously. This is similar to how web APIs are versioned: Multiple versions are live simultaneously, two or three, and not more). Over time, newer versions come online, and older versions are sunsetted . +Multiple versions of a model will live in the same code repository at the same time, and be deployed into the same data environment simultaneously. This is similar to how web APIs are versioned: Multiple versions live simultaneously, two or three, and not more). Over time, newer versions come online, and older versions are sunsetted . ## How is this different from just creating a new model? diff --git a/website/docs/docs/collaborate/govern/project-dependencies.md b/website/docs/docs/collaborate/govern/project-dependencies.md index a56646b0d0b..7813e25efcb 100644 --- a/website/docs/docs/collaborate/govern/project-dependencies.md +++ b/website/docs/docs/collaborate/govern/project-dependencies.md @@ -18,9 +18,10 @@ This year, dbt Labs is introducing an expanded notion of `dependencies` across m ## Prerequisites - Available in [dbt Cloud Enterprise](https://www.getdbt.com/pricing). If you have an Enterprise account, you can unlock these features by designating a [public model](/docs/collaborate/govern/model-access) and adding a [cross-project ref](#how-to-write-cross-project-ref). -- Use a supported version of dbt (v1.6, v1.7, or go versionless with "[Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless)") for both the upstream ("producer") project and the downstream ("consumer") project. +- Use a supported version of dbt (v1.6 or newer or go versionless with "[Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless)") for both the upstream ("producer") project and the downstream ("consumer") project. - Define models in an upstream ("producer") project that are configured with [`access: public`](/reference/resource-configs/access). You need at least one successful job run after defining their `access`. - Define a deployment environment in the upstream ("producer") project [that is set to be your Production environment](/docs/deploy/deploy-environments#set-as-production-environment), and ensure it has at least one successful job run in that environment. +- If the upstream project has a Staging environment, run a job in that Staging environment to ensure the downstream cross-project ref resolves. - Each project `name` must be unique in your dbt Cloud account. For example, if you have a dbt project (codebase) for the `jaffle_marketing` team, you should not create separate projects for `Jaffle Marketing - Dev` and `Jaffle Marketing - Prod`. That isolation should instead be handled at the environment level. - We are adding support for environment-level permissions and data warehouse connections; please contact your dbt Labs account team for beta access. - The `dbt_project.yml` file is case-sensitive, which means the project name must exactly match the name in your `dependencies.yml`. For example, if your project name is `jaffle_marketing`, you should use `jaffle_marketing` (not `JAFFLE_MARKETING`) in all related files. @@ -30,9 +31,6 @@ import UseCaseInfo from '/snippets/_packages_or_dependencies.md'; -Refer to the [FAQs](#faqs) for more info. - - ## Example As an example, let's say you work on the Marketing team at the Jaffle Shop. The name of your team's project is `jaffle_marketing`: @@ -105,13 +103,18 @@ For more guidance on how to use dbt Mesh, refer to the dedicated [dbt Mesh guide ### Safeguarding production data with staging environments -When working in a Development environment, cross-project `ref`s normally resolve to the Production environment of the project. However, to protect production data, set up a [Staging deployment environment](/docs/deploy/deploy-environments#staging-environment) within your projects. With a staging environment integrated into the project, any references from external projects during development workflows resolve to the Staging environment. This adds a layer of security between your Deployment and Production environments by limiting access to production data. +When working in a Development environment, cross-project `ref`s normally resolve to the Production environment of the project. However, to protect production data, set up a [Staging deployment environment](/docs/deploy/deploy-environments#staging-environment) within your projects. + +With a staging environment integrated into the project, dbt Mesh automatically fetches public model information from the producer’s staging environment if the consumer is also in staging. Similarly, dbt Mesh fetches from the producer’s production environment if the consumer is in production. This ensures consistency between environments and adds a layer of security by preventing access to production data during development workflows. Read [Why use a staging environment](/docs/deploy/deploy-environments#why-use-a-staging-environment) for more information about the benefits. #### Staging with downstream dependencies -dbt Cloud begins using the Staging environment to resolve cross-project references from downstream projects as soon as it exists in a project without "fail-over" to Production. To avoid causing downtime for downstream developers, you should define and trigger a job before marking the environment as Staging: +dbt Cloud begins using the Staging environment to resolve cross-project references from downstream projects as soon as it exists in a project without "fail-over" to Production. This means that dbt Cloud will consistently use metadata from the Staging environment to resolve references in downstream projects, even if there haven't been any successful runs in the configured Staging environment. + +To avoid causing downtime for downstream developers, you should define and trigger a job before marking the environment as Staging: + 1. Create a new environment, but do NOT mark it as **Staging**. 2. Define a job in that environment. 3. Trigger the job to run, and ensure it completes successfully. diff --git a/website/docs/docs/collaborate/model-query-history.md b/website/docs/docs/collaborate/model-query-history.md new file mode 100644 index 00000000000..0180757f980 --- /dev/null +++ b/website/docs/docs/collaborate/model-query-history.md @@ -0,0 +1,113 @@ +--- +title: "Model query history" +sidebar_label: "Model query history" +description: "Import and auto-generate exposures from dashboards and understand how models are used in downstream tools for a richer lineage." +image: /img/docs/collaborate/dbt-explorer/model-query-queried-models.jpg +--- + +# About model query history + +Model query history allows you to: + +- View the count of consumption queries for a model based on the data warehouse's query logs. +- Provides data teams insight, so they can focus their time and infrastructure spend on the worthwhile used data products. +- Enable analysts to find the most popular models used by other people. + +Model query history is powered by a single consumption query of the query log table in your data warehouse aggregated on a daily basis. It currently supports Snowflake and BigQuery only, with additional platforms coming soon. + +:::info What is a consumption query? +Consumption query is a metric of queries in your dbt project that has used the model in a given time. It filters down to `select` statements only to gauge model consumption and excludes dbt model build and test executions. + +So for example, if `model_super_santi` was queried 10 times in the past week, it would count as having 10 consumption queries for that particular time period. +::: + +## Prerequisites + +To access the features, you should meet the following: + +1. You have a dbt Cloud account on the [Enterprise plan](https://www.getdbt.com/pricing/). Single-tenant accounts should contact their account representative for setup. +2. You have set up a [production](https://docs.getdbt.com/docs/deploy/deploy-environments#set-as-production-environment) deployment environment for each project you want to explore, with at least one successful job run. +3. You have [admin permissions](/docs/cloud/manage-access/enterprise-permissions) in dbt Cloud to edit project settings or production environment settings. +4. Use Snowflake or BigQuery as your data warehouse and can enable query history permissions or work with an admin to do so. Support for additional data platforms coming soon. + +## Enable query history in dbt Cloud + +To enable model query history in dbt Cloud, follow these steps: + +1. Navigate to **Deploy** and then **Environments**. +2. Select the environment marked **PROD** and click **Settings**. +3. Click **Edit** and scroll to the **Query History** section to enable the query history toggle. When it’s green and to the right, it's enabled. +4. Click the **Test Permissions** button to validate the deployment credentials permissions are sufficient to support query history. + + + + + + + + +## Credential permissions + +This section explains the permissions and steps you need to enable and view model query history in dbt Explorer. + +The model query history feature uses the credentials in your production environment to gather metadata from your data warehouse’s query logs. This means you may need elevated permissions with the warehouse. Before making any changes to your data platform permissions, confirm the configured permissions in dbt Cloud: + +1. Navigate to **Deploy** and then **Environments**. +2. Select the Environment marked **PROD** and click **Settings**. +3. Look at the information under **Deployment credentials**. + - Note: Querying query history entails warehouse costs / uses credits. + + +4. Copy or cross reference those credential permissions with the warehouse permissions and grant your user the right permissions. + + + + This feature makes use of metadata tables available to Snowflake Enterprise tier accounts or higher, `QUERY_HISTORY` and `ACCESS_HISTORY`. The Snowflake user used in the production environment must have `GOVERNANCE_VIEWER` permissions to view the data. + + This can be granted to this user by your `ACCOUNTADMIN` user in Snowflake. For more details, view the snowflake docs [here](https://docs.snowflake.com/en/sql-reference/account-usage#enabling-other-roles-to-use-schemas-in-the-snowflake-database). + + + + + + This feature uses the metadata from the `INFORMATION_SCHEMA.JOBS` view in BigQuery. To access this, the user configured for your production environment must have the following [IAM roles](https://cloud.google.com/bigquery/docs/access-control) for your BigQuery project: + + - `roles/bigquery.resourceViewer` + - `roles/bigquery.jobs.create` + + + +## View query history in Explorer + +To enhance your discovery, you can view your model query history in various locations within dbt Explorer: +- [View from Performance charts](#view-from-performance-charts) +* [View from Project lineage](#view-from-project-lineage) +- [View from Model list](#view-from-model-list) + +### View from Performance charts + +1. Navigate to dbt Explorer by clicking on the **Explore** link in the navigation. +2. In the main **Overview** page, click on **Performance** under the **Project details** section. Scroll down to view the **Most consumed models**. +3. Use the dropdown menu on the right to select the desired time period, with options available for up to the past 3 months. + + + +4. Click on a model for more details and go to the **Performance** tab. +5. On the **Performance** tab, scroll down to the **Model performance** section. +6. Select the **Consumption queries** tab to view the consumption queries over a given time for that model. + + +### View from Project lineage + +1. To view your model in your project lineage, go to the main **Overview page** and click on **Project lineage.** +2. In the lower left of your lineage, click on **Lenses** and select **Consumption queries**. + + +3. Your lineage should display a small red box above each model, indicating the consumption query number. The number for each model represents the model consumption over the last 30 days. + +### View from Model list + +1. To view a list of models, go to the main **Overview page**. +2. In the left navigation, go to the **Resources** tab and click on **Models** to view the models list. +3. You can view the consumption query count for the models and sort by most or least consumed. The consumption query number for each model represents the consumption over the last 30 days. + diff --git a/website/docs/docs/collaborate/project-recommendations.md b/website/docs/docs/collaborate/project-recommendations.md index 12007c6b88b..c9499579e54 100644 --- a/website/docs/docs/collaborate/project-recommendations.md +++ b/website/docs/docs/collaborate/project-recommendations.md @@ -20,7 +20,7 @@ The Recommendations overview page includes two top-level metrics measuring the t - **Model test coverage** — The percent of models in your project (models not from a package or imported via dbt Mesh) with at least one dbt test configured on them. - **Model documentation coverage** — The percent of models in your project (models not from a package or imported via dbt Mesh) with a description. - + ## List of rules The following table lists the rules currently defined in the `dbt_project_evaluator` [package](https://hub.getdbt.com/dbt-labs/dbt_project_evaluator/latest/). diff --git a/website/docs/docs/community-adapters.md b/website/docs/docs/community-adapters.md index 22dd2404765..3af4e15b32b 100644 --- a/website/docs/docs/community-adapters.md +++ b/website/docs/docs/community-adapters.md @@ -9,7 +9,6 @@ Community adapters are adapter plugins contributed and maintained by members of | ------------------------------------------ | -------------------------------- | ------------------------------------- | | [Clickhouse](/docs/core/connect-data-platform/clickhouse-setup) | [Databend Cloud](/docs/core/connect-data-platform/databend-setup) | [Doris & SelectDB](/docs/core/connect-data-platform/doris-setup) | | [DuckDB](/docs/core/connect-data-platform/duckdb-setup) | [Exasol Analytics](/docs/core/connect-data-platform/exasol-setup) | [Extrica](/docs/core/connect-data-platform/extrica-setup) | -| [fal - Python models](/docs/core/connect-data-platform/fal-setup) | [Firebolt](/docs/core/connect-data-platform/firebolt-setup) | [Greenplum](/docs/core/connect-data-platform/greenplum-setup) | | [Hive](/docs/core/connect-data-platform/hive-setup) | [IBM DB2](/docs/core/connect-data-platform/ibmdb2-setup) | [Impala](/docs/core/connect-data-platform/impala-setup) | | [Infer](/docs/core/connect-data-platform/infer-setup) | [iomete](/docs/core/connect-data-platform/iomete-setup) | [MindsDB](/docs/core/connect-data-platform/mindsdb-setup) | | [MySQL](/docs/core/connect-data-platform/mysql-setup) | [RisingWave](/docs/core/connect-data-platform/risingwave-setup) | [Rockset](/docs/core/connect-data-platform/rockset-setup) | diff --git a/website/docs/docs/core/connect-data-platform/about-core-connections.md b/website/docs/docs/core/connect-data-platform/about-core-connections.md index 461aeea2e87..221f495d054 100644 --- a/website/docs/docs/core/connect-data-platform/about-core-connections.md +++ b/website/docs/docs/core/connect-data-platform/about-core-connections.md @@ -32,8 +32,6 @@ If you're using dbt from the command line (CLI), you'll need a profiles.yml file For detailed info, you can refer to the [Connection profiles](/docs/core/connect-data-platform/connection-profiles). - - ## Adapter features The following table lists the features available for adapters: @@ -55,5 +53,3 @@ For adapters that support it, you can partially build the catalog. This allows t ### Source freshness You can measure source freshness using the warehouse metadata tables on supported adapters. This allows for calculating source freshness without using the [`loaded_at_field`](/reference/resource-properties/freshness#loaded_at_field) and without querying the table directly. This is faster and more flexible (though it might sometimes be inaccurate, depending on how the warehouse tracks altered tables). You can override this with the `loaded_at_field` in the [source config](/reference/source-configs). If the adapter doesn't support this, you can still use the `loaded_at_field`. - - diff --git a/website/docs/docs/core/connect-data-platform/athena-setup.md b/website/docs/docs/core/connect-data-platform/athena-setup.md index 457dd84781f..825d3071ad2 100644 --- a/website/docs/docs/core/connect-data-platform/athena-setup.md +++ b/website/docs/docs/core/connect-data-platform/athena-setup.md @@ -2,12 +2,12 @@ title: "Athena setup" description: "Read this guide to learn about the Athena warehouse setup in dbt." meta: - maintained_by: Community - authors: Community - github_repo: 'dbt-athena/dbt-athena' + maintained_by: dbt Labs + authors: dbt Labs + github_repo: 'dbt-labs/dbt-athena' pypi_package: 'dbt-athena-community' min_core_version: 'v1.3.0' - cloud_support: Not Supported + cloud_support: Supported min_supported_version: 'engine version 2 and 3' slack_channel_name: '#db-athena' slack_channel_link: 'https://getdbt.slack.com/archives/C013MLFR7BQ' diff --git a/website/docs/docs/core/connect-data-platform/azuresynapse-setup.md b/website/docs/docs/core/connect-data-platform/azuresynapse-setup.md index 8a4d6b61004..0a0347df9ea 100644 --- a/website/docs/docs/core/connect-data-platform/azuresynapse-setup.md +++ b/website/docs/docs/core/connect-data-platform/azuresynapse-setup.md @@ -7,7 +7,7 @@ meta: github_repo: 'Microsoft/dbt-synapse' pypi_package: 'dbt-synapse' min_core_version: 'v0.18.0' - cloud_support: Not Supported + cloud_support: Supported min_supported_version: 'Azure Synapse 10' slack_channel_name: '#db-synapse' slack_channel_link: 'https://getdbt.slack.com/archives/C01DRQ178LQ' diff --git a/website/docs/docs/core/connect-data-platform/bigquery-setup.md b/website/docs/docs/core/connect-data-platform/bigquery-setup.md index eedc3646f89..8b1867ef620 100644 --- a/website/docs/docs/core/connect-data-platform/bigquery-setup.md +++ b/website/docs/docs/core/connect-data-platform/bigquery-setup.md @@ -390,9 +390,9 @@ my-profile: ### Running Python models on Dataproc -To run dbt Python models on GCP, dbt uses companion services, Dataproc and Cloud Storage, that offer tight integrations with BigQuery. You may use an existing Dataproc cluster and Cloud Storage bucket, or create new ones: -- https://cloud.google.com/dataproc/docs/guides/create-cluster -- https://cloud.google.com/storage/docs/creating-buckets +import BigQueryDataproc from '/snippets/_bigquery-dataproc.md'; + + Then, add the bucket name, cluster name, and cluster region to your connection profile: diff --git a/website/docs/docs/core/connect-data-platform/fal-setup.md b/website/docs/docs/core/connect-data-platform/fal-setup.md deleted file mode 100644 index 5dfb5967d3a..00000000000 --- a/website/docs/docs/core/connect-data-platform/fal-setup.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: "fal setup (Python models)" -description: "Read this guide to learn about the fal warehouse setup in dbt." -meta: - maintained_by: fal.ai - authors: 'Features & Labels' - github_repo: 'fal-ai/fal' - pypi_package: 'dbt-fal' - min_core_version: 'v1.3.0' - max_core_version: 'v1.5.0' - cloud_support: Not Supported - min_supported_version: 'n/a' - slack_channel_name: '#tools-fal' - slack_channel_link: 'https://getdbt.slack.com/archives/C02V8QW3Q4Q' - platform_name: 'fal' - config_page: '/reference/resource-configs/fal-configs' ---- - -:::info Adapter no longer maintained -The [`dbt-fal` adapter](https://github.com/fal-ai/dbt-fal) is no longer actively maintained. This means although the adapter is still operational, there is no further development or bug fixes planned and it may not be compatible with future versions of dbt. `dbt-fal` was test until dbt v1.5. - -Documentation for `dbt-fal` are kept for reference purposes only and will eventually be removed from the site in the future. -::: - -import SetUpPages from '/snippets/_setup-pages-intro.md'; - - - -## Setting up fal with other adapter - -[fal](http://github.com/fal-ai/fal) offers a Python runtime independent from what database you are using and integrates seamlessly with dbt. It works by downloading the data as a Pandas DataFrame, transforming it in a local Python runtime and uploading it to the database. The only configuration change you need to do is adding it to the `profiles.yml` and setting the `db_profile` property as the database profile you are already using. - -It will run all the SQL dbt models with the main adapter you specified in your `profiles.yml` and all the Python models are executed by the fal adapter. - -Example: - - - -```yaml -jaffle_shop: - target: dev_with_fal - outputs: - dev_with_fal: - type: fal - db_profile: dev_pg # This points to your main adapter - dev_pg: - type: postgres - ... -``` - - diff --git a/website/docs/docs/core/connect-data-platform/glue-setup.md b/website/docs/docs/core/connect-data-platform/glue-setup.md index f2cf717147a..a074038a87f 100644 --- a/website/docs/docs/core/connect-data-platform/glue-setup.md +++ b/website/docs/docs/core/connect-data-platform/glue-setup.md @@ -175,7 +175,7 @@ Please to update variables between **`<>`**, here are explanations of these argu ### Configuration of the local environment -Because **`dbt`** and **`dbt-glue`** adapters are compatible with Python versions 3.7, 3.8, and 3.9, check the version of Python: +Because **`dbt`** and **`dbt-glue`** adapters are compatible with Python versions 3.9 or higher, check the version of Python: ```bash $ python3 --version diff --git a/website/docs/docs/core/connect-data-platform/postgres-setup.md b/website/docs/docs/core/connect-data-platform/postgres-setup.md index 7720e82844d..b6f34a00e0b 100644 --- a/website/docs/docs/core/connect-data-platform/postgres-setup.md +++ b/website/docs/docs/core/connect-data-platform/postgres-setup.md @@ -5,7 +5,7 @@ id: "postgres-setup" meta: maintained_by: dbt Labs authors: 'core dbt maintainers' - github_repo: 'dbt-labs/dbt-core' + github_repo: 'dbt-labs/dbt-postgres' pypi_package: 'dbt-postgres' min_core_version: 'v0.4.0' cloud_support: Supported diff --git a/website/docs/docs/core/connect-data-platform/snowflake-setup.md b/website/docs/docs/core/connect-data-platform/snowflake-setup.md index 266840cafae..b692ba5c0d6 100644 --- a/website/docs/docs/core/connect-data-platform/snowflake-setup.md +++ b/website/docs/docs/core/connect-data-platform/snowflake-setup.md @@ -211,7 +211,7 @@ my-snowflake-db: -### SSO Authentication +### SSO authentication To use SSO authentication for Snowflake, omit a `password` and instead supply an `authenticator` config to your target. `authenticator` can be one of 'externalbrowser' or a valid Okta URL. @@ -332,7 +332,7 @@ my-snowflake-db: -### SSO Authentication +### SSO authentication To use SSO authentication for Snowflake, omit a `password` and instead supply an `authenticator` config to your target. `authenticator` can be one of 'externalbrowser' or a valid Okta URL. @@ -421,6 +421,30 @@ my-snowflake-db: Refer to the [Snowflake docs](https://docs.snowflake.com/en/sql-reference/parameters.html#label-allow-id-token) for info on how to enable this feature in your account. +### OAuth authorization + +To learn how to configure OAuth in Snowflake, refer to their [documentation](https://docs.snowflake.com/en/user-guide/oauth-snowflake-overview). Your Snowflake admin needs to generate an [OAuth token](https://community.snowflake.com/s/article/HOW-TO-OAUTH-TOKEN-GENERATION-USING-SNOWFLAKE-CUSTOM-OAUTH) for your configuration to work. + +Provide the OAUTH_REDIRECT_URI in Snowflake:`http://localhost:PORT_NUMBER`. For example, `http://localhost:8080`. + +Once your Snowflake admin has configured OAuth, add the following to your `profiles.yml` file: + +```yaml + +my-snowflake-db: + target: dev + outputs: + dev: + type: snowflake + account: [account id] + + # The following fields are retrieved from the Snowflake configuration + authenticator: oauth + oauth_client_id: [OAuth client id] + oauth_client_secret: [OAuth client secret] + token: [OAuth refresh token] +``` + ## Configurations The "base" configs for Snowflake targets are shown below. Note that you should also specify auth-related configs specific to the authentication method you are using as described above. diff --git a/website/docs/docs/core/connect-data-platform/spark-setup.md b/website/docs/docs/core/connect-data-platform/spark-setup.md index 3b1429c246b..611642e91b7 100644 --- a/website/docs/docs/core/connect-data-platform/spark-setup.md +++ b/website/docs/docs/core/connect-data-platform/spark-setup.md @@ -197,17 +197,14 @@ connect_retries: 3 - - - - ### Server side configuration Spark can be customized using [Application Properties](https://spark.apache.org/docs/latest/configuration.html). Using these properties the execution can be customized, for example, to allocate more memory to the driver process. Also, the Spark SQL runtime can be set through these properties. For example, this allows the user to [set a Spark catalogs](https://spark.apache.org/docs/latest/configuration.html#spark-sql). - ## Caveats +When facing difficulties, run `poetry run dbt debug --log-level=debug`. The logs are saved at `logs/dbt.log`. + ### Usage with EMR To connect to Apache Spark running on an Amazon EMR cluster, you will need to run `sudo /usr/lib/spark/sbin/start-thriftserver.sh` on the master node of the cluster to start the Thrift server (see [the docs](https://aws.amazon.com/premiumsupport/knowledge-center/jdbc-connection-emr/) for more information). You will also need to connect to port 10001, which will connect to the Spark backend Thrift server; port 10000 will instead connect to a Hive backend, which will not work correctly with dbt. @@ -223,6 +220,6 @@ Delta-only features: ### Default namespace with Thrift connection method -If your Spark cluster doesn't have a default namespace, metadata queries that run before any dbt workflow will fail, causing the entire workflow to fail, even if your configurations are correct. The metadata queries fail there's no default namespace in which to run it. +To run metadata queries in dbt, you need to have a namespace named `default` in Spark when connecting with Thrift. You can check available namespaces by using Spark's `pyspark` and running `spark.sql("SHOW NAMESPACES").show()`. If the default namespace doesn't exist, create it by running `spark.sql("CREATE NAMESPACE default").show()`. -To debug, review the debug-level logs to confirm the query dbt is running when it encounters the error: `dbt run --debug` or `logs/dbt.log`. +If there's a network connection issue, your logs will display an error like `Could not connect to any of [('127.0.0.1', 10000)]` (or something similar). diff --git a/website/docs/docs/core/connect-data-platform/teradata-setup.md b/website/docs/docs/core/connect-data-platform/teradata-setup.md index 7067104fb94..7b964b23b3d 100644 --- a/website/docs/docs/core/connect-data-platform/teradata-setup.md +++ b/website/docs/docs/core/connect-data-platform/teradata-setup.md @@ -26,20 +26,17 @@ import SetUpPages from '/snippets/_setup-pages-intro.md'; ## Python compatibility -| Plugin version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | Python 3.11 | -| -------------- | ----------- | ----------- | ----------- | ----------- | ----------- | ------------ | -| 0.19.0.x | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ -| 0.20.0.x | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ -| 0.21.1.x | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ -| 1.0.0.x | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ -|1.1.x.x | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ -|1.2.x.x | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ -|1.3.x.x | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ -|1.4.x.x | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ -|1.5.x | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ -|1.6.x | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ -|1.7.x | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ -|1.8.x | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ +| Plugin version | Python 3.9 | Python 3.10 | Python 3.11 | +| -------------- | ----------- | ----------- | ------------ | +|1.0.0.x | ✅ | ❌ | ❌ +|1.1.x.x | ✅ | ✅ | ❌ +|1.2.x.x | ✅ | ✅ | ❌ +|1.3.x.x | ✅ | ✅ | ❌ +|1.4.x.x | ✅ | ✅ | ✅ +|1.5.x | ✅ | ✅ | ✅ +|1.6.x | ✅ | ✅ | ✅ +|1.7.x | ✅ | ✅ | ✅ +|1.8.x | ✅ | ✅ | ✅ ## dbt dependent packages version compatibility @@ -67,7 +64,7 @@ To connect to Teradata Vantage from dbt, you'll need to add a [profile](https:// password: schema: tmode: ANSI - threads: 1 + threads: [optional, 1 or more] #optional fields ``` diff --git a/website/docs/docs/core/connect-data-platform/vertica-setup.md b/website/docs/docs/core/connect-data-platform/vertica-setup.md index 8e499d68b3e..9e70bc46306 100644 --- a/website/docs/docs/core/connect-data-platform/vertica-setup.md +++ b/website/docs/docs/core/connect-data-platform/vertica-setup.md @@ -6,9 +6,9 @@ meta: authors: 'Vertica (Former authors: Matthew Carter, Andy Regan, Andrew Hedengren)' github_repo: 'vertica/dbt-vertica' pypi_package: 'dbt-vertica' - min_core_version: 'v1.7.0' + min_core_version: 'v1.8.5' cloud_support: 'Not Supported' - min_supported_version: 'Vertica 23.4.0' + min_supported_version: 'Vertica 24.3.0' slack_channel_name: 'n/a' slack_channel_link: 'https://www.getdbt.com/community/' platform_name: 'Vertica' @@ -50,7 +50,8 @@ your-profile: schema: [dbt schema] connection_load_balance: True backup_server_node: [list of backup hostnames or IPs] - retries: [1 or more] + retries: [1 or more] + autocommit: False threads: [1 or more] target: dev @@ -79,6 +80,7 @@ backup_server_node| List of hosts to connect to if the primary host specified in retries |The retry times after an unsuccessful connection.| No| 2 |3| threads |The number of threads the dbt project will run on.| No| 1| 3| label| A session label to identify the connection. |No |An auto-generated label with format of: dbt_username |dbt_dbadmin| +autocommit | Boolean value that indicates if the connection can enable or disable auto-commit.| No | True | False For more information on Vertica’s connection properties please refer to [Vertica-Python](https://github.com/vertica/vertica-python#create-a-connection) Connection Properties. diff --git a/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md b/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md index d4ec0a82d5f..2183b19d38f 100644 --- a/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md +++ b/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md @@ -18,20 +18,14 @@ Many endpoints of the Administrative API can also be called through the [dbt Clo
- - diff --git a/website/docs/docs/dbt-cloud-apis/apis-overview.md b/website/docs/docs/dbt-cloud-apis/apis-overview.md index 055edea72b6..05964ace871 100644 --- a/website/docs/docs/dbt-cloud-apis/apis-overview.md +++ b/website/docs/docs/dbt-cloud-apis/apis-overview.md @@ -20,4 +20,4 @@ If you want to learn more about webhooks, refer to [Webhooks for your jobs](/doc ## How to Access the APIs -dbt Cloud supports two types of API Tokens: [user tokens](/docs/dbt-cloud-apis/user-tokens) and [service account tokens](/docs/dbt-cloud-apis/service-tokens). Requests to the dbt Cloud APIs can be authorized using these tokens. +dbt Cloud supports two types of API Tokens: [personal access tokens](/docs/dbt-cloud-apis/user-tokens) and [service account tokens](/docs/dbt-cloud-apis/service-tokens). Requests to the dbt Cloud APIs can be authorized using these tokens. diff --git a/website/docs/docs/dbt-cloud-apis/authentication.md b/website/docs/docs/dbt-cloud-apis/authentication.md index 8729cc0641d..43a08d84fd7 100644 --- a/website/docs/docs/dbt-cloud-apis/authentication.md +++ b/website/docs/docs/dbt-cloud-apis/authentication.md @@ -8,7 +8,7 @@ pagination_prev: null
@@ -23,9 +23,7 @@ pagination_prev: null ## Types of API access tokens -**User API keys (Legacy):** User API keys were historically the only method available to access dbt Cloud APIs on the user’s behalf. They are scoped to the user and not the account. User API Keys will eventually be deprecated for the more secure personal access tokens. - -**Personal access tokens (New):** Personal access tokens (PATs) are the new, preferred, and secure way of accessing dbt Cloud APIs on behalf of a user. They are more secure than user API Keys. PATs are scoped to an account and can be enhanced with more granularity and control. +**Personal access tokens:** Preferred and secure way of accessing dbt Cloud APIs on behalf of a user. PATs are scoped to an account and can be enhanced with more granularity and control. **Service tokens:** Service tokens are similar to service accounts and are the preferred method to enable access on behalf of the dbt Cloud account. @@ -33,7 +31,7 @@ pagination_prev: null You should use service tokens broadly for any production workflow where you need a service account. You should use PATs only for developmental workflows _or_ dbt Cloud client workflows that require user context. The following examples show you when to use a personal access token (PAT) or a service token: -* **Connecting a partner integration to dbt Cloud** — Some examples include the [dbt Semantic Layer Google Sheets integration](/docs/cloud-integrations/avail-sl-integrations), Hightouch, Datafold, a custom app you’ve created, etc. These types of integrations should use a service token instead of a PAT because service tokens give you visibility, and you can scope them to only what the integration needs and ensure the least privilege. We highly recommend switching to a service token if you’re using a user API key for these integrations today. +* **Connecting a partner integration to dbt Cloud** — Some examples include the [dbt Semantic Layer Google Sheets integration](/docs/cloud-integrations/avail-sl-integrations), Hightouch, Datafold, a custom app you’ve created, etc. These types of integrations should use a service token instead of a PAT because service tokens give you visibility, and you can scope them to only what the integration needs and ensure the least privilege. We highly recommend switching to a service token if you’re using a personal acess token for these integrations today. * **Production Terraform** — Use a service token since this is a production workflow and is acting as a service account and not a user account. * **Cloud CLI** — Use a PAT since the dbt Cloud CLI works within the context of a user (the user is making the requests and has to operate within the context of their user account). * **Testing a custom script and staging Terraform or Postman** — We recommend using a PAT as this is a developmental workflow and is scoped to the user making the changes. When you push this script or Terraform into production, use a service token instead. diff --git a/website/docs/docs/dbt-cloud-apis/discovery-api.md b/website/docs/docs/dbt-cloud-apis/discovery-api.md index 0345c647dd9..db6819a5e09 100644 --- a/website/docs/docs/dbt-cloud-apis/discovery-api.md +++ b/website/docs/docs/dbt-cloud-apis/discovery-api.md @@ -7,7 +7,7 @@ Every time dbt Cloud runs a project, it generates and stores information about t By leveraging the metadata in dbt Cloud, you can create systems for data monitoring and alerting, lineage exploration, and automated reporting. This can help you improve data discovery, data quality, and pipeline operations within your organization. -You can access the Discovery API through [ad hoc queries](/docs/dbt-cloud-apis/discovery-querying), custom applications, a wide range of [partner ecosystem integrations](https://www.getdbt.com/product/integrations/) (like BI/analytics, catalog and governance, and quality and observability), and by using dbt Cloud features like [model timing](/docs/deploy/run-visibility#model-timing) and [dashboard status tiles](/docs/deploy/dashboard-status-tiles). +You can access the Discovery API through [ad hoc queries](/docs/dbt-cloud-apis/discovery-querying), custom applications, a wide range of [partner ecosystem integrations](https://www.getdbt.com/product/integrations/) (like BI/analytics, catalog and governance, and quality and observability), and by using dbt Cloud features like [model timing](/docs/deploy/run-visibility#model-timing) and [data health tiles]/(docs/collaborate/data-tile). @@ -32,7 +32,7 @@ Use the API to look at historical information like model build time to determine You can use, for example, the [model timing](/docs/deploy/run-visibility#model-timing) tab to help identify and optimize bottlenecks in model builds: - + @@ -50,7 +50,7 @@ Use the API to find and understand dbt assets in integrated tools using informat Data producers must manage and organize data for stakeholders, while data consumers need to quickly and confidently analyze data on a large scale to make informed decisions that improve business outcomes and reduce organizational overhead. The API is useful for discovery data experiences in catalogs, analytics, apps, and machine learning (ML) tools. It can help you understand the origin and meaning of datasets for your analysis. - + @@ -65,7 +65,9 @@ Use the API to review who developed the models and who uses them to help establi Use the API to review dataset changes and uses by examining exposures, lineage, and dependencies. From the investigation, you can learn how to define and build more effective dbt projects. For more details, refer to [Development](/docs/dbt-cloud-apis/discovery-use-cases-and-examples#development). - + + + diff --git a/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md b/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md index b01fe6a3b5e..e095374343f 100644 --- a/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md +++ b/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md @@ -25,7 +25,7 @@ For performance use cases, people typically query the historical or latest appli It’s helpful to understand how long it takes to build models (tables) and tests to execute during a dbt run. Longer model build times result in higher infrastructure costs and fresh data arriving later to stakeholders. Analyses like these can be in observability tools or ad-hoc queries, like in a notebook. - +
Example query with code @@ -1040,7 +1040,7 @@ For development use cases, people typically query the historical or latest defin ### How is this model or metric used in downstream tools? [Exposures](/docs/build/exposures) provide a method to define how a model or metric is actually used in dashboards and other analytics tools and use cases. You can query an exposure’s definition to see how project nodes are used and query its upstream lineage results to understand the state of the data used in it, which powers use cases like a freshness and quality status tile. - +
diff --git a/website/docs/docs/dbt-cloud-apis/migrating-to-v2.md b/website/docs/docs/dbt-cloud-apis/migrating-to-v2.md deleted file mode 100644 index 72616f4b19c..00000000000 --- a/website/docs/docs/dbt-cloud-apis/migrating-to-v2.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -title: "Migrating to dbt Cloud Administrative API v2" -description: "You should migrate to API v2 while we deprecate API v4 " -sidebar_label: "Migrating to API v2" -id: "migrating-to-v2" ---- - - -In an attempt to provide an improved dbt Cloud Administrative API experience, the dbt Cloud API v4 will be deprecated by April 30th, 2023. We suggest you migrate to dbt Cloud Administrative API v2. When migrating from API v4 to API v2, there are a few differences you should consider when querying your dbt Cloud account. - -## Key differences - -When using the [List runs](/dbt-cloud/api-v2-legacy#tag/Runs) endpoint, you can include triggered runs and sort by ID. You can use the following request in v2 to get a similar response as v4, replacing the `{accountId}` with your own and `{YOUR_ACCESS_URL}` with the appropriate [Access URL](https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses) for your region and plan: - -```shell -GET https://{YOUR_ACCESS_URL}/api/v2/accounts/{accountId}/runs/?include_related=[%22trigger%22]&order_by=-id -``` -For example, if your region is EMEA multi-tenant and your account ID is `001`, your endpoint would be: - -```shell -GET https://emea.dbt.com/api/v2/accounts/001/runs/?include_related=[%22trigger%22]&order_by=-id` -``` - -Differences in responses include: - -| Property description | API v4 | API v2 | -|---------------------|-----------|-------------| -| Reverse sort order when you use sort by `-id` | Defaults to order by most recent | Defaults to order by least recent | -| Update to timestamps | Unix timestamps | ISO strings | -| Update to IDs: `id`, `environment_id`, `account_id`, `project_id`, `job_id` | Values are the same, but they are strings | Values are the same, but they are numeric | -| New property for returning runs with the specified status | `status` property | Maps to `status_humanized` | -| New property for including related field with run | `replace` property | Maps to the `trigger` property | diff --git a/website/docs/docs/dbt-cloud-apis/service-tokens.md b/website/docs/docs/dbt-cloud-apis/service-tokens.md index 1a5920fab8a..a077b230c28 100644 --- a/website/docs/docs/dbt-cloud-apis/service-tokens.md +++ b/website/docs/docs/dbt-cloud-apis/service-tokens.md @@ -12,7 +12,7 @@ If you have service tokens created on or before July 18, 2023, please read [this ::: -Service account tokens enable you to securely authenticate with the dbt Cloud API by assigning each token a narrow set of permissions that more precisely manages access to the API. While similar to [User API tokens](user-tokens), service account tokens belong to an account rather than a user. +Service account tokens enable you to securely authenticate with the dbt Cloud API by assigning each token a narrow set of permissions that more precisely manages access to the API. While similar to [personal access tokens](user-tokens), service account tokens belong to an account rather than a user. You can use service account tokens for system-level integrations that do not run on behalf of any one user. Assign any permission sets available in dbt Cloud to your service account token, which can vary slightly depending on your plan: @@ -36,77 +36,37 @@ You can assign service account tokens to any permission set available in dbt Clo ### Team plans using service account tokens -The following permissions can be assigned to a service account token on a Team plan. +The following permissions can be assigned to a service account token on a Team plan. Refer to [Enterprise permissions](/docs/cloud/manage-access/enterprise-permissions) for more information about these roles. -**Account Admin**
-Account Admin service tokens have full `read + write` access to an account, so please use them with caution. A Team plan refers to this permission set as an "Owner role." For more on these permissions, see [Account Admin](/docs/cloud/manage-access/enterprise-permissions#account-admin). - -**Metadata Only**
-Metadata-only service tokens authorize requests to the Discovery API. - -**Semantic Layer Only**
-Semantic Layer-only service tokens authorize requests to the Semantic Layer APIs. - -**Job Admin**
-Job admin service tokens can authorize requests for viewing, editing, and creating environments, triggering runs, and viewing historical runs. - -**Member**
-Member service tokens can authorize requests for viewing and editing resources, triggering runs, and inviting members to the account. Tokens assigned the Member permission set will have the same permissions as a Member user. For more information about Member users, see "[Self-service permissions](/docs/cloud/manage-access/self-service-permissions)". - -**Read-only**
-Read-only service tokens can authorize requests for viewing a read-only dashboard, viewing generated documentation, and viewing source freshness reports. This token can access and retrieve account-level information endpoints on the [Admin API](/docs/dbt-cloud-apis/admin-cloud-api) and authorize requests to the [Discovery API](/docs/dbt-cloud-apis/discovery-api). +- Account Admin — Account Admin service tokens have full `read + write` access to an account, so please use them with caution. A Team plan refers to this permission set as an "Owner role." +- Billing Admin +- Job Admin +- Metadata Only +- Member +- Read-only +- Semantic Layer Only ### Enterprise plans using service account tokens -The following permissions can be assigned to a service account token on an Enterprise plan. For more details about these permissions, see "[Enterprise permissions](/docs/cloud/manage-access/enterprise-permissions)." - -**Account Admin**
-Account Admin service tokens have full `read + write` access to an account, so please use them with caution. For more on these permissions, see [Account Admin](/docs/cloud/manage-access/enterprise-permissions#account-admin). - -**Security Admin**
-Security Admin service tokens have certain account-level permissions. For more on these permissions, see [Security Admin](/docs/cloud/manage-access/enterprise-permissions#security-admin). - -**Billing Admin**
-Billing Admin service tokens have certain account-level permissions. For more on these permissions, see [Billing Admin](/docs/cloud/manage-access/enterprise-permissions#billing-admin). - -**Manage marketplace apps**
-Used only for service tokens assigned to marketplace apps (for example, the [Snowflake Native app](/docs/cloud-integrations/snowflake-native-app)). - -**Metadata Only**
-Metadata-only service tokens authorize requests to the Discovery API. - -**Semantic Layer Only**
-Semantic Layer-only service tokens authorize requests to the Semantic Layer APIs. - -**Job Admin**
-Job Admin service tokens can authorize requests for viewing, editing, and creating environments, triggering runs, and viewing historical runs. For more on these permissions, see [Job Admin](/docs/cloud/manage-access/enterprise-permissions#job-admin). - -**Account Viewer**
-Account Viewer service tokens have read-only access to dbt Cloud accounts. For more on these permissions, see [Account Viewer](/docs/cloud/manage-access/enterprise-permissions#account-viewer) on the Enterprise Permissions page. - -**Admin**
-Admin service tokens have unrestricted access to projects in dbt Cloud accounts. You have the option to grant that permission all projects in the account or grant the permission only on specific projects. For more on these permissions, see [Admin Service](/docs/cloud/manage-access/enterprise-permissions#admin-service) on the Enterprise Permissions page. - -**Git Admin**
-Git admin service tokens have all the permissions listed in [Git admin](/docs/cloud/manage-access/enterprise-permissions#git-admin) on the Enterprise Permissions page. - -**Database Admin**
-Database admin service tokens have all the permissions listed in [Database admin](/docs/cloud/manage-access/enterprise-permissions#database-admin) on the Enterprise Permissions page. - -**Team Admin**
-Team admin service tokens have all the permissions listed in [Team admin](/docs/cloud/manage-access/enterprise-permissions#team-admin) on the Enterprise Permissions page. - -**Job Viewer**
-Job viewer admin service tokens have all the permissions listed in [Job viewer](/docs/cloud/manage-access/enterprise-permissions#job-viewer) on the Enterprise Permissions page. - -**Developer**
-Developer service tokens have all the permissions listed in [Developer](/docs/cloud/manage-access/enterprise-permissions#developer) on the Enterprise Permissions page. - -**Analyst**
-Analyst admin service tokens have all the permissions listed in [Analyst](/docs/cloud/manage-access/enterprise-permissions#analyst) on the Enterprise Permissions page. - -**Stakeholder**
-Stakeholder service tokens have all the permissions listed in [Stakeholder](/docs/cloud/manage-access/enterprise-permissions#stakeholder) on the Enterprise Permissions page. +Refer to [Enterprise permissions](/docs/cloud/manage-access/enterprise-permissions) for more information about these roles. + +- Account Admin — Account Admin service tokens have full `read + write` access to an account, so please use them with caution. +- Account Viewer +- Admin +- Analyst +- Billing Admin +- Database Admin +- Developer +- Git Admin +- Job Admin +- Job Runner +- Job Viewer +- Manage marketplace apps +- Metadata Only +- Semantic Layer Only +- Security Admin +- Stakeholder +- Team Admin ## Service token update diff --git a/website/docs/docs/dbt-cloud-apis/sl-api-overview.md b/website/docs/docs/dbt-cloud-apis/sl-api-overview.md index 1c4d5f387e9..e4e2a91791d 100644 --- a/website/docs/docs/dbt-cloud-apis/sl-api-overview.md +++ b/website/docs/docs/dbt-cloud-apis/sl-api-overview.md @@ -43,15 +43,9 @@ plan="dbt Cloud Team or Enterprise" icon="dbt-bit"/> - -
diff --git a/website/docs/docs/dbt-cloud-apis/sl-graphql.md b/website/docs/docs/dbt-cloud-apis/sl-graphql.md index 2898b6e5c0a..13149ca4953 100644 --- a/website/docs/docs/dbt-cloud-apis/sl-graphql.md +++ b/website/docs/docs/dbt-cloud-apis/sl-graphql.md @@ -627,3 +627,16 @@ mutation { } } ``` + +### Multi-hop joins + +In cases where you need to query across multiple related tables (multi-hop joins), use the `entity_path` argument to specify the path between related entities. The following are examples of how you can define these joins: + +- In this example, you're querying the `location_name` dimension but specifying that it should be joined using the `order_id` field. + ```sql + {{Dimension('location__location_name', entity_path=['order_id'])}} + ``` +- In this example, the `salesforce_account_owner` dimension is joined to the `region` field, with the path going through `salesforce_account`. + ```sql + {{ Dimension('salesforce_account_owner__region',['salesforce_account']) }} + ``` diff --git a/website/docs/docs/dbt-cloud-apis/sl-jdbc.md b/website/docs/docs/dbt-cloud-apis/sl-jdbc.md index ea0afbcfbad..d9ce3bf4fd1 100644 --- a/website/docs/docs/dbt-cloud-apis/sl-jdbc.md +++ b/website/docs/docs/dbt-cloud-apis/sl-jdbc.md @@ -56,7 +56,7 @@ The Semantic Layer JDBC API has built-in metadata calls which can provide a user Expand the following toggles for examples and metadata commands: - + You can use this query to fetch all defined metrics in your dbt project: @@ -65,9 +65,9 @@ select * from {{ semantic_layer.metrics() }} ``` - + - + You can use this query to fetch all dimensions for a metric. @@ -77,9 +77,9 @@ Note, metrics is a required argument that lists one or multiple metrics in it. select * from {{ semantic_layer.dimensions(metrics=['food_order_amount'])}} ``` - + - + You can use this query to fetch dimension values for one or multiple metrics and a single dimension. @@ -89,9 +89,9 @@ Note, metrics is a required argument that lists one or multiple metrics, and a s select * from {{ semantic_layer.dimension_values(metrics=['food_order_amount'], group_by=['customer__customer_name'])}} ``` - + - + You can use this query to fetch queryable granularities for a list of metrics. @@ -103,9 +103,9 @@ select * from {{ semantic_layer.queryable_granularities(metrics=['food_order_amount', 'order_gross_profit'])}} ``` - + - + You can use this query to fetch available metrics given dimensions. This command is essentially the opposite of getting dimensions given a list of metrics. @@ -117,9 +117,9 @@ select * from {{ }} ``` - + - + You can use this example query to fetch available granularities for all time dimensions (the similar queryable granularities API call only returns granularities for the primary time dimensions for metrics). @@ -133,9 +133,9 @@ select NAME, QUERYABLE_GRANULARITIES from {{ }} ``` - + - + It may be useful in your application to expose the names of the time dimensions that represent metric_time or the common thread across all metrics. @@ -147,9 +147,44 @@ select * from {{ }} ``` - + + + + +You can filter your metrics to include only those that contain a specific substring (sequence of characters contained within a larger string (text)). Use the `search` argument to specify the substring you want to match. + +```sql +select * from {{ semantic_layer.metrics(search='order') }} +``` + +If no substring is provided, the query returns all metrics. + + - + + +In the case when you don't want to return the full result set from a metadata call, you can paginate the results for both `semantic_layer.metrics()` and `semantic_layer.dimensions()` calls using the `page_size` and `page_number` parameters. + +- `page_size`: This is an optional variable which sets the number of records per page. If left as None, there is no page limit. +- `page_number`: This is an optional variable which specifies the page number to retrieve. Defaults to `1` (first page) if not specified. + +Examples: + +```sql +-- Retrieves the 5th page with a page size of 10 metrics +select * from {{ semantic_layer.metrics(page_size=10, page_number=5) }} + +-- Retrieves the 1st page with a page size of 10 metrics +select * from {{ semantic_layer.metrics(page_size=10) }} + +-- Retrieves all metrics without pagination +select * from {{ semantic_layer.metrics() }} +``` + +You can use the same pagination parameters for `semantic_layer.dimensions(...)`. + + + You can use this example query to list all available saved queries in your dbt project. @@ -165,7 +200,7 @@ select * from semantic_layer.saved_queries() | NAME | DESCRIPTION | LABEL | METRICS | GROUP_BY | WHERE_FILTER | ``` - + import Features from '/snippets/_sl-plan-info.md' @@ -25,54 +28,28 @@ product="dbt Semantic Layer" plan="dbt Cloud Team or Enterprise" /> -
- - - - - - - - - - - - - - - - - -
+This page points to various resources available to help you understand, configure, deploy, and integrate the dbt Semantic Layer. The following sections contain links to specific pages that explain each aspect in detail. Use these links to navigate directly to the information you need, whether you're setting up the Semantic Layer for the first time, deploying metrics, or integrating with downstream tools. + + +Refer to the following resources to get started with the dbt Semantic Layer: +- [Quickstart with the dbt Cloud Semantic Layer](/guides/sl-snowflake-qs) — Build and define metrics, set up the dbt Semantic Layer, and query them using our first-class integrations. +- [dbt Semantic Layer FAQs](/docs/use-dbt-semantic-layer/sl-faqs) — Discover answers to frequently asked questions about the dbt Semantic Layer, such as availability, integrations, and more. + +## Configure the dbt Semantic Layer + +The following resources provide information on how to configure the dbt Semantic Layer: +- [Set up the dbt Semantic Layer](/docs/use-dbt-semantic-layer/setup-sl) — Learn how to set up the dbt Semantic Layer in dbt Cloud using intuitive navigation. +- [Architecture](/docs/use-dbt-semantic-layer/sl-architecture) — Explore the powerful components that make up the dbt Semantic Layer. + +## Deploy metrics +This section provides information on how to deploy the dbt Semantic Layer and materialize your metrics: +- [Deploy your Semantic Layer](/docs/use-dbt-semantic-layer/deploy-sl) — Run a dbt Cloud job to deploy the dbt Semantic Layer and materialize your metrics. +- [Write queries with exports](/docs/use-dbt-semantic-layer/exports) — Use exports to write commonly used queries directly within your data platform, on a schedule. +- [Cache common queries](/docs/use-dbt-semantic-layer/sl-cache) — Leverage result caching and declarative caching for common queries to speed up performance and reduce query computation. + +## Consume metrics and integrate +Consume metrics and integrate the dbt Semantic Layer with downstream tools and applications: +- [Consume metrics](/docs/use-dbt-semantic-layer/consume-metrics) — Query and consume metrics in downstream tools and applications using the dbt Semantic Layer. +- [Available integrations](/docs/cloud-integrations/avail-sl-integrations) — Review a wide range of partners you can integrate and query with the dbt Semantic Layer. +- [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) — Use the dbt Semantic Layer APIs to query metrics in downstream tools for consistent, reliable data metrics. + diff --git a/website/docs/docs/use-dbt-semantic-layer/deploy-sl.md b/website/docs/docs/use-dbt-semantic-layer/deploy-sl.md new file mode 100644 index 00000000000..63a77d9cab0 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/deploy-sl.md @@ -0,0 +1,29 @@ +--- +title: "Deploy your metrics" +id: deploy-sl +description: "Deploy the dbt Semantic Layer in dbt Cloud by running a job to materialize your metrics." +sidebar_label: "Deploy your metrics" +tags: [Semantic Layer] +pagination_next: "docs/use-dbt-semantic-layer/exports" +--- + + + +import RunProdJob from '/snippets/_sl-run-prod-job.md'; + + + +## Next steps +After you've executed a job and deployed your Semantic Layer: +- [Set up your Semantic Layer](/docs/use-dbt-semantic-layer/setup-sl) in dbt Cloud. +- Discover the [available integrations](/docs/cloud-integrations/avail-sl-integrations), such as Tableau, Google Sheets, Microsoft Excel, and more. +- Start querying your metrics with the [API query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata). + + +## Related docs +- [Optimize querying performance](/docs/use-dbt-semantic-layer/sl-cache) using declarative caching. +- [Validate semantic nodes in CI](/docs/deploy/ci-jobs#semantic-validations-in-ci) to ensure code changes made to dbt models don't break these metrics. +- If you haven't already, learn how to [build your metrics and semantic models](/docs/build/build-metrics-intro) in your development tool of choice. diff --git a/website/docs/docs/use-dbt-semantic-layer/exports.md b/website/docs/docs/use-dbt-semantic-layer/exports.md index f8269d42942..5d6e4c0d996 100644 --- a/website/docs/docs/use-dbt-semantic-layer/exports.md +++ b/website/docs/docs/use-dbt-semantic-layer/exports.md @@ -2,6 +2,7 @@ title: "Write queries with exports" description: "Use exports to write tables to the data platform on a schedule." sidebar_label: "Write queries with exports" +keywords: [DBT_INCLUDE_SAVED_QUERY, exports, DBT_EXPORTS_SAVED_QUERY, dbt Cloud, Semantic Layer] --- Exports enhance [saved queries](/docs/build/saved-queries) by running your saved queries and writing the output to a table or view within your data platform. Saved queries are a way to save and reuse commonly used queries in MetricFlow, exports take this functionality a step further by: @@ -53,7 +54,7 @@ Before you're able to run exports in development or production, you'll need to m There are two ways to run an export: -- [Run exports in development](#exports-in-development) using the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) to test the output before production (dbt Cloud IDE isn't supported yet). +- [Run exports in development](#exports-in-development) using the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) to test the output before production (You can configure exports in the dbt Cloud IDE, however running them directly in the IDE isn't supported yet). If you're using the dbt Cloud IDE, use `dbt build` to run exports. Make sure you have the [environment variable](#set-environment-variable) enabled. - [Run exports in production](#exports-in-production) using the [dbt Cloud job scheduler](/docs/deploy/job-scheduler) to write these queries within your data platform. ## Exports in development @@ -181,8 +182,9 @@ If exports aren't needed, you can set the value(s) to `FALSE` (`DBT_INCLUDE_SAVE 1. Click **Deploy** in the top navigation bar and choose **Environments**. 2. Select **Environment variables**. 3. [Set the environment variable](/docs/build/environment-variables#setting-and-overriding-environment-variables) key to `DBT_EXPORT_SAVED_QUERIES` and the environment variable's value to `TRUE` (`DBT_EXPORT_SAVED_QUERIES=TRUE`). +*Note, if you're on dbt v1.7, set the environment variable key to `DBT_INCLUDE_SAVED_QUERY`. Use the documentation toggle to select version "1.7" to view more details. -Doing this ensures saved queries and exports are included in your dbt build job. For example, running `dbt build sq_name` runs the equivalent of `dbt sl export --saved-query sq_name` in the dbt Cloud Job scheduler. +Doing this ensures saved queries and exports are included in your dbt build job. For example, running `dbt build -s sq_name` runs the equivalent of `dbt sl export --saved-query sq_name` in the dbt Cloud Job scheduler. If exports aren't needed, you can set the value(s) to `FALSE` (`DBT_EXPORT_SAVED_QUERIES=FALSE`). diff --git a/website/docs/docs/use-dbt-semantic-layer/setup-sl.md b/website/docs/docs/use-dbt-semantic-layer/setup-sl.md index adad5bd9fd1..3dfa7f3aa7d 100644 --- a/website/docs/docs/use-dbt-semantic-layer/setup-sl.md +++ b/website/docs/docs/use-dbt-semantic-layer/setup-sl.md @@ -2,8 +2,10 @@ title: "Set up the dbt Semantic Layer" id: setup-sl description: "Seamlessly set up the dbt Semantic Layer in dbt Cloud using intuitive navigation." -sidebar_label: "Set up your Semantic Layer" +sidebar_label: "Set up the Semantic Layer" tags: [Semantic Layer] +pagination_next: "docs/use-dbt-semantic-layer/sl-architecture" +pagination_prev: "guides/sl-snowflake-qs" --- With the dbt Semantic Layer, you can centrally define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. diff --git a/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md b/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md index 2062f9e405e..9239275ebdf 100644 --- a/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md +++ b/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md @@ -4,7 +4,6 @@ id: sl-architecture description: "dbt Semantic Layer product architecture and related questions." sidebar_label: "Semantic Layer architecture" tags: [Semantic Layer] -pagination_next: null --- The dbt Semantic Layer allows you to define metrics and use various interfaces to query them. The Semantic Layer does the heavy lifting to find where the queried data exists in your data platform and generates the SQL to make the request (including performing joins). diff --git a/website/docs/docs/use-dbt-semantic-layer/sl-faqs.md b/website/docs/docs/use-dbt-semantic-layer/sl-faqs.md index 40b84ada40a..d206e4f1488 100644 --- a/website/docs/docs/use-dbt-semantic-layer/sl-faqs.md +++ b/website/docs/docs/use-dbt-semantic-layer/sl-faqs.md @@ -28,6 +28,8 @@ The primary value of the dbt Semantic Layer is to centralize and bring consisten - **Simplify your code** by not duplicating metric logic and allowing MetricFlow to perform complex calculations for you. - **Empower stakeholders** with rich context and flexible, yet governed experiences. + + @@ -110,6 +112,9 @@ You can use tables and dbt models to calculate metrics as an option, but it's a If you create a table with a metric, you’ll need to create numerous other tables derived from that table to show the desired metric cut by the desired dimension or time grain. Mature data models have thousands of dimensions, so you can see how this will quickly result in unnecessary duplication, maintenance, and costs. It's also incredibly hard to predict all the slices of data that a user is going to need ahead of time. With the dbt Semantic Layer, you don’t need to pre-join or build any tables; rather, you can simply add a few lines of code to your semantic model, and that data will only be computed upon request. + + + diff --git a/website/docs/faqs/API/rotate-token.md b/website/docs/faqs/API/rotate-token.md deleted file mode 100644 index 8dea2d0b875..00000000000 --- a/website/docs/faqs/API/rotate-token.md +++ /dev/null @@ -1,60 +0,0 @@ ---- -title: How can I rotate my user API token? -description: "Instructions on how to rotate API token" -sidebar_label: 'Rotate your user API token' -id: rotate-token ---- - -For security reasons and best practices, you should aim to rotate API keys every so often. You can rotate your API key automatically with the push of a button in your dbt Cloud environment or manually using the command line. - - - - - -To automatically rotate your API key: - -1. Navigate to the Account settings by clicking the **gear icon** in the top right of your dbt Cloud account. -2. Select **API Access** from the lefthand side. -3. In the **API** pane, click `Rotate`. - - - - - - - -1. Rotate your [User API token](/docs/dbt-cloud-apis/user-tokens) by replacing `YOUR_USER_ID`, `YOUR_CURRENT_PAT_TOKEN`, and `YOUR_ACCESS_URL` with your information in the following request. - -``` -curl --location --request POST 'https://cloud.getdbt.com/api/v3/accounts/YOUR_ACCOUNT_ID/users/YOUR_USER_ID/apikey/' \ ---header 'Authorization: Token YOUR_CURRENT_PAT_TOKEN' -``` - -* Find your `YOUR_USER_ID` by reading [How to find your user ID](/faqs/Accounts/find-user-id). -* Find your `YOUR_CURRENT_TOKEN` by going to **Profile Settings** -> **API Access** and copying the API key. -* Find [`YOUR_ACCESS_URL`](/docs/cloud/about-cloud/access-regions-ip-addresses) for your region and plan. - -If `YOUR_USER_ID` = `123`, `YOUR_CURRENT_TOKEN` = `abcf9g`, then your curl request will be: - -``` -curl --location --request POST 'https://YOUR_ACCESS_URL/api/v2/users/123/apikey/' \ - ---header 'Authorization: Token abcf9g' -``` - -2. Find the new key in the API response or in dbt Cloud. - -3. To find the new key in dbt Cloud, go to **Profile Settings** -> **API Access**. - -### dbt Cloud deployments - -If your [dbt Cloud deployment](/docs/cloud/about-cloud/access-regions-ip-addresses) uses a different access URL, replace `YOUR_ACCESS_URL` with the URL of your instance. - -For example, if your deployment is Virtual Private dbt: - -✅ `http://cloud.customizedurl.getdbt.com/`
-❌ `http://cloud.getdbt.com/`
- -
- -
diff --git a/website/docs/faqs/Accounts/find-user-id.md b/website/docs/faqs/Accounts/find-user-id.md index 09e3ed35a0b..c7c810d9b3c 100644 --- a/website/docs/faqs/Accounts/find-user-id.md +++ b/website/docs/faqs/Accounts/find-user-id.md @@ -5,7 +5,7 @@ sidebar_label: 'Where can I find my user ID' id: find-user-id --- -Knowing your dbt Cloud user ID can help with actions related to [rotating your API token](/faqs/API/rotate-token), interacting with support, and more. +Knowing your dbt Cloud user ID can help with interacting with support. To find your user ID in dbt Cloud, read the following steps: diff --git a/website/docs/faqs/Accounts/transfer-account.md b/website/docs/faqs/Accounts/transfer-account.md index 693061c55c6..e694636cf68 100644 --- a/website/docs/faqs/Accounts/transfer-account.md +++ b/website/docs/faqs/Accounts/transfer-account.md @@ -10,7 +10,7 @@ You can transfer your dbt Cloud [access control](/docs/cloud/manage-access/about | Account plan| Steps | | ------ | ---------- | -| **Developer** | You can transfer ownership by changing the email directly on your dbt Cloud profile page, which you can access using this URL when you replace `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/access-regions-ip-addresses) for your region and plan: `https://YOUR_ACCESS_URL/settings/profile`. Before doing this, please ensure that you unlink your GitHub profile. | +| **Developer** | You can transfer ownership by changing the email directly on your dbt Cloud profile page, which you can access using this URL when you replace `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/access-regions-ip-addresses) for your region and plan: `https://YOUR_ACCESS_URL/settings/profile`. Before doing this, please ensure that you unlink your GitHub profile. The email address of the new account owner cannot be associated with another dbt Cloud account.| | **Team** | Existing account admins with account access can add users to, or remove users from the owner group. | | **Enterprise** | Account admins can add users to, or remove users from a group with Account Admin permissions. | | **If all account owners left the company** | If the account owner has left your organization, you will need to work with _your_ IT department to have incoming emails forwarded to the new account owner. Once your IT department has redirected the emails, you can request to reset the user password. Once you log in, you can change the email on the Profile page when you replace `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/access-regions-ip-addresses) for your region and plan: `https://YOUR_ACCESS_URL/settings/profile`. | diff --git a/website/docs/faqs/Core/install-pip-os-prereqs.md b/website/docs/faqs/Core/install-pip-os-prereqs.md index c8435b44f33..e25c15ee570 100644 --- a/website/docs/faqs/Core/install-pip-os-prereqs.md +++ b/website/docs/faqs/Core/install-pip-os-prereqs.md @@ -33,7 +33,7 @@ python --version ``` -If you need a compatible version, you can download and install [Python version 3.8 or higher for MacOS](https://www.python.org/downloads/macos). +If you need a compatible version, you can download and install [Python version 3.9 or higher for MacOS](https://www.python.org/downloads/macos). If your machine runs on an Apple M1 architecture, we recommend that you install dbt via [Rosetta](https://support.apple.com/en-us/HT211861). This is necessary for certain dependencies that are only supported on Intel processors. ### Ubuntu/Debian @@ -55,6 +55,6 @@ pip install cryptography~=3.4 Windows requires Python and git to successfully install and run dbt Core. -Install [Git for Windows](https://git-scm.com/downloads) and [Python version 3.8 or higher for Windows](https://www.python.org/downloads/windows/). +Install [Git for Windows](https://git-scm.com/downloads) and [Python version 3.9 or higher for Windows](https://www.python.org/downloads/windows/). For further questions, please see the [Python compatibility FAQ](/faqs/Core/install-python-compatibility) diff --git a/website/docs/faqs/Core/install-python-compatibility.md b/website/docs/faqs/Core/install-python-compatibility.md index aee2d16318e..92b4ae8698b 100644 --- a/website/docs/faqs/Core/install-python-compatibility.md +++ b/website/docs/faqs/Core/install-python-compatibility.md @@ -1,6 +1,6 @@ --- title: What version of Python can I use? -description: "Python versions 3.8 and newer can be used with dbt Core" +description: "Python versions supported with dbt Core" sidebar_label: 'Python version' id: install-python-compatibility --- diff --git a/website/docs/faqs/Git/gitignore.md b/website/docs/faqs/Git/gitignore.md index 16575861289..4386a27d4f2 100644 --- a/website/docs/faqs/Git/gitignore.md +++ b/website/docs/faqs/Git/gitignore.md @@ -80,9 +80,9 @@ dbt_modules/ * `target`, `dbt_modules`, `dbt_packages`, `logs` 7. Commit (save) the deletions to the main branch. 8. Switch to the dbt Cloud IDE, and open the project that you're fixing. -9. Reclone your repo in the IDE by clicking on the three dots next to the **IDE Status** button on the lower right corner of the IDE screen, then select **Reclone Repo**. - * **Note** — Any saved but uncommitted changes will be lost, so make sure you copy any modified code that you want to keep in a temporary location outside of dbt Cloud. -10. Once you reclone the repo, open the `.gitignore` file in the branch you're working in. If the new changes aren't included, you'll need to merge the latest commits from the main branch into your working branch. +9. [Rollback your repo to remote](/docs/collaborate/git/version-control-basics#the-git-button-in-the-cloud-ide) in the IDE by clicking on the three dots next to the **IDE Status** button on the lower right corner of the IDE screen, then select **Rollback to remote**. + * **Note** — Rollback to remote resets your repo back to an earlier clone from your remote. Any saved but uncommitted changes will be lost, so make sure you copy any modified code that you want to keep in a temporary location outside of dbt Cloud. +10. Once you rollback to remote, open the `.gitignore` file in the branch you're working in. If the new changes aren't included, you'll need to merge the latest commits from the main branch into your working branch. 11. Go to the **File Explorer** to verify the `.gitignore` file contains the correct entries and make sure the untracked files/folders in the .gitignore file are in *italics*. 12. Great job 🎉! You've configured the `.gitignore` correctly and can continue with your development! @@ -111,9 +111,9 @@ dbt_modules/ 8. Open a merge request using the git provider web interface. The merge request should attempt to merge the changes into the 'main' branch that all development branches are created from. 9. Follow the necessary procedures to get the branch approved and merged into the 'main' branch. You can delete the branch after the merge is complete. 10. Once the merge is complete, go back to the dbt Cloud IDE, and open the project that you're fixing. -11. Reclone your repo in the IDE by clicking on the three dots next to the **IDE Status** button on the lower right corner of the IDE screen, then select **Reclone Repo**. - * **Note** — Any saved but uncommitted changes will be lost, so make sure you copy any modified code that you want to keep in a temporary location outside of dbt Cloud. -12. Once you reclone the repo, open the `.gitignore` file in the branch you're working in. If the new changes aren't included, you'll need to merge the latest commits from the main branch into your working branch. +11. [Rollback your repo to remote](/docs/collaborate/git/version-control-basics#the-git-button-in-the-cloud-ide) in the IDE by clicking on the three dots next to the **IDE Status** button on the lower right corner of the IDE screen, then select **Rollback to remote**. + * **Note** — Rollback to remote resets your repo back to an earlier clone from your remote. Any saved but uncommitted changes will be lost, so make sure you copy any modified code that you want to keep in a temporary location outside of dbt Cloud. +12. Once you rollback to remote, open the `.gitignore` file in the branch you're working in. If the new changes aren't included, you'll need to merge the latest commits from the main branch into your working branch. 13. Go to the **File Explorer** to verify the `.gitignore` file contains the correct entries and make sure the untracked files/folders in the .gitignore file are in *italics*. 14. Great job 🎉! You've configured the `.gitignore` correctly and can continue with your development! diff --git a/website/docs/faqs/Models/unique-model-names.md b/website/docs/faqs/Models/unique-model-names.md deleted file mode 100644 index 6d8bd18ac00..00000000000 --- a/website/docs/faqs/Models/unique-model-names.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: Do model names need to be unique? -description: "Unique model names to build dependencies" -sidebar_label: 'Model names need to be unique' -id: unique-model-names - ---- - -Within one project: yes! To build dependencies between models, you need to use the `ref` function, and pass in the model name as an argument. dbt uses that model name to uniquely resolve the `ref` to a specific model. As a result, these model names need to be unique, _even if they are in distinct folders_. - -A model in one project can have the same name as a model in another project (installed as a dependency). dbt uses the project name to uniquely identify each model. We call this "namespacing." If you `ref` a model with a duplicated name, it will resolve to the model within the same namespace (package or project), or raise an error because of an ambiguous reference. Use [two-argument `ref`](/reference/dbt-jinja-functions/ref#ref-project-specific-models) to disambiguate references by specifying the namespace. - -Those models will still need to land in distinct locations in the data warehouse. Read the docs on [custom aliases](/docs/build/custom-aliases) and [custom schemas](/docs/build/custom-schemas) for details on how to achieve this. diff --git a/website/docs/faqs/Project/unique-resource-names.md b/website/docs/faqs/Project/unique-resource-names.md new file mode 100644 index 00000000000..1792f21c0f7 --- /dev/null +++ b/website/docs/faqs/Project/unique-resource-names.md @@ -0,0 +1,12 @@ +--- +title: Do ref-able resource names need to be unique? +description: "Unique resource names to build dependencies" +sidebar_label: 'Resource names need to be unique' +id: unique-resource-names +--- + +Within one project: yes! To build dependencies between resources (such as models, seeds, and snapshots), you need to use the `ref` function, and pass in the resource name as an argument. dbt uses that resource name to uniquely resolve the `ref` to a specific resource. As a result, these resource names need to be unique, _even if they are in distinct folders_. + +A resource in one project can have the same name as a resource in another project (installed as a dependency). dbt uses the project name to uniquely identify each resource. We call this "namespacing." If you `ref` a resource with a duplicated name, it will resolve to the resource within the same namespace (package or project), or raise an error because of an ambiguous reference. Use [two-argument `ref`](/reference/dbt-jinja-functions/ref#ref-project-specific-models) to disambiguate references by specifying the namespace. + +Those resource will still need to land in distinct locations in the data warehouse. Read the docs on [custom aliases](/docs/build/custom-aliases) and [custom schemas](/docs/build/custom-schemas) for details on how to achieve this. diff --git a/website/docs/faqs/Snapshots/snapshot-target-is-not-a-snapshot-table.md b/website/docs/faqs/Snapshots/snapshot-target-is-not-a-snapshot-table.md index 5ce8f380008..0175588bf6f 100644 --- a/website/docs/faqs/Snapshots/snapshot-target-is-not-a-snapshot-table.md +++ b/website/docs/faqs/Snapshots/snapshot-target-is-not-a-snapshot-table.md @@ -27,3 +27,4 @@ A snapshot must have a materialized value of 'snapshot' This tells you to change your `materialized` config to `snapshot`. But when you make that change, you might encounter an error message saying that certain fields like `dbt_scd_id` are missing. This error happens because, previously, when dbt treated snapshots as tables, it didn't include the necessary [snapshot meta-fields](/docs/build/snapshots#snapshot-meta-fields) in your target table. Since those meta-fields don't exist, dbt correctly identifies that you're trying to create a snapshot in a table that isn't actually a snapshot. When this happens, you have to start from scratch — re-snapshotting your source data as if it was the first time by dropping your "snapshot" which isn't a real snapshot table. Then dbt snapshot will create a new snapshot and insert the snapshot meta-fields as expected. + diff --git a/website/docs/faqs/Troubleshooting/access_token_error.md b/website/docs/faqs/Troubleshooting/access_token_error.md deleted file mode 100644 index db59d3ba17a..00000000000 --- a/website/docs/faqs/Troubleshooting/access_token_error.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -title: I'm receiving an `access_token` error when trying to run queries in the IDE. -description: "Reauthenticate warehouse when seeing `access_token` error" -sidebar_label: 'Receiving `access_token` error in the IDE' -id: access_token_error - ---- - -If you're seeing a database error labeled `access_token` when you try to run queries in the IDE, this means your [OAuth](/docs/cloud/manage-access/set-up-snowflake-oauth) connection between Snowflake and dbt Cloud has expired. - -To fix this, you'll need to re-connect the two tools. - -Your Snowflake administrator can [configure](/docs/cloud/manage-access/set-up-snowflake-oauth#create-a-security-integration) the refresh tokens' validity, which has a maximum 90-day validity period. - -To resolve the issue, complete the following steps: - -1. Go to your **Profile settings** page (accessible from the gear icon at the upper right corner of dbt Cloud). -2. Click on the correct warehouse connection under **Credentials**. -3. Click the green **Reconnect Snowflake Account** button in the **Development Credentials** section. This drives you through reauthentication through the SSO flow. - -If you've tried the step above and are still experiencing this behavior, reach out to the Support team at support@getdbt.com for further assistance. diff --git a/website/docs/faqs/Troubleshooting/auth-expired-error.md b/website/docs/faqs/Troubleshooting/auth-expired-error.md new file mode 100644 index 00000000000..267407a8c70 --- /dev/null +++ b/website/docs/faqs/Troubleshooting/auth-expired-error.md @@ -0,0 +1,19 @@ +--- +title: Receiving an `authentication has expired` error when trying to run queries in the IDE. +description: "Reauthenticate warehouse when seeing `authentication has expired` error" +sidebar_label: 'Receiving `authentication has expired` error in the IDE' +--- + +If you see a `authentication has expired` error when you try to run queries in the dbt CLoud IDE, this means your [OAuth](/docs/cloud/manage-access/set-up-snowflake-oauth) connection between Snowflake and dbt Cloud has expired. + +To fix this, you must reconnect the two tools. + +Your Snowflake administrator can [configure](/docs/cloud/manage-access/set-up-snowflake-oauth#create-a-security-integration) the refresh tokens' validity, which has a maximum 90-day validity period. + +To resolve the issue, complete the following steps: + +1. Go to your **Profile settings** page, accessible from the navigation menu. +2. Navigate to **Credentials** and click on the project you're experiencing the issue with. +3. Under **Development credentials**, click the **Reconnect Snowflake Account** (green) button. This steps you through reauthentication using the SSO workflow. + +If you've tried these step and are still getting this error, please contact the Support team at support@getdbt.com for further assistance. diff --git a/website/docs/faqs/Troubleshooting/generate-har-file.md b/website/docs/faqs/Troubleshooting/generate-har-file.md new file mode 100644 index 00000000000..0cb16711942 --- /dev/null +++ b/website/docs/faqs/Troubleshooting/generate-har-file.md @@ -0,0 +1,71 @@ +--- +title: "How to generate HAR files" +description: "How to generate HAR files for debugging" +sidebar_label: 'Generate HAR files' +sidebar_position: 1 +keywords: + - HAR files + - HTTP Archive + - Troubleshooting + - Debugging +--- + +HTTP Archive (HAR) files are used to gather data from users’ browser, which dbt Support uses to troubleshoot network or resource issues. This information includes detailed timing information about the requests made between the browser and the server. + +The following sections describe how to generate HAR files using common browsers such as [Google Chrome](#google-chrome), [Mozilla Firefox](#mozilla-firefox), [Apple Safari](#apple-safari), and [Microsoft Edge](#microsoft-edge). + +:::info +Remove or hide any confidential or personally identifying information before you send the HAR file to dbt Labs. You can edit the file using a text editor. +::: + +### Google Chrome + +1. Open Google Chrome. +2. Click on **View** --> **Developer Tools**. +3. Select the **Network** tab. +4. Ensure that Google Chrome is recording. A red button (🔴) indicates that a recording is already in progress. Otherwise, click **Record network log**. +5. Select **Preserve Log**. +6. Clear any existing logs by clicking **Clear network log** (🚫). +7. Go to the page where the issue occurred and reproduce the issue. +8. Click **Export HAR** (the down arrow icon) to export the file as HAR. The icon is located on the same row as the **Clear network log** button. +9. Save the HAR file. +10. Upload the HAR file to the dbt Support ticket thread. + +### Mozilla Firefox + +1. Open Firefox. +2. Click the application menu and then **More tools** --> **Web Developer Tools**. +3. In the developer tools docked tab, select **Network**. +4. Go to the page where the issue occurred and reproduce the issue. The page automatically starts recording as you navigate. +5. When you're finished, click **Pause/Resume recording network log**. +6. Right-click anywhere in the **File** column and select **Save All as HAR**. +7. Save the HAR file. +8. Upload the HAR file to the dbt Support ticket thread. + +### Apple Safari + +1. Open Safari. +2. In case the **Develop** menu doesn't appear in the menu bar, go to **Safari** and then **Settings**. +3. Click **Advanced**. +4. Select the **Show features for web developers** checkbox. +5. From the **Develop** menu, select **Show Web Inspector**. +6. Click the **Network tab**. +7. Go to the page where the issue occurred and reproduce the issue. +8. When you're finished, click **Export**. +9. Save the file. +10. Upload the HAR file to the dbt Support ticket thread. + +### Microsoft Edge + +1. Open Microsoft Edge. +2. Click the **Settings and more** menu (...) to the right of the toolbar and then select **More tools** --> **Developer tools**. +3. Click **Network**. +4. Ensure that Microsoft Edge is recording. A red button (🔴) indicates that a recording is already in progress. Otherwise, click **Record network log**. +5. Go to the page where the issue occurred and reproduce the issue. +6. When you're finished, click **Stop recording network log**. +7. Click **Export HAR** (the down arrow icon) or press **Ctrl + S** to export the file as HAR. +8. Save the HAR file. +9. Upload the HAR file to the dbt Support ticket thread. + +### Additional resources +Check out the [How to generate a HAR file in Chrome](https://www.loom.com/share/cabdb7be338243f188eb619b4d1d79ca) video for a visual guide on how to generate HAR files in Chrome. diff --git a/website/docs/faqs/Troubleshooting/ide-session-unknown-error.md b/website/docs/faqs/Troubleshooting/ide-session-unknown-error.md new file mode 100644 index 00000000000..4165506993c --- /dev/null +++ b/website/docs/faqs/Troubleshooting/ide-session-unknown-error.md @@ -0,0 +1,19 @@ +--- +title: I'm receiving an 'Your IDE session experienced an unknown error and was terminated. Please contact support'. +description: "Add a repository when seeing IDE unknown error" +sidebar_label: 'Receiving unknown error in the IDE' + +--- + +If you're seeing the following error when you launch the dbt Cloud IDE, it could be due to a few scenarios but, commonly, it indicates a missing repository: + +```shell + +Your IDE session experienced an unknown error and was terminated. Please contact support. + +``` + +You can try to resolve this by adding a repository like a [managed repository](/docs/collaborate/git/managed-repository) or your preferred Git account. To add your Git account, navigate to **Project** > **Repository** and select your repository. + + +If you're still running into this error, please contact the Support team at support@getdbt.com for help. diff --git a/website/docs/faqs/Troubleshooting/job-memory-limits.md b/website/docs/faqs/Troubleshooting/job-memory-limits.md index 06f6a752507..abba43d18cd 100644 --- a/website/docs/faqs/Troubleshooting/job-memory-limits.md +++ b/website/docs/faqs/Troubleshooting/job-memory-limits.md @@ -6,14 +6,14 @@ sidebar_label: 'Job failures due to exceeded memory limits' If you're receiving a `This run exceeded your account's run memory limits` error in your failed job, it means that the job exceeded the [memory limits](/docs/deploy/job-scheduler#job-memory) set for your account. All dbt Cloud accounts have a pod memory of 600Mib and memory limits are on a per run basis. They're typically influenced by the amount of result data that dbt has to ingest and process, which is small but can become bloated unexpectedly by project design choices. -## Common reasons +### Common reasons Some common reasons for higher memory usage are: - dbt run/build: Macros that capture large result sets from run query may not all be necessary and may be memory inefficient. - dbt docs generate: Source or model schemas with large numbers of tables (even if those tables aren't all used by dbt) cause the ingest of very large results for catalog queries. -## Resolution +### Resolution There are various reasons why you could be experiencing this error but they are mostly the outcome of retrieving too much data back into dbt. For example, using the `run_query()` operations or similar macros, or even using database/schemas that have a lot of other non-dbt related tables/views. Try to reduce the amount of data / number of rows retrieved back into dbt by refactoring the SQL in your `run_query()` operation using `group`, `where`, or `limit` clauses. Additionally, you can also use a database/schema with fewer non-dbt related tables/views. @@ -26,5 +26,5 @@ As an additional resource, check out [this example video](https://www.youtube.co If you've tried the earlier suggestions and are still experiencing failed job runs with this error about hitting the memory limits of your account, please [reach out to support](mailto:support@getdbt.com). We're happy to help! -## Additional resources +### Additional resources - [Blog post on how we shaved 90 mins off](https://docs.getdbt.com/blog/how-we-shaved-90-minutes-off-model) diff --git a/website/docs/guides/adapter-creation.md b/website/docs/guides/adapter-creation.md index f737afa0392..278e2a9fe14 100644 --- a/website/docs/guides/adapter-creation.md +++ b/website/docs/guides/adapter-creation.md @@ -556,6 +556,108 @@ While much of dbt's adapter-specific functionality can be modified in adapter ma See [this GitHub discussion](https://github.com/dbt-labs/dbt-core/discussions/5468) for information on the macros required for `GRANT` statements: +### Behavior change flags + +Starting in `dbt-adapters==1.5.0` and `dbt-core==1.8.7`, adapter maintainers can implement their own behavior change flags. Refer to [Behavior changes](https://docs.getdbt.com/reference/global-configs/behavior-changes) for more information. + +Behavior Flags are not intended to be long-living feature flags. They should be implemented with the expectation that the behavior will be the default within an expected period of time. To implement a behavior change flag, you must provide a name for the flag, a default setting (`True` / `False`), an optional source, and a description and/or a link to the flag's documentation on docs.getdbt.com. + +We recommend having a description and documentation link whenever possible. The description and/or docs should provide end users context for why the flag exists, why they may see a warning, and why they may want to utilize the behavior flag. Behavior change flags can be implemented by overwriting `_behavior_flags()` on the adapter in `impl.py`: + + + +```python +class ABCAdapter(BaseAdapter): + ... + @property + def _behavior_flags(self) -> List[BehaviorFlag]: + return [ + { + "name": "enable_new_functionality_requiring_higher_permissions", + "default": False, + "source": "dbt-abc", + "description": ( + "The dbt-abc adapter is implementing a new method for sourcing metadata. " + "This is a more performant way for dbt to source metadata but requires higher permissions on the platform. " + "Enabling this without granting the requisite permissions will result in an error. " + "This feature is expected to be required by Spring 2025." + ), + "docs_url": "https://docs.getdbt.com/reference/global-configs/behavior-changes#abc-enable_new_functionality_requiring_higher_permissions", + } + ] +``` + + + +Once a behavior change flag has been implemented, it can be referenced on the adapter both in `impl.py` and in Jinja macros: + + + +```python +class ABCAdapter(BaseAdapter): + ... + def some_method(self, *args, **kwargs): + if self.behavior.enable_new_functionality_requiring_higher_permissions: + # do the new thing + else: + # do the old thing +``` + + + + + +```sql +{% macro some_macro(**kwargs) %} + {% if adapter.behavior.enable_new_functionality_requiring_higher_permissions %} + {# do the new thing #} + {% else %} + {# do the old thing #} + {% endif %} +{% endmacro %} +``` + + + +Every time the behavior flag evaluates to `False,` it warns the user, informing them that a change will occur in the future. + +This warning doesn't display when the flag evaluates to `True` as the user is already in the new experience. + +Recognizing that the warnings can be disruptive and are not always necessary, you can evaluate the flag without triggering the warning. Simply append `.no_warn` to the end of the flag. + + + + +```python + class ABCAdapter(BaseAdapter): + ... + def some_method(self, *args, **kwargs): + if self.behavior.enable_new_functionality_requiring_higher_permissions.no_warn: + # do the new thing + else: + # do the old thing +``` + + + + + +```sql +{% macro some_macro(**kwargs) %} + {% if adapter.behavior.enable_new_functionality_requiring_higher_permissions.no_warn %} + {# do the new thing #} + {% else %} + {# do the old thing #} + {% endif %} +{% endmacro %} +``` + + + +It's best practice to evaluate a behavior flag as few times as possible. This will make it easier to remove once the behavior change has matured. + +As a result, evaluating the flag earlier in the logic flow is easier. Then, take either the old or the new path. While this may create some duplication in code, using behavior flags in this way provides a safer way to implement a change, which we are already admitting is risky or even breaking in nature. + ### Other files #### `profile_template.yml` diff --git a/website/docs/guides/airflow-and-dbt-cloud.md b/website/docs/guides/airflow-and-dbt-cloud.md index 51ac7668aa9..c55940212bc 100644 --- a/website/docs/guides/airflow-and-dbt-cloud.md +++ b/website/docs/guides/airflow-and-dbt-cloud.md @@ -61,14 +61,14 @@ Follow the instructions [here](https://docs.docker.com/desktop/) to install Dock ## Clone the airflow-dbt-cloud repository -Open your terminal and clone the [airflow-dbt-cloud repository](https://github.com/sungchun12/airflow-dbt-cloud). This contains example Airflow DAGs that you’ll use to orchestrate your dbt Cloud job. Once cloned, navigate into the `airflow-dbt-cloud` project. +Open your terminal and clone the [airflow-dbt-cloud repository](https://github.com/dbt-labs/airflow-dbt-cloud). This contains example Airflow DAGs that you’ll use to orchestrate your dbt Cloud job. Once cloned, navigate into the `airflow-dbt-cloud` project. ```bash -git clone https://github.com/sungchun12/airflow-dbt-cloud.git +git clone https://github.com/dbt-labs/airflow-dbt-cloud.git cd airflow-dbt-cloud ``` - +For more information about cloning GitHub repositories, refer to "[Cloning a repository](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository)" in the GitHub documentation. ## Start the Docker container @@ -140,29 +140,16 @@ Now you have all the working pieces to get up and running with Airflow + dbt Clo ## Update the placeholders in the sample code - Add your `account_id` and `job_id` to the python file [dbt_cloud_provider_eltml.py](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/dags/dbt_cloud_provider_eltml.py). + Add your `account_id` and `job_id` to the python file [dbt_cloud_run_job.py](https://github.com/dbt-labs/airflow-dbt-cloud/blob/main/dags/dbt_cloud_run_job.py). Both IDs are included inside of the dbt Cloud job URL as shown in the following snippets: ```python # For the dbt Cloud Job URL https://YOUR_ACCESS_URL/#/accounts/16173/projects/36467/jobs/65767/ -# The account_id is 16173 - -# Update line 28 -default_args={"dbt_cloud_conn_id": "dbt_cloud", "account_id": 16173}, -``` - -```python -# For the dbt Cloud Job URL https://YOUR_ACCESS_URL/#/accounts/16173/projects/36467/jobs/65767/ -# The job_id is 65767 - -# Update line 39 -trigger_dbt_cloud_job_run = DbtCloudRunJobOperator( - task_id="trigger_dbt_cloud_job_run", - job_id=65767, - check_interval=10, - timeout=300, - ) +# The account_id is 16173 and the job_id is 65767 +# Update lines 34 and 35 +ACCOUNT_ID = "16173" +JOB_ID = "65767" ``` @@ -247,4 +234,4 @@ Yes, either through [Airflow's email/slack](https://www.astronomer.io/guides/err Check out [this recording](https://www.youtube.com/watch?v=n7IIThR8hGk) of a dbt meetup for some tips. -
\ No newline at end of file + diff --git a/website/docs/guides/azure-synapse-analytics-qs.md b/website/docs/guides/azure-synapse-analytics-qs.md index ea70030d351..4f0285e6623 100644 --- a/website/docs/guides/azure-synapse-analytics-qs.md +++ b/website/docs/guides/azure-synapse-analytics-qs.md @@ -297,7 +297,7 @@ Later, you can connect your business intelligence (BI) tools to these views and #### FAQs {#faq-2} - + diff --git a/website/docs/guides/bigquery-qs.md b/website/docs/guides/bigquery-qs.md index e608efeffc7..19a4ff8fbb0 100644 --- a/website/docs/guides/bigquery-qs.md +++ b/website/docs/guides/bigquery-qs.md @@ -299,7 +299,7 @@ Later, you can connect your business intelligence (BI) tools to these views and #### FAQs {#faq-2} - + diff --git a/website/docs/guides/core-cloud-2.md b/website/docs/guides/core-cloud-2.md index 93e9e92bfa4..cee1e8029c2 100644 --- a/website/docs/guides/core-cloud-2.md +++ b/website/docs/guides/core-cloud-2.md @@ -20,6 +20,20 @@ import CoretoCloudTable from '/snippets/_core-to-cloud-guide-table.md'; + + + - dbt Cloud is the fastest and most reliable way to deploy dbt. It enables you to develop, test, deploy, and explore data products using a single, fully managed service. It also supports: + - Development experiences tailored to multiple personas ([dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) or [dbt Cloud CLI](/docs/cloud/cloud-cli-installation)) + - Out-of-the-box [CI/CD workflows](/docs/deploy/ci-jobs) + - The [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) for consistent metrics + - Domain ownership of data with multi-project [dbt Mesh](/best-practices/how-we-mesh/mesh-1-intro) setups + - [dbt Explorer](/docs/collaborate/explore-projects) for easier data discovery and understanding + + Learn more about [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features). +- dbt Core is an open-source tool that enables data teams to define and execute data transformations in a cloud data warehouse following analytics engineering best practices. While this can work well for ‘single players’ and small technical teams, all development happens on a command-line interface, and production deployments must be self-hosted and maintained. This requires significant, costly work that adds up over time to maintain and scale. + + + ## What you'll learn Today thousands of companies, with data teams ranging in size from 2 to 2,000, rely on dbt Cloud to accelerate data work, increase collaboration, and win the trust of the business. Understanding what you'll need to do in order to move between dbt Cloud and your current Core deployment will help you strategize and plan for your move. @@ -182,6 +196,7 @@ This guide should now have given you some insight and equipped you with a framew + Congratulations on finishing this guide, we hope it's given you insight into the considerations you need to take to best plan your move to dbt Cloud. For the next steps, you can continue exploring our 3-part-guide series on moving from dbt Core to dbt Cloud: diff --git a/website/docs/guides/core-to-cloud-1.md b/website/docs/guides/core-to-cloud-1.md index 171e844d7e5..efed66c862a 100644 --- a/website/docs/guides/core-to-cloud-1.md +++ b/website/docs/guides/core-to-cloud-1.md @@ -24,17 +24,19 @@ import CoretoCloudTable from '/snippets/_core-to-cloud-guide-table.md'; + -dbt Cloud is the fastest and most reliable way to deploy dbt. It enables you to develop, test, deploy, and explore data products using a single, fully managed service. It also supports: -- Development experiences tailored to multiple personas ([dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) or [dbt Cloud CLI](/docs/cloud/cloud-cli-installation)) -- Out-of-the-box [CI/CD workflows](/docs/deploy/ci-jobs) -- The [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) for consistent metrics -- Domain ownership of data with multi-project [dbt Mesh](/best-practices/how-we-mesh/mesh-1-intro) setups -- [dbt Explorer](/docs/collaborate/explore-projects) for easier data discovery and understanding + - dbt Cloud is the fastest and most reliable way to deploy dbt. It enables you to develop, test, deploy, and explore data products using a single, fully managed service. It also supports: + - Development experiences tailored to multiple personas ([dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) or [dbt Cloud CLI](/docs/cloud/cloud-cli-installation)) + - Out-of-the-box [CI/CD workflows](/docs/deploy/ci-jobs) + - The [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) for consistent metrics + - Domain ownership of data with multi-project [dbt Mesh](/best-practices/how-we-mesh/mesh-1-intro) setups + - [dbt Explorer](/docs/collaborate/explore-projects) for easier data discovery and understanding -Learn more about [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features). + Learn more about [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features). +- dbt Core is an open-source tool that enables data teams to define and execute data transformations in a cloud data warehouse following analytics engineering best practices. While this can work well for ‘single players’ and small technical teams, all development happens on a command-line interface, and production deployments must be self-hosted and maintained. This requires significant, costly work that adds up over time to maintain and scale. -dbt Core is an open-source tool that enables data teams to define and execute data transformations in a cloud data warehouse following analytics engineering best practices. While this can work well for ‘single players’ and small technical teams, all development happens on a command-line interface, and production deployments must be self-hosted and maintained. This requires significant, costly work that adds up over time to maintain and scale. + ## What you'll learn @@ -57,7 +59,7 @@ This guide outlines the steps you need to take to move from dbt Core to dbt Clou ## Prerequisites - You have an existing dbt Core project connected to a Git repository and data platform supported in [dbt Cloud](/docs/cloud/connect-data-platform/about-connections). -- A [supported version](/docs/dbt-versions/core) of dbt or select [**Versionless**](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) of dbt. +- A [supported version](/docs/dbt-versions/core) of dbt or select [**Versionless**](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) of dbt. - You have a dbt Cloud account. **[Don't have one? Start your free trial today](https://www.getdbt.com/signup)**! ## Account setup @@ -74,9 +76,9 @@ This section outlines the steps to set up your dbt Cloud account and configure i ### Additional configuration Explore these additional configurations for performance and reliability improvements: -1. In **Account settings**, enable [partial parsing](/docs/deploy/deploy-environments#partial-parsing) to only reparse changed files, saving time. +1. In **Account settings**, enable [partial parsing](/docs/cloud/account-settings#partial-parsing) to only reparse changed files, saving time. -2. In **Account settings**, enable [Git repo caching](/docs/deploy/deploy-environments#git-repository-caching) for job reliability & third-party outage protection. +2. In **Account settings**, enable [Git repo caching](/docs/cloud/account-settings#git-repository-caching) for job reliability & third-party outage protection. ## Data platform setup @@ -84,8 +86,10 @@ This section outlines the considerations and methods to connect your data platfo 1. In dbt Cloud, set up your [data platform connections](/docs/cloud/connect-data-platform/about-connections) and [environment variables](/docs/build/environment-variables). dbt Cloud can connect with a variety of data platform providers including: - [AlloyDB](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) + - [Amazon Athena](/docs/cloud/connect-data-platform/connect-amazon-athena) (beta) - [Amazon Redshift](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) - [Apache Spark](/docs/cloud/connect-data-platform/connect-apache-spark) + - [Azure Synapse Analytics](/docs/cloud/connect-data-platform/connect-azure-synapse-analytics) - [Databricks](/docs/cloud/connect-data-platform/connect-databricks) - [Google BigQuery](/docs/cloud/connect-data-platform/connect-bigquery) - [Microsoft Fabric](/docs/cloud/connect-data-platform/connect-microsoft-fabric) @@ -138,7 +142,7 @@ The most common data environments are production, staging, and development. The - Streamlining the process of switching between development, staging, and production contexts. - Making it easy to configure environments through the dbt Cloud UI instead of manually editing the `profiles.yml` file. You can also [set up](/reference/dbt-jinja-functions/target) or [customize](/docs/build/custom-target-names) target names in dbt Cloud. - Adding `profiles.yml` attributes to dbt Cloud environment settings with [Extended Attributes](/docs/dbt-cloud-environments#extended-attributes). -- Using [Git repo caching](/docs/dbt-cloud-environments#git-repository-caching) to protect you from third-party outages, Git auth failures, and more. +- Using [Git repo caching](/docs/cloud/account-settings#git-repository-caching) to protect you from third-party outages, Git auth failures, and more. ### Initial setup steps 1. **Set up development environment** — Set up your [development](/docs/dbt-cloud-environments#create-a-development-environment) environment and [development credentials](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud#access-the-cloud-ide). You’ll need this to access your dbt project and start developing. @@ -230,6 +234,8 @@ Explore these additional configurations to optimize your dbt Cloud orchestration Building a custom solution to efficiently check code upon pull requests is complicated. With dbt Cloud, you can enable [continuous integration / continuous deployment (CI/CD)](/docs/deploy/continuous-integration) and configure dbt Cloud to run your dbt projects in a temporary schema when new commits are pushed to open pull requests. + + This build-on-PR functionality is a great way to catch bugs before deploying to production, and an essential tool for data practitioners. 1. Set up an integration with a native Git application (such as Azure DevOps, GitHub, GitLab) and a CI environment in dbt Cloud. diff --git a/website/docs/guides/core-to-cloud-3.md b/website/docs/guides/core-to-cloud-3.md index 0ea22de8478..7d482d54471 100644 --- a/website/docs/guides/core-to-cloud-3.md +++ b/website/docs/guides/core-to-cloud-3.md @@ -99,11 +99,11 @@ dbt Cloud provides robust orchestration that enables you to schedule, run, and m ### Tips -- Enable [partial parsing](/docs/deploy/deploy-environments#partial-parsing) between jobs in dbt Cloud to significantly speed up project parsing by only processing changed files, optimizing performance for large projects. +- Enable [partial parsing](/docs/cloud/account-settings#partial-parsing) between jobs in dbt Cloud to significantly speed up project parsing by only processing changed files, optimizing performance for large projects. - [Run multiple CI/CD](/docs/deploy/continuous-integration) jobs at the same time which will not block production runs. The Job scheduler automatically cancels stale runs when a newer commit is pushed. This is because each PR will run in its own schema. - dbt Cloud automatically [cancels](/docs/deploy/job-scheduler#run-cancellation-for-over-scheduled-jobs) a scheduled run if the existing run is still executing. This prevents unnecessary, duplicative executions. -- Protect you and your data freshness from third-party outages by enabling dbt Cloud’s [Git repository caching](/docs/deploy/deploy-environments#git-repository-caching), which keeps a cache of the project's Git repository. -- [Link deploy jobs](/docs/deploy/deploy-jobs#trigger-on-job-completion--) across dbt Cloud projects by configuring your job or using the [Create Job API](/dbt-cloud/api-v2#/operations/Create%20Job) to do this. +- Protect you and your data freshness from third-party outages by enabling dbt Cloud’s [Git repository caching](/docs/cloud/account-settings#git-repository-caching), which keeps a cache of the project's Git repository. +- [Link deploy jobs](/docs/deploy/deploy-jobs#trigger-on-job-completion) across dbt Cloud projects by configuring your job or using the [Create Job API](/dbt-cloud/api-v2#/operations/Create%20Job) to do this. - [Rerun your jobs](/docs/deploy/retry-jobs) from the start or the point of failure if your dbt job run completed with a status of **`Error.`** ### Caveats diff --git a/website/docs/guides/custom-cicd-pipelines.md b/website/docs/guides/custom-cicd-pipelines.md index 59a7767c69b..be23524d096 100644 --- a/website/docs/guides/custom-cicd-pipelines.md +++ b/website/docs/guides/custom-cicd-pipelines.md @@ -10,6 +10,9 @@ hide_table_of_contents: true tags: ['dbt Cloud', 'Orchestration', 'CI'] level: 'Intermediate' recently_updated: true +search_weight: "heavy" +keywords: + - bitbucket pipeline, custom pipelines, github, gitlab, azure devops, ci/cd custom pipeline ---
@@ -19,7 +22,6 @@ One of the core tenets of dbt is that analytic code should be version controlled A note on parlance in this article since each code hosting platform uses different terms for similar concepts. The terms `pull request` (PR) and `merge request` (MR) are used interchangeably to mean the process of merging one branch into another branch. - ### What are pipelines? Pipelines (which are known by many names, such as workflows, actions, or build steps) are a series of pre-defined jobs that are triggered by specific events in your repository (PR created, commit pushed, branch merged, etc). Those jobs can do pretty much anything your heart desires assuming you have the proper security access and coding chops. diff --git a/website/docs/guides/databricks-qs.md b/website/docs/guides/databricks-qs.md index bb248e09320..ba93ff74540 100644 --- a/website/docs/guides/databricks-qs.md +++ b/website/docs/guides/databricks-qs.md @@ -41,36 +41,33 @@ You can check out [dbt Fundamentals](https://learn.getdbt.com/courses/dbt-fundam ## Create a Databricks workspace -1. Use your existing account or [sign up for a Databricks account](https://databricks.com/). Complete the form with your user information. +1. Use your existing account or [sign up for a Databricks account](https://databricks.com/). Complete the form with your user information and click **Continue**.
-2. For the purpose of this tutorial, you will be selecting AWS as our cloud provider but if you use Azure or GCP internally, please choose one of them. The setup process will be similar. -3. Check your email to complete the verification process. -4. After setting up your password, you will be guided to choose a subscription plan. Select the `Premium` or `Enterprise` plan to access the SQL Compute functionality required for using the SQL warehouse for dbt. We have chosen `Premium` for this tutorial. Click **Continue** after selecting your plan. - -
- +2. On the next screen, select your cloud provider. This tutorial uses AWS as the cloud provider, but if you use Azure or GCP internally, please select your platform. The setup process will be similar. Do not select the **Get started with Community Edition** option, as this will not provide the required compute for this guide. + +
+
-5. Click **Get Started** when you come to this below page and then **Confirm** after you validate that you have everything needed. +3. Check your email and complete the verification process. +4. After completing the verification processes, you will be brought to the first setup screen. Databricks defaults to the `Premium` plan and you can change the trial to `Enterprise` on this page. +
- -
-
- +
-6. Now it's time to create your first workspace. A Databricks workspace is an environment for accessing all of your Databricks assets. The workspace organizes objects like notebooks, SQL warehouses, clusters, etc into one place. Provide the name of your workspace and choose the appropriate AWS region and click **Start Quickstart**. You might get the checkbox of **I have data in S3 that I want to query with Databricks**. You do not need to check this off for the purpose of this tutorial. +5. Now, it's time to create your first workspace. A Databricks workspace is an environment for accessing all of your Databricks assets. The workspace organizes objects like notebooks, SQL warehouses, clusters, and more so into one place. Provide the name of your workspace, choose the appropriate AWS region, and click **Start Quickstart**. You might get the checkbox of **I have data in S3 that I want to query with Databricks**. You do not need to check this off for this tutorial.
- +
-7. By clicking on `Start Quickstart`, you will be redirected to AWS and asked to log in if you haven’t already. After logging in, you should see a page similar to this. +6. By clicking on `Start Quickstart`, you will be redirected to AWS and asked to log in if you haven’t already. After logging in, you should see a page similar to this.
@@ -80,7 +77,7 @@ You can check out [dbt Fundamentals](https://learn.getdbt.com/courses/dbt-fundam If you get a session error and don’t get redirected to this page, you can go back to the Databricks UI and create a workspace from the interface. All you have to do is click **create workspaces**, choose the quickstart, fill out the form and click **Start Quickstart**. ::: -8. There is no need to change any of the pre-filled out fields in the Parameters. Just add in your Databricks password under **Databricks Account Credentials**. Check off the Acknowledgement and click **Create stack**. +7. There is no need to change any of the pre-filled out fields in the Parameters. Just add in your Databricks password under **Databricks Account Credentials**. Check off the Acknowledgement and click **Create stack**.
@@ -89,11 +86,11 @@ If you get a session error and don’t get redirected to this page, you can go b
-10. Go back to the Databricks tab. You should see that your workspace is ready to use. +8. Go back to the Databricks tab. You should see that your workspace is ready to use.
-11. Now let’s jump into the workspace. Click **Open** and log into the workspace using the same login as you used to log into the account. +9. Now let’s jump into the workspace. Click **Open** and log into the workspace using the same login as you used to log into the account. ## Load data @@ -425,7 +422,7 @@ Later, you can connect your business intelligence (BI) tools to these views and #### FAQs {#faq-2} - +
diff --git a/website/docs/guides/debug-errors.md b/website/docs/guides/debug-errors.md index 11f02f325a4..442284fc6ee 100644 --- a/website/docs/guides/debug-errors.md +++ b/website/docs/guides/debug-errors.md @@ -390,4 +390,17 @@ _(More likely for dbt Core users)_ If you just opened a SQL file in the `target/` directory to help debug an issue, it's not uncommon to accidentally edit that file! To avoid this, try changing your code editor settings to grey out any files in the `target/` directory — the visual cue will help avoid the issue. -
\ No newline at end of file +## FAQs + +Here are some useful FAQs to help you debug your dbt project: + +- +- +- +- +- +- +- +- + + diff --git a/website/docs/guides/explorer-qs.md b/website/docs/guides/explorer-qs.md new file mode 100644 index 00000000000..0017831fb95 --- /dev/null +++ b/website/docs/guides/explorer-qs.md @@ -0,0 +1,157 @@ +--- +title: "Quickstart for the dbt Explorer workshop" +id: "explorer-quickstart" +description: "Use this guide to build and define metrics, set up the dbt Cloud Semantic Layer, and query them using Google Sheets." +sidebar_label: "Quickstart dbt Explorer" +icon: 'guides' +hide_table_of_contents: true +tags: ['Explorer', 'Snowflake', 'dbt Cloud','Quickstart'] +keywords: ['dbt Explorer','Mesh','dbt Cloud', 'Snowflake', 'Multi-Project'] +level: 'Beginner' +recently_updated: true +--- + +## Introduction + +Unlock the power of [dbt Explorer](/docs/collaborate/explore-projects) in this hands-on workshop designed for analytics engineers, data analysts, stakeholders, and data leaders. + +This quickstart guide accompanies the Explorer hands-on workshop and helps you dive into a production-level dbt Mesh implementation and discover how to explore your data workflows.⁠ Whether you're looking to streamline your data operations, improve data quality, or self-serve information about your data platform, this workshop will equip you with the tools and knowledge to take your dbt projects to the next level. + +By the end of the guide and workshop, you'll understand how to leverage dbt Explorer and have the confidence to navigate multiple dbt projects, trace dependencies, and identify opportunities to improve performance and data quality. + +### What you'll learn +In this guide, you will learn how to: +- Navigate multiple dbt projects using dbt Explorer +- Self-serve on data documentation +- Trace dependencies at the model and column level +- Identify opportunities to improve performance and data quality + +### Prerequisites +- Familiarity with data platforms + +## Setup +Now we’ll be creating your dbt Cloud account and connecting it to a data warehouse. +1. Go to this URL (sign out if you're already logged in): https://cloud.getdbt.com/coalesce-workshop-signup +2. Enter your first name and last name. +3. Select the **Exploring a dbt Mesh implementation with dbt Explorer** option. +4. Use the passcode provided by the workshop facilitator. +5. Agree to the terms of service and click the **Complete Registration** button. +6. Wait about 30 seconds, you’ll be in the dbt Cloud account for this workshop and already connected to a data warehouse. +7. Toggle into the **Platform project**. Go to the **Deploy** tab and select **Jobs** from the dropdown menu. +8. Run each job you see by clicking on the job and then selecting **Run**. This will run the *upstream* project job in both a production and staging environment. +9. Toggle into the **Analytics project**. Go to the **Deploy** tab and select **Jobs** from the dropdown menu. +10. Run each job you see by clicking on the job and then selecting **Run**. This will run the *downstream* project job in both a production and staging environment. + + + + +## Performance + +dbt Explorer will show you your project's most executed models, longest model executions, most failed models and tests, and most consumed models all in one place: The performance tab. + +### Hands-On +- Trigger the Daily Prod job to run again +- Explore the **Performance** tab on the **Project details** page + - Which model took the longest over the last two weeks? Over the last month? + - Which model failed the most tests? + - Click on the model that took the longest to run in the _Longest model executions_ graph + - What is the average duration time over the last two weeks? Over the last month? + - How often is the model being built? What is the Model Test Failure Rate? + +## Resources + +With dbt Explorer, you can view your project's resources (such as models, tests, and metrics), their lineage, and model consumption to gain a better understanding of its latest production state. + +Navigate and manage your projects within dbt Cloud to help you and other data developers, analysts, and consumers discover and leverage your dbt resources. + + + +### Hands-On +- Explore the **Model** tab + - Pick a model. What’s its row count? + - Use the test results drop down to see if this model’s tests passed. What other models does it depend on? +- Explore the **Tests** tab + - What tests do we see? Which tests have warnings? Failures? +- Explore the **Sources** tab + - What sources can we see? Which sources have stale data? Which sources have fresh data? +- Explore **Exposures** + - Use the lineage graph to find an exposure. Which models and metrics does the Exposure reference? + +## Lineage + +dbt Explorer provides a visualization of your project’s that you can interact with. The nodes in the lineage graph represent the project’s resources and the edges represent the relationships between the nodes. Nodes are color-coded and include iconography according to their resource type. + +- Use the search bar and [node selectors](/reference/node-selection/syntax) to filter your DAG. +- [Lenses](/docs/collaborate/explore-projects#lenses) make it easier to understand your project’s contextual metadata at scales, especially to distinguish a particular model or a subset of models. + - Applying a lens adds tags to the nodes, showing metadata like layer values, with color coding to help you distinguish them. + + + +- Use the [advanced search](/docs/collaborate/explore-projects#search-resources) feature to locate resources in your project. + - Perform hard searches and keyword searches. + - All resource names, column names, resource descriptions, warehouse relations, and code matching your search criteria will appear in the center of the page. + - Apply filters to fully refine your search. +- When searching for a column name, the results show all relational nodes containing that column in their schemas. + + + +### Hands-On +- Explore **Project-Level lineage** + - Pick a model and review its upstream and downstream dependencies + - Which sources does this model depend on? Which models depend on this model? +- Explore **Lenses** + - Apply the Test Status Lenses. Which models passed tests? Which had warnings? + - Explore different lenses (Model Layer, Materialization Type, Resource). What information do you see? +- Explore **Column-Level Lineage** + - Navigate to the model’s **Model resource** page and explore the primary key column’s **Column-Level Lineage** + +## Multi-project +Use dbt Explorer to gain a deeper understanding of *all* your dbt Cloud projects with its [multi-project capabilities](/docs/collaborate/explore-multiple-projects). +- See the number of public, protected, and private models, as well as metrics for each project. +- View cross-project lineage and navigate between individual projects’ lineage graphs. +- Explore column-level lineage across projects. + +### Hands-On +- In the lineage graph, filter the Platform Project’s Project-Level Lineage for Public models using the `access:public` filter + - Make a note of which models are referenced by the analytics project. +- Explore the Analytics Project’s lineage + - Choose a model in the Platform project referenced by the Analytics project. + - Look at the multi-project column-level lineage of its primary key column. + - Open the Analytics project’s lineage graph. Which models does it reference? + + +## Project recommendations + +These recommendations are designed to build trust in your project and reduce confusion. + +To learn more about the specific suggestions and the reasons behind them, check out [our docs](/docs/collaborate/project-recommendations). + + + +### Hands-On +- Review your project recommendations. +- Find the project recommendation for the model `agg_daily_returned_orders`. +- Add documentation to this model in the `aggregates.yml` file. + +## What's next + + + +Congratulations! You've completed the dbt Explorer workshop. You now have the tools and knowledge to navigate multiple dbt projects, trace dependencies, and identify opportunities to improve performance and data quality. + +You've learned how to: +- Use dbt Explorer to visualize your project’s lineage and interact with the DAG +- Search for resources in your project and apply filters to refine your search +- Explore lenses and find table materializations in your current project +- Navigate multiple dbt projects using dbt Explorer +- Trace dependencies at the model and column level +- Review project recommendations and implement improvements + +For the next steps, you can check out the [dbt Explorer documentation](/docs/collaborate/explore-projects) and [FAQs](/docs/collaborate/dbt-explorer-faqs) to learn more about how to use dbt Explorer. + +Keep an eye out for new features coming out soon, like: +- More [auto-exposure](/docs/collaborate/auto-exposures) integrations (like PowerBI and Tableau). +- [Model query history](/docs/collaborate/model-query-history) for additional warehouses (like Redshift and Databricks) +- Improvements to [data health tiles](/docs/collaborate/data-tile) + + diff --git a/website/docs/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md b/website/docs/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md index f420b7845a2..60d67218642 100644 --- a/website/docs/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md +++ b/website/docs/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md @@ -33,7 +33,7 @@ Using Databricks workflows to call the dbt Cloud job API can be useful for sever ## Set up a Databricks secret scope -1. Retrieve **[User API Token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens#user-api-tokens) **or **[Service Account Token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#generating-service-account-tokens) **from dbt Cloud +1. Retrieve **[personal access token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) **or **[Service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#generating-service-account-tokens) **from dbt Cloud 2. Set up a **Databricks secret scope**, which is used to securely store your dbt Cloud API key. 3. Enter the **following commands** in your terminal: diff --git a/website/docs/guides/manual-install-qs.md b/website/docs/guides/manual-install-qs.md index b68c43db42d..2e10cdac07c 100644 --- a/website/docs/guides/manual-install-qs.md +++ b/website/docs/guides/manual-install-qs.md @@ -421,7 +421,7 @@ This time, when you performed a `dbt run`, separate views/tables were created fo ### FAQs {#faq-2} - + ### Next steps diff --git a/website/docs/guides/mesh-qs.md b/website/docs/guides/mesh-qs.md index 0d13d043059..47ece7b29ec 100644 --- a/website/docs/guides/mesh-qs.md +++ b/website/docs/guides/mesh-qs.md @@ -300,6 +300,8 @@ To run your first deployment dbt Cloud job, you will need to create a new dbt Cl 5. After the run is complete, click **Explore** from the upper menu bar. You should now see your lineage, tests, and documentation coming through successfully. +For details on how dbt Cloud uses metadata from the Staging environment to resolve references in downstream projects, check out the section on [Staging with downstream dependencies](/docs/collaborate/govern/project-dependencies#staging-with-downstream-dependencies). + ## Reference a public model in your downstream project In this section, you will set up the downstream project, "Jaffle | Finance", and [cross-project reference](/docs/collaborate/govern/project-dependencies) the `fct_orders` model from the foundational project. Navigate to the **Develop** page to set up our project: diff --git a/website/docs/guides/microsoft-fabric-qs.md b/website/docs/guides/microsoft-fabric-qs.md index c3800e63f7a..157ab2e6b89 100644 --- a/website/docs/guides/microsoft-fabric-qs.md +++ b/website/docs/guides/microsoft-fabric-qs.md @@ -314,7 +314,7 @@ Later, you can connect your business intelligence (BI) tools to these views and #### FAQs {#faq-2} - + diff --git a/website/docs/guides/productionize-your-dbt-databricks-project.md b/website/docs/guides/productionize-your-dbt-databricks-project.md index bada787e01f..1e757e9cf0a 100644 --- a/website/docs/guides/productionize-your-dbt-databricks-project.md +++ b/website/docs/guides/productionize-your-dbt-databricks-project.md @@ -109,7 +109,7 @@ The deployment monitor in dbt Cloud offers a higher-level view of your run histo -By adding [status tiles](/docs/deploy/dashboard-status-tiles) to your BI dashboards, you can give stakeholders visibility into the health of your data pipeline without leaving their preferred interface. Status tiles instill confidence in your data and help prevent unnecessary inquiries or context switching. To implement dashboard status tiles, you'll need to have dbt docs with [exposures](/docs/build/exposures) defined. +By adding [data health tiles](/docs/collaborate/data-tile) to your BI dashboards, you can give stakeholders visibility into the health of your data pipeline without leaving their preferred interface. Data tiles instill confidence in your data and help prevent unnecessary inquiries or context switching. To implement dashboard status tiles, you'll need to have dbt docs with [exposures](/docs/build/exposures) defined. ## Set up notifications diff --git a/website/docs/guides/qs-cloud-cli.md b/website/docs/guides/qs-cloud-cli.md new file mode 100644 index 00000000000..1e2a548114f --- /dev/null +++ b/website/docs/guides/qs-cloud-cli.md @@ -0,0 +1,313 @@ +--- +title: "Coalesce: Quickstart for dbt Cloud CLI" +id: "dbt-cloud-cli" +# time_to_complete: '30 minutes' commenting out until we test +level: 'Beginner' +icon: 'guides' +hide_table_of_contents: true +tags: ['Cloud CLI', 'dbt Cloud','Quickstart'] +recently_updated: true +--- + +
+ +## Introduction + +In this quickstart guide, you'll learn how to configure and use dbt Cloud CLI as part of the Coalesce 24 Workshop. + +It will show you how to: + +- Set up a dbt Cloud sandbox. +- Install the dbt Cloud CLI and connect to dbt Cloud. +- Run commands locally using the dbt Cloud CLI. +- Defer to different production environments. +- Leverage cross-project ref. +- Install dbt Power User. +- Use dbt Power User to supercharge development. + +### Prerequisites​ + +- Familiarity with dbt projects and common commands (for example, `dbt build`) +- Git is installed +- An editor, such as Visual Studio Code (preferred), is installed + +### Related content + +- Learn more with [dbt Learn courses](https://learn.getdbt.com) + +## Install Git and Visual Studio Code (Prerequisites) + +You will need to have Git installed locally and a code editor (preferably Visual Studio Code). + +### Check your installation status + +Run `git --version` in your terminal to check if it's installed. For example: + +
+ +
+ +Check your installed applications for Visual Studio Code (vscode) or another editor. For example: + +
+ +
+ +### Install Git and Visual Studio Code + +Navigate to the following Git installation page and install it for your operating system: + +https://git-scm.com/downloads + +Navigate to the following Visual Studio Code installation page and install it for your operating system. + +https://code.visualstudio.com/download + +## Set up dbt Cloud (Coalesce Workshop Only) + +Let's get set up with a dbt Cloud sandbox that's already connected to a Snowflake account for the workshop. + +1. Go to [bit.ly/coalesce-24-sandboxes](https://bit.ly/coalesce-24-sandboxes) to create an account. Make sure you log out of any other dbt Cloud accounts. + + a. Enter your **First Name** and **Last Name** + + b. For **Workshop**, choose **Test driving dbt Cloud CLI and dbt power user** from the dropdown + + c. The **Passcode** will be provided by your facilitators + + d. Accept the terms and click **Complete Registration** + +1. Navigate to the platform project by selecting **Project** form the left sidebar and choosing **Platform Analytics**. + +1. Select **Deploy >> Runs** to find the created jobs. For each job, click on the job and click **run**. + +1. Now repeat for the **Analytics project**. Toggle into the Analytics project. + +1. Select **Deploy >> Runs** to find the created jobs. For the one job, click on the job and click **run**. + +1. Select **Explore** from the navigation and choose XX. Now you can visualize your dbt Mesh. Click into each project to see project level lineage. + +You've now successfully run your project in deployment environments so you can use cross project ref and deferral later in the workshop. + +## Configure dbt Cloud CLI + +Now we'll clone the project repository and configure dbt Cloud CLI to connect to your sandbox. + +### Clone the repo + +1. Navigate to a folder on your computer to clone the repository. + +1. In your terminal, run the following command to clone the downstream (analytics) project: + + ```shell + git clone https://github.com/dbt-labs/c24-workshops-analytics.git + ``` + +### Install Cloud CLI + +1. In dbt Cloud, select Platform Analytics and choose **Develop >> Configure Cloud CLI**. + +1. Based on your current local setup, use the following guidance to determine your installation approach: + + a. Check if you have dbt in your PATH by running `dbt --version` + + b. If you don't have dbt in your PATH, we recommend the macOS or Windows installation method. + + c. If you do have dbt in your PATH (global environment), we recommend: + 1. Uninstalling dbt globally + 2. Installing dbt Cloud CLI with a Python virtual environment + + d. If you have dbt in a virtual environment, install dbt Cloud CLI with a separate Python virtual environment. Be sure to activate it with `source /bin/activate`. + +1. Download the CLI configuration file from the dbt Cloud UI. Save it in your `.dbt` folder. + +1. Navigate to the dbt project folder that you cloned earlier and open the `dbt_project.yml` file with your `project_id`. + +### Confirm the installation + +Run `dbt compile` to verify your installation. + +There you go! You've installed the dbt Cloud CLI! Let's dive in! + +### Additional resources +Consult the following docs if you run into problems when trying to install the dbt Cloud CLI: +- [Install dbt Cloud CLI](https://docs.getdbt.com/docs/cloud/cloud-cli-installation) +- [Configure and use dbt Cloud CLI](https://docs.getdbt.com/docs/cloud/configure-cloud-cli) + +## Leverage dbt Cloud CLI + +Let's run a few commands together to get comfortable with the dbt Cloud CLI: +* `dbt debug` — Displays your connection details and information +* `dbt compile --select stg_campaigns` — Compiles your dbt project +* `dbt run --select stg_campaigns` — Materialized your dbt models +* `dbt run --select stg_campaigns` — Preview the results of a model +* `dbt test --select stg_campaigns` — Execute tests against your materialized models + +Now let's dive into some more advanced components of dbt Cloud CLI. + +### Deferral + +Deferral is a powerful functionality, allowing you to leverage upstream assets that exist outside of your personal development environment. As a result, you can speed up your development workflows and save on warehouse compute costs. Let's run a few commands using deferral: + +1. Run `dbt compile -s stg_campaigns`. Notice how we're able to resolve dependencies in the compiled SQL without seeding `campaigns.csv`. +1. Now let's modify the `stg_campaigns` model by adding a timestamp: + ```sql + current_timestamp() as updated_at + ``` + + Let's build that model with the next command. +1. Run `dbt build --select stg_campaigns`. We're utilizing deferral and the concept of "statefulness" to check with objects that have been modified and resolve dependencies of upstream assets if they exist. + + By default, the dbt Cloud CLI defers to a [Staging](https://docs.getdbt.com/docs/deploy/deploy-environments#staging-environment) environment if one exists. If not, dbt uses the assets from the Production environment. + + To override which environment the dbt Cloud CLI defers to, you can set a `defer-env-id` key in either your `dbt_project.yml` or `dbt_cloud.yml` file. For example: + + ```yml + dbt-cloud: + defer-env-id: '123456' + ``` + +### dbt Mesh + +You have access to cross-project ref's that's powered by the metadata of dbt Cloud. + +1. Open the `agg_campaign_customer_contacts` model. +1. Find the reference called `{{ ref('platform', 'dim_customers', v=1) }}`. +1. Run the command: + + ```shell + dbt run --select agg_campaign_customer_contacts + ``` + +1. Navigate to dbt Cloud Explorer and find a public model. Let's use the `fct_order_items` model. +1. Create a new model called `agg_orders` in your project with the following code: + + ```sql + with orders as ( + + select * from {{ ref('platform', 'fct_order_items') }} + + ), + + final as ( + + select + customer_key as customer_id, + is_return as return_status, + count(*) as count_orders + + from + orders + group by + customer_key, + is_return + ) + + select * from final + ``` + +### Linting and fixing SQL files + +With SQLFluff built in, you can check your code against a style guide and automatically make fixes. + +1. Run the SQLFluff command `lint`: + + ```shell + dbt sqlfluff lint models/staging/campaigns/stg_campaigns.sql --dialect snowflake + ``` + + This identifies tweaks to make in the `stg_campaigns` model. +2. Run the SQLFluff command `fix`: + + ```shell + dbt sqlfluff fix models/staging/campaigns/stg_campaigns.sql --dialect snowflake + ``` + + This attempts to directly make fixes in the `stg_campaigns` model. + +### Change branches + +You can quickly change branches without fully pushing to your Git provider (such as GitHub): + +```shell +git checkout -b my-new-branch + +git checkout main +``` + +Now you've taken a tour of what you can do with dbt Cloud CLI. Let's dive into dbt Power User next. + +## Install dbt Power User + +Let's get dbt Power User installed to supercharge our workflow. + +1. From Visual Studio Code, click on extensions and search for "Power User for dbt". + +
+ +
+1. Click on install. +1. Click **Switch to dbt Cloud**. You might need to refresh. +
+ +
+1. Complete the setup steps. (click on welcome in VSCode and choose dbt Poweruser) +
+ +
+1. Make an account to sign up and get an API Key: https://app.myaltimate.com/register + +1. Copy your API key and enter this into the dbt Power User extension settings. + +Now let's dive in! + +## Leverage dbt Power User + +There is a ton you can do to supercharge your workflow with dbt Cloud. Let's cover some highlights. + +### Preview your upstream/downstream changes + +Open the Power User extension on the left-hand side. You can see the upstream and downstream projects. + +
+ +
+ +### Preview results + +Press Command-Enter (or Control-Enter for Windows) and instantly see the results of your model below. + +
+ +
+ +### SQL visualization + +While looking at a model file, click the Altimate logo in the top right and click **Visualize SQL** to see a breakdown of your SQL model. + +
+ +
+ +### Generate test and documentation YML with user-friendly UX and AI + +At the top of your model file, click on generate documentation for a UI to rapidly create documentation and tests with AI + +
+ +
+ +There is a whole lot more too! Check out the dbt Power User docs here: https://docs.myaltimate.com/ + +## Conclusion + +You've successfully installed dbt Cloud CLI and dbt Power User! Now you can get the benefits of local development _and_ dbt Cloud working together. + +Be on the look out for the following enhancements to dbt Cloud CLI: +- Deeper integration with dbt Explorer for visual interaction +- Support for invoking production jobs directly from the CLI +- Continued optimization for performance and scalability improvements + +
+ + diff --git a/website/docs/guides/redshift-qs.md b/website/docs/guides/redshift-qs.md index 544c18a75d5..8b950472506 100644 --- a/website/docs/guides/redshift-qs.md +++ b/website/docs/guides/redshift-qs.md @@ -43,6 +43,9 @@ Check out [dbt Fundamentals](https://learn.getdbt.com/courses/dbt-fundamentals) 1. Sign in to your [AWS account](https://signin.aws.amazon.com/console) as a root user or an IAM user depending on your level of access. 2. Use a CloudFormation template to quickly set up a Redshift cluster. A CloudFormation template is a configuration file that automatically spins up the necessary resources in AWS. [Start a CloudFormation stack](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=dbt-workshop&templateURL=https://tpch-sample-data.s3.amazonaws.com/create-dbtworkshop-infr) and you can refer to the [create-dbtworkshop-infr JSON file](https://github.com/aws-samples/aws-modernization-with-dbtlabs/blob/main/resources/cloudformation/create-dbtworkshop-infr) for more template details. +:::tip +To avoid connectivity issues with dbt Cloud, make sure to allow inbound traffic on port 5439 from [dbt Cloud's IP addresses](/docs/cloud/about-cloud/access-regions-ip-addresses) in your Redshift security groups and Network Access Control Lists (NACLs) settings. +::: 3. Click **Next** for each page until you reach the **Select acknowledgement** checkbox. Select **I acknowledge that AWS CloudFormation might create IAM resources with custom names** and click **Create Stack**. You should land on the stack page with a CREATE_IN_PROGRESS status. @@ -165,6 +168,7 @@ Now we are going to load our sample data into the S3 bucket that our Cloudformat select * from jaffle_shop.orders; select * from stripe.payment; ``` + ## Connect dbt Cloud to Redshift 1. Create a new project in [dbt Cloud](/docs/cloud/about-cloud/access-regions-ip-addresses). From **Account settings** (using the gear menu in the top right corner), click **+ New Project**. 2. Enter a project name and click **Continue**. @@ -173,17 +177,19 @@ Now we are going to load our sample data into the S3 bucket that our Cloudformat - **Hostname** — Your entire hostname. - **Port** — `5439` - **Database** — `dbtworkshop`. -
- -
+ + + + :::tip + To avoid connectivity issues with dbt Cloud, make sure to allow inbound traffic on port 5439 from [dbt Cloud's IP addresses](/docs/cloud/about-cloud/access-regions-ip-addresses) in your Redshift security groups and Network Access Control Lists (NACLs) settings. + ::: 5. Set your development credentials. These credentials will be used by dbt Cloud to connect to Redshift. Those credentials (as provided in your CloudFormation output) will be: - **Username** — `dbtadmin` - **Password** — This is the autogenerated password that you used earlier in the guide - **Schema** — dbt Cloud automatically generates a schema name for you. By convention, this is `dbt_`. This is the schema connected directly to your development environment, and it's where your models will be built when running dbt within the Cloud IDE. -
+ -
6. Click **Test Connection**. This verifies that dbt Cloud can access your Redshift cluster. 7. Click **Next** if the test succeeded. If it failed, you might need to check your Redshift settings and credentials. @@ -391,7 +397,7 @@ Later, you can connect your business intelligence (BI) tools to these views and #### FAQs {#faq-2} - + diff --git a/website/docs/guides/serverless-datadog.md b/website/docs/guides/serverless-datadog.md index 10444ccae9a..dcb4a851663 100644 --- a/website/docs/guides/serverless-datadog.md +++ b/website/docs/guides/serverless-datadog.md @@ -108,7 +108,7 @@ Wrote config file fly.toml
## Store secrets The application requires four secrets to be set, using these names: -- `DBT_CLOUD_SERVICE_TOKEN`: a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens) with at least the `Metdata Only` permission. +- `DBT_CLOUD_SERVICE_TOKEN`: a dbt Cloud [personal access token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens) with at least the `Metdata Only` permission. - `DBT_CLOUD_AUTH_TOKEN`: the Secret Key for the dbt Cloud webhook you created earlier. - `DD_API_KEY`: the API key you created earlier. - `DD_SITE`: The Datadog site for your organisation, e.g. `datadoghq.com`. diff --git a/website/docs/guides/serverless-pagerduty.md b/website/docs/guides/serverless-pagerduty.md index ffd25f8989c..a4df65e0304 100644 --- a/website/docs/guides/serverless-pagerduty.md +++ b/website/docs/guides/serverless-pagerduty.md @@ -113,7 +113,7 @@ Make note of the Webhook Secret Key for later. ## Store secrets The application requires three secrets to be set, using these names: -- `DBT_CLOUD_SERVICE_TOKEN`: a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens) with at least the `Metdata Only` permission. +- `DBT_CLOUD_SERVICE_TOKEN`: a dbt Cloud [personal access token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens) with at least the `Metdata Only` permission. - `DBT_CLOUD_AUTH_TOKEN`: the Secret Key for the dbt Cloud webhook you created earlier. - `PD_ROUTING_KEY`: the integration key for the PagerDuty integration you created earlier. diff --git a/website/docs/guides/sl-snowflake-qs.md b/website/docs/guides/sl-snowflake-qs.md index 782073fdc38..b5a0e559c5b 100644 --- a/website/docs/guides/sl-snowflake-qs.md +++ b/website/docs/guides/sl-snowflake-qs.md @@ -97,7 +97,7 @@ Open a new tab and follow these quick steps for account setup and data loading i ## Prerequisites - You need a [dbt Cloud](https://www.getdbt.com/signup/) Trial, Team, or Enterprise account for all deployments. Contact your representative for Single-tenant setup; otherwise, create an account using this guide. -- Have the correct [dbt Cloud license](/docs/cloud/manage-access/seats-and-users) and [permissions](/docs/cloud/manage-access/self-service-permissions) based on your plan: +- Have the correct [dbt Cloud license](/docs/cloud/manage-access/seats-and-users) and [permissions](/docs/cloud/manage-access/enterprise-permissions) based on your plan: - Enterprise — Developer license with Account Admin permissions. Or "Owner" with a Developer license, assigned Project Creator, Database Admin, or Admin permissions. @@ -339,9 +339,7 @@ If you used Partner Connect, you can skip to [initializing your dbt project](#in ## Initialize your dbt project and start developing -This guide assumes you use the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) to develop your dbt project and define metrics. However, the dbt Cloud IDE doesn't support using [MetricFlow commands](/docs/build/metricflow-commands) to query or preview metrics (support coming soon). - -To query and preview metrics in your development tool, you can use the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) to run the [MetricFlow commands](/docs/build/metricflow-commands). +This guide assumes you use the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) to develop your dbt project, define metrics, and query and preview metrics using [MetricFlow commands](/docs/build/metricflow-commands). Now that you have a repository configured, you can initialize your project and start development in dbt Cloud using the IDE: @@ -619,6 +617,11 @@ select * from final In the following steps, semantic models enable you to define how to interpret the data related to orders. It includes entities (like ID columns serving as keys for joining data), dimensions (for grouping or filtering data), and measures (for data aggregations). 1. In the `metrics` sub-directory, create a new file `fct_orders.yml`. + +:::tip +Make sure to save all semantic models and metrics under the directory defined in the [`model-paths`](/reference/project-configs/model-paths) (or a subdirectory of it, like `models/semantic_models/`). If you save them outside of this path, it will result in an empty `semantic_manifest.json` file, and your semantic models or metrics won't be recognized. +::: + 2. Add the following code to that newly created file: @@ -661,7 +664,8 @@ semantic_models: entities: - name: order_id type: primary - - name: customer_id + - name: customer + expr: customer_id type: foreign ``` @@ -686,8 +690,9 @@ semantic_models: entities: - name: order_id type: primary - - name: customer_id - type: foreign + - name: customer + expr: customer_id + type: foreign # Newly added dimensions: - name: order_date @@ -717,7 +722,8 @@ semantic_models: entities: - name: order_id type: primary - - name: customer_id + - name: customer + expr: customer_id type: foreign dimensions: - name: order_date @@ -762,7 +768,11 @@ There are different types of metrics you can configure: Once you've created your semantic models, it's time to start referencing those measures you made to create some metrics: -Add metrics to your `fct_orders.yml` semantic model file: +1. Add metrics to your `fct_orders.yml` semantic model file: + +:::tip +Make sure to save all semantic models and metrics under the directory defined in the [`model-paths`](/reference/project-configs/model-paths) (or a subdirectory of it, like `models/semantic_models/`). If you save them outside of this path, it will result in an empty `semantic_manifest.json` file, and your semantic models or metrics won't be recognized. +::: @@ -777,7 +787,8 @@ semantic_models: entities: - name: order_id type: primary - - name: customer_id + - name: customer + expr: customer_id type: foreign dimensions: - name: order_date @@ -811,21 +822,24 @@ metrics: type: simple label: "order_total" type_params: - measure: order_total + measure: + name: order_total - name: "order_count" description: "number of orders" type: simple label: "order_count" type_params: - measure: order_count + measure: + name: order_count - name: large_orders description: "Count of orders with order total over 20." type: simple label: "Large Orders" type_params: - measure: order_count + measure: + name: order_count filter: | - {{ Dimension('order_id__order_total_dim') }} >= 20 + {{ Metric('order_total', group_by=['order_id']) }} >= 20 # Ratio type metric - name: "avg_order_value" label: "avg_order_value" @@ -840,7 +854,8 @@ metrics: description: "The month to date value of all orders" type: cumulative type_params: - measure: order_total + measure: + name: order_total grain_to_date: month # Derived metric - name: "pct_of_orders_that_are_large" @@ -912,12 +927,12 @@ metrics: description: "Unique count of customers placing orders" type: simple type_params: - measure: customers + measure: + name: customers ``` - This semantic model uses simple metrics to focus on customer metrics and emphasizes customer dimensions like name, type, and order dates. It uniquely analyzes customer behavior, lifetime value, and order patterns. ## Test and query metrics @@ -938,15 +953,6 @@ https://github.com/dbt-labs/docs.getdbt.com/blob/current/website/snippets/_sl-ru -
- -What’s happening internally? - -- Merging the code into your main branch allows dbt Cloud to pull those changes and build the definition in the manifest produced by the run.
-- Re-running the job in the deployment environment helps materialize the models, which the metrics depend on, in the data platform. It also makes sure that the manifest is up to date.
-- The Semantic Layer APIs pull in the most recent manifest and enables your integration to extract metadata from it. - -
## Set up dbt Semantic Layer diff --git a/website/docs/guides/snowflake-qs.md b/website/docs/guides/snowflake-qs.md index 01646691c8e..bc27d1e1a4f 100644 --- a/website/docs/guides/snowflake-qs.md +++ b/website/docs/guides/snowflake-qs.md @@ -408,7 +408,7 @@ Later, you can connect your business intelligence (BI) tools to these views and #### FAQs {#faq-2} - + ## Build models on top of sources diff --git a/website/docs/guides/starburst-galaxy-qs.md b/website/docs/guides/starburst-galaxy-qs.md index 3863d80a1b8..316e392483d 100644 --- a/website/docs/guides/starburst-galaxy-qs.md +++ b/website/docs/guides/starburst-galaxy-qs.md @@ -414,7 +414,7 @@ Later, you can connect your business intelligence (BI) tools to these views and #### FAQs {#faq-2} - + diff --git a/website/docs/guides/teradata-qs.md b/website/docs/guides/teradata-qs.md new file mode 100644 index 00000000000..d662f3bce3f --- /dev/null +++ b/website/docs/guides/teradata-qs.md @@ -0,0 +1,673 @@ +--- +title: "Quickstart for dbt Cloud and Teradata" +id: "teradata" +level: 'Beginner' +icon: 'teradata' +tags: ['dbt Cloud','Quickstart','Teradata'] +hide_table_of_contents: true +--- + +
+ +## Introduction + +In this quickstart guide, you'll learn how to use dbt Cloud with Teradata Vantage. It will show you how to: + +- Create a new Teradata Clearscape instance +- Load sample data into your Teradata Database +- Connect dbt Cloud to Teradata. +- Take a sample query and turn it into a model in your dbt project. A model in dbt is a select statement. +- Add tests to your models. +- Document your models. +- Schedule a job to run. + +:::tip Videos for you +You can check out [dbt Fundamentals](https://learn.getdbt.com/courses/dbt-fundamentals) for free if you're interested in course learning with videos. +::: + +### Prerequisites​ + +- You have a [dbt Cloud account](https://www.getdbt.com/signup/). +- You have access to a Teradata Vantage instance. You can provision one for free at https://clearscape.teradata.com. See [the ClearScape Analytics Experience guide](https://developers.teradata.com/quickstarts/get-access-to-vantage/clearscape-analytics-experience/getting-started-with-csae/) for details. + +### Related content + +- Learn more with [dbt Learn courses](https://learn.getdbt.com) +- [How we provision Teradata Clearscape Vantage instance](https://developers.teradata.com/quickstarts/get-access-to-vantage/clearscape-analytics-experience/getting-started-with-csae/) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) +- [Job notifications](/docs/deploy/job-notifications) +- [Source freshness](/docs/deploy/source-freshness) + +## Load data + +The following steps will guide you through how to get the data stored as CSV files in a public S3 bucket and insert it into the tables. + +:::tip SQL IDE + +If you created your Teradata Vantage database instance at https://clearscape.teradata.com and you don't have an SQL IDE handy, use the JupyterLab bundled with your database to execute SQL: + +1. Navigate to [ClearScape Analytics Experience dashboard](https://clearscape.teradata.com/dashboard) and click the **Run Demos** button. The demo will launch JupyterLab. + +2. In JupyterLab, go to **Launcher** by clicking the blue **+** icon in the top left corner. Find the **Notebooks** section and click **Teradata SQL**. + +3. In the notebook's first cell, connect to the database using `connect` magic. You will be prompted to enter your database password when you execute it: + ```ipynb + %connect local + ``` +4. Use additional cells to type and run SQL statements. + +::: + +1. Use your preferred SQL IDE editor to create the database, `jaffle_shop`: + + ```sql + CREATE DATABASE jaffle_shop AS PERM = 1e9; + ``` + +2. In `jaffle_shop` database, create three foreign tables and reference the respective csv files located in object storage: + + ```sql + CREATE FOREIGN TABLE jaffle_shop.customers ( + id integer, + first_name varchar (100), + last_name varchar (100), + email varchar (100) + ) + USING ( + LOCATION ('/gs/storage.googleapis.com/clearscape_analytics_demo_data/dbt/raw_customers.csv') + ) + NO PRIMARY INDEX; + + CREATE FOREIGN TABLE jaffle_shop.orders ( + id integer, + user_id integer, + order_date date, + status varchar(100) + ) + USING ( + LOCATION ('/gs/storage.googleapis.com/clearscape_analytics_demo_data/dbt/raw_orders.csv') + ) + NO PRIMARY INDEX; + + CREATE FOREIGN TABLE jaffle_shop.payments ( + id integer, + orderid integer, + paymentmethod varchar (100), + amount integer + ) + USING ( + LOCATION ('/gs/storage.googleapis.com/clearscape_analytics_demo_data/dbt/raw_payments.csv') + ) + NO PRIMARY INDEX; + ``` + +## Connect dbt Cloud to Teradata + +1. Create a new project in dbt Cloud. From **Account settings** (using the gear menu in the top right corner), click **New Project**. +2. Enter a project name and click **Continue**. +3. In **Configure your development environment**, click **Add new connection**. +4. Select **Teradata**, fill in all the required details in the **Settings** section, and test the connection. + + + + + +5. Enter your **Development Credentials** for Teradata with: + * **Username** — The username of Teradata database. + * **Password** — The password of Teradata database. + * **Schema** — The default database to use + + + +6. Click **Test Connection** to verify that dbt Cloud can access your Teradata Vantage instance. +7. If the connection test succeeds, click **Next**. If it fails, check your Teradata settings and credentials. + +## Set up a dbt Cloud managed repository + + + +## Initialize your dbt project​ and start developing + +Now that you have a repository configured, you can initialize your project and start development in dbt Cloud: + +1. Click **Start developing in the IDE**. It might take a few minutes for your project to spin up for the first time as it establishes your git connection, clones your repo, and tests the connection to the warehouse. +2. Above the file tree to the left, click **Initialize your project** to build out your folder structure with example models. +3. Make your initial commit by clicking **Commit and sync**. Use the commit message `initial commit` to create the first commit to your managed repo. Once you’ve created the commit, you can open a branch to add new dbt code. + +## Delete the example models + +You can now delete the files that dbt created when you initialized the project: + +1. Delete the `models/example/` directory. +2. Delete the `example:` key from your `dbt_project.yml` file, and any configurations that are listed under it. + + + + ```yaml + # before + models: + my_new_project: + +materialized: table + example: + +materialized: view + ``` + + + + + + ```yaml + # after + models: + my_new_project: + +materialized: table + ``` + + + +3. Save your changes. +4. Commit your changes and merge to the main branch. + +#### FAQs + + + + + +## Build your first model + +You have two options for working with files in the dbt Cloud IDE: + +- Create a new branch (recommended) — Create a new branch to edit and commit your changes. Navigate to **Version Control** on the left sidebar and click **Create branch**. +- Edit in the protected primary branch — If you prefer to edit, format, lint files, or execute dbt commands directly in your primary git branch. The dbt Cloud IDE prevents commits to the protected branch, so you will receive a prompt to commit your changes to a new branch. + +Name the new branch `add-customers-model`. + +1. Click the **...** next to the `models` directory, then select **Create file**. +2. Name the file `bi_customers.sql`, then click **Create**. +3. Copy the following query into the file and click **Save**. + +```sql + +with customers as ( + + select + id as customer_id, + first_name, + last_name + + from jaffle_shop.customers + +), + +orders as ( + + select + id as order_id, + user_id as customer_id, + order_date, + status + + from jaffle_shop.orders + +), + +customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + +), + +final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders on customers.customer_id = customer_orders.customer_id + +) + +select * from final + +``` + +4. Enter `dbt run` in the command prompt at the bottom of the screen. You should get a successful run and see the three models. + +You can connect your business intelligence (BI) tools to these views and tables so they only read cleaned-up data rather than raw data in your BI tool. + +## Change the way your model is materialized + +One of the most powerful features of dbt is that you can change the way a model is materialized in your warehouse, simply by changing a configuration value. You can change things between tables and views by changing a keyword rather than writing the data definition language (DDL) to do this behind the scenes. + +By default, everything gets created as a view. You can override that at the directory level so everything in that directory will materialize to a different materialization. + +1. Edit your `dbt_project.yml` file. + - Update your project `name` to: + + + ```yaml + name: 'jaffle_shop' + ``` + + + - Configure `jaffle_shop` so everything in it will be materialized as a table; and configure `example` so everything in it will be materialized as a view. Update your `models` config block to: + + + + ```yaml + models: + jaffle_shop: + +materialized: table + ``` + + + - Click **Save**. + +2. Enter the `dbt run` command. Your `bi_customers` model should now be built as a table! + :::info + To do this, dbt had to first run a `drop view` statement (or API call on BigQuery), then a `create table as` statement. + ::: + +3. Edit `models/bi_customers.sql` to override the `dbt_project.yml` for the `customers` model only by adding the following snippet to the top, and click **Save**: + + + + ```sql + {{ + config( + materialized='view' + ) + }} + + with customers as ( + + select + id as customer_id + ... + + ) + + ``` + + + +4. Enter the `dbt run` command. Your model, `bi_customers`, should now build as a view. + +### FAQs + + + + + +## Build models on top of other models + + + +1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in your original query. + + + ```sql + select + id as customer_id, + first_name, + last_name + + from jaffle_shop.customers + ``` + + + +2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in your original query. + + + ```sql + select + id as order_id, + user_id as customer_id, + order_date, + status + + from jaffle_shop.orders + ``` + + + +3. Edit the SQL in your `models/bi_customers.sql` file as follows: + + + + ```sql + with customers as ( + + select * from {{ ref('stg_customers') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders') }} + + ), + + customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + + ), + + final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders on customers.customer_id = customer_orders.customer_id + + ) + + select * from final + + ``` + + + +4. Execute `dbt run`. + + This time, when you performed a `dbt run`, it created separate views/tables for `stg_customers`, `stg_orders`, and `customers`. dbt inferred the order in which these models should run. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You don’t need to define these dependencies explicitly. + +#### FAQs {#faq-2} + + + + + +## Build models on top of sources + +Sources make it possible to name and describe the data loaded into your warehouse by your extract and load tools. By declaring these tables as sources in dbt, you can: +- Select from source tables in your models using the `{{ source() }}` function, helping define the lineage of your data +- Test your assumptions about your source data +- Calculate the freshness of your source data + +1. Create a new YML file, `models/sources.yml`. +2. Declare the sources by copying the following into the file and clicking **Save**. + + + + ```yml + version: 2 + + sources: + - name: jaffle_shop + description: This is a replica of the Postgres database used by the app + database: raw + schema: jaffle_shop + tables: + - name: customers + description: One record per customer. + - name: orders + description: One record per order. Includes canceled and deleted orders. + ``` + + + +3. Edit the `models/stg_customers.sql` file to select from the `customers` table in the `jaffle_shop` source. + + + + ```sql + select + id as customer_id, + first_name, + last_name + + from {{ source('jaffle_shop', 'customers') }} + ``` + + + +4. Edit the `models/stg_orders.sql` file to select from the `orders` table in the `jaffle_shop` source. + + + + ```sql + select + id as order_id, + user_id as customer_id, + order_date, + status + + from {{ source('jaffle_shop', 'orders') }} + ``` + + + +5. Execute `dbt run`. + + Your `dbt run` results will be the same as those in the previous step. Your `stg_customers` and `stg_orders` + models will still query from the same raw data source in Teradata. By using `source`, you can + test and document your raw data and also understand the lineage of your sources. + + +
+ +## Add tests to your models + +Adding [tests](/docs/build/data-tests) to a project helps validate that your models are working correctly. + +To add tests to your project: + +1. Create a new YAML file in the `models` directory, named `models/schema.yml` +2. Add the following contents to the file: + + + + ```yaml + version: 2 + + models: + - name: bi_customers + columns: + - name: customer_id + tests: + - unique + - not_null + + - name: stg_customers + columns: + - name: customer_id + tests: + - unique + - not_null + + - name: stg_orders + columns: + - name: order_id + tests: + - unique + - not_null + - name: status + tests: + - accepted_values: + values: ['placed', 'shipped', 'completed', 'return_pending', 'returned'] + - name: customer_id + tests: + - not_null + - relationships: + to: ref('stg_customers') + field: customer_id + + ``` + + + +3. Run `dbt test`, and confirm that all your tests passed. + +When you run `dbt test`, dbt iterates through your YAML files, and constructs a query for each test. Each query will return the number of records that fail the test. If this number is 0, then the test is successful. + +#### FAQs + + + + + + + + + + +## Document your models + +Adding [documentation](/docs/build/documentation) to your project allows you to describe your models in rich detail, and share that information with your team. Here, we're going to add some basic documentation to our project. + +1. Update your `models/schema.yml` file to include some descriptions, such as those below. + + + + ```yaml + version: 2 + + models: + - name: bi_customers + description: One record per customer + columns: + - name: customer_id + description: Primary key + tests: + - unique + - not_null + - name: first_order_date + description: NULL when a customer has not yet placed an order. + + - name: stg_customers + description: This model cleans up customer data + columns: + - name: customer_id + description: Primary key + tests: + - unique + - not_null + + - name: stg_orders + description: This model cleans up order data + columns: + - name: order_id + description: Primary key + tests: + - unique + - not_null + - name: status + tests: + - accepted_values: + values: ['placed', 'shipped', 'completed', 'return_pending', 'returned'] + - name: customer_id + tests: + - not_null + - relationships: + to: ref('stg_customers') + field: customer_id + ``` + + + +2. Run `dbt docs generate` to generate the documentation for your project. dbt introspects your project and your warehouse to generate a file with rich documentation about your project. + + +3. Click the book icon in the Develop interface to launch documentation in a new tab. + +#### FAQs + + + + + + +## Commit your changes + +Now that you've built your customer model, you need to commit the changes you made to the project so that the repository has your latest code. + +**If you edited directly in the protected primary branch:**
+1. Click the **Commit and sync git** button. This action prepares your changes for commit. +2. A modal titled **Commit to a new branch** will appear. +3. In the modal window, name your new branch `add-customers-model`. This branches off from your primary branch with your new changes. +4. Add a commit message, such as "Add customers model, tests, docs" and commit your changes. +5. Click **Merge this branch to main** to add these changes to the main branch on your repo. + + +**If you created a new branch before editing:**
+1. Since you already branched out of the primary protected branch, go to **Version Control** on the left. +2. Click **Commit and sync** to add a message. +3. Add a commit message, such as "Add customers model, tests, docs." +4. Click **Merge this branch to main** to add these changes to the main branch on your repo. + +## Deploy dbt + +Use dbt Cloud's Scheduler to deploy your production jobs confidently and build observability into your processes. You'll learn to create a deployment environment and run a job in the following steps. + +### Create a deployment environment + +1. In the upper left, select **Deploy**, then click **Environments**. +2. Click **Create Environment**. +3. In the **Name** field, write the name of your deployment environment. For example, "Production." +4. In the **dbt Version** field, select the latest version from the dropdown. +5. Under **Deployment connection**, enter the name of the dataset you want to use as the target, such as `jaffle_shop_prod`. This will allow dbt to build and work with that dataset. +6. Click **Save**. + +### Create and run a job + +Jobs are a set of dbt commands that you want to run on a schedule. For example, `dbt build`. + +As the `jaffle_shop` business gains more customers, and those customers create more orders, you will see more records added to your source data. Because you materialized the `bi_customers` model as a table, you'll need to periodically rebuild your table to ensure that the data stays up-to-date. This update will happen when you run a job. + +1. After creating your deployment environment, you should be directed to the page for a new environment. If not, select **Deploy** in the upper left, then click **Jobs**. +2. Click **+ Create job** and then select **Deploy job**. Provide a name, for example, "Production run", and link it to the Environment you just created. +3. Scroll down to the **Execution Settings** section. +4. Under **Commands**, add this command as part of your job if you don't see it: + * `dbt build` +5. Select the **Generate docs on run** checkbox to automatically [generate updated project docs](/docs/collaborate/build-and-view-your-docs) each time your job runs. +6. For this exercise, do _not_ set a schedule for your project to run — while your organization's project should run regularly, there's no need to run this example project on a schedule. Scheduling a job is sometimes referred to as _deploying a project_. +7. Select **Save**, then click **Run now** to run your job. +8. Click the run and watch its progress under "Run history." +9. Once the run is complete, click **View Documentation** to see the docs for your project. + + +Congratulations 🎉! You've just deployed your first dbt project! + + +#### FAQs + + + + + diff --git a/website/docs/guides/zapier-ms-teams.md b/website/docs/guides/zapier-ms-teams.md index 171ed19193a..500ae4bf9e5 100644 --- a/website/docs/guides/zapier-ms-teams.md +++ b/website/docs/guides/zapier-ms-teams.md @@ -56,7 +56,7 @@ The sample body's values are hard-coded and not reflective of your project, but ## Store secrets -In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). +In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [personal access token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). diff --git a/website/docs/guides/zapier-refresh-mode-report.md b/website/docs/guides/zapier-refresh-mode-report.md index c3bd1a11778..23dd19d0b4c 100644 --- a/website/docs/guides/zapier-refresh-mode-report.md +++ b/website/docs/guides/zapier-refresh-mode-report.md @@ -46,7 +46,7 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development. ## Store secrets -In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens), as well as a [Mode API token and secret](https://mode.com/developer/api-reference/authentication/). +In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [personal access token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens), as well as a [Mode API token and secret](https://mode.com/developer/api-reference/authentication/). Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). diff --git a/website/docs/guides/zapier-slack.md b/website/docs/guides/zapier-slack.md index c3e7383c007..e11da493b67 100644 --- a/website/docs/guides/zapier-slack.md +++ b/website/docs/guides/zapier-slack.md @@ -50,7 +50,7 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test The sample body's values are hardcoded and not reflective of your project, but they give Zapier a correctly-shaped object during development. ## Store secrets -In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). +In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [personal access token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps). This prevents your keys from being displayed as plaintext in the Zap code. You can access them with the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). @@ -215,7 +215,7 @@ Sometimes dbt Cloud posts the message about the run failing before the run's art A one-minute delay is generally sufficient. ### 5. Store secrets -In the next step, you will need either a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). +In the next step, you will need either a dbt Cloud [personal access token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps). This prevents your keys from being displayed as plaintext in the Zap code. You can access them with the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). diff --git a/website/docs/reference/artifacts/dbt-artifacts.md b/website/docs/reference/artifacts/dbt-artifacts.md index 8d3e1ae29e8..b8998dba261 100644 --- a/website/docs/reference/artifacts/dbt-artifacts.md +++ b/website/docs/reference/artifacts/dbt-artifacts.md @@ -22,12 +22,14 @@ dbt has produced artifacts since the release of dbt-docs in v0.11.0. Starting in ### When are artifacts produced? Most dbt commands (and corresponding RPC methods) produce artifacts: -- [semantic manifest](/docs/dbt-cloud-apis/sl-manifest): produced whenever your dbt project is parsed +- [semantic manifest](/reference/artifacts/sl-manifest): produced whenever your dbt project is parsed - [manifest](/reference/artifacts/manifest-json): produced by commands that read and understand your project - [run results](/reference/artifacts/run-results-json): produced by commands that run, compile, or catalog nodes in your DAG - [catalog](catalog-json): produced by `docs generate` - [sources](/reference/artifacts/sources-json): produced by `source freshness` +When running commands from the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation), all artifacts are downloaded by default. If you want to change this behavior, refer to [How to skip artifacts from being downloaded](/docs/cloud/configure-cloud-cli#how-to-skip-artifacts-from-being-downloaded). + ## Where are artifacts produced? By default, artifacts are written to the `/target` directory of your dbt project. You can configure the location using the [`target-path` flag](/reference/global-configs/json-artifacts). diff --git a/website/docs/reference/artifacts/other-artifacts.md b/website/docs/reference/artifacts/other-artifacts.md index 0216acccff0..e37662ae28c 100644 --- a/website/docs/reference/artifacts/other-artifacts.md +++ b/website/docs/reference/artifacts/other-artifacts.md @@ -39,7 +39,7 @@ Each of those points in time contains the `name` and `type` of each node and `su ### semantic_manifest.json -The [`semantic_manifest.json`](/docs/dbt-cloud-apis/sl-manifest) file is useful as an internal interface between `dbt-core` and MetricFlow. As such, it functions as a behind-the-scenes bridge for interaction between the two systems. You can find all of the `semantic_manifest.json` information in the [`semantic_manifest.json`](/docs/dbt-cloud-apis/sl-manifest). +The [`semantic_manifest.json`](/reference/artifacts/sl-manifest) file is useful as an internal interface between `dbt-core` and MetricFlow. As such, it functions as a behind-the-scenes bridge for interaction between the two systems. You can find all of the `semantic_manifest.json` information in the [`semantic_manifest.json`](/reference/artifacts/sl-manifest). There are two reasons why `semantic_manifest.json` exists alongside `manifest.json`: diff --git a/website/docs/reference/artifacts/run-results-json.md b/website/docs/reference/artifacts/run-results-json.md index ff8da3559fa..13ad528d185 100644 --- a/website/docs/reference/artifacts/run-results-json.md +++ b/website/docs/reference/artifacts/run-results-json.md @@ -44,8 +44,6 @@ import RowsAffected from '/snippets/_run-result.md'; - - The run_results.json includes three attributes related to the `applied` state that complement `unique_id`: - `compiled`: Boolean entry of the node compilation status (`False` after parsing, but `True` after compiling). @@ -195,5 +193,3 @@ Here's a printed snippet from the `run_results.json`: } ], ``` - - diff --git a/website/docs/docs/dbt-cloud-apis/sl-manifest.md b/website/docs/reference/artifacts/sl-manifest.md similarity index 73% rename from website/docs/docs/dbt-cloud-apis/sl-manifest.md rename to website/docs/reference/artifacts/sl-manifest.md index e203f4a0754..03e661841c4 100644 --- a/website/docs/docs/dbt-cloud-apis/sl-manifest.md +++ b/website/docs/reference/artifacts/sl-manifest.md @@ -7,26 +7,24 @@ sidebar_label: "Semantic manifest" pagination_next: null --- +**Produced by:** Any command that parses your project. This includes all commands _except_ [`deps`](/reference/commands/deps), [`clean`](/reference/commands/clean), [`debug`](/reference/commands/debug), and [`init`](/reference/commands/init). + dbt creates an [artifact](/reference/artifacts/dbt-artifacts) file called the _Semantic Manifest_ (`semantic_manifest.json`), which MetricFlow requires to build and run metric queries properly for the dbt Semantic Layer. This artifact contains comprehensive information about your dbt Semantic Layer. It is an internal file that acts as the integration point with MetricFlow. By using the semantic manifest produced by dbt Core, MetricFlow will instantiate a data flow plan and generate SQL from Semantic Layer query requests. It's a valuable reference that you can use to understand the structure and details of your data models. Similar to the [`manifest.json` file](/reference/artifacts/manifest-json), the `semantic_manifest.json` file also lives in the [target directory](/reference/global-configs/json-artifacts) of your dbt project where dbt stores various artifacts (such as compiled models and tests) generated during the execution of your project. -## How it's produced - -Just like `manifest.json`, the `semantic_manifest.json` is produced whenever your dbt project is parsed. All dbt commands will parse your project and create a `semantic_manifest.json` file, _except_ [`deps`](/reference/commands/deps), [`clean`](/reference/commands/clean), [`debug`](/reference/commands/debug), and [`init`](/reference/commands/init). - - -## Top level keys +## Top-level keys Top-level keys for the semantic manifest are: - `semantic_models` — Starting points of data with entities, dimensions, and measures, and correspond to models in your dbt project. - `metrics` — Functions combining measures, constraints, and so on to define quantitative indicators. - `project_configuration` — Contains information around your project configurations -
-Example target/semantic_manifest.json file +### Example + + ```json { @@ -77,13 +75,42 @@ Top-level keys for the semantic manifest are: ], "metadata": null, "dsi_package_version": {} - } + }, + "saved_queries": [ + { + "name": "name of the saved query", + "query_params": { + "metrics": [ + "metrics used in the saved query" + ], + "group_by": [ + "TimeDimension('model_primary_key__date_column', 'day')", + "Dimension('model_primary_key__metric_one')", + "Dimension('model__dimension')" + ], + "where": null + }, + "description": "Description of the saved query", + "metadata": null, + "label": null, + "exports": [ + { + "name": "saved_query_name", + "config": { + "export_as": "view", + "schema_name": null, + "alias": null + } + } + ] + } + ] } ] } ``` -
+
## Related docs diff --git a/website/docs/reference/commands/build.md b/website/docs/reference/commands/build.md index ae5e5805c31..9f8e83d2abd 100644 --- a/website/docs/reference/commands/build.md +++ b/website/docs/reference/commands/build.md @@ -31,6 +31,9 @@ In DAG order, for selected resources or an entire project. The `build` command supports the `--empty` flag for building schema-only dry runs. The `--empty` flag limits the refs and sources to zero rows. dbt will still execute the model SQL against the target data warehouse but will avoid expensive reads of input data. This validates dependencies and ensures your models will build properly. +import SQLCompilationError from '/snippets/_render-method.md'; + + ## Tests diff --git a/website/docs/reference/commands/cmd-docs.md b/website/docs/reference/commands/cmd-docs.md index cceb8c2ec6e..03e11ae89f0 100644 --- a/website/docs/reference/commands/cmd-docs.md +++ b/website/docs/reference/commands/cmd-docs.md @@ -10,30 +10,28 @@ id: "cmd-docs" The command is responsible for generating your project's documentation website by -1. Copying the website `index.html` file into the `target/` directory -2. Compiling the resources in your project, so that their `compiled_code` will be included in [`manifest.json`](/reference/artifacts/manifest-json) +1. Copying the website `index.html` file into the `target/` directory. +2. Compiling the resources in your project, so that their `compiled_code` will be included in [`manifest.json`](/reference/artifacts/manifest-json). 3. Running queries against database metadata to produce the [`catalog.json`](/reference/artifacts/catalog-json) file, which contains metadata about the tables and views produced by the models in your project. **Example**: + ``` dbt docs generate ``` - - Use the `--select` argument to limit the nodes included within `catalog.json`. When this flag is provided, step (3) will be restricted to the selected nodes. All other nodes will be excluded. Step (2) is unaffected. **Example**: + ```shell dbt docs generate --select +orders ``` - - - Use the `--no-compile` argument to skip re-compilation. When this flag is provided, `dbt docs generate` will skip step (2) described above. **Example**: + ``` dbt docs generate --no-compile ``` @@ -43,26 +41,54 @@ Use the `--empty-catalog` argument to skip running the database queries to popul This is not recommended for production environments, as it means that your documentation will be missing information gleaned from database metadata (the full set of columns in each table, and statistics about those tables). It can speed up `docs generate` in development, when you just want to visualize lineage and other information defined within your project. To learn how to build your documentation in dbt Cloud, refer to [build your docs in dbt Cloud](/docs/collaborate/build-and-view-your-docs). **Example**: + ``` dbt docs generate --empty-catalog ``` ### dbt docs serve -This command starts a webserver on port 8080 to serve your documentation locally and opens the documentation site in your default browser. The webserver is rooted in your `target/` directory. Be sure to run `dbt docs generate` before `dbt docs serve` because the `generate` command produces a [catalog metadata artifact](/reference/artifacts/catalog-json) that the `serve` command depends upon. You will see an error message if the catalog is missing. + +This command starts a webserver on port 8080 to serve your documentation locally and opens the documentation site in your default browser. The webserver is rooted in your `target/` directory. Be sure to run `dbt docs generate` before `dbt docs serve` because the `generate` command produces a [catalog metadata artifact](/reference/artifacts/catalog-json) that the `serve` command depends upon. You will see an error message if the catalog is missing. Use the `dbt docs serve` command if you're developing locally with the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) or [dbt Core](/docs/core/installation-overview). The [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) doesn't support this command. **Usage:** + + +``` +dbt docs serve [--profiles-dir PROFILES_DIR] + [--profile PROFILE] [--target TARGET] + [--port PORT] + [--no-browser] +``` + + ``` dbt docs serve [--profiles-dir PROFILES_DIR] [--profile PROFILE] [--target TARGET] + [--host HOST] [--port PORT] [--no-browser] ``` + You may specify a different port using the `--port` flag. **Example**: + ``` dbt docs serve --port 8001 ``` + + + +You may specify a different host using the `--host` flag. + +**Example**: + +```shell +dbt docs serve --host "" +``` + +As of 1.8.1, the default host is `127.0.0.1`. For versions 1.8.0 and prior, the default host was `""`. + diff --git a/website/docs/reference/commands/deps.md b/website/docs/reference/commands/deps.md index 85c103e6337..0cb8e50f7a6 100644 --- a/website/docs/reference/commands/deps.md +++ b/website/docs/reference/commands/deps.md @@ -58,8 +58,6 @@ Updates available for packages: ['tailsdotcom/dbt_artifacts', 'dbt-labs/snowplow Update your versions in packages.yml, then run dbt deps ``` - - ## Predictable package installs Starting in dbt Core v1.7, dbt generates a `package-lock.yml` file in the root of your project. This contains the complete set of resolved packages based on the `packages` configuration in `dependencies.yml` or `packages.yml`. Each subsequent invocation of `dbt deps` will install from the _locked_ set of packages specified in this file. Storing the complete set of required packages (with pinned versions) in version-controlled code ensures predictable installs in production and consistency across all developers and environments. @@ -97,5 +95,3 @@ dbt deps --add-package https://github.com/fivetran/dbt_amplitude@v0.3.0 --source # add package from local dbt deps --add-package /opt/dbt/redshift --source local ``` - - diff --git a/website/docs/reference/commands/init.md b/website/docs/reference/commands/init.md index 8945eb823db..112fff63a38 100644 --- a/website/docs/reference/commands/init.md +++ b/website/docs/reference/commands/init.md @@ -17,15 +17,10 @@ Then, it will: - Create a new folder with your project name and sample files, enough to get you started with dbt - Create a connection profile on your local machine. The default location is `~/.dbt/profiles.yml`. Read more in [configuring your profile](/docs/core/connect-data-platform/connection-profiles). - - When using `dbt init` to initialize your project, include the `--profile` flag to specify an existing `profiles.yml` as the `profile:` key to use instead of creating a new one. For example, `dbt init --profile profile_name`. - - If the profile does not exist in `profiles.yml` or the command is run inside an existing project, the command raises an error. - ## Existing project diff --git a/website/docs/reference/commands/retry.md b/website/docs/reference/commands/retry.md index 8da5d5a77a6..68d18dfd77a 100644 --- a/website/docs/reference/commands/retry.md +++ b/website/docs/reference/commands/retry.md @@ -10,6 +10,8 @@ Retry works with the following commands: - [`build`](/reference/commands/build) - [`compile`](/reference/commands/compile) +- [`clone`](/reference/commands/clone) +- [`docs generate`](/reference/commands/cmd-docs#dbt-docs-generate) - [`seed`](/reference/commands/seed) - [`snapshot`](/reference/commands/build) - [`test`](/reference/commands/test) diff --git a/website/docs/reference/commands/version.md b/website/docs/reference/commands/version.md index 2ed14117828..3847b3cd593 100644 --- a/website/docs/reference/commands/version.md +++ b/website/docs/reference/commands/version.md @@ -13,7 +13,7 @@ The `--version` command-line flag returns information about the currently instal ## Versioning To learn more about release versioning for dbt Core, refer to [How dbt Core uses semantic versioning](/docs/dbt-versions/core#how-dbt-core-uses-semantic-versioning). -If using [versionless dbt Cloud](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless), then `dbt_version` uses the latest (continuous) release version. This also follows semantic versioning guidelines, using the `YYYY.xx.yy` format, where the year is the major version (for example, `2024.04.1234`) +If using [versionless dbt Cloud](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless), then `dbt_version` uses the latest (continuous) release version. This also follows semantic versioning guidelines, using the `YYYY.MM.DD+` format. The year, month, and day represent the date the version was built (for example, `2024.10.28+996c6a8`). The suffix provides an additional unique identification for each build. ## Example usages diff --git a/website/docs/reference/configs-and-properties.md b/website/docs/reference/configs-and-properties.md index 20d762b7462..20892898a51 100644 --- a/website/docs/reference/configs-and-properties.md +++ b/website/docs/reference/configs-and-properties.md @@ -26,9 +26,18 @@ Whereas you can use **configurations** to: Depending on the resource type, configurations can be defined in the dbt project and also in an installed package by: + + +1. Using a [`config` property](/reference/resource-properties/config) in a `.yml` file in the `models/`, `snapshots/`, `seeds/`, `analyses`, or `tests/` directory +2. From the [`dbt_project.yml` file](dbt_project.yml), under the corresponding resource key (`models:`, `snapshots:`, `tests:`, etc) + + + + 1. Using a [`config()` Jinja macro](/reference/dbt-jinja-functions/config) within a `model`, `snapshot`, or `test` SQL file -2. Using a [`config` property](/reference/resource-properties/config) in a `.yml` file +2. Using a [`config` property](/reference/resource-properties/config) in a `.yml` file in the `models/`, `snapshots/`, `seeds/`, `analyses/`, or `tests/` directory. 3. From the [`dbt_project.yml` file](dbt_project.yml), under the corresponding resource key (`models:`, `snapshots:`, `tests:`, etc) + ### Config inheritance diff --git a/website/docs/reference/dbt-commands.md b/website/docs/reference/dbt-commands.md index 8386cf61731..ca9a7725eb2 100644 --- a/website/docs/reference/dbt-commands.md +++ b/website/docs/reference/dbt-commands.md @@ -11,7 +11,7 @@ A key distinction with the tools mentioned, is that dbt Cloud CLI and IDE are de ## Parallel execution -dbt Cloud allows for parallel execution of commands, enhancing efficiency without compromising data integrity. This enables you to run multiple commands at the same time, however it's important to understand which commands can be run in parallel and which can't. +dbt Cloud allows for concurrent execution of commands, enhancing efficiency without compromising data integrity. This enables you to run multiple commands at the same time. However, it's important to understand which commands can be run in parallel and which can't. In contrast, [`dbt-core` _doesn't_ support](/reference/programmatic-invocations#parallel-execution-not-supported) safe parallel execution for multiple invocations in the same process, and requires users to manage concurrency manually to ensure data integrity and system stability. diff --git a/website/docs/reference/dbt-jinja-functions/adapter.md b/website/docs/reference/dbt-jinja-functions/adapter.md index 7d2ae696a78..54e1e31fd84 100644 --- a/website/docs/reference/dbt-jinja-functions/adapter.md +++ b/website/docs/reference/dbt-jinja-functions/adapter.md @@ -190,7 +190,7 @@ Drops a schema (or equivalent) in the target database. If the target schema does ```sql -{% do adapter.drop_schema(api.Relation.create(database=target.database, schema="my_schema"))) %} +{% do adapter.drop_schema(api.Relation.create(database=target.database, schema="my_schema")) %} ``` diff --git a/website/docs/reference/dbt-jinja-functions/execute.md b/website/docs/reference/dbt-jinja-functions/execute.md index f99bfa64734..65cd4708dc8 100644 --- a/website/docs/reference/dbt-jinja-functions/execute.md +++ b/website/docs/reference/dbt-jinja-functions/execute.md @@ -9,7 +9,7 @@ description: "Use `execute` to return True when dbt is in 'execute' mode." When you execute a `dbt compile` or `dbt run` command, dbt: -1. Reads all of the files in your project and generates a "manifest" comprised of models, tests, and other graph nodes present in your project. During this phase, dbt uses the `ref` statements it finds to generate the DAG for your project. **No SQL is run during this phase**, and `execute == False`. +1. Reads all of the files in your project and generates a [manifest](/reference/artifacts/manifest-json) comprised of models, tests, and other graph nodes present in your project. During this phase, dbt uses the [`ref`](/reference/dbt-jinja-functions/ref) and [`source`](/reference/dbt-jinja-functions/source) statements it finds to generate the DAG for your project. **No SQL is run during this phase**, and `execute == False`. 2. Compiles (and runs) each node (eg. building models, or running tests). **SQL is run during this phase**, and `execute == True`. Any Jinja that relies on a result being returned from the database will error during the parse phase. For example, this SQL will return an error: diff --git a/website/docs/reference/dbt-jinja-functions/model.md b/website/docs/reference/dbt-jinja-functions/model.md index 903851617f2..516981e11e3 100644 --- a/website/docs/reference/dbt-jinja-functions/model.md +++ b/website/docs/reference/dbt-jinja-functions/model.md @@ -11,7 +11,7 @@ description: "`model` is the dbt graph object (or node) for the current model." For example: ```jinja -{% if model.config.materialization == 'view' %} +{% if model.config.materialized == 'view' %} {{ log(model.name ~ " is a view.", info=True) }} {% endif %} ``` diff --git a/website/docs/reference/dbt-jinja-functions/set.md b/website/docs/reference/dbt-jinja-functions/set.md index d85e0539924..fa4de60e968 100644 --- a/website/docs/reference/dbt-jinja-functions/set.md +++ b/website/docs/reference/dbt-jinja-functions/set.md @@ -27,6 +27,10 @@ __Args__: {% do log(my_set) %} {# None #} ``` +``` +{% set email_id = "'admin@example.com'" %} +``` + ### set_strict The `set_strict` context method can be used to convert any iterable to a sequence of iterable elements that are unique (a set). The difference to the `set` context method is that the `set_strict` method will raise an exception on a `TypeError`, if the provided value is not a valid iterable and cannot be converted to a set. diff --git a/website/docs/reference/dbt_project.yml.md b/website/docs/reference/dbt_project.yml.md index 08261dd6932..1bb9dd2cf9c 100644 --- a/website/docs/reference/dbt_project.yml.md +++ b/website/docs/reference/dbt_project.yml.md @@ -14,8 +14,6 @@ Every [dbt project](/docs/build/projects) needs a `dbt_project.yml` file — thi The following example is a list of all available configurations in the `dbt_project.yml` file: - - ```yml @@ -94,77 +92,6 @@ vars: ``` - - - - - - -```yml -[name](/reference/project-configs/name): string - -[config-version](/reference/project-configs/config-version): 2 -[version](/reference/project-configs/version): version - -[profile](/reference/project-configs/profile): profilename - -[model-paths](/reference/project-configs/model-paths): [directorypath] -[seed-paths](/reference/project-configs/seed-paths): [directorypath] -[test-paths](/reference/project-configs/test-paths): [directorypath] -[analysis-paths](/reference/project-configs/analysis-paths): [directorypath] -[macro-paths](/reference/project-configs/macro-paths): [directorypath] -[snapshot-paths](/reference/project-configs/snapshot-paths): [directorypath] -[docs-paths](/reference/project-configs/docs-paths): [directorypath] -[asset-paths](/reference/project-configs/asset-paths): [directorypath] - -[packages-install-path](/reference/project-configs/packages-install-path): directorypath - -[clean-targets](/reference/project-configs/clean-targets): [directorypath] - -[query-comment](/reference/project-configs/query-comment): string - -[require-dbt-version](/reference/project-configs/require-dbt-version): version-range | [version-range] - -[dbt-cloud](/docs/cloud/cloud-cli-installation): - [project-id](/docs/cloud/configure-cloud-cli#configure-the-dbt-cloud-cli): project_id # Required - [defer-env-id](/docs/cloud/about-cloud-develop-defer#defer-in-dbt-cloud-cli): environment_id # Optional - -[quoting](/reference/project-configs/quoting): - database: true | false - schema: true | false - identifier: true | false - -models: - [](/reference/model-configs) - -seeds: - [](/reference/seed-configs) - -snapshots: - [](/reference/snapshot-configs) - -sources: - [](source-configs) - -tests: - [](/reference/data-test-configs) - -vars: - [](/docs/build/project-variables) - -[on-run-start](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] -[on-run-end](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] - -[dispatch](/reference/project-configs/dispatch-config): - - macro_namespace: packagename - search_order: [packagename] - -[restrict-access](/docs/collaborate/govern/model-access): true | false - -``` - - - ## Naming convention @@ -177,9 +104,8 @@ It's important to follow the correct YAML naming conventions for the configs in ```yml saved-queries: # Use dashes for resource types in the dbt_project.yml file. my_saved_query: - config: - +cache: - enabled: true + +cache: + enabled: true ``` diff --git a/website/docs/reference/global-configs/about-global-configs.md b/website/docs/reference/global-configs/about-global-configs.md index bbbe63ac439..64d56d002fe 100644 --- a/website/docs/reference/global-configs/about-global-configs.md +++ b/website/docs/reference/global-configs/about-global-configs.md @@ -16,7 +16,7 @@ There is a significant overlap between dbt's flags and dbt's command line option ### Setting flags There are multiple ways of setting flags, which depend on the use case: -- **[Project-level `flags` in `dbt_project.yml`](/reference/global-configs/project-flags):** Define version-controlled defaults for everyone running this project. Preserve [legacy behaviors](/reference/global-configs/legacy-behaviors) until their slated deprecation. +- **[Project-level `flags` in `dbt_project.yml`](/reference/global-configs/project-flags):** Define version-controlled defaults for everyone running this project. Also, opt in or opt out of [behavior changes](/reference/global-configs/behavior-changes) to manage your migration off legacy functionality. - **[Environment variables](/reference/global-configs/environment-variable-configs):** Define different behavior in different runtime environments (development vs. production vs. [continuous integration](/docs/deploy/continuous-integration), or different behavior for different users in development (based on personal preferences). - **[CLI options](/reference/global-configs/command-line-options):** Define behavior specific to _this invocation_. Supported for all dbt commands. @@ -41,7 +41,7 @@ dbt run --no-fail-fast # set to False There are two categories of exceptions: 1. **Flags setting file paths:** Flags for file paths that are relevant to runtime execution (for example, `--log-path` or `--state`) cannot be set in `dbt_project.yml`. To override defaults, pass CLI options or set environment variables (`DBT_LOG_PATH`, `DBT_STATE`). Flags that tell dbt where to find project resources (for example, `model-paths`) are set in `dbt_project.yml`, but as a top-level key, outside the `flags` dictionary; these configs are expected to be fully static and never vary based on the command or execution environment. -2. **Opt-in flags:** Flags opting into [legacy dbt behaviors](/reference/global-configs/legacy-behaviors) can _only_ be defined in `dbt_project.yml`. These are intended to be set in version control and migrated via pull/merge request. Their values should not diverge indefinitely across invocations, environments, or users. +2. **Opt-in flags:** Flags opting in or out of [behavior changes](/reference/global-configs/behavior-changes) can _only_ be defined in `dbt_project.yml`. These are intended to be set in version control and migrated via pull/merge request. Their values should not diverge indefinitely across invocations, environments, or users. ### Accessing flags @@ -79,12 +79,12 @@ Because the values of `flags` can differ across invocations, we strongly advise | [print](/reference/global-configs/print-output#suppress-print-messages-in-stdout) | boolean | True | ❌ | `DBT_PRINT` | `--print` | ❌ | | [printer_width](/reference/global-configs/print-output#printer-width) | int | 80 | ✅ | `DBT_PRINTER_WIDTH` | `--printer-width` | ❌ | | [profile](/docs/core/connect-data-platform/connection-profiles#about-profiles) | string | None | ✅ (as top-level key) | `DBT_PROFILE` | `--profile` | ❌ | -| [profiles_dir](/docs/core/connect-data-platform/connection-profiles#about-profiles) | path | None (current dir, then HOME dir) | ❌ | `DBT_PROFILES_DIR` | `--log-path` | ❌ | +| [profiles_dir](/docs/core/connect-data-platform/connection-profiles#about-profiles) | path | None (current dir, then HOME dir) | ❌ | `DBT_PROFILES_DIR` | `--profiles-dir` | ❌ | | [project_dir](/reference/dbt_project.yml) | path | | ❌ | `DBT_PROJECT_DIR` | `--project-dir` | ❌ | | [quiet](/reference/global-configs/logs#suppress-non-error-logs-in-output) | boolean | False | ❌ | `DBT_QUIET` | `--quiet` | ✅ | | [resource-type](/reference/global-configs/resource-type) (v1.8+) | string | None | ❌ | `DBT_RESOURCE_TYPES`

`DBT_EXCLUDE_RESOURCE_TYPES` | `--resource-type`

`--exclude-resource-type` | ✅ | | [send_anonymous_usage_stats](/reference/global-configs/usage-stats) | boolean | True | ✅ | `DBT_SEND_ANONYMOUS_USAGE_STATS` | `--send-anonymous-usage-stats`, `--no-send-anonymous-usage-stats` | ❌ | -| [source_freshness_run_project_hooks](/reference/global-configs/legacy-behaviors#source_freshness_run_project_hooks) | boolean | False | ✅ | ❌ | ❌ | ❌ | +| [source_freshness_run_project_hooks](/reference/global-configs/behavior-changes#source_freshness_run_project_hooks) | boolean | False | ✅ | ❌ | ❌ | ❌ | | [state](/reference/node-selection/defer) | path | none | ❌ | `DBT_STATE`, `DBT_DEFER_STATE` | `--state`, `--defer-state` | ❌ | | [static_parser](/reference/global-configs/parsing#static-parser) | boolean | True | ✅ | `DBT_STATIC_PARSER` | `--static-parser`, `--no-static-parser` | ❌ | | [store_failures](/reference/resource-configs/store_failures) | boolean | False | ✅ (as resource config) | `DBT_STORE_FAILURES` | `--store-failures`, `--no-store-failures` | ✅ | diff --git a/website/docs/reference/global-configs/adapter-behavior-changes.md b/website/docs/reference/global-configs/adapter-behavior-changes.md new file mode 100644 index 00000000000..a755f8cfe50 --- /dev/null +++ b/website/docs/reference/global-configs/adapter-behavior-changes.md @@ -0,0 +1,30 @@ +--- +title: "About adapter-specific behavior changes" +id: "adapter-behavior-changes" +sidebar_label: "Adapter behavior changes" +description: "Adapter-specific behavior changes" +hide_table_of_contents: true +pagination_next: null +pagination_prev: null +--- + + +Some adapters can display behavior changes when certain flags are enabled. The following sections contain details about these adapter-specific behavior changes. + + +
+ + + + + + +
diff --git a/website/docs/reference/global-configs/legacy-behaviors.md b/website/docs/reference/global-configs/behavior-changes.md similarity index 50% rename from website/docs/reference/global-configs/legacy-behaviors.md rename to website/docs/reference/global-configs/behavior-changes.md index 1450fda1459..299674ae9c1 100644 --- a/website/docs/reference/global-configs/legacy-behaviors.md +++ b/website/docs/reference/global-configs/behavior-changes.md @@ -1,9 +1,11 @@ --- -title: "Legacy behaviors" -id: "legacy-behaviors" -sidebar: "Legacy behaviors" +title: "Behavior changes" +id: "behavior-changes" +sidebar: "Behavior changes" --- +import StateModified from '/snippets/_state-modified-compare.md'; + Most flags exist to configure runtime behaviors with multiple valid choices. The right choice may vary based on the environment, user preference, or the specific invocation. Another category of flags provides existing projects with a migration window for runtime behaviors that are changing in newer releases of dbt. These flags help us achieve a balance between these goals, which can otherwise be in tension, by: @@ -12,15 +14,41 @@ Another category of flags provides existing projects with a migration window for - Providing maintainability of dbt software. Every fork in behavior requires additional testing & cognitive overhead that slows future development. These flags exist to facilitate migration from "current" to "better," not to stick around forever. These flags go through three phases of development: -1. **Introduction (disabled by default):** dbt adds logic to support both 'old' + 'new' behaviors. The 'new' behavior is gated behind a flag, disabled by default, preserving the old behavior. +1. **Introduction (disabled by default):** dbt adds logic to support both 'old' and 'new' behaviors. The 'new' behavior is gated behind a flag, disabled by default, preserving the old behavior. 2. **Maturity (enabled by default):** The default value of the flag is switched, from `false` to `true`, enabling the new behavior by default. Users can preserve the 'old' behavior and opt out of the 'new' behavior by setting the flag to `false` in their projects. They may see deprecation warnings when they do so. 3. **Removal (generally enabled):** After marking the flag for deprecation, we remove it along with the 'old' behavior it supported from the dbt codebases. We aim to support most flags indefinitely, but we're not committed to supporting them forever. If we choose to remove a flag, we'll offer significant advance notice. +## What is a behavior change? + +The same dbt project code and the same dbt commands return one result before the behavior change, and they return a different result after the behavior change. + +Examples of behavior changes: +- dbt begins raising a validation _error_ that it didn't previously. +- dbt changes the signature of a built-in macro. Your project has a custom reimplementation of that macro. This could lead to errors, because your custom reimplementation will be passed arguments it cannot accept. +- A dbt adapter renames or removes a method that was previously available on the `{{ adapter }}` object in the dbt-Jinja context. +- dbt makes a breaking change to contracted metadata artifacts by deleting a required field, changing the name or type of an existing field, or removing the default value of an existing field ([README](https://github.com/dbt-labs/dbt-core/blob/37d382c8e768d1e72acd767e0afdcb1f0dc5e9c5/core/dbt/artifacts/README.md#breaking-changes)). +- dbt removes one of the fields from [structured logs](/reference/events-logging#structured-logging). + +The following are **not** behavior changes: +- Fixing a bug where the previous behavior was defective, undesirable, or undocumented. +- dbt begins raising a _warning_ that it didn't previously. +- dbt updates the language of human-friendly messages in log events. +- dbt makes a non-breaking change to contracted metadata artifacts by adding a new field with a default, or deleting a field with a default ([README](https://github.com/dbt-labs/dbt-core/blob/37d382c8e768d1e72acd767e0afdcb1f0dc5e9c5/core/dbt/artifacts/README.md#non-breaking-changes)). + +The vast majority of changes are not behavior changes. Because introducing these changes does not require any action on the part of users, they are included in continuous releases of dbt Cloud and patch releases of dbt Core. + +By contrast, behavior change migrations happen slowly, over the course of months, facilitated by behavior change flags. The flags are loosely coupled to the specific dbt runtime version. By setting flags, users have control over opting in (and later opting out) of these changes. + ## Behavior change flags These flags _must_ be set in the `flags` dictionary in `dbt_project.yml`. They configure behaviors closely tied to project code, which means they should be defined in version control and modified through pull or merge requests, with the same testing and peer review. -The following example displays the current flags and their current default values in the latest dbt Cloud and dbt Core versions. To opt out of a specific behavior change, set the values of the flag to `False` in `dbt_project.yml`. You'll continue to see warnings for legacy behaviors that you have opted out of explicitly until you either resolve them (switch the flag to `True`) or choose to silence the warnings using the `warn_error_options.silence` flag. +The following example displays the current flags and their current default values in the latest dbt Cloud and dbt Core versions. To opt out of a specific behavior change, set the values of the flag to `False` in `dbt_project.yml`. You will continue to see warnings for legacy behaviors you’ve opted out of, until you either: + +- Resolve the issue (by switching the flag to `True`) +- Silence the warnings using the `warn_error_options.silence` flag + +Here's an example of the available behavior change flags with their default values: @@ -29,22 +57,48 @@ flags: require_explicit_package_overrides_for_builtin_materializations: False require_model_names_without_spaces: False source_freshness_run_project_hooks: False + restrict_direct_pg_catalog_access: False + require_yaml_configuration_for_mf_time_spines: False ``` -When we use dbt Cloud in the following table, we're referring to accounts that have gone "[Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless)." +When we use dbt Cloud in the following table, we're referring to accounts that have gone "[Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless)." This table outlines which version of dbt Core contains the behavior change or the date the behavior change was added to dbt Cloud. | Flag | dbt Cloud: Intro | dbt Cloud: Maturity | dbt Core: Intro | dbt Core: Maturity | |-----------------------------------------------------------------|------------------|---------------------|-----------------|--------------------| -| require_explicit_package_overrides_for_builtin_materializations | 2024.04.141 | 2024.06.192 | 1.6.14, 1.7.14 | 1.8.0 | -| require_resource_names_without_spaces | 2024.05.146 | TBD* | 1.8.0 | 1.9.0 | -| source_freshness_run_project_hooks | 2024.03.61 | TBD* | 1.8.0 | 1.9.0 | +| [require_explicit_package_overrides_for_builtin_materializations](#package-override-for-built-in-materialization) | 2024.04 | 2024.06 | 1.6.14, 1.7.14 | 1.8.0 | +| [require_resource_names_without_spaces](#no-spaces-in-resource-names) | 2024.05 | TBD* | 1.8.0 | 1.9.0 | +| [source_freshness_run_project_hooks](#project-hooks-with-source-freshness) | 2024.03 | TBD* | 1.8.0 | 1.9.0 | +| [Redshift] [restrict_direct_pg_catalog_access](/reference/global-configs/redshift-changes#the-restrict_direct_pg_catalog_access-flag) | 2024.09 | TBD* | dbt-redshift v1.9.0 | 1.9.0 | +| [skip_nodes_if_on_run_start_fails](#failures-in-on-run-start-hooks) | 2024.10 | TBD* | 1.9.0 | TBD* | +| [state_modified_compare_more_unrendered_values](#source-definitions-for-state) | 2024.10 | TBD* | 1.9.0 | TBD* | +| [require_yaml_configuration_for_mf_time_spines](#metricflow-time-spine-yaml) | 2024.10 | TBD* | 1.9.0 | TBD* | When the dbt Cloud Maturity is "TBD," it means we have not yet determined the exact date when these flags' default values will change. Affected users will see deprecation warnings in the meantime, and they will receive emails providing advance warning ahead of the maturity date. In the meantime, if you are seeing a deprecation warning, you can either: - Migrate your project to support the new behavior, and then set the flag to `True` to stop seeing the warnings. - Set the flag to `False`. You will continue to see warnings, and you will retain the legacy behavior even after the maturity date (when the default value changes). +### Failures in on-run-start hooks + +The flag is `False` by default. + +Set the `skip_nodes_if_on_run_start_fails` flag to `True` to skip all selected resources from running if there is a failure on an `on-run-start` hook. + +### Source definitions for state:modified + +:::info + + + +::: + +The flag is `False` by default. + +Set `state_modified_compare_more_unrendered_values` to `True` to reduce false positives during `state:modified` checks (especially when configs differ by target environment like `prod` vs. `dev`). + +Setting the flag to `True` changes the `state:modified` comparison from using rendered values to unrendered values instead. It accomplishes this by persisting `unrendered_config` during model parsing and `unrendered_database` and `unrendered_schema` configs during source parsing. + ### Package override for built-in materialization Setting the `require_explicit_package_overrides_for_builtin_materializations` flag to `True` prevents this automatic override. @@ -91,7 +145,7 @@ The names of dbt resources (models, sources, etc) should contain letters, number Set the `source_freshness_run_project_hooks` flag to `True` to include "project hooks" ([`on-run-start` / `on-run-end`](/reference/project-configs/on-run-start-on-run-end)) in the `dbt source freshness` command execution. -If you have specific project [`on-run-start` / `on-run-end`](/reference/project-configs/on-run-start-on-run-end) hooks that should not run before/after `source freshness` command, you can add a conditional check to those hooks: +If you have a specific project [`on-run-start` / `on-run-end`](/reference/project-configs/on-run-start-on-run-end) hooks that should not run before/after `source freshness` command, you can add a conditional check to those hooks: @@ -100,3 +154,13 @@ on-run-start: - '{{ ... if flags.WHICH != 'freshness' }}' ``` + + +### MetricFlow time spine YAML +The `require_yaml_configuration_for_mf_time_spines` flag is set to `False` by default. + +In previous versions (dbt Core 1.8 and earlier), the MetricFlow time spine configuration was stored in a `metricflow_time_spine.sql` file. + +When the flag is set to `True`, dbt will continue to support the SQL file configuration. When the flag is set to `False`, dbt will raise a deprecation warning if it detects a MetricFlow time spine configured in a SQL file. + +The MetricFlow YAML file should have the `time_spine:` field. Refer to [MetricFlow timespine](/docs/build/metricflow-time-spine) for more details. diff --git a/website/docs/reference/global-configs/cache.md b/website/docs/reference/global-configs/cache.md index 349bdcd9d6f..bcec51da3cd 100644 --- a/website/docs/reference/global-configs/cache.md +++ b/website/docs/reference/global-configs/cache.md @@ -6,7 +6,7 @@ sidebar: "Cache" ### Cache population -At the start of runs, dbt caches metadata about all the objects in all the schemas where it might materialize resources (such as models). By default, dbt populates the cache with information on all schemas related to the project. +At the start of runs, dbt caches metadata about all the objects in all the schemas where it might materialize resources (such as models). By default, dbt populates the relational cache with information on all schemas related to the project. There are two ways to optionally modify this behavior: - `POPULATE_CACHE` (default: `True`): Whether to populate the cache at all. To skip cache population entirely, use the `--no-populate-cache` flag or `DBT_POPULATE_CACHE: False`. Note that this does not _disable_ the cache; missed cache lookups will run queries, and update the cache afterward. @@ -27,3 +27,11 @@ Or, to improve speed and performance while focused on developing Salesforce mode dbt --cache-selected-only run --select salesforce ``` + +### Logging relational cache events + +import LogLevel from '/snippets/_log-relational-cache.md'; + + diff --git a/website/docs/reference/global-configs/databricks-changes.md b/website/docs/reference/global-configs/databricks-changes.md new file mode 100644 index 00000000000..ca24b822ae5 --- /dev/null +++ b/website/docs/reference/global-configs/databricks-changes.md @@ -0,0 +1,26 @@ +--- +title: "Databricks adapter behavior changes" +id: "databricks-changes" +sidebar: "Databricks" +--- + +The following are the current [behavior change flags](/docs/reference/global-configs/behavior-changes.md#behavior-change-flags) that are specific to `dbt-databricks`: + +| Flag | `dbt-databricks`: Intro | `dbt-databricks`: Maturity | +| ----------------------------- | ----------------------- | -------------------------- | +| `use_info_schema_for_columns` | 1.9.0 | TBD | +| `use_user_folder_for_python` | 1.9.0 | TBD | + +### Use information schema for columns + +The `use_info_schema_for_columns` flag is `False` by default. + +Setting this flag to `True` will use `information_schema` rather than `describe extended` to get column metadata for Unity Catalog tables. This setting helps you avoid issues where `describe extended` truncates information when the type is a complex struct. However, this setting is not yet the default behavior, as there are performance impacts due to a Databricks metadata limitation because of the need to run `REPAIR TABLE {{relation}} SYNC METADATA` before querying to ensure the `information_schema` is complete. + +This flag will become the default behavior when this additional query is no longer needed. + +### Use user's folder for Python model notebooks + +The `use_user_folder_for_python` flag is `False` by default and results in writing uploaded python model notebooks to `/Shared/dbt_python_models/{{schema}}/`. Setting this flag to `True` will write notebooks to `/Users/{{current user}}/{{catalog}}/{{schema}}/` Writing to the `Shared` folder is deprecated by Databricks as it does not align with governance best practices. + +We plan to promote this flag to maturity in v1.10.0. diff --git a/website/docs/reference/global-configs/logs.md b/website/docs/reference/global-configs/logs.md index 972a731854d..682b9fc8393 100644 --- a/website/docs/reference/global-configs/logs.md +++ b/website/docs/reference/global-configs/logs.md @@ -137,11 +137,11 @@ You can use either of these parameters to ensure clean output that's compatible ### Logging relational cache events -The `LOG_CACHE_EVENTS` config allows detailed logging for [relational cache](/reference/global-configs/cache) events, which are disabled by default. +import LogLevel from '/snippets/_log-relational-cache.md'; -```text -dbt --log-cache-events compile -``` +relational cache} +/> ### Color diff --git a/website/docs/reference/global-configs/project-flags.md b/website/docs/reference/global-configs/project-flags.md index 896276d9735..cdbe3463b14 100644 --- a/website/docs/reference/global-configs/project-flags.md +++ b/website/docs/reference/global-configs/project-flags.md @@ -17,7 +17,7 @@ flags: Reference the [table of all flags](/reference/global-configs/about-global-configs#available-flags) to see which global configs are available for setting in [`dbt_project.yml`](/reference/dbt_project.yml). -The `flags` dictionary is the _only_ place you can opt out of [behavior changes](/reference/global-configs/legacy-behaviors), while the legacy behavior is still supported. +The `flags` dictionary is the _only_ place you can opt out of [behavior changes](/reference/global-configs/behavior-changes), while the legacy behavior is still supported. diff --git a/website/docs/reference/global-configs/redshift-changes.md b/website/docs/reference/global-configs/redshift-changes.md new file mode 100644 index 00000000000..2ba7c0f165f --- /dev/null +++ b/website/docs/reference/global-configs/redshift-changes.md @@ -0,0 +1,11 @@ +--- +title: "Amazon Redshift adapter behavior changes" +id: "redshift-changes" +sidebar: "Redshift" +--- + +## The restrict_direct_pg_catalog_access flag + +Originally, the `dbt-redshift` adapter was built on top of the `dbt-postgres` adapter and used Postgres tables for metadata access. When this flag is enabled, the adapter uses the Redshift API (through the Python client) if available, or queries Redshift's `information_schema` tables instead of using the `pg_` tables. + +While you shouldn't notice any behavior changes due to this change, however, to be cautious dbt Labs is gating it behind a behavior-change flag and encouraging you to test it before it becoming the default. diff --git a/website/docs/reference/global-configs/resource-type.md b/website/docs/reference/global-configs/resource-type.md index eb3562b5175..431b6c049cb 100644 --- a/website/docs/reference/global-configs/resource-type.md +++ b/website/docs/reference/global-configs/resource-type.md @@ -4,7 +4,17 @@ id: "resource-type" sidebar: "resource type" --- -The `--resource-type` and `--exclude-resource-type` flags include or exclude resource types from the `dbt build`, `dbt clone`, and `dbt list` commands. + + +The `--resource-type` and `--exclude-resource-type` flags include or exclude resource types from the `dbt build`, `dbt clone`, and `dbt list` commands. In Versionless and from dbt v1.9 onwards, these flags are also supported in the `dbt test` command. + + + + + +The `--resource-type` and `--exclude-resource-type` flags include or exclude resource types from the `dbt build`, `dbt test`, `dbt clone`, and `dbt list` commands. + + This means the flags enable you to specify which types of resources to include or exclude when running the commands, instead of targeting specific resources. @@ -14,20 +24,7 @@ The `--exclude-resource-type` flag is only available in dbt version 1.8 and high The available resource types are: - - -- [`analysis`](/docs/build/analyses) -- [`exposure`](/docs/build/exposures) -- [`metric`](/docs/build/metrics-overview) -- [`model`](/docs/build/models) -- [`seed`](/docs/build/seeds) -- [`snapshot`](/docs/build/snapshots) -- [`source`](/docs/build/sources) -- [`test`](/docs/build/data-tests) - - - - + - [`analysis`](/docs/build/analyses) - [`exposure`](/docs/build/exposures) @@ -53,49 +50,80 @@ The available resource types are: - [`semantic_model`](/docs/build/semantic-models) - [`snapshot`](/docs/build/snapshots) - [`source`](/docs/build/sources) -- [`data_test`](/docs/build/data-tests) +- [`test`](/docs/build/data-tests) - [`unit_test`](/docs/build/unit-tests) ## Example -Instead of targeting specific resources, use the `--resource-flag` or `--exclude-resource-type` flags to target all resources of a certain type: `dbt build --resource-type RESOURCE_TYPE` replacing `RESOURCE_TYPE` with the resource type you want to exclude. +Instead of targeting specific resources, use the `--resource-flag` or `--exclude-resource-type` flags to target all resources of a certain type: `dbt build --resource-type RESOURCE_TYPE` replacing `RESOURCE_TYPE` with the resource type you want to include. - For example, use the following command to include _all_ snapshots from your dbt build process: - + -```text -dbt build --resource-type snapshot -``` + ```text + dbt build --resource-type snapshot + ``` - + - - In this example, run the following command to include _all_ saved queries with the `--resource-type` flag: - + + + ```text + dbt build --resource-type saved_query + ``` + + + + + +- In this example, use the following command to exclude _all_ unit tests from your dbt build process. Note that the `--exclude-resource-type` flag is only available in dbt version 1.8 and higher: + + + + ```text + dbt build --exclude-resource-type unit_test + ``` -```text -dbt build --resource-type saved_query -``` + - +- In this example, use the following command to include all data tests in your build process: + + + + ```text + dbt build --resource-type test + ``` + + - + + +- In this example, use the following command to exclude _all_ unit tests when running tests: + + + + ```text + dbt test --exclude-resource-type unit_test + ``` + + -- In this example, use the following command to exclude _all_ unit tests, from your dbt build process. Note that the `--exclude-resource-type` flag is only available in dbt version 1.8 and higher: +- In this example, use the following command to include all data tests when running tests: - + -```text -dbt build --exclude-resource-type unit_test -``` + ```text + dbt test --resource-type test + ``` - + diff --git a/website/docs/reference/global-configs/usage-stats.md b/website/docs/reference/global-configs/usage-stats.md index 62ead8834a6..73610c29586 100644 --- a/website/docs/reference/global-configs/usage-stats.md +++ b/website/docs/reference/global-configs/usage-stats.md @@ -25,6 +25,6 @@ For full transparency, you can see all the event definitions in [`tracking.py`]( send_anonymous_usage_stats: False ``` - dbt Core users can also use the` DO_NOT_TRACK` environment variable to enable or disable sending anonymous data. For more information, see [Environment variables](/docs/build/environment-variables). + dbt Core users can also use the `DO_NOT_TRACK` environment variable to enable or disable sending anonymous data. For more information, see [Environment variables](/docs/build/environment-variables). `DO_NOT_TRACK=1` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=False` diff --git a/website/docs/reference/global-configs/warnings.md b/website/docs/reference/global-configs/warnings.md index 97eb270338e..d432432d25f 100644 --- a/website/docs/reference/global-configs/warnings.md +++ b/website/docs/reference/global-configs/warnings.md @@ -46,7 +46,6 @@ flags: error: # Previously called "include" warn: # Previously called "exclude" silence: # To silence or ignore warnings - - TestsConfigDeprecation - NoNodesForSelectionCriteria ``` @@ -131,7 +130,6 @@ config: warn: # Previously called "exclude" - NoNodesForSelectionCriteria silence: # Silence or ignore warnings - - TestsConfigDeprecation - NoNodesForSelectionCriteria ``` diff --git a/website/docs/reference/model-configs.md b/website/docs/reference/model-configs.md index 0746fe92036..65133dcb25a 100644 --- a/website/docs/reference/model-configs.md +++ b/website/docs/reference/model-configs.md @@ -35,6 +35,7 @@ models: [](/reference/resource-configs/resource-path): [+](/reference/resource-configs/plus-prefix)[materialized](/reference/resource-configs/materialized): [+](/reference/resource-configs/plus-prefix)[sql_header](/reference/resource-configs/sql_header): + [+](/reference/resource-configs/plus-prefix)[on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail #only for materialized views on supported adapters ``` @@ -55,6 +56,7 @@ models: config: [materialized](/reference/resource-configs/materialized): [sql_header](/reference/resource-configs/sql_header): + [on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail #only for materialized views on supported adapters ``` @@ -72,6 +74,7 @@ models: {{ config( [materialized](/reference/resource-configs/materialized)="", [sql_header](/reference/resource-configs/sql_header)="" + [on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail #only for materialized views for supported adapters ) }} ``` @@ -136,8 +139,8 @@ models: config: [enabled](/reference/resource-configs/enabled): true | false [tags](/reference/resource-configs/tags): | [] - [pre-hook](/reference/resource-configs/pre-hook-post-hook): | [] - [post-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [pre_hook](/reference/resource-configs/pre-hook-post-hook): | [] + [post_hook](/reference/resource-configs/pre-hook-post-hook): | [] [database](/reference/resource-configs/database): [schema](/reference/resource-properties/schema): [alias](/reference/resource-configs/alias): diff --git a/website/docs/reference/model-properties.md b/website/docs/reference/model-properties.md index 46fb0ca3bad..9ec0c667360 100644 --- a/website/docs/reference/model-properties.md +++ b/website/docs/reference/model-properties.md @@ -38,9 +38,15 @@ models: - - ... # declare additional data tests [tags](/reference/resource-configs/tags): [] + + # only required in conjunction with time_spine key + granularity: <[any supported time granularity](/docs/build/dimensions?dimension=time_gran)> - name: ... # declare properties of additional columns + [time_spine](/docs/build/metricflow-time-spine): + standard_granularity_column: + [versions](/reference/resource-properties/versions): - [v](/reference/resource-properties/versions#v): # required [defined_in](/reference/resource-properties/versions#defined-in): @@ -74,7 +80,3 @@ models: - diff --git a/website/docs/reference/node-selection/defer.md b/website/docs/reference/node-selection/defer.md index 1653bf7e04c..863494de12e 100644 --- a/website/docs/reference/node-selection/defer.md +++ b/website/docs/reference/node-selection/defer.md @@ -31,7 +31,7 @@ dbt test --models [...] --defer --state path/to/artifacts When the `--defer` flag is provided, dbt will resolve `ref` calls differently depending on two criteria: 1. Is the referenced node included in the model selection criteria of the current run? -2. Does the reference node exist as a database object in the current environment? +2. Does the referenced node exist as a database object in the current environment? If the answer to both is **no**—a node is not included _and_ it does not exist as a database object in the current environment—references to it will use the other namespace instead, provided by the state manifest. @@ -71,8 +71,6 @@ group by 1 I want to test my changes. Nothing exists in my development schema, `dev_alice`. -### test - +### test + I also have a `relationships` test that establishes referential integrity between `model_a` and `model_b`: @@ -223,4 +223,5 @@ dbt will check to see if `dev_alice.model_a` exists. If it doesn't exist, dbt wi ## Related docs - [Using defer in dbt Cloud](/docs/cloud/about-cloud-develop-defer) +- [on_configuration_change](/reference/resource-configs/on_configuration_change) diff --git a/website/docs/reference/node-selection/methods.md b/website/docs/reference/node-selection/methods.md index 37f50f734e7..7587a9fd2b1 100644 --- a/website/docs/reference/node-selection/methods.md +++ b/website/docs/reference/node-selection/methods.md @@ -44,13 +44,8 @@ Use the `resource_type` method to select nodes of a particular type (`model`, `t ```bash dbt build --select "resource_type:exposure" # build all resources upstream of exposures -dbt list --select "resource_type:test" # list all tests in your project -``` - -Note: This method doesn't work for sources, so use the [`--resource-type`](/reference/commands/list) option of the list command instead: - - ```bash -dbt list --resource-type source +dbt list --select "resource_type:test" # list all tests in your project +dbt list --select "resource_type:source" # list all sources in your project ``` ### The "path" method @@ -315,10 +310,6 @@ dbt list --select "+semantic_model:orders" # list your semantic model named "or ``` ### The "saved_query" method - -Supported in v1.7 or newer. - - The `saved_query` method selects [saved queries](/docs/build/saved-queries). @@ -327,8 +318,6 @@ dbt list --select "saved_query:*" # list all saved queries dbt list --select "+saved_query:orders_saved_query" # list your saved query named "orders_saved_query" and all upstream resources ``` - - ### The "unit_test" method diff --git a/website/docs/reference/node-selection/state-comparison-caveats.md b/website/docs/reference/node-selection/state-comparison-caveats.md index b0abde03aa0..adaf35bd710 100644 --- a/website/docs/reference/node-selection/state-comparison-caveats.md +++ b/website/docs/reference/node-selection/state-comparison-caveats.md @@ -2,6 +2,8 @@ title: "Caveats to state comparison" --- +import StateModified from '/snippets/_state-modified-compare.md'; + The [`state:` selection method](/reference/node-selection/methods#the-state-method) is a powerful feature, with a lot of underlying complexity. Below are a handful of considerations when setting up automated jobs that leverage state comparison. ### Seeds @@ -44,12 +46,21 @@ dbt test -s "state:modified" --exclude "test_name:relationships" ### False positives + + +To reduce false positives during `state:modified` selection due to env-aware logic, you can set the `state_modified_compare_more_unrendered_values` [behavior flag](/reference/global-configs/behavior-changes#behavior-change-flags) to `True`. + + + + + + State comparison works by identifying discrepancies between two manifests. Those discrepancies could be the result of: 1. Changes made to a project in development -2. Env-aware logic that causes different behavior based on the `target`, env vars, etc. +2. Env-aware logic that causes different behavior based on the `target`, env vars, etc., which can be avoided if you upgrade to dbt Core 1.9 and set the `state_modified_compare_more_unrendered_values` [behavior flag](/reference/global-configs/behavior-changes#behavior-change-flags) to `True`. -State comparison detects env-aware config in `dbt_project.yml`. This target-based config registers as a modification: +State comparison detects env-aware config in `dbt_project.yml`. This target-based config won't register as a modification: @@ -73,6 +84,7 @@ That means the following config—functionally identical to the snippet above— materialized = ('table' if target.name == 'prod' else 'view') ) }} ``` + ### Final note diff --git a/website/docs/reference/programmatic-invocations.md b/website/docs/reference/programmatic-invocations.md index 09e41b1789f..61250e6debb 100644 --- a/website/docs/reference/programmatic-invocations.md +++ b/website/docs/reference/programmatic-invocations.md @@ -25,9 +25,9 @@ for r in res.result: ## Parallel execution not supported -[`dbt-core`](https://pypi.org/project/dbt-core/) doesn't support [safe parallel execution](/reference/dbt-commands#parallel-execution) for multiple invocations in the same process. This means it's not safe to run multiple dbt commands at the same time. It's officially discouraged and requires a wrapping process to handle sub-processes. This is because: +[`dbt-core`](https://pypi.org/project/dbt-core/) doesn't support [safe parallel execution](/reference/dbt-commands#parallel-execution) for multiple invocations in the same process. This means it's not safe to run multiple dbt commands concurrently. It's officially discouraged and requires a wrapping process to handle sub-processes. This is because: -- Running simultaneous commands can unexpectedly interact with the data platform. For example, running `dbt run` and `dbt build` for the same models simultaneously could lead to unpredictable results. +- Running concurrent commands can unexpectedly interact with the data platform. For example, running `dbt run` and `dbt build` for the same models simultaneously could lead to unpredictable results. - Each `dbt-core` command interacts with global Python variables. To ensure safe operation, commands need to be executed in separate processes, which can be achieved using methods like spawning processes or using tools like Celery. To run [safe parallel execution](/reference/dbt-commands#available-commands), you can use the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) or [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud), both of which does that additional work to manage concurrency (multiple processes) on your behalf. diff --git a/website/docs/reference/project-configs/docs-paths.md b/website/docs/reference/project-configs/docs-paths.md index 51ff5c5ccca..5481c19c9fd 100644 --- a/website/docs/reference/project-configs/docs-paths.md +++ b/website/docs/reference/project-configs/docs-paths.md @@ -17,8 +17,18 @@ Optionally specify a custom list of directories where [docs blocks](/docs/build/ ## Default -By default, dbt will search in all resource paths for docs blocks (i.e. the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [macro-paths](/reference/project-configs/macro-paths) and [snapshot-paths](/reference/project-configs/snapshot-paths)). If this option is configured, dbt will _only_ look in the specified directory for docs blocks. + + +By default, dbt will search in all resource paths for docs blocks (for example, the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [test-paths](/reference/project-configs/test-paths), [macro-paths](/reference/project-configs/macro-paths), and [snapshot-paths](/reference/project-configs/snapshot-paths)). If this option is configured, dbt will _only_ look in the specified directory for docs blocks. + + + + + +By default, dbt will search in all resource paths for docs blocks (i.e. the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [macro-paths](/reference/project-configs/macro-paths), and [snapshot-paths](/reference/project-configs/snapshot-paths)). If this option is configured, dbt will _only_ look in the specified directory for docs blocks. + + ## Example diff --git a/website/docs/reference/project-configs/on-run-start-on-run-end.md b/website/docs/reference/project-configs/on-run-start-on-run-end.md index e1a3d7b761a..74557839f11 100644 --- a/website/docs/reference/project-configs/on-run-start-on-run-end.md +++ b/website/docs/reference/project-configs/on-run-start-on-run-end.md @@ -20,7 +20,7 @@ on-run-end: sql-statement | [sql-statement] A SQL statement (or list of SQL statements) to be run at the start or end of the following commands: -`on-run-start` and `on-run-end` hooks can also call macros that return SQL statements +`on-run-start` and `on-run-end` hooks can also [call macros](#call-a-macro-to-grant-privileges) that return SQL statements. ## Usage notes * The `on-run-end` hook has additional jinja variables available in the context — check out the [docs](/reference/dbt-jinja-functions/on-run-end-context). diff --git a/website/docs/reference/project-configs/require-dbt-version.md b/website/docs/reference/project-configs/require-dbt-version.md index 42dc49c4546..97b42e036ec 100644 --- a/website/docs/reference/project-configs/require-dbt-version.md +++ b/website/docs/reference/project-configs/require-dbt-version.md @@ -93,7 +93,7 @@ In the following example, the project will only run with dbt v1.5: ```yml -require-dbt-version: 1.5 +require-dbt-version: "1.5.0" ``` diff --git a/website/docs/reference/project-configs/snapshot-paths.md b/website/docs/reference/project-configs/snapshot-paths.md index 81b2759609d..8319833f1e6 100644 --- a/website/docs/reference/project-configs/snapshot-paths.md +++ b/website/docs/reference/project-configs/snapshot-paths.md @@ -12,7 +12,16 @@ snapshot-paths: [directorypath] ## Definition -Optionally specify a custom list of directories where [snapshots](/docs/build/snapshots) are located. Note that you cannot co-locate models and snapshots. + +Optionally specify a custom list of directories where [snapshots](/docs/build/snapshots) are located. + + +In [Versionless](/docs/dbt-versions/versionless-cloud) and on dbt v1.9 and higher, you can co-locate your snapshots with models if they are [defined using the latest YAML syntax](/docs/build/snapshots). + + + +Note that you cannot co-locate models and snapshots. However, in [Versionless](/docs/dbt-versions/versionless-cloud) and on dbt v1.9 and higher, you can co-locate your snapshots with models if they are [defined using the latest YAML syntax](/docs/build/snapshots). + ## Default By default, dbt will search for snapshots in the `snapshots` directory, i.e. `snapshot-paths: ["snapshots"]` diff --git a/website/docs/reference/project-configs/test-paths.md b/website/docs/reference/project-configs/test-paths.md index 59f17db05eb..6749a07d23d 100644 --- a/website/docs/reference/project-configs/test-paths.md +++ b/website/docs/reference/project-configs/test-paths.md @@ -13,7 +13,7 @@ test-paths: [directorypath] ## Definition -Optionally specify a custom list of directories where [singular tests](/docs/build/data-tests) are located. +Optionally specify a custom list of directories where [singular tests](/docs/build/data-tests#singular-data-tests) and [custom generic tests](/docs/build/data-tests#generic-data-tests) are located. ## Default diff --git a/website/docs/reference/resource-configs/access.md b/website/docs/reference/resource-configs/access.md index 0f67a454344..c73e09dd639 100644 --- a/website/docs/reference/resource-configs/access.md +++ b/website/docs/reference/resource-configs/access.md @@ -15,14 +15,6 @@ models: - - -Access modifiers may be applied to models one-by-one in YAML properties. In v1.5 and v1.6, you are unable to configure `access` for multiple models at once. Upgrade to v1.7 for additional configuration options. A group or subfolder contains models with varying access levels, so when you designate a model with `access: public`, make sure you intend for this behavior. - - - - - You can apply access modifiers in config files, including the `dbt_project.yml`, or to models one-by-one in `properties.yml`. Applying access configs to a subfolder modifies the default for all models in that subfolder, so make sure you intend for this behavior. When setting individual model access, a group or subfolder might contain a variety of access levels, so when you designate a model with `access: public` make sure you intend for this behavior. There are multiple approaches to configuring access: @@ -83,8 +75,6 @@ There are multiple approaches to configuring access: ``` - - After you define `access`, rerun a production job to apply the change. ## Definition diff --git a/website/docs/reference/resource-configs/athena-configs.md b/website/docs/reference/resource-configs/athena-configs.md new file mode 100644 index 00000000000..f871ede9fab --- /dev/null +++ b/website/docs/reference/resource-configs/athena-configs.md @@ -0,0 +1,552 @@ +--- +title: "Amazon Athena configurations" +description: "Reference article for the Amazon Athena adapter for dbt Core and dbt Cloud." +id: "athena-configs" +--- + +## Models + +### Table configuration + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `external_location` | None | The full S3 path to where the table is saved. It only works with incremental models. It doesn't work with Hive tables with `ha` set to `true`. | +| `partitioned_by` | None | An array list of columns by which the table will be partitioned. Currently limited to 100 partitions. | +| `bucketed_by` | None | An array list of the columns to bucket data. Ignored if using Iceberg. | +| `bucket_count` | None | The number of buckets for bucketing your data. This parameter is ignored if using Iceberg. | +| `table_type` | Hive | The type of table. Supports `hive` or `iceberg`. | +| `ha` | False | Build the table using the high-availability method. Only available for Hive tables. | +| `format` | Parquet | The data format for the table. Supports `ORC`, `PARQUET`, `AVRO`, `JSON`, and `TEXTFILE`. | +| `write_compression` | None | The compression type for any storage format that allows compressions. | +| `field_delimeter` | None | Specify the custom field delimiter to use when the format is set to `TEXTFIRE`. | +| `table_properties` | N/A | The table properties to add to the table. This is only for Iceberg. | +| `native_drop` | N/A | Relation drop operations will be performed with SQL, not direct Glue API calls. No S3 calls will be made to manage data in S3. Data in S3 will only be cleared up for Iceberg tables. See the [AWS docs](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-managing-tables.html) for more info. Iceberg DROP TABLE operations may timeout if they take longer than 60 seconds.| +| `seed_by_insert` | False | Creates seeds using an SQL insert statement. Large seed files can't exceed the Athena 262144 bytes limit. | +| `force_batch` | False | Run the table creation directly in batch insert mode. Useful when the standard table creation fails due to partition limitation. | +| `unique_tmp_table_suffix` | False | Replace the "__dbt_tmp table" suffix with a unique UUID for incremental models using insert overwrite on Hive tables. | +| `temp_schema` | None | Defines a schema to hold temporary create statements used in incremental model runs. Scheme will be created in the models target database if it does not exist. | +| `lf_tags_config` | None | [AWS Lake Formation](#aws-lake-formation-integration) tags to associate with the table and columns. Existing tags will be removed.
* `enabled` (`default=False`) whether LF tags management is enabled for a model
* `tags` dictionary with tags and their values to assign for the model
* `tags_columns` dictionary with a tag key, value and list of columns they must be assigned to | +| `lf_inherited_tags` | None | List of the Lake Formation tag keys that are to be inherited from the database level and shouldn't be removed during the assignment of those defined in `ls_tags_config`. | +| `lf_grants` | None | Lake Formation grants config for `data_cell` filters. | + +#### Configuration examples + + + + + + + +```sql +{{ + config( + materialized='incremental', + incremental_strategy='append', + on_schema_change='append_new_columns', + table_type='iceberg', + schema='test_schema', + lf_tags_config={ + 'enabled': true, + 'tags': { + 'tag1': 'value1', + 'tag2': 'value2' + }, + 'tags_columns': { + 'tag1': { + 'value1': ['column1', 'column2'], + 'value2': ['column3', 'column4'] + } + }, + 'inherited_tags': ['tag1', 'tag2'] + } + ) +}} +``` + + + + + + + + +```yaml + +lf_tags_config: + enabled: true + tags: + tag1: value1 + tag2: value2 + tags_columns: + tag1: + value1: [ column1, column2 ] + inherited_tags: [ tag1, tag2 ] +``` + + + + + + + +```python +lf_grants={ + 'data_cell_filters': { + 'enabled': True | False, + 'filters': { + 'filter_name': { + 'row_filter': '', + 'principals': ['principal_arn1', 'principal_arn2'] + } + } + } + } +``` + + + + + +There are some limitations and recommendations that should be considered: + +- `lf_tags` and `lf_tags_columns` configs support only attaching lf tags to corresponding resources. +- We recommend managing LF Tags permissions somewhere outside dbt. For example, [terraform](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lakeformation_permissions) or [aws cdk](https://docs.aws.amazon.com/cdk/api/v1/docs/aws-lakeformation-readme.html). +- `data_cell_filters` management can't be automated outside dbt because the filter can't be attached to the table, which doesn't exist. Once you `enable` this config, dbt will set all filters and their permissions during every dbt run. Such an approach keeps the actual state of row-level security configuration after every dbt run and applies changes if they occur: drop, create, and update filters and their permissions. +- Any tags listed in `lf_inherited_tags` should be strictly inherited from the database level and never overridden at the table and column level. +- Currently, `dbt-athena` does not differentiate between an inherited tag association and an override it made previously. + - For example, If a `lf_tags_config` value overrides an inherited tag in one run, and that override is removed before a subsequent run, the prior override will linger and no longer be encoded anywhere (for example, Terraform where the inherited value is configured nor in the DBT project where the override previously existed but now is gone). + + +### Table location + +The saved location of a table is determined in precedence by the following conditions: + +1. If `external_location` is defined, that value is used. +2. If `s3_data_dir` is defined, the path is determined by that and `s3_data_naming`. +3. If `s3_data_dir` is not defined, data is stored under `s3_staging_dir/tables/`. + +The following options are available for `s3_data_naming`: + +- `unique`: `{s3_data_dir}/{uuid4()}/` +- `table`: `{s3_data_dir}/{table}/` +- `table_unique`: `{s3_data_dir}/{table}/{uuid4()}/` +- `schema_table`: `{s3_data_dir}/{schema}/{table}/` +- `s3_data_naming=schema_table_unique`: `{s3_data_dir}/{schema}/{table}/{uuid4()}/` + +To set the `s3_data_naming` globally in the target profile, overwrite the value in the table config, or set up the value for groups of the models in dbt_project.yml. + +Note: If you're using a workgroup with a default output location configured, `s3_data_naming` ignores any configured buckets and uses the location configured in the workgroup. + +### Incremental models + +The following [incremental models](https://docs.getdbt.com/docs/build/incremental-models) strategies are supported: + +- `insert_overwrite` (default): The insert-overwrite strategy deletes the overlapping partitions from the destination table and then inserts the new records from the source. This strategy depends on the `partitioned_by` keyword! dbt will fall back to the `append` strategy if no partitions are defined. +- `append`: Insert new records without updating, deleting or overwriting any existing data. There might be duplicate data (great for log or historical data). +- `merge`: Conditionally updates, deletes, or inserts rows into an Iceberg table. Used in combination with `unique_key`.It is only available when using Iceberg. + + +### On schema change + +The `on_schema_change` option reflects changes of the schema in incremental models. The values you can set this to are: + +- `ignore` (default) +- `fail` +- `append_new_columns` +- `sync_all_columns` + +To learn more, refer to [What if the columns of my incremental model change](/docs/build/incremental-models#what-if-the-columns-of-my-incremental-model-change). + +### Iceberg + +The adapter supports table materialization for Iceberg. + +For example: + +```sql +{{ config( + materialized='table', + table_type='iceberg', + format='parquet', + partitioned_by=['bucket(user_id, 5)'], + table_properties={ + 'optimize_rewrite_delete_file_threshold': '2' + } +) }} + +select 'A' as user_id, + 'pi' as name, + 'active' as status, + 17.89 as cost, + 1 as quantity, + 100000000 as quantity_big, + current_date as my_date +``` + +Iceberg supports bucketing as hidden partitions. Use the `partitioned_by` config to add specific bucketing +conditions. + +Iceberg supports the `PARQUET`, `AVRO` and `ORC` table formats for data . + +The following are the supported strategies for using Iceberg incrementally: + +- `append`: New records are appended to the table (this can lead to duplicates). +- `merge`: Perform an update and insert (and optional delete) where new and existing records are added. This is only available with Athena engine version 3. + - `unique_key`(required): Columns that define a unique source and target table record. + - `incremental_predicates` (optional): The SQL conditions that enable custom join clauses in the merge statement. This helps improve performance via predicate pushdown on target tables. + - `delete_condition` (optional): SQL condition that identifies records that should be deleted. + - `update_condition` (optional): SQL condition that identifies records that should be updated. + - `insert_condition` (optional): SQL condition that identifies records that should be inserted. + +`incremental_predicates`, `delete_condition`, `update_condition` and `insert_condition` can include any column of the incremental table (`src`) or the final table (`target`). Column names must be prefixed by either `src` or `target` to prevent a `Column is ambiguous` error. + + + + + +```sql +{{ config( + materialized='incremental', + table_type='iceberg', + incremental_strategy='merge', + unique_key='user_id', + incremental_predicates=["src.quantity > 1", "target.my_date >= now() - interval '4' year"], + delete_condition="src.status != 'active' and target.my_date < now() - interval '2' year", + format='parquet' +) }} + +select 'A' as user_id, + 'pi' as name, + 'active' as status, + 17.89 as cost, + 1 as quantity, + 100000000 as quantity_big, + current_date as my_date +``` + + + + + +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key=['id'], + update_condition='target.id > 1', + schema='sandbox' + ) +}} + +{% if is_incremental() %} + +select * from ( + values + (1, 'v1-updated') + , (2, 'v2-updated') +) as t (id, value) + +{% else %} + +select * from ( + values + (-1, 'v-1') + , (0, 'v0') + , (1, 'v1') + , (2, 'v2') +) as t (id, value) + +{% endif %} +``` + + + + + +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key=['id'], + insert_condition='target.status != 0', + schema='sandbox' + ) +}} + +select * from ( + values + (1, 0) + , (2, 1) +) as t (id, status) + +``` + + + + + +### High availability (HA) table + +The current implementation of table materialization can lead to downtime, as the target table is dropped and re-created. For less destructive behavior, you can use the `ha` config on your `table` materialized models. It leverages the table versions feature of the glue catalog, which creates a temporary table and swaps the target table to the location of the temporary table. This materialization is only available for `table_type=hive` and requires using unique locations. For Iceberg, high availability is the default. + +By default, the materialization keeps the last 4 table versions,but you can change it by setting `versions_to_keep`. + +```sql +{{ config( + materialized='table', + ha=true, + format='parquet', + table_type='hive', + partitioned_by=['status'], + s3_data_naming='table_unique' +) }} + +select 'a' as user_id, + 'pi' as user_name, + 'active' as status +union all +select 'b' as user_id, + 'sh' as user_name, + 'disabled' as status +``` + + +#### HA known issues + +- There could be a little downtime when swapping from a table with partitions to a table without (and the other way around). If higher performance is needed, consider bucketing instead of partitions. +- By default, Glue "duplicates" the versions internally, so the last two versions of a table point to the same location. +- It's recommended to set `versions_to_keep` >= 4, as this will avoid having the older location removed. + +### Update glue data catalog + +You can persist your column and model level descriptions to the Glue Data Catalog as [glue table properties](https://docs.aws.amazon.com/glue/latest/dg/tables-described.html#table-properties) and [column parameters](https://docs.aws.amazon.com/glue/latest/webapi/API_Column.html). To enable this, set the configuration to `true` as shown in the following example. By default, documentation persistence is disabled, but it can be enabled for specific resources or groups of resources as needed. + + +For example: + +```yaml +models: + - name: test_deduplicate + description: another value + config: + persist_docs: + relation: true + columns: true + meta: + test: value + columns: + - name: id + meta: + primary_key: true +``` + +Refer to [persist_docs](https://docs.getdbt.com/reference/resource-configs/persist_docs) for more details. + +## Snapshots + +The adapter supports snapshot materialization. It supports both the timestamp and check strategies. To create a snapshot, create a snapshot file in the `snapshots` directory. You'll need to create this directory if it doesn't already exist. + +### Timestamp strategy + + +Refer to [Timestamp strategy](/docs/build/snapshots#timestamp-strategy-recommended) for details on how to use it. + + +### Check strategy + +Refer to [Check strategy](/docs/build/snapshots#check-strategy) for details on how to use it. + +### Hard deletes + +The materialization also supports invalidating hard deletes. For usage details, refer to [Hard deletes](/docs/build/snapshots#hard-deletes-opt-in). + +### Snapshots known issues + +- Incremental Iceberg models - Sync all columns on schema change. Columns used for partitioning can't be removed. From a dbt perspective, the only way is to fully refresh the incremental model. +- Tables, schemas and database names should only be lowercase +- To avoid potential conflicts, make sure [`dbt-athena-adapter`](https://github.com/Tomme/dbt-athena) is not installed in the target environment. +- Snapshot does not support dropping columns from the source table. If you drop a column, make sure to drop the column from the snapshot as well. Another workaround is to NULL the column in the snapshot definition to preserve the history. + +## AWS Lake Formation integration + +The following describes how the adapter implements the AWS Lake Formation tag management: + +- [Enable](#table-configuration) LF tags management with the `lf_tags_config` parameter. By default, it's disabled. +- Once enabled, LF tags are updated on every dbt run. +- First, all lf-tags for columns are removed to avoid inheritance issues. +- Then, all redundant lf-tags are removed from tables and actual tags from table configs are applied. +- Finally, lf-tags for columns are applied. + +It's important to understand the following points: + +- dbt doesn't manage `lf-tags` for databases +- dbt doesn't manage Lake Formation permissions + +That's why it's important to take care of this yourself or use an automation tool such as terraform and AWS CDK. For more details, refer to: + +* [terraform aws_lakeformation_permissions](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lakeformation_permissions) +* [terraform aws_lakeformation_resource_lf_tags](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lakeformation_resource_lf_tags) + +## Python models + +The adapter supports Python models using [`spark`](https://docs.aws.amazon.com/athena/latest/ug/notebooks-spark.html). + +### Prerequisites + +- A Spark-enabled workgroup created in Athena. +- Spark execution role granted access to Athena, Glue and S3. +- The Spark workgroup is added to the `~/.dbt/profiles.yml` file and the profile to be used + is referenced in `dbt_project.yml`. + +### Spark-specific table configuration + +| Configuration | Default | Description | +|---------------|---------|--------------| +| `timeout` | 43200 | Time out in seconds for each Python model execution. Defaults to 12 hours/43200 seconds. | +| `spark_encryption` | False | When set to `true,` it encrypts data stored locally by Spark and in transit between Spark nodes. | +| `spark_cross_account_catalog` | False | When using the Spark Athena workgroup, queries can only be made against catalogs on the same AWS account by default. Setting this parameter to true will enable querying external catalogs if you want to query another catalog on an external AWS account.

Use the syntax `external_catalog_id/database.table` to access the external table on the external catalog (For example, `999999999999/mydatabase.cloudfront_logs` where 999999999999 is the external catalog ID).| +| `spark_requester_pays` | False | When set to true, if an Amazon S3 bucket is configured as `requester pays`, the user account running the query is charged for data access and data transfer fees associated with the query. | + + +### Spark notes + +- A session is created for each unique engine configuration defined in the models that are part of the invocation. +A session's idle timeout is set to 10 minutes. Within the timeout period, if a new calculation (Spark Python model) is ready for execution and the engine configuration matches, the process will reuse the same session. +- The number of Python models running simultaneously depends on the `threads`. The number of sessions created for the entire run depends on the number of unique engine configurations and the availability of sessions to maintain thread concurrency. +- For Iceberg tables, it's recommended to use the `table_properties` configuration to set the `format_version` to `2`. This helps maintain compatibility between the Iceberg tables Trino created and those Spark created. + +### Example models + + + + + +```python +import pandas as pd + + +def model(dbt, session): + dbt.config(materialized="table") + + model_df = pd.DataFrame({"A": [1, 2, 3, 4]}) + + return model_df +``` + + + + + +```python +def model(dbt, spark_session): + dbt.config(materialized="table") + + data = [(1,), (2,), (3,), (4,)] + + df = spark_session.createDataFrame(data, ["A"]) + + return df +``` + + + + +```python +def model(dbt, spark_session): + dbt.config(materialized="incremental") + df = dbt.ref("model") + + if dbt.is_incremental: + max_from_this = ( + f"select max(run_date) from {dbt.this.schema}.{dbt.this.identifier}" + ) + df = df.filter(df.run_date >= spark_session.sql(max_from_this).collect()[0][0]) + + return df +``` + + + + + +```python +def model(dbt, spark_session): + dbt.config( + materialized="table", + engine_config={ + "CoordinatorDpuSize": 1, + "MaxConcurrentDpus": 3, + "DefaultExecutorDpuSize": 1 + }, + spark_encryption=True, + spark_cross_account_catalog=True, + spark_requester_pays=True + polling_interval=15, + timeout=120, + ) + + data = [(1,), (2,), (3,), (4,)] + + df = spark_session.createDataFrame(data, ["A"]) + + return df +``` + + + + + +Using imported external python files: + +```python +def model(dbt, spark_session): + dbt.config( + materialized="incremental", + incremental_strategy="merge", + unique_key="num", + ) + sc = spark_session.sparkContext + sc.addPyFile("s3://athena-dbt/test/file1.py") + sc.addPyFile("s3://athena-dbt/test/file2.py") + + def func(iterator): + from file2 import transform + + return [transform(i) for i in iterator] + + from pyspark.sql.functions import udf + from pyspark.sql.functions import col + + udf_with_import = udf(func) + + data = [(1, "a"), (2, "b"), (3, "c")] + cols = ["num", "alpha"] + df = spark_session.createDataFrame(data, cols) + + return df.withColumn("udf_test_col", udf_with_import(col("alpha"))) +``` + + + + + +### Known issues in Python models + +- Python models can't [reference Athena SQL views](https://docs.aws.amazon.com/athena/latest/ug/notebooks-spark.html). +- You can use third-party Python libraries; however, they must be [included in the pre-installed list][pre-installed list] or [imported manually][imported manually]. +- Python models can only reference or write to tables with names matching the regular expression: `^[0-9a-zA-Z_]+$`. Spark doesn't support dashes or special characters, even though Athena supports them. +- Incremental models don't fully utilize Spark capabilities. They depend partially on existing SQL-based logic that runs on Trino. +- Snapshot materializations are not supported. +- Spark can only reference tables within the same catalog. +- For tables created outside of the dbt tool, be sure to populate the location field, or dbt will throw an error when creating the table. + + +[pre-installed list]: https://docs.aws.amazon.com/athena/latest/ug/notebooks-spark-preinstalled-python-libraries.html +[imported manually]: https://docs.aws.amazon.com/athena/latest/ug/notebooks-import-files-libraries.html + +## Contracts + +The adapter partly supports contract definitions: + +- `data_type` is supported but needs to be adjusted for complex types. Types must be specified entirely (for example, `array`) even though they won't be checked. Indeed, as dbt recommends, we only compare the broader type (array, map, int, varchar). The complete definition is used to check that the data types defined in Athena are ok (pre-flight check). +- The adapter does not support the constraints since Athena has no constraint concept. + diff --git a/website/docs/reference/resource-configs/bigquery-configs.md b/website/docs/reference/resource-configs/bigquery-configs.md index a6f3036ede8..9dd39c936b6 100644 --- a/website/docs/reference/resource-configs/bigquery-configs.md +++ b/website/docs/reference/resource-configs/bigquery-configs.md @@ -21,7 +21,7 @@ This will allow you to read and write from multiple BigQuery projects. Same for ### Partition clause -BigQuery supports the use of a [partition by](https://cloud.google.com/bigquery/docs/data-definition-language#specifying_table_partitioning_options) clause to easily partition a by a column or expression. This option can help decrease latency and cost when querying large tables. Note that partition pruning [only works](https://cloud.google.com/bigquery/docs/querying-partitioned-tables#pruning_limiting_partitions) when partitions are filtered using literal values (so selecting partitions using a won't improve performance). +BigQuery supports the use of a [partition by](https://cloud.google.com/bigquery/docs/data-definition-language#specifying_table_partitioning_options) clause to easily partition a by a column or expression. This option can help decrease latency and cost when querying large tables. Note that partition pruning [only works](https://cloud.google.com/bigquery/docs/querying-partitioned-tables#use_a_constant_filter_expression) when partitions are filtered using literal values (so selecting partitions using a won't improve performance). The `partition_by` config can be supplied as a dictionary with the following format: @@ -265,7 +265,7 @@ If your model has `partition_by` configured, you may optionally specify two addi -### Clustering Clause +### Clustering clause BigQuery tables can be [clustered](https://cloud.google.com/bigquery/docs/clustered-tables) to colocate related data. @@ -286,7 +286,7 @@ select * from ... -Clustering on a multiple columns: +Clustering on multiple columns: @@ -303,11 +303,11 @@ select * from ... -## Managing KMS Encryption +## Managing KMS encryption [Customer managed encryption keys](https://cloud.google.com/bigquery/docs/customer-managed-encryption) can be configured for BigQuery tables using the `kms_key_name` model configuration. -### Using KMS Encryption +### Using KMS encryption To specify the KMS key name for a model (or a group of models), use the `kms_key_name` model configuration. The following example sets the `kms_key_name` for all of the models in the `encrypted/` directory of your dbt project. @@ -328,7 +328,7 @@ models: -## Labels and Tags +## Labels and tags ### Specifying labels @@ -373,8 +373,6 @@ models: - - ### Specifying tags @@ -434,7 +432,7 @@ The `incremental_strategy` config can be set to one of two values: ### Performance and cost The operations performed by dbt while building a BigQuery incremental model can -be made cheaper and faster by using [clustering keys](#clustering-keys) in your +be made cheaper and faster by using a [clustering clause](#clustering-clause) in your model configuration. See [this guide](https://discourse.getdbt.com/t/benchmarking-incremental-strategies-on-bigquery/981) for more information on performance tuning for BigQuery incremental models. **Note:** These performance and cost benefits are applicable to incremental models @@ -673,7 +671,7 @@ select ... -## Authorized Views +## Authorized views If the `grant_access_to` config is specified for a model materialized as a view, dbt will grant the view model access to select from the list of datasets @@ -712,8 +710,6 @@ models: Views with this configuration will be able to select from objects in `project_1.dataset_1` and `project_2.dataset_2`, even when they are located elsewhere and queried by users who do not otherwise have access to `project_1.dataset_1` and `project_2.dataset_2`. - - ## Materialized views The BigQuery adapter supports [materialized views](https://cloud.google.com/bigquery/docs/materialized-views-intro) @@ -896,10 +892,6 @@ As with most data platforms, there are limitations associated with materialized Find more information about materialized view limitations in Google's BigQuery [docs](https://cloud.google.com/bigquery/docs/materialized-views-intro#limitations). - - - - ## Python models The BigQuery adapter supports Python models with the following additional configuration parameters: @@ -916,4 +908,3 @@ By default, this is set to `True` to support the default `intermediate_format` o ### The `intermediate_format` parameter The `intermediate_format` parameter specifies which file format to use when writing records to a table. The default is `parquet`. - diff --git a/website/docs/reference/resource-configs/check_cols.md b/website/docs/reference/resource-configs/check_cols.md index bd187409379..f7c6b85d372 100644 --- a/website/docs/reference/resource-configs/check_cols.md +++ b/website/docs/reference/resource-configs/check_cols.md @@ -3,6 +3,31 @@ resource_types: [snapshots] description: "Read this guide to understand the check_cols configuration in dbt." datatype: "[column_name] | all" --- + + + + + ```yml + snapshots: + - name: snapshot_name + relation: source('my_source', 'my_table') + config: + schema: string + unique_key: column_name_or_expression + strategy: check + check_cols: + - column_name + ``` + + + + + + +import SnapshotYaml from '/snippets/_snapshot-yaml-spec.md'; + + + ```jinja2 @@ -14,7 +39,7 @@ datatype: "[column_name] | all" ``` - + @@ -42,6 +67,29 @@ No default is provided. ### Check a list of columns for changes + + + + +```yaml +snapshots: + - name: orders_snapshot_check + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: check + check_cols: + - status + - is_cancelled +``` + + +To select from this snapshot in a downstream model: `select * from {{ ref('orders_snapshot_check') }}` + + + + ```sql {% snapshot orders_snapshot_check %} @@ -58,8 +106,32 @@ No default is provided. {% endsnapshot %} ``` + + ### Check all columns for changes + + + + +```yaml +snapshots: + - name: orders_snapshot_check + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: check + check_cols: + - all + ``` + + +To select from this snapshot in a downstream model: `select * from {{{ ref('orders_snapshot_check') }}` + + + + ```sql {% snapshot orders_snapshot_check %} @@ -75,3 +147,4 @@ No default is provided. {% endsnapshot %} ``` + diff --git a/website/docs/reference/resource-configs/contract.md b/website/docs/reference/resource-configs/contract.md index 5904a6cc69d..fb25076b0d9 100644 --- a/website/docs/reference/resource-configs/contract.md +++ b/website/docs/reference/resource-configs/contract.md @@ -2,7 +2,7 @@ resource_types: [models] description: "When the contract configuration is enforced, dbt will ensure that your model's returned dataset exactly matches the attributes you have defined in yaml, such as name and data_type, as well as any additional constraints supported by the data platform." datatype: "{}" -default_value: {contract: false} +default_value: {enforced: false} id: "contract" --- @@ -16,14 +16,6 @@ This is to ensure that the people querying your model downstream—both inside a ## Data type aliasing - - -The `data_type` defined in your YAML file must match a data type your data platform recognizes. dbt does not do any type aliasing itself. If your data platform recognizes both `int` and `integer` as corresponding to the same type, then they will return a match. - - - - - dbt uses built-in type aliasing for the `data_type` defined in your YAML. For example, you can specify `string` in your contract, and on Postgres/Redshift, dbt will convert it to `text`. If dbt doesn't recognize the `data_type` name among its known aliases, it will pass it through as-is. This is enabled by default, but you can opt-out by setting `alias_types` to `false`. Example for disabling: @@ -42,7 +34,6 @@ models: ``` -
## Size, precision, and scale diff --git a/website/docs/reference/resource-configs/databricks-configs.md b/website/docs/reference/resource-configs/databricks-configs.md index ff580d2d8cf..f807b1c0d88 100644 --- a/website/docs/reference/resource-configs/databricks-configs.md +++ b/website/docs/reference/resource-configs/databricks-configs.md @@ -7,22 +7,7 @@ id: "databricks-configs" When materializing a model as `table`, you may include several optional configs that are specific to the dbt-databricks plugin, in addition to the standard [model configs](/reference/model-configs). - - - -| Option | Description | Required? | Model Support | Example | -|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|---------------|--------------------------| -| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | SQL, Python | `delta` | -| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | SQL, Python | `/mnt/root` | -| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | SQL, Python | `date_day` | -| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | SQL | `date_day` | -| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | SQL, Python | `country_code` | -| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | SQL, Python | `8` | -| tblproperties | [Tblproperties](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-tblproperties.html) to be set on the created table | Optional | SQL | `{'this.is.my.key': 12}` | - - - - + | Option | Description | Required? | Model Support | Example | @@ -34,13 +19,14 @@ When materializing a model as `table`, you may include several optional configs | clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | SQL, Python | `country_code` | | buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | SQL, Python | `8` | | tblproperties | [Tblproperties](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-tblproperties.html) to be set on the created table | Optional | SQL, Python* | `{'this.is.my.key': 12}` | +| compression | Set the compression algorithm. | Optional | SQL, Python | `zstd` | \* Beginning in 1.7.12, we have added tblproperties to Python models via an alter statement that runs after table creation. We do not yet have a PySpark API to set tblproperties at table creation, so this feature is primarily to allow users to anotate their python-derived tables with tblproperties. - + 1.8 introduces support for [Tags](https://docs.databricks.com/en/data-governance/unity-catalog/tags.html) at the table level, in addition to all table configuration supported in 1.7. @@ -49,11 +35,12 @@ We do not yet have a PySpark API to set tblproperties at table creation, so this | file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | SQL, Python | `delta` | | location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | SQL, Python | `/mnt/root` | | partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | SQL, Python | `date_day` | -| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | SQL | `date_day` | +| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | SQL, Python | `date_day` | | clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | SQL, Python | `country_code` | | buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | SQL, Python | `8` | | tblproperties | [Tblproperties](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-tblproperties.html) to be set on the created table | Optional | SQL, Python* | `{'this.is.my.key': 12}` | -| databricks_tags | [Tags](https://docs.databricks.com/en/data-governance/unity-catalog/tags.html) to be set on the created table | Optional | SQL+, Python+ | `{'my_tag': 'my_value'}` | +| databricks_tags | [Tags](https://docs.databricks.com/en/data-governance/unity-catalog/tags.html) to be set on the created table | Optional | SQL+, Python+ | `{'my_tag': 'my_value'}` | +| compression | Set the compression algorithm. | Optional | SQL, Python | `zstd` | \* Beginning in 1.7.12, we have added tblproperties to Python models via an alter statement that runs after table creation. We do not yet have a PySpark API to set tblproperties at table creation, so this feature is primarily to allow users to anotate their python-derived tables with tblproperties. @@ -62,6 +49,131 @@ We do not yet have a PySpark API to set tblproperties at table creation, so this + + +dbt Core v.9 and Versionless dbt Clouyd support for `table_format: iceberg`, in addition to all previous table configurations supported in 1.8. + +| Option | Description | Required? | Model Support | Example | +|---------------------|-----------------------------|-------------------------------------------|-----------------|--------------------------| +| table_format | Whether or not to provision [Iceberg](https://docs.databricks.com/en/delta/uniform.html) compatibility for the materialization | Optional | SQL, Python | `iceberg` | +| file_format+ | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | SQL, Python | `delta` | +| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | SQL, Python | `/mnt/root` | +| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | SQL, Python | `date_day` | +| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | SQL, Python | `date_day` | +| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | SQL, Python | `country_code` | +| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | SQL, Python | `8` | +| tblproperties | [Tblproperties](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-tblproperties.html) to be set on the created table | Optional | SQL, Python* | `{'this.is.my.key': 12}` | +| databricks_tags | [Tags](https://docs.databricks.com/en/data-governance/unity-catalog/tags.html) to be set on the created table | Optional | SQL++, Python++ | `{'my_tag': 'my_value'}` | +| compression | Set the compression algorithm. | Optional | SQL, Python | `zstd` | + +\* We do not yet have a PySpark API to set tblproperties at table creation, so this feature is primarily to allow users to anotate their python-derived tables with tblproperties. +\+ When `table_format` is `iceberg`, `file_format` must be `delta`. +\++ `databricks_tags` are currently only supported at the table level, and applied via `ALTER` statements. + + + + + +### Python submission methods + +In dbt v1.9 and higher, or in [Versionless](/docs/dbt-versions/versionless-cloud) dbt Cloud, you can use these four options for `submission_method`: + +* `all_purpose_cluster`: Executes the python model either directly using the [command api](https://docs.databricks.com/api/workspace/commandexecution) or by uploading a notebook and creating a one-off job run +* `job_cluster`: Creates a new job cluster to execute an uploaded notebook as a one-off job run +* `serverless_cluster`: Uses a [serverless cluster](https://docs.databricks.com/en/jobs/run-serverless-jobs.html) to execute an uploaded notebook as a one-off job run +* `workflow_job`: Creates/updates a reusable workflow and uploaded notebook, for execution on all-purpose, job, or serverless clusters. + :::caution + This approach gives you maximum flexibility, but will create persistent artifacts in Databricks (the workflow) that users could run outside of dbt. + ::: + +We are currently in a transitionary period where there is a disconnect between old submission methods (which were grouped by compute), and the logically distinct submission methods (command, job run, workflow). + +As such, the supported config matrix is somewhat complicated: + +| Config | Use | Default | `all_purpose_cluster`* | `job_cluster` | `serverless_cluster` | `workflow_job` | +| --------------------- | -------------------------------------------------------------------- | ------------------ | ---------------------- | ------------- | -------------------- | -------------- | +| `create_notebook` | if false, use Command API, otherwise upload notebook and use job run | `false` | ✅ | ❌ | ❌ | ❌ | +| `timeout` | maximum time to wait for command/job to run | `0` (No timeout) | ✅ | ✅ | ✅ | ✅ | +| `job_cluster_config` | configures a [new cluster](https://docs.databricks.com/api/workspace/jobs/submit#tasks-new_cluster) for running the model | `{}` | ❌ | ✅ | ❌ | ✅ | +| `access_control_list` | directly configures [access control](https://docs.databricks.com/api/workspace/jobs/submit#access_control_list) for the job | `{}` | ✅ | ✅ | ✅ | ✅ | +| `packages` | list of packages to install on the executing cluster | `[]` | ✅ | ✅ | ✅ | ✅ | +| `index_url` | url to install `packages` from | `None` (uses pypi) | ✅ | ✅ | ✅ | ✅ | +| `additional_libs` | directly configures [libraries](https://docs.databricks.com/api/workspace/jobs/submit#tasks-libraries) | `[]` | ✅ | ✅ | ✅ | ✅ | +| `python_job_config` | additional configuration for jobs/workflows (see table below) | `{}` | ✅ | ✅ | ✅ | ✅ | +| `cluster_id` | id of existing all purpose cluster to execute against | `None` | ✅ | ❌ | ❌ | ✅ | +| `http_path` | path to existing all purpose cluster to execute against | `None` | ✅ | ❌ | ❌ | ❌ | + +\* Only `timeout` and `cluster_id`/`http_path` are supported when `create_notebook` is false + +With the introduction of the `workflow_job` submission method, we chose to segregate further configuration of the python model submission under a top level configuration named `python_job_config`. This keeps configuration options for jobs and workflows namespaced in such a way that they do not interfere with other model config, allowing us to be much more flexible with what is supported for job execution. + +The support matrix for this feature is divided into `workflow_job` and all others (assuming `all_purpose_cluster` with `create_notebook`==true). +Each config option listed must be nested under `python_job_config`: + +| Config | Use | Default | `workflow_job` | All others | +| -------------------------- | ----------------------------------------------------------------------------------------------------------------------- | ------- | -------------- | ---------- | +| `name` | The name to give (or used to look up) the created workflow | `None` | ✅ | ❌ | +| `grants` | A simplified way to specify access control for the workflow | `{}` | ✅ | ✅ | +| `existing_job_id` | Id to use to look up the created workflow (in place of `name`) | `None` | ✅ | ❌ | +| `post_hook_tasks` | [Tasks](https://docs.databricks.com/api/workspace/jobs/create#tasks) to include after the model notebook execution | `[]` | ✅ | ❌ | +| `additional_task_settings` | Additional [task config](https://docs.databricks.com/api/workspace/jobs/create#tasks) to include in the model task | `{}` | ✅ | ❌ | +| [Other job run settings](https://docs.databricks.com/api/workspace/jobs/submit) | Config will be copied into the request, outside of the model task | `None` | ❌ | ✅ | +| [Other workflow settings](https://docs.databricks.com/api/workspace/jobs/create) | Config will be copied into the request, outside of the model task | `None` | ✅ | ❌ | + +This example uses the new configuration options in the previous table: + + + +```yaml +models: + - name: my_model + config: + submission_method: workflow_job + + # Define a job cluster to create for running this workflow + # Alternately, could specify cluster_id to use an existing cluster, or provide neither to use a serverless cluster + job_cluster_config: + spark_version: "15.3.x-scala2.12" + node_type_id: "rd-fleet.2xlarge" + runtime_engine: "{{ var('job_cluster_defaults.runtime_engine') }}" + data_security_mode: "{{ var('job_cluster_defaults.data_security_mode') }}" + autoscale: { "min_workers": 1, "max_workers": 4 } + + python_job_config: + # These settings are passed in, as is, to the request + email_notifications: { on_failure: ["me@example.com"] } + max_retries: 2 + + name: my_workflow_name + + # Override settings for your model's dbt task. For instance, you can + # change the task key + additional_task_settings: { "task_key": "my_dbt_task" } + + # Define tasks to run before/after the model + # This example assumes you have already uploaded a notebook to /my_notebook_path to perform optimize and vacuum + post_hook_tasks: + [ + { + "depends_on": [{ "task_key": "my_dbt_task" }], + "task_key": "OPTIMIZE_AND_VACUUM", + "notebook_task": + { "notebook_path": "/my_notebook_path", "source": "WORKSPACE" }, + }, + ] + + # Simplified structure, rather than having to specify permission separately for each user + grants: + view: [{ "group_name": "marketing-team" }] + run: [{ "user_name": "other_user@example.com" }] + manage: [] +``` + + + + + + ## Incremental models dbt-databricks plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-strategy). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of four values: @@ -72,6 +184,23 @@ dbt-databricks plugin leans heavily on the [`incremental_strategy` config](/docs Each of these strategies has its pros and cons, which we'll discuss below. As with any model config, `incremental_strategy` may be specified in `dbt_project.yml` or within a model file's `config()` block. + + + + +## Incremental models + +dbt-databricks plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-strategy). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of five values: + - **`append`**: Insert new records without updating or overwriting any existing data. + - **`insert_overwrite`**: If `partition_by` is specified, overwrite partitions in the with new data. If no `partition_by` is specified, overwrite the entire table with new data. + - **`merge`** (default; Delta and Hudi file format only): Match records based on a `unique_key`, updating old records, and inserting new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) + - **`replace_where`** (Delta file format only): Match records based on `incremental_predicates`, replacing all records that match the predicates from the existing table with records matching the predicates from the new data. (If no `incremental_predicates` are specified, all new data is inserted, similar to `append`.) + - **`microbatch`** (Delta file format only): Implements the [microbatch strategy](/docs/build/incremental-microbatch) using `replace_where` with predicates generated based `event_time`. + +Each of these strategies has its pros and cons, which we'll discuss below. As with any model config, `incremental_strategy` may be specified in `dbt_project.yml` or within a model file's `config()` block. + + + ### The `append` strategy Following the `append` strategy, dbt will perform an `insert into` statement with all new data. The appeal of this strategy is that it is straightforward and functional across all platforms, file types, connection methods, and Apache Spark versions. However, this strategy _cannot_ update, overwrite, or delete existing data, so it is likely to insert duplicate records for many data sources. @@ -218,7 +347,7 @@ The `merge` incremental strategy requires: - Databricks Runtime 5.1 and above for delta file format - Apache Spark for hudi file format -dbt will run an [atomic `merge` statement](https://docs.databricks.com/spark/latest/spark-sql/language-manual/merge-into.html) which looks nearly identical to the default merge behavior on Snowflake and BigQuery. If a `unique_key` is specified (recommended), dbt will update old records with values from new records that match on the key column. If a `unique_key` is not specified, dbt will forgo match criteria and simply insert all new records (similar to `append` strategy). +The Databricks adapter will run an [atomic `merge` statement](https://docs.databricks.com/spark/latest/spark-sql/language-manual/merge-into.html) similar to the default merge behavior on Snowflake and BigQuery. If a `unique_key` is specified (recommended), dbt will update old records with values from new records that match on the key column. If a `unique_key` is not specified, dbt will forgo match criteria and simply insert all new records (similar to `append` strategy). Specifying `merge` as the incremental strategy is optional since it's the default strategy used when none is specified. @@ -299,6 +428,123 @@ merge into analytics.merge_incremental as DBT_INTERNAL_DEST
+ + +Beginning with 1.9, `merge` behavior can be modified with the following additional configuration options: + +- `target_alias`, `source_alias`: Aliases for the target and source to allow you to describe your merge conditions more naturally. These default to `DBT_INTERNAL_DEST` and `DBT_INTERNAL_SOURCE`, respectively. +- `skip_matched_step`: If set to `true`, the 'matched' clause of the merge statement will not be included. +- `skip_not_matched_step`: If set to `true`, the 'not matched' clause will not be included. +- `matched_condition`: Condition to apply to the `WHEN MATCHED` clause. You should use the `target_alias` and `source_alias` to write a conditional expression, such as `DBT_INTERNAL_DEST.col1 = hash(DBT_INTERNAL_SOURCE.col2, DBT_INTERNAL_SOURCE.col3)`. This condition further restricts the matched set of rows. +- `not_matched_condition`: Condition to apply to the `WHEN NOT MATCHED [BY TARGET]` clause. This condition further restricts the set of rows in the target that do not match the source that will be inserted into the merged table. +- `not_matched_by_source_condition`: Condition to apply to the further filter `WHEN NOT MATCHED BY SOURCE` clause. Only used in conjunction with `not_matched_by_source_action: delete`. +- `not_matched_by_source_action`: If set to `delete`, a `DELETE` clause is added to the merge statement for `WHEN NOT MATCHED BY SOURCE`. +- `merge_with_schema_evolution`: If set to `true`, the merge statement includes the `WITH SCHEMA EVOLUTION` clause. + +For more details on the meaning of each merge clause, please see [the Databricks documentation](https://docs.databricks.com/en/sql/language-manual/delta-merge-into.html). + +The following is an example demonstrating the use of these new options: + + + + + + +```sql +{{ config( + materialized = 'incremental', + unique_key = 'id', + incremental_strategy='merge', + target_alias='t', + source_alias='s', + matched_condition='t.tech_change_ts < s.tech_change_ts', + not_matched_condition='s.attr1 IS NOT NULL', + not_matched_by_source_condition='t.tech_change_ts < current_timestamp()', + not_matched_by_source_action='delete', + merge_with_schema_evolution=true +) }} + +select + id, + attr1, + attr2, + tech_change_ts +from + {{ ref('source_table') }} as s +``` + + + + + + + +```sql +create temporary view merge_incremental__dbt_tmp as + + select + id, + attr1, + attr2, + tech_change_ts + from upstream.source_table +; + +merge + with schema evolution +into + target_table as t +using ( + select + id, + attr1, + attr2, + tech_change_ts + from + source_table as s +) +on + t.id <=> s.id +when matched + and t.tech_change_ts < s.tech_change_ts + then update set + id = s.id, + attr1 = s.attr1, + attr2 = s.attr2, + tech_change_ts = s.tech_change_ts + +when not matched + and s.attr1 IS NOT NULL + then insert ( + id, + attr1, + attr2, + tech_change_ts + ) values ( + s.id, + s.attr1, + s.attr2, + s.tech_change_ts + ) + +when not matched by source + and t.tech_change_ts < current_timestamp() + then delete +``` + + + + + + + + ### The `replace_where` strategy The `replace_where` incremental strategy requires: @@ -388,7 +634,83 @@ insert into analytics.replace_where_incremental - + + +### The `microbatch` strategy + +The Databricks adapter implements the `microbatch` strategy using `replace_where`. Note the requirements and caution statements for `replace_where` above. For more information about this strategy, see the [microbatch reference page](/docs/build/incremental-microbatch). + +In the following example, the upstream table `events` have been annotated with an `event_time` column called `ts` in its schema file. + + + + + + +```sql +{{ config( + materialized='incremental', + file_format='delta', + incremental_strategy = 'microbatch' + event_time='date' # Use 'date' as the grain for this microbatch table +) }} + +with new_events as ( + + select * from {{ ref('events') }} + +) + +select + user_id, + date, + count(*) as visits + +from events +group by 1, 2 +``` + + + + + + + +```sql +create temporary view replace_where__dbt_tmp as + + with new_events as ( + + select * from (select * from analytics.events where ts >= '2024-10-01' and ts < '2024-10-02') + + ) + + select + user_id, + date, + count(*) as visits + from events + group by 1, 2 +; + +insert into analytics.replace_where_incremental + replace where CAST(date as TIMESTAMP) >= '2024-10-01' and CAST(date as TIMESTAMP) < '2024-10-02' + table `replace_where__dbt_tmp` +``` + + + + + + + + ## Selecting compute per model @@ -410,31 +732,31 @@ To take advantage of this capability, you will need to add compute blocks to you ```yaml -: - target: # this is the default target +profile-name: + target: target-name # this is the default target outputs: - : + target-name: type: databricks - catalog: [optional catalog name if you are using Unity Catalog] - schema: [schema name] # Required - host: [yourorg.databrickshost.com] # Required + catalog: optional catalog name if you are using Unity Catalog + schema: schema name # Required + host: yourorg.databrickshost.com # Required ### This path is used as the default compute - http_path: [/sql/your/http/path] # Required + http_path: /sql/your/http/path # Required ### New compute section compute: ### Name that you will use to refer to an alternate compute Compute1: - http_path: [‘/sql/your/http/path’] # Required of each alternate compute + http_path: '/sql/your/http/path' # Required of each alternate compute ### A third named compute, use whatever name you like Compute2: - http_path: [‘/some/other/path’] # Required of each alternate compute + http_path: '/some/other/path' # Required of each alternate compute ... - : # additional targets + target-name: # additional targets ... ### For each target, you need to define the same compute, ### but you can specify different paths @@ -442,11 +764,11 @@ To take advantage of this capability, you will need to add compute blocks to you ### Name that you will use to refer to an alternate compute Compute1: - http_path: [‘/sql/your/http/path’] # Required of each alternate compute + http_path: '/sql/your/http/path' # Required of each alternate compute ### A third named compute, use whatever name you like Compute2: - http_path: [‘/some/other/path’] # Required of each alternate compute + http_path: '/some/other/path' # Required of each alternate compute ... ``` @@ -553,9 +875,15 @@ Databricks adapter ... using compute resource . Materializing a python model requires execution of SQL as well as python. Specifically, if your python model is incremental, the current execution pattern involves executing python to create a staging table that is then merged into your target table using SQL. + The python code needs to run on an all purpose cluster, while the SQL code can run on an all purpose cluster or a SQL Warehouse. + + +The python code needs to run on an all purpose cluster (or serverless cluster, see [Python Submission Methods](#python-submission-methods)), while the SQL code can run on an all purpose cluster or a SQL Warehouse. + When you specify your `databricks_compute` for a python model, you are currently only specifying which compute to use when running the model-specific SQL. -If you wish to use a different compute for executing the python itself, you must specify an alternate `http_path` in the config for the model. Please note that declaring a separate SQL compute and a python compute for your python dbt models is optional. If you wish to do this: +If you wish to use a different compute for executing the python itself, you must specify an alternate compute in the config for the model. +For example: @@ -572,8 +900,6 @@ def model(dbt, session): If your default compute is a SQL Warehouse, you will need to specify an all purpose cluster `http_path` in this way. - - ## Persisting model descriptions Relation-level docs persistence is supported in dbt v0.17.0. For more @@ -776,9 +1102,14 @@ These properties are sent directly to Databricks without validation in dbt, so b One application of this feature is making `delta` tables compatible with `iceberg` readers using the [Universal Format](https://docs.databricks.com/en/delta/uniform.html). - +```sql +{{ config( + tblproperties={ + 'delta.enableIcebergCompatV2' = 'true' + 'delta.universalFormat.enabledFormats' = 'iceberg' + } + ) }} +``` `tblproperties` can be specified for python models, but they will be applied via an `ALTER` statement after table creation. This is due to a limitation in PySpark. - - diff --git a/website/docs/reference/resource-configs/enabled.md b/website/docs/reference/resource-configs/enabled.md index febf1e50c88..b74d7250907 100644 --- a/website/docs/reference/resource-configs/enabled.md +++ b/website/docs/reference/resource-configs/enabled.md @@ -230,14 +230,6 @@ exposures: - - -Support for disabling semantic models has been added in dbt Core v1.7 - - - - - ```yaml @@ -259,20 +251,10 @@ semantic_models: - - - - -Support for disabling saved queries has been added in dbt Core v1.7. - - - - - ```yaml @@ -294,8 +276,6 @@ saved_queries: - - diff --git a/website/docs/reference/resource-configs/fal-configs.md b/website/docs/reference/resource-configs/fal-configs.md deleted file mode 100644 index 85befeccdb4..00000000000 --- a/website/docs/reference/resource-configs/fal-configs.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -title: "fal configurations" -id: "fal-configs" ---- - -:::info Adapter no longer maintained -The [`dbt-fal` adapter](https://github.com/fal-ai/dbt-fal) is no longer actively maintained. This means although the adapter is still operational, there is no further development or bug fixes planned and it may not be compatible with future versions of dbt. `dbt-fal` was test until dbt v1.5. - -Documentation for `dbt-fal` are kept for reference purposes only and will eventually be removed from the site in the future. -::: - -## Setting the `db_profile` - -The fal profile configuration needs the `db_profile` property set to the profile configuring your database for SQL models. - -fal will wrap around adapter and just handle Python models while letting all the SQL -needs to the underlying database adapter. - -fal will inherit the `threads` configuration from the `db_profile` unless explicitly specified. - -Example: - - - -```yaml -jaffle_shop: - target: dev_with_fal - outputs: - dev_with_fal: - type: fal - db_profile: dev_pg # This points to your main adapter - dev_pg: - type: postgres - ... -``` - - - -## Using `fal_environment` model configuration - -By creating a `fal_project.yml` in the same location as your `dbt_project.yml` and adding environment definitions in there: - - - -```yaml -environments: - - name: clustering - type: conda - packages: - - kmodes==0.12.2 - - - name: predict - type: venv - requirements: - - prophet -``` - - - -You can now reference any of these environments in your dbt Python models: - - - -```py -def model(dbt, fal): - dbt.config({ - "fal_environment": "clustering" - }) - - import pandas as pd - # kmodes is available because of the `fal_environment` being used - from kmodes.kmodes import KModes - - df: pd.DataFrame = dbt.ref("order_detailed") - df_train = df[["size", "is_vegan", "is_vegetarian", "is_keto", "shape"]] - - km_2 = KModes(n_clusters=3, init="Huang") - km_2.fit_predict(df_train) - df["cluster_label"] = km_2.labels_ - - return df -``` - - diff --git a/website/docs/reference/resource-configs/firebolt-configs.md b/website/docs/reference/resource-configs/firebolt-configs.md index 394823e33de..0ab14354003 100644 --- a/website/docs/reference/resource-configs/firebolt-configs.md +++ b/website/docs/reference/resource-configs/firebolt-configs.md @@ -38,8 +38,8 @@ models: +table_type: fact +primary_index: [ , ... ] +indexes: - - type: aggregating - key_column: [ , ... ] + - index_type: aggregating + key_columns: [ , ... ] aggregation: [ , ... ] ... ``` @@ -58,8 +58,8 @@ models: table_type: fact primary_index: [ , ... ] indexes: - - type: aggregating - key_column: [ , ... ] + - index_type: aggregating + key_columns: [ , ... ] aggregation: [ , ... ] ... ``` @@ -77,9 +77,9 @@ models: primary_index = [ "", ... ], indexes = [ { - type = "aggregating" - key_column = [ "", ... ], - aggregation = [ "", ... ], + "index_type": "aggregating" + "key_columns": [ "", ... ], + "aggregation": [ "", ... ], }, ... ] @@ -99,8 +99,8 @@ models: | `table_type` | Whether the materialized table will be a [fact or dimension](https://docs.firebolt.io/godocs/Overview/working-with-tables/working-with-tables.html#fact-and-dimension-tables) table. | | `primary_index` | Sets the primary index for the fact table using the inputted list of column names from the model. Required for fact tables. | | `indexes` | A list of aggregating indexes to create on the fact table. | -| `type` | Specifies that the index is an [aggregating index](https://docs.firebolt.io/godocs/Guides/working-with-indexes/using-aggregating-indexes.html). Should be set to `aggregating`. | -| `key_column` | Sets the grouping of the aggregating index using the inputted list of column names from the model. | +| `index_type` | Specifies that the index is an [aggregating index](https://docs.firebolt.io/godocs/Guides/working-with-indexes/using-aggregating-indexes.html). Should be set to `aggregating`. | +| `key_columns` | Sets the grouping of the aggregating index using the inputted list of column names from the model. | | `aggregation` | Sets the aggregations on the aggregating index using the inputted list of SQL agg expressions. | @@ -113,9 +113,9 @@ models: primary_index = "id", indexes = [ { - type: "aggregating", - key_column: "order_id", - aggregation: ["COUNT(DISTINCT status)", "AVG(customer_id)"] + "index_type": "aggregating", + "key_columns": "order_id", + "aggregation": ["COUNT(DISTINCT status)", "AVG(customer_id)"] } ] ) }} diff --git a/website/docs/reference/resource-configs/full_refresh.md b/website/docs/reference/resource-configs/full_refresh.md index 26a2364a8c6..c874fe7a396 100644 --- a/website/docs/reference/resource-configs/full_refresh.md +++ b/website/docs/reference/resource-configs/full_refresh.md @@ -1,6 +1,6 @@ --- resource_types: [models, seeds] -description: "Full_Refresh - Read this in-depth guide to learn about configurations in dbt." +description: "Setting the full_refresh config to false prevents a model or seed from being rebuilt, even when the `--full-refresh` flag is included in an invocation." datatype: boolean --- @@ -85,3 +85,6 @@ This logic is encoded in the [`should_full_refresh()`](https://github.com/dbt-la ## Recommendation Set `full_refresh: false` for models of especially large datasets, which you would _never_ want dbt to fully drop and recreate. + +## Reference docs +* [on_configuration_change](/reference/resource-configs/on_configuration_change) diff --git a/website/docs/reference/resource-configs/grants.md b/website/docs/reference/resource-configs/grants.md index 6e960115ea1..99b61ef2413 100644 --- a/website/docs/reference/resource-configs/grants.md +++ b/website/docs/reference/resource-configs/grants.md @@ -249,13 +249,14 @@ models:
-* Granting to / revoking from is only fully supported for Redshift users (not groups or roles). +* Granting to / revoking from is only fully supported for Redshift users (not [groups](https://docs.aws.amazon.com/redshift/latest/dg/r_Groups.html) or [roles](https://docs.aws.amazon.com/redshift/latest/dg/r_roles-managing.html)).
* dbt accounts for the [`copy_grants` configuration](/reference/resource-configs/snowflake-configs#copying-grants) when calculating which grants need to be added or removed. +* Granting to / revoking from is only fully supported for Snowflake roles (not [database roles](https://docs.snowflake.com/user-guide/security-access-control-overview#types-of-roles)).
diff --git a/website/docs/reference/resource-configs/group.md b/website/docs/reference/resource-configs/group.md index 717d7de89f5..cd0ad2683f5 100644 --- a/website/docs/reference/resource-configs/group.md +++ b/website/docs/reference/resource-configs/group.md @@ -218,14 +218,6 @@ metrics: - - -Support for grouping semantic models has been added in dbt Core v1.7. - - - - - ```yaml @@ -247,20 +239,10 @@ semantic_models: - - - - -Support for grouping saved queries has been added in dbt Core v1.7. - - - - - ```yaml @@ -282,8 +264,6 @@ saved_queries: - - diff --git a/website/docs/reference/resource-configs/invalidate_hard_deletes.md b/website/docs/reference/resource-configs/invalidate_hard_deletes.md index ba5b37c5d71..bdaec7e33a9 100644 --- a/website/docs/reference/resource-configs/invalidate_hard_deletes.md +++ b/website/docs/reference/resource-configs/invalidate_hard_deletes.md @@ -4,6 +4,31 @@ description: "Invalidate_hard_deletes - Read this in-depth guide to learn about datatype: column_name --- + + + + + +```yaml +snapshots: + - name: snapshot + relation: source('my_source', 'my_table') + [config](/reference/snapshot-configs): + strategy: timestamp + invalidate_hard_deletes: true | false +``` + + + + + + + + +import SnapshotYaml from '/snippets/_snapshot-yaml-spec.md'; + + + ```jinja2 @@ -17,6 +42,7 @@ datatype: column_name ``` + @@ -39,6 +65,26 @@ By default the feature is disabled. ## Example + + + +```yaml +snapshots: + - name: orders_snapshot + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + database: analytics + unique_key: id + strategy: timestamp + updated_at: updated_at + invalidate_hard_deletes: true + ``` + + + + + ```sql @@ -60,3 +106,4 @@ By default the feature is disabled. ``` + diff --git a/website/docs/reference/resource-configs/materialized.md b/website/docs/reference/resource-configs/materialized.md index 43ee88130e4..5f7d00df46a 100644 --- a/website/docs/reference/resource-configs/materialized.md +++ b/website/docs/reference/resource-configs/materialized.md @@ -81,3 +81,14 @@ select ... You can also configure [custom materializations](/guides/create-new-materializations?step=1) in dbt. Custom materializations are a powerful way to extend dbt's functionality to meet your specific needs. +## Creation Precedence + +Materializations are implemented following this "drop through" life cycle: + +1. If a model does not exist with the provided path, create the new model. +2. If a model exists, but has a different type, drop the existing model and create the new model. +3. If [`--full-refresh`](/reference/resource-configs/full_refresh) is supplied, replace the existing model regardless of configuration changes and the [`on_configuration_change`](/reference/resource-configs/on_configuration_change) setting. +4. If there are no configuration changes, perform the default action for that type (e.g. apply refresh for a materialized view). +5. Determine whether to apply the configuration changes according to the `on_configuration_change` setting. + + diff --git a/website/docs/reference/resource-configs/meta.md b/website/docs/reference/resource-configs/meta.md index 2bcccdd4141..53a4f77184e 100644 --- a/website/docs/reference/resource-configs/meta.md +++ b/website/docs/reference/resource-configs/meta.md @@ -179,14 +179,6 @@ exposures: - - -Support for grouping semantic models was added in dbt Core v1.7 - - - - - ```yml @@ -201,8 +193,6 @@ semantic_models: The `meta` config can also be defined under the `semantic-models` config block in `dbt_project.yml`. See [configs and properties](/reference/configs-and-properties) for details. - - @@ -249,14 +239,6 @@ metrics: - - -Support for saved queries has been added in dbt Core v1.7. - - - - - ```yml @@ -268,8 +250,6 @@ saved_queries: - - diff --git a/website/docs/reference/resource-configs/on_configuration_change.md b/website/docs/reference/resource-configs/on_configuration_change.md index eb85c998405..8e9846ae8cd 100644 --- a/website/docs/reference/resource-configs/on_configuration_change.md +++ b/website/docs/reference/resource-configs/on_configuration_change.md @@ -81,6 +81,6 @@ models: Materializations are implemented following this "drop through" life cycle: 1. If a model does not exist with the provided path, create the new model. 2. If a model exists, but has a different type, drop the existing model and create the new model. -3. If `--full-refresh` is supplied, replace the existing model regardless of configuration changes and the `on_configuration_change` setting. +3. If [`--full-refresh`](/reference/resource-configs/full_refresh) is supplied, replace the existing model regardless of configuration changes and the `on_configuration_change` setting. 4. If there are no configuration changes, perform the default action for that type (e.g. apply refresh for a materialized view). 5. Determine whether to apply the configuration changes according to the `on_configuration_change` setting. diff --git a/website/docs/reference/resource-configs/postgres-configs.md b/website/docs/reference/resource-configs/postgres-configs.md index 07cfc938f1c..f2bf90a93c0 100644 --- a/website/docs/reference/resource-configs/postgres-configs.md +++ b/website/docs/reference/resource-configs/postgres-configs.md @@ -185,20 +185,3 @@ It's worth noting that, unlike tables, dbt monitors this parameter for changes a This happens via a `DROP/CREATE` of the indexes, which can be thought of as an `ALTER` of the materialized view. Learn more about these parameters in Postgres's [docs](https://www.postgresql.org/docs/current/sql-creatematerializedview.html). - - - -### Limitations - -#### Changing materialization to and from "materialized_view" - -Swapping an already materialized model to a materialized view, and vice versa, is not supported. -The workaround is to manually drop the existing materialization in the data warehouse prior to calling `dbt run`. -Running with `--full-refresh` flag will not work to drop the existing table or view and create the materialized view (and vice versa). -This would only need to be done once as the existing object would then be a materialized view. - -For example,`my_model`, has already been materialized as a table in the underlying data platform via `dbt run`. -If the user changes the model's config to `materialized="materialized_view"`, they will get an error. -The solution is to execute `DROP TABLE my_model` on the data warehouse before trying the model again. - - diff --git a/website/docs/reference/resource-configs/pre-hook-post-hook.md b/website/docs/reference/resource-configs/pre-hook-post-hook.md index bf4375c9490..bd01a7be840 100644 --- a/website/docs/reference/resource-configs/pre-hook-post-hook.md +++ b/website/docs/reference/resource-configs/pre-hook-post-hook.md @@ -45,6 +45,18 @@ select ... ``` + + + + +```yml +models: + - name: [] + config: + [pre_hook](/reference/resource-configs/pre-hook-post-hook): | [] + [post_hook](/reference/resource-configs/pre-hook-post-hook): | [] +``` + @@ -66,6 +78,18 @@ seeds: + + +```yml +seeds: + - name: [] + config: + [pre_hook](/reference/resource-configs/pre-hook-post-hook): | [] + [post_hook](/reference/resource-configs/pre-hook-post-hook): | [] +``` + + + @@ -85,6 +109,8 @@ snapshots: + + ```sql @@ -100,6 +126,19 @@ select ... ``` + + + + + +```yml +snapshots: + - name: [] + [config](/reference/resource-properties/config): + [pre_hook](/reference/resource-configs/pre-hook-post-hook): | [] + [post_hook](/reference/resource-configs/pre-hook-post-hook): | [] +``` + @@ -115,6 +154,10 @@ Pre- and post-hooks can also call macros that return SQL statements. If your mac dbt aims to provide all the boilerplate SQL you need (DDL, DML, and DCL) via out-of-the-box functionality, which you can configure quickly and concisely. In some cases, there may be SQL that you want or need to run, specific to functionality in your data platform, which dbt does not (yet) offer as a built-in feature. In those cases, you can write the exact SQL you need, using dbt's compilation context, and pass it into a `pre-` or `post-` hook to run before or after your model, seed, or snapshot. +import SQLCompilationError from '/snippets/_render-method.md'; + + + ## Examples diff --git a/website/docs/reference/resource-configs/redshift-configs.md b/website/docs/reference/resource-configs/redshift-configs.md index e7149ae484e..b033cd6267e 100644 --- a/website/docs/reference/resource-configs/redshift-configs.md +++ b/website/docs/reference/resource-configs/redshift-configs.md @@ -230,21 +230,6 @@ As with most data platforms, there are limitations associated with materialized Find more information about materialized view limitations in Redshift's [docs](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-create-sql-command.html#mv_CREATE_MATERIALIZED_VIEW-limitations). - - -#### Changing materialization from "materialized_view" to "table" or "view" - -Swapping a materialized view to a table or view is not supported. -You must manually drop the existing materialized view in the data warehouse before calling `dbt run`. -Normally, re-running with the `--full-refresh` flag would resolve this, but not in this case. -This would only need to be done once as the existing object would then be a materialized view. - -For example, assume that a materialized view, `my_mv.sql`, has already been materialized to the underlying data platform via `dbt run`. -If the user changes the model's config to `materialized="table"`, they will get an error. -The workaround is to execute `DROP MATERIALIZED VIEW my_mv CASCADE` on the data warehouse before trying the model again. - - - ## Unit test limitations diff --git a/website/docs/reference/resource-configs/schema.md b/website/docs/reference/resource-configs/schema.md index 57a357767cb..1e2ff47729c 100644 --- a/website/docs/reference/resource-configs/schema.md +++ b/website/docs/reference/resource-configs/schema.md @@ -76,6 +76,15 @@ This results in the generated relation being located in the `snapshots` schema s + + + +```yml +saved-queries: + +schema: metrics +``` + + Customize the schema for storing test results in your `dbt_project.yml` file. diff --git a/website/docs/reference/resource-configs/snapshot_meta_column_names.md b/website/docs/reference/resource-configs/snapshot_meta_column_names.md new file mode 100644 index 00000000000..46aba7886d0 --- /dev/null +++ b/website/docs/reference/resource-configs/snapshot_meta_column_names.md @@ -0,0 +1,109 @@ +--- +resource_types: [snapshots] +description: "Snapshot meta column names" +datatype: "{}" +default_value: {"dbt_valid_from": "dbt_valid_from", "dbt_valid_to": "dbt_valid_to", "dbt_scd_id": "dbt_scd_id", "dbt_updated_at": "dbt_updated_at"} +id: "snapshot_meta_column_names" +--- + +Starting in 1.9 or with [versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) dbt Cloud. + + + +```yaml +snapshots: + - name: + config: + snapshot_meta_column_names: + dbt_valid_from: + dbt_valid_to: + dbt_scd_id: + dbt_updated_at: + +``` + + + + + +```jinja2 +{{ + config( + snapshot_meta_column_names={ + "dbt_valid_from": "", + "dbt_valid_to": "", + "dbt_scd_id": "", + "dbt_updated_at": "", + } + ) +}} + +``` + + + + + +```yml +snapshots: + [](/reference/resource-configs/resource-path): + +snapshot_meta_column_names: + dbt_valid_from: + dbt_valid_to: + dbt_scd_id: + dbt_updated_at: + +``` + + + +## Description + +In order to align with an organization's naming conventions, the `snapshot_meta_column_names` config can be used to customize the names of the [metadata columns](/docs/build/snapshots#snapshot-meta-fields) within each snapshot. + +## Default + +By default, dbt snapshots use the following column names to track change history using [Type 2 slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row) records: + +| Field | Meaning | Notes | +| -------------- | ------- | ----- | +| `dbt_valid_from` | The timestamp when this snapshot row was first inserted and became valid. | The value is affected by the [`strategy`](/reference/resource-configs/strategy). | +| `dbt_valid_to` | The timestamp when this row is no longer valid. | | +| `dbt_scd_id` | A unique key generated for each snapshot row. | This is used internally by dbt. | +| `dbt_updated_at` | The `updated_at` timestamp of the source record when this snapshot row was inserted. | This is used internally by dbt. | + +However, these column names can be customized using the `snapshot_meta_column_names` config. + +:::warning + +To avoid any unintentional data modification, dbt will **not** automatically apply any column renames. So if a user applies `snapshot_meta_column_names` config for a snapshot without updating the pre-existing table, they will get an error. We recommend either only using these settings for net-new snapshots, or arranging an update of pre-existing tables prior to committing a column name change. + +::: + +## Example + + + +```yaml +snapshots: + - name: orders_snapshot + relation: ref("orders") + config: + unique_key: id + strategy: check + check_cols: all + snapshot_meta_column_names: + dbt_valid_from: start_date + dbt_valid_to: end_date + dbt_scd_id: scd_id + dbt_updated_at: modified_date +``` + + + +The resulting snapshot table contains the configured meta column names: + +| id | scd_id | modified_date | start_date | end_date | +| -- | -------------------- | -------------------- | -------------------- | -------------------- | +| 1 | 60a1f1dbdf899a4dd... | 2024-10-02 ... | 2024-10-02 ... | 2024-10-02 ... | +| 2 | b1885d098f8bcff51... | 2024-10-02 ... | 2024-10-02 ... | | diff --git a/website/docs/reference/resource-configs/snapshot_name.md b/website/docs/reference/resource-configs/snapshot_name.md index bb4826a116b..62480ac3f84 100644 --- a/website/docs/reference/resource-configs/snapshot_name.md +++ b/website/docs/reference/resource-configs/snapshot_name.md @@ -2,6 +2,27 @@ description: "Snapshot-name - Read this in-depth guide to learn about configurations in dbt." --- + + + +```yaml +snapshots: + - name: snapshot_name + relation: source('my_source', 'my_table') + config: + schema: string + database: string + unique_key: column_name_or_expression + strategy: timestamp | check + updated_at: column_name # Required if strategy is 'timestamp' + +``` + + + + + + ```jinja2 @@ -13,9 +34,15 @@ description: "Snapshot-name - Read this in-depth guide to learn about configurat +import SnapshotYaml from '/snippets/_snapshot-yaml-spec.md'; + + + + + ## Description -The name of a snapshot, as defined in the `{% snapshot %}` block header. This name is used when selecting from a snapshot using the [`ref` function](/reference/dbt-jinja-functions/ref) +The name of a snapshot, which is used when selecting from a snapshot using the [`ref` function](/reference/dbt-jinja-functions/ref) This name must not conflict with the name of any other "refable" resource (models, seeds, other snapshots) defined in this project or package. @@ -24,6 +51,26 @@ The name does not need to match the file name. As a result, snapshot filenames d ## Examples ### Name a snapshot `order_snapshot` + + + + +```yaml +snapshots: + - name: order_snapshot + relation: source('my_source', 'my_table') + config: + schema: string + database: string + unique_key: column_name_or_expression + strategy: timestamp | check + updated_at: column_name # Required if strategy is 'timestamp' +``` + + + + + ```jinja2 @@ -35,6 +82,7 @@ The name does not need to match the file name. As a result, snapshot filenames d + To select from this snapshot in a downstream model: diff --git a/website/docs/reference/resource-configs/snowflake-configs.md b/website/docs/reference/resource-configs/snowflake-configs.md index a59bc8dee00..b95b79241ba 100644 --- a/website/docs/reference/resource-configs/snowflake-configs.md +++ b/website/docs/reference/resource-configs/snowflake-configs.md @@ -9,295 +9,333 @@ To-do: - use the reference doc structure for this article / split into separate articles ---> -## Transient tables + -Snowflake supports the creation of [transient tables](https://docs.snowflake.net/manuals/user-guide/tables-temp-transient.html). Snowflake does not preserve a history for these tables, which can result in a measurable reduction of your Snowflake storage costs. Transient tables participate in time travel to a limited degree with a retention period of 1 day by default with no fail-safe period. Weigh these tradeoffs when deciding whether or not to configure your dbt models as `transient`. **By default, all Snowflake tables created by dbt are `transient`.** +## Iceberg table format -### Configuring transient tables in dbt_project.yml +The dbt-snowflake adapter supports the Iceberg table format. It is available for three of the Snowflake materializations: -A whole folder (or package) can be configured to be transient (or not) by adding a line to the `dbt_project.yml` file. This config works just like all of the [model configs](/reference/model-configs) defined in `dbt_project.yml`. +- [Table](/docs/build/materializations#table) +- [Incremental](/docs/build/materializations#incremental) +- [Dynamic](#dynamic-tables) + +For now, to create Iceberg tables, you must implement a [behavior flag](/reference/global-configs/behavior-changes) due to performance impact related to using Iceberg tables. Snowflake does not support `is_iceberg` on the `Show Objects` query, which dbt depends on for metadata. + +To use Iceberg, set the `enable_iceberg_materializations` flag to `True` in your dbt_project.yml: ```yaml -name: my_project -... +flags: + enable_iceberg_materializations: True -models: - +transient: false - my_project: - ... ``` -### Configuring transience for a specific model -A specific model can be configured to be transient by setting the `transient` model config to `true`. +The following configurations are supported. +For more information, check out the Snowflake reference for [`CREATE ICEBERG TABLE` (Snowflake as the catalog)](https://docs.snowflake.com/en/sql-reference/sql/create-iceberg-table-snowflake). - +| Field | Type | Required | Description | Sample input | Note | +| --------------------- | ------ | -------- | -------------------------------------------------------------------------------------------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Table Format | String | Yes | Configures the objects table format. | `iceberg` | `iceberg` is the only accepted value. | +| External volume | String | Yes(*) | Specifies the identifier (name) of the external volume where Snowflake writes the Iceberg table's metadata and data files. | `my_s3_bucket` | *You don't need to specify this if the account, database, or schema already has an associated external volume. [More info](https://docs.snowflake.com/en/sql-reference/sql/create-iceberg-table-snowflake#:~:text=Snowflake%20Table%20Structures.-,external_volume) | +| Base location Subpath | String | No | An optional suffix to add to the `base_location` path that dbt automatically specifies. | `jaffle_marketing_folder` | We recommend that you do not specify this. Modifying this parameter results in a new Iceberg table. See [Base Location](#base-location) for more info. | + +### Example configuration + +To configure an Iceberg table materialization in dbt, refer to the example configuration: + + ```sql -{{ config(materialized='table', transient=true) }} -select * from ... +{{ + config( + materialized = "table", + table_format="iceberg", + external_volume="s3_iceberg_snow", + ) +}} + +select * from {{ ref('raw_orders') }} + ``` -## Query tags - -[Query tags](https://docs.snowflake.com/en/sql-reference/parameters.html#query-tag) are a Snowflake -parameter that can be quite useful later on when searching in the [QUERY_HISTORY view](https://docs.snowflake.com/en/sql-reference/account-usage/query_history.html). +### Base location -dbt supports setting a default query tag for the duration of its Snowflake connections in -[your profile](/docs/core/connect-data-platform/snowflake-setup). You can set more precise values (and override the default) for subsets of models by setting -a `query_tag` model config or by overriding the default `set_query_tag` macro: +Snowflake's `CREATE ICEBERG TABLE` DDL requires that a `base_location` be provided. dbt defines this parameter on the user's behalf to streamline usage and enforce basic isolation of table data within the `EXTERNAL VOLUME`. The default behavior in dbt is to provide a `base_location` string of the form: `_dbt/{SCHEMA_NAME}/{MODEL_NAME}` - +#### Base Location Subpath +We recommend using dbt's auto-generated `base_location`. However, if you need to customize the resulting `base_location`, dbt allows users to configure a `base_location_subpath`. When specified, the subpath concatenates to the end of the previously described pattern for `base_location` string generation. -```yaml -models: - [](/reference/resource-configs/resource-path): - +query_tag: dbt_special +For example, `config(base_location_subpath="prod")` will generate a `base_location` of the form `_dbt/{SCHEMA_NAME}/{MODEL_NAME}/prod/`. -``` +A theoretical (but not recommended) use case is re-using an `EXTERNAL VOLUME` while maintaining isolation across development and production environments. We recommend against this as storage permissions should configured on the external volume and underlying storage, not paths that any analytics engineer can modify. - +#### Rationale - +dbt manages `base_location` on behalf of users to enforce best practices. With Snowflake-managed Iceberg format tables, the user owns and maintains the data storage of the tables in an external storage solution (the declared `external volume`). The `base_ location` parameter declares where to write the data within the external volume. The Snowflake Iceberg catalog keeps track of your Iceberg table regardless of where the data lives within the `external volume` declared and the `base_location` provided. However, Snowflake permits passing anything into the `base_location` field, including an empty string, even reusing the same path across multiple tables. This behavior could result in future technical debt because it will limit the ability to: -```sql -{{ config( - query_tag = 'dbt_special' -) }} +- Navigate the underlying object store (S3/Azure blob) +- Read Iceberg tables via an object-store integration +- Grant schema-specific access to tables via object store +- Use a crawler pointed at the tables within the external volume to build a new catalog with another tool -select ... +To maintain best practices, we enforce an input. Currently, we do not support overriding the default `base location` input but will consider it based on user feedback. -``` - -In this example, you can set up a query tag to be applied to every query with the model's name. - -```sql +In summary, dbt-snowflake does not support arbitrary definition of `base_location` for Iceberg tables. Instead, dbt, by default, writes your tables within a `_dbt/{SCHEMA_NAME}/{TABLE_NAME}` prefix to ensure easier object-store observability and auditability. - {% macro set_query_tag() -%} - {% set new_query_tag = model.name %} - {% if new_query_tag %} - {% set original_query_tag = get_current_query_tag() %} - {{ log("Setting query_tag to '" ~ new_query_tag ~ "'. Will reset to '" ~ original_query_tag ~ "' after materialization.") }} - {% do run_query("alter session set query_tag = '{}'".format(new_query_tag)) %} - {{ return(original_query_tag)}} - {% endif %} - {{ return(none)}} -{% endmacro %} +### Limitations -``` +There are some limitations to the implementation you need to be aware of: -**Note:** query tags are set at the _session_ level. At the start of each model , if the model has a custom `query_tag` configured, dbt will run `alter session set query_tag` to set the new value. At the end of the materialization, dbt will run another `alter` statement to reset the tag to its default value. As such, build failures midway through a materialization may result in subsequent queries running with an incorrect tag. +- Using Iceberg tables with dbt, the result is that your query is materialized in Iceberg. However, often, dbt creates intermediary objects as temporary and transient tables for certain materializations, such as incremental ones. It is not possible to configure these temporary objects also to be Iceberg-formatted. You may see non-Iceberg tables created in the logs to support specific materializations, but they will be dropped after usage. +- You cannot incrementally update a preexisting incremental model to be an Iceberg table. To do so, you must fully rebuild the table with the `--full-refresh` flag. - + -## Merge behavior (incremental models) +## Dynamic tables -The [`incremental_strategy` config](/docs/build/incremental-strategy) controls how dbt builds incremental models. By default, dbt will use a [merge statement](https://docs.snowflake.net/manuals/sql-reference/sql/merge.html) on Snowflake to refresh incremental tables. +The Snowflake adapter supports [dynamic tables](https://docs.snowflake.com/en/user-guide/dynamic-tables-about). +This materialization is specific to Snowflake, which means that any model configuration that +would normally come along for the ride from `dbt-core` (e.g. as with a `view`) may not be available +for dynamic tables. This gap will decrease in future patches and versions. +While this materialization is specific to Snowflake, it very much follows the implementation +of [materialized views](/docs/build/materializations#Materialized-View). +In particular, dynamic tables have access to the `on_configuration_change` setting. +Dynamic tables are supported with the following configuration parameters: -Snowflake's `merge` statement fails with a "nondeterministic merge" error if the `unique_key` specified in your model config is not actually unique. If you encounter this error, you can instruct dbt to use a two-step incremental approach by setting the `incremental_strategy` config for your model to `delete+insert`. + -## Configuring table clustering +| Parameter | Type | Required | Default | Change Monitoring Support | +|--------------------|------------|----------|-------------|---------------------------| +| [`on_configuration_change`](/reference/resource-configs/on_configuration_change) | `` | no | `apply` | n/a | +| [`target_lag`](#target-lag) | `` | yes | | alter | +| [`snowflake_warehouse`](#configuring-virtual-warehouses) | `` | yes | | alter | + -dbt supports [table clustering](https://docs.snowflake.net/manuals/user-guide/tables-clustering-keys.html) on Snowflake. To control clustering for a or incremental model, use the `cluster_by` config. When this configuration is applied, dbt will do two things: + -1. It will implicitly order the table results by the specified `cluster_by` fields -2. It will add the specified clustering keys to the target table +| Parameter | Type | Required | Default | Change Monitoring Support | +|--------------------|------------|----------|-------------|---------------------------| +| [`on_configuration_change`](/reference/resource-configs/on_configuration_change) | `` | no | `apply` | n/a | +| [`target_lag`](#target-lag) | `` | yes | | alter | +| [`snowflake_warehouse`](#configuring-virtual-warehouses) | `` | yes | | alter | +| [`refresh_mode`](#refresh-mode) | `` | no | `AUTO` | refresh | +| [`initialize`](#initialize) | `` | no | `ON_CREATE` | n/a | -By using the specified `cluster_by` fields to order the table, dbt minimizes the amount of work required by Snowflake's automatic clustering functionality. If an incremental model is configured to use table clustering, then dbt will also order the staged dataset before merging it into the destination table. As such, the dbt-managed table should always be in a mostly clustered state. + -### Using cluster_by + -The `cluster_by` config accepts either a string, or a list of strings to use as clustering keys. The following example will create a sessions table that is clustered by the `session_start` column. + - + -```sql -{{ - config( - materialized='table', - cluster_by=['session_start'] - ) -}} + -select - session_id, - min(event_time) as session_start, - max(event_time) as session_end, - count(*) as count_pageviews +```yaml +models: + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)[materialized](/reference/resource-configs/materialized): dynamic_table + [+](/reference/resource-configs/plus-prefix)[on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail + [+](/reference/resource-configs/plus-prefix)[target_lag](#target-lag): downstream | + [+](/reference/resource-configs/plus-prefix)[snowflake_warehouse](#configuring-virtual-warehouses): -from {{ source('snowplow', 'event') }} -group by 1 ``` -The code above will be compiled to SQL that looks (approximately) like this: + -```sql -create or replace table my_database.my_schema.my_table as ( - select * from ( - select - session_id, - min(event_time) as session_start, - max(event_time) as session_end, - count(*) as count_pageviews + - from {{ source('snowplow', 'event') }} - group by 1 - ) + - -- this order by is added by dbt in order to create the - -- table in an already-clustered manner. - order by session_start +```yaml +version: 2 -); +models: + - name: [] + config: + [materialized](/reference/resource-configs/materialized): dynamic_table + [on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail + [target_lag](#target-lag): downstream | + [snowflake_warehouse](#configuring-virtual-warehouses): - alter table my_database.my_schema.my_table cluster by (session_start); ``` -### Automatic clustering + -Automatic clustering is [enabled by default in Snowflake today](https://docs.snowflake.com/en/user-guide/tables-auto-reclustering.html), no action is needed to make use of it. Though there is an `automatic_clustering` config, it has no effect except for accounts with (deprecated) manual clustering enabled. + -If [manual clustering is still enabled for your account](https://docs.snowflake.com/en/user-guide/tables-clustering-manual.html), you can use the `automatic_clustering` config to control whether or not automatic clustering is enabled for dbt models. When `automatic_clustering` is set to `true`, dbt will run an `alter table resume recluster` query after building the target table. -The `automatic_clustering` config can be specified in the `dbt_project.yml` file, or in a model `config()` block. + - + + +```jinja + +{{ config( + [materialized](/reference/resource-configs/materialized)="dynamic_table", + [on_configuration_change](/reference/resource-configs/on_configuration_change)="apply" | "continue" | "fail", + [target_lag](#target-lag)="downstream" | " seconds | minutes | hours | days", + [snowflake_warehouse](#configuring-virtual-warehouses)="", + +) }} -```yaml -models: - +automatic_clustering: true ``` -## Configuring virtual warehouses + -The default warehouse that dbt uses can be configured in your [Profile](/docs/core/connect-data-platform/profiles.yml) for Snowflake connections. To override the warehouse that is used for specific models (or groups of models), use the `snowflake_warehouse` model configuration. This configuration can be used to specify a larger warehouse for certain models in order to control Snowflake costs and project build times. + + + + + - - + { label: 'Project file', value: 'project-yaml', }, + { label: 'Property file', value: 'property-yaml', }, + { label: 'Config block', value: 'config', }, + ] +}> -The example config below changes the warehouse for a group of models with a config argument in the yml. + ```yaml -name: my_project -version: 1.0.0 - -... - models: - +snowflake_warehouse: "EXTRA_SMALL" # use the `EXTRA_SMALL` warehouse for all models in the project... - my_project: - clickstream: - +snowflake_warehouse: "EXTRA_LARGE" # ...except for the models in the `clickstream` folder, which will use the `EXTRA_LARGE` warehouse. + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)[materialized](/reference/resource-configs/materialized): dynamic_table + [+](/reference/resource-configs/plus-prefix)[on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail + [+](/reference/resource-configs/plus-prefix)[target_lag](#target-lag): downstream | + [+](/reference/resource-configs/plus-prefix)[snowflake_warehouse](#configuring-virtual-warehouses): + [+](/reference/resource-configs/plus-prefix)[refresh_mode](#refresh-mode): AUTO | FULL | INCREMENTAL + [+](/reference/resource-configs/plus-prefix)[initialize](#initialize): ON_CREATE | ON_SCHEDULE -snapshots: - +snowflake_warehouse: "EXTRA_LARGE" # all Snapshot models are configured to use the `EXTRA_LARGE` warehouse. ``` + - -The example config below changes the warehouse for a single model with a config() block in the sql model. + - + -```sql -{{ - config( - materialized='table', - snowflake_warehouse='EXTRA_LARGE' - ) -}} +```yaml +version: 2 -with +models: + - name: [] + config: + [materialized](/reference/resource-configs/materialized): dynamic_table + [on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail + [target_lag](#target-lag): downstream | + [snowflake_warehouse](#configuring-virtual-warehouses): + [refresh_mode](#refresh-mode): AUTO | FULL | INCREMENTAL + [initialize](#initialize): ON_CREATE | ON_SCHEDULE -aggregated_page_events as ( +``` - select - session_id, - min(event_time) as session_start, - max(event_time) as session_end, - count(*) as count_page_views - from {{ source('snowplow', 'event') }} - group by 1 + -), + -index_sessions as ( - select - *, - row_number() over ( - partition by session_id - order by session_start - ) as page_view_in_session_index - from aggregated_page_events + -) + + +```jinja + +{{ config( + [materialized](/reference/resource-configs/materialized)="dynamic_table", + [on_configuration_change](/reference/resource-configs/on_configuration_change)="apply" | "continue" | "fail", + [target_lag](#target-lag)="downstream" | " seconds | minutes | hours | days", + [snowflake_warehouse](#configuring-virtual-warehouses)="", + [refresh_mode](#refresh-mode)="AUTO" | "FULL" | "INCREMENTAL", + [initialize](#initialize)="ON_CREATE" | "ON_SCHEDULE", + +) }} -select * from index_sessions ``` + + -## Copying grants + -When the `copy_grants` config is set to `true`, dbt will add the `copy grants` qualifier when rebuilding tables and views. The default value is `false`. +Learn more about these parameters in Snowflake's [docs](https://docs.snowflake.com/en/sql-reference/sql/create-dynamic-table): - +### Target lag -```yaml -models: - +copy_grants: true -``` +Snowflake allows two configuration scenarios for scheduling automatic refreshes: +- **Time-based** — Provide a value of the form ` { seconds | minutes | hours | days }`. For example, if the dynamic table needs to be updated every 30 minutes, use `target_lag='30 minutes'`. +- **Downstream** — Applicable when the dynamic table is referenced by other dynamic tables. In this scenario, `target_lag='downstream'` allows for refreshes to be controlled at the target, instead of at each layer. - +Learn more about `target_lag` in Snowflake's [docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-refresh#understanding-target-lag). Please note that Snowflake supports a target lag of 1 minute or longer. -## Secure views + -To create a Snowflake [secure view](https://docs.snowflake.net/manuals/user-guide/views-secure.html), use the `secure` config for view models. Secure views can be used to limit access to sensitive data. Note: secure views may incur a performance penalty, so you should only use them if you need them. +### Refresh mode -The following example configures the models in the `sensitive/` folder to be configured as secure views. +Snowflake allows three options for refresh mode: +- **AUTO** — Enforces an incremental refresh of the dynamic table by default. If the `CREATE DYNAMIC TABLE` statement does not support the incremental refresh mode, the dynamic table is automatically created with the full refresh mode. +- **FULL** — Enforces a full refresh of the dynamic table, even if the dynamic table can be incrementally refreshed. +- **INCREMENTAL** — Enforces an incremental refresh of the dynamic table. If the query that underlies the dynamic table can’t perform an incremental refresh, dynamic table creation fails and displays an error message. - +Learn more about `refresh_mode` in [Snowflake's docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-refresh). -```yaml -name: my_project -version: 1.0.0 +### Initialize -models: - my_project: - sensitive: - +materialized: view - +secure: true -``` +Snowflake allows two options for initialize: +- **ON_CREATE** — Refreshes the dynamic table synchronously at creation. If this refresh fails, dynamic table creation fails and displays an error message. +- **ON_SCHEDULE** — Refreshes the dynamic table at the next scheduled refresh. - +Learn more about `initialize` in [Snowflake's docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-refresh). + + + +### Limitations + +As with materialized views on most data platforms, there are limitations associated with dynamic tables. Some worth noting include: + +- Dynamic table SQL has a [limited feature set](https://docs.snowflake.com/en/user-guide/dynamic-tables-tasks-create#query-constructs-not-currently-supported-in-dynamic-tables). +- Dynamic table SQL cannot be updated; the dynamic table must go through a `--full-refresh` (DROP/CREATE). +- Dynamic tables cannot be downstream from: materialized views, external tables, streams. +- Dynamic tables cannot reference a view that is downstream from another dynamic table. + +Find more information about dynamic table limitations in Snowflake's [docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-tasks-create#dynamic-table-limitations-and-supported-functions). + +For dbt limitations, these dbt features are not supported: +- [Model contracts](/docs/collaborate/govern/model-contracts) +- [Copy grants configuration](/reference/resource-configs/snowflake-configs#copying-grants) ## Temporary tables @@ -338,270 +376,297 @@ In the configuration format for the model SQL file: -## Dynamic tables -The Snowflake adapter supports [dynamic tables](https://docs.snowflake.com/en/user-guide/dynamic-tables-about). -This materialization is specific to Snowflake, which means that any model configuration that -would normally come along for the ride from `dbt-core` (e.g. as with a `view`) may not be available -for dynamic tables. This gap will decrease in future patches and versions. -While this materialization is specific to Snowflake, it very much follows the implementation -of [materialized views](/docs/build/materializations#Materialized-View). -In particular, dynamic tables have access to the `on_configuration_change` setting. -Dynamic tables are supported with the following configuration parameters: +## Transient tables - +Snowflake supports the creation of [transient tables](https://docs.snowflake.net/manuals/user-guide/tables-temp-transient.html). Snowflake does not preserve a history for these tables, which can result in a measurable reduction of your Snowflake storage costs. Transient tables participate in time travel to a limited degree with a retention period of 1 day by default with no fail-safe period. Weigh these tradeoffs when deciding whether or not to configure your dbt models as `transient`. **By default, all Snowflake tables created by dbt are `transient`.** -| Parameter | Type | Required | Default | Change Monitoring Support | -|--------------------|------------|----------|-------------|---------------------------| -| [`on_configuration_change`](/reference/resource-configs/on_configuration_change) | `` | no | `apply` | n/a | -| [`target_lag`](#target-lag) | `` | yes | | alter | -| [`snowflake_warehouse`](#configuring-virtual-warehouses) | `` | yes | | alter | - +### Configuring transient tables in dbt_project.yml - +A whole folder (or package) can be configured to be transient (or not) by adding a line to the `dbt_project.yml` file. This config works just like all of the [model configs](/reference/model-configs) defined in `dbt_project.yml`. -| Parameter | Type | Required | Default | Change Monitoring Support | -|--------------------|------------|----------|-------------|---------------------------| -| [`on_configuration_change`](/reference/resource-configs/on_configuration_change) | `` | no | `apply` | n/a | -| [`target_lag`](#target-lag) | `` | yes | | alter | -| [`snowflake_warehouse`](#configuring-virtual-warehouses) | `` | yes | | alter | -| [`refresh_mode`](#refresh-mode) | `` | no | `AUTO` | refresh | -| [`initialize`](#initialize) | `` | no | `ON_CREATE` | n/a | + - +```yaml +name: my_project - +... - +models: + +transient: false + my_project: + ... +``` - + + +### Configuring transience for a specific model + +A specific model can be configured to be transient by setting the `transient` model config to `true`. + + + +```sql +{{ config(materialized='table', transient=true) }} + +select * from ... +``` + + + +## Query tags + +[Query tags](https://docs.snowflake.com/en/sql-reference/parameters.html#query-tag) are a Snowflake +parameter that can be quite useful later on when searching in the [QUERY_HISTORY view](https://docs.snowflake.com/en/sql-reference/account-usage/query_history.html). + +dbt supports setting a default query tag for the duration of its Snowflake connections in +[your profile](/docs/core/connect-data-platform/snowflake-setup). You can set more precise values (and override the default) for subsets of models by setting +a `query_tag` model config or by overriding the default `set_query_tag` macro: ```yaml models: [](/reference/resource-configs/resource-path): - [+](/reference/resource-configs/plus-prefix)[materialized](/reference/resource-configs/materialized): dynamic_table - [+](/reference/resource-configs/plus-prefix)[on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail - [+](/reference/resource-configs/plus-prefix)[target_lag](#target-lag): downstream | - [+](/reference/resource-configs/plus-prefix)[snowflake_warehouse](#configuring-virtual-warehouses): + +query_tag: dbt_special ``` - - + - +```sql +{{ config( + query_tag = 'dbt_special' +) }} - +select ... -```yaml -version: 2 +``` + +In this example, you can set up a query tag to be applied to every query with the model's name. + +```sql -models: - - name: [] - config: - [materialized](/reference/resource-configs/materialized): dynamic_table - [on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail - [target_lag](#target-lag): downstream | - [snowflake_warehouse](#configuring-virtual-warehouses): + {% macro set_query_tag() -%} + {% set new_query_tag = model.name %} + {% if new_query_tag %} + {% set original_query_tag = get_current_query_tag() %} + {{ log("Setting query_tag to '" ~ new_query_tag ~ "'. Will reset to '" ~ original_query_tag ~ "' after materialization.") }} + {% do run_query("alter session set query_tag = '{}'".format(new_query_tag)) %} + {{ return(original_query_tag)}} + {% endif %} + {{ return(none)}} +{% endmacro %} ``` +**Note:** query tags are set at the _session_ level. At the start of each model , if the model has a custom `query_tag` configured, dbt will run `alter session set query_tag` to set the new value. At the end of the materialization, dbt will run another `alter` statement to reset the tag to its default value. As such, build failures midway through a materialization may result in subsequent queries running with an incorrect tag. + - +## Merge behavior (incremental models) +The [`incremental_strategy` config](/docs/build/incremental-strategy) controls how dbt builds incremental models. By default, dbt will use a [merge statement](https://docs.snowflake.net/manuals/sql-reference/sql/merge.html) on Snowflake to refresh incremental tables. - +Snowflake's `merge` statement fails with a "nondeterministic merge" error if the `unique_key` specified in your model config is not actually unique. If you encounter this error, you can instruct dbt to use a two-step incremental approach by setting the `incremental_strategy` config for your model to `delete+insert`. - +## Configuring table clustering -```jinja +dbt supports [table clustering](https://docs.snowflake.net/manuals/user-guide/tables-clustering-keys.html) on Snowflake. To control clustering for a or incremental model, use the `cluster_by` config. When this configuration is applied, dbt will do two things: -{{ config( - [materialized](/reference/resource-configs/materialized)="dynamic_table", - [on_configuration_change](/reference/resource-configs/on_configuration_change)="apply" | "continue" | "fail", - [target_lag](#target-lag)="downstream" | " seconds | minutes | hours | days", - [snowflake_warehouse](#configuring-virtual-warehouses)="", +1. It will implicitly order the table results by the specified `cluster_by` fields +2. It will add the specified clustering keys to the target table -) }} +By using the specified `cluster_by` fields to order the table, dbt minimizes the amount of work required by Snowflake's automatic clustering functionality. If an incremental model is configured to use table clustering, then dbt will also order the staged dataset before merging it into the destination table. As such, the dbt-managed table should always be in a mostly clustered state. -``` +### Using cluster_by - +The `cluster_by` config accepts either a string, or a list of strings to use as clustering keys. The following example will create a sessions table that is clustered by the `session_start` column. - + - +```sql +{{ + config( + materialized='table', + cluster_by=['session_start'] + ) +}} - +select + session_id, + min(event_time) as session_start, + max(event_time) as session_end, + count(*) as count_pageviews - +from {{ source('snowplow', 'event') }} +group by 1 +``` - + - +The code above will be compiled to SQL that looks (approximately) like this: - +```sql +create or replace table my_database.my_schema.my_table as ( -```yaml -models: - [](/reference/resource-configs/resource-path): - [+](/reference/resource-configs/plus-prefix)[materialized](/reference/resource-configs/materialized): dynamic_table - [+](/reference/resource-configs/plus-prefix)[on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail - [+](/reference/resource-configs/plus-prefix)[target_lag](#target-lag): downstream | - [+](/reference/resource-configs/plus-prefix)[snowflake_warehouse](#configuring-virtual-warehouses): - [+](/reference/resource-configs/plus-prefix)[refresh_mode](#refresh-mode): AUTO | FULL | INCREMENTAL - [+](/reference/resource-configs/plus-prefix)[initialize](#initialize): ON_CREATE | ON_SCHEDULE + select * from ( + select + session_id, + min(event_time) as session_start, + max(event_time) as session_end, + count(*) as count_pageviews + + from {{ source('snowplow', 'event') }} + group by 1 + ) + + -- this order by is added by dbt in order to create the + -- table in an already-clustered manner. + order by session_start + +); + alter table my_database.my_schema.my_table cluster by (session_start); ``` - +### Automatic clustering - +Automatic clustering is [enabled by default in Snowflake today](https://docs.snowflake.com/en/user-guide/tables-auto-reclustering.html), no action is needed to make use of it. Though there is an `automatic_clustering` config, it has no effect except for accounts with (deprecated) manual clustering enabled. +If [manual clustering is still enabled for your account](https://docs.snowflake.com/en/user-guide/tables-clustering-manual.html), you can use the `automatic_clustering` config to control whether or not automatic clustering is enabled for dbt models. When `automatic_clustering` is set to `true`, dbt will run an `alter table
resume recluster` query after building the target table. - +The `automatic_clustering` config can be specified in the `dbt_project.yml` file, or in a model `config()` block. - + ```yaml -version: 2 - models: - - name: [] - config: - [materialized](/reference/resource-configs/materialized): dynamic_table - [on_configuration_change](/reference/resource-configs/on_configuration_change): apply | continue | fail - [target_lag](#target-lag): downstream | - [snowflake_warehouse](#configuring-virtual-warehouses): - [refresh_mode](#refresh-mode): AUTO | FULL | INCREMENTAL - [initialize](#initialize): ON_CREATE | ON_SCHEDULE - + +automatic_clustering: true ``` - +## Configuring virtual warehouses +The default warehouse that dbt uses can be configured in your [Profile](/docs/core/connect-data-platform/profiles.yml) for Snowflake connections. To override the warehouse that is used for specific models (or groups of models), use the `snowflake_warehouse` model configuration. This configuration can be used to specify a larger warehouse for certain models in order to control Snowflake costs and project build times. - + - + -```jinja +The example config below changes the warehouse for a group of models with a config argument in the yml. -{{ config( - [materialized](/reference/resource-configs/materialized)="dynamic_table", - [on_configuration_change](/reference/resource-configs/on_configuration_change)="apply" | "continue" | "fail", - [target_lag](#target-lag)="downstream" | " seconds | minutes | hours | days", - [snowflake_warehouse](#configuring-virtual-warehouses)="", - [refresh_mode](#refresh-mode)="AUTO" | "FULL" | "INCREMENTAL", - [initialize](#initialize)="ON_CREATE" | "ON_SCHEDULE", + -) }} +```yaml +name: my_project +version: 1.0.0 + +... + +models: + +snowflake_warehouse: "EXTRA_SMALL" # use the `EXTRA_SMALL` warehouse for all models in the project... + my_project: + clickstream: + +snowflake_warehouse: "EXTRA_LARGE" # ...except for the models in the `clickstream` folder, which will use the `EXTRA_LARGE` warehouse. +snapshots: + +snowflake_warehouse: "EXTRA_LARGE" # all Snapshot models are configured to use the `EXTRA_LARGE` warehouse. ``` - - - - - -Learn more about these parameters in Snowflake's [docs](https://docs.snowflake.com/en/sql-reference/sql/create-dynamic-table): + -### Target lag +The example config below changes the warehouse for a single model with a config() block in the sql model. -Snowflake allows two configuration scenarios for scheduling automatic refreshes: -- **Time-based** — Provide a value of the form ` { seconds | minutes | hours | days }`. For example, if the dynamic table needs to be updated every 30 minutes, use `target_lag='30 minutes'`. -- **Downstream** — Applicable when the dynamic table is referenced by other dynamic tables. In this scenario, `target_lag='downstream'` allows for refreshes to be controlled at the target, instead of at each layer. + -Learn more about `target_lag` in Snowflake's [docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-refresh#understanding-target-lag). +```sql +{{ + config( + materialized='table', + snowflake_warehouse='EXTRA_LARGE' + ) +}} - +with -### Refresh mode +aggregated_page_events as ( -Snowflake allows three options for refresh mode: -- **AUTO** — Enforces an incremental refresh of the dynamic table by default. If the `CREATE DYNAMIC TABLE` statement does not support the incremental refresh mode, the dynamic table is automatically created with the full refresh mode. -- **FULL** — Enforces a full refresh of the dynamic table, even if the dynamic table can be incrementally refreshed. -- **INCREMENTAL** — Enforces an incremental refresh of the dynamic table. If the query that underlies the dynamic table can’t perform an incremental refresh, dynamic table creation fails and displays an error message. + select + session_id, + min(event_time) as session_start, + max(event_time) as session_end, + count(*) as count_page_views + from {{ source('snowplow', 'event') }} + group by 1 -Learn more about `refresh_mode` in [Snowflake's docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-refresh). +), -### Initialize +index_sessions as ( -Snowflake allows two options for initialize: -- **ON_CREATE** — Refreshes the dynamic table synchronously at creation. If this refresh fails, dynamic table creation fails and displays an error message. -- **ON_SCHEDULE** — Refreshes the dynamic table at the next scheduled refresh. + select + *, + row_number() over ( + partition by session_id + order by session_start + ) as page_view_in_session_index + from aggregated_page_events -Learn more about `initialize` in [Snowflake's docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-refresh). +) - +select * from index_sessions +``` -### Limitations + + + -As with materialized views on most data platforms, there are limitations associated with dynamic tables. Some worth noting include: +## Copying grants -- Dynamic table SQL has a [limited feature set](https://docs.snowflake.com/en/user-guide/dynamic-tables-tasks-create#query-constructs-not-currently-supported-in-dynamic-tables). -- Dynamic table SQL cannot be updated; the dynamic table must go through a `--full-refresh` (DROP/CREATE). -- Dynamic tables cannot be downstream from: materialized views, external tables, streams. -- Dynamic tables cannot reference a view that is downstream from another dynamic table. +When the `copy_grants` config is set to `true`, dbt will add the `copy grants` qualifier when rebuilding tables and views. The default value is `false`. -Find more information about dynamic table limitations in Snowflake's [docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-tasks-create#dynamic-table-limitations-and-supported-functions). + -For dbt limitations, these dbt features are not supported: -- [Model contracts](/docs/collaborate/govern/model-contracts) -- [Copy grants configuration](/reference/resource-configs/snowflake-configs#copying-grants) +```yaml +models: + +copy_grants: true +``` - + -#### Changing materialization to and from "dynamic_table" +## Secure views -Version `1.6.x` does not support altering the materialization from a non-dynamic table be a dynamic table and vice versa. -Re-running with the `--full-refresh` does not resolve this either. -The workaround is manually dropping the existing model in the warehouse prior to calling `dbt run`. -This only needs to be done once for the conversion. +To create a Snowflake [secure view](https://docs.snowflake.net/manuals/user-guide/views-secure.html), use the `secure` config for view models. Secure views can be used to limit access to sensitive data. Note: secure views may incur a performance penalty, so you should only use them if you need them. -For example, assume for the example model below, `my_model`, has already been materialized to the underlying data platform via `dbt run`. -If the model config is updated to `materialized="dynamic_table"`, dbt will return an error. -The workaround is to execute `DROP TABLE my_model` on the data warehouse before trying the model again. +The following example configures the models in the `sensitive/` folder to be configured as secure views. - + ```yaml +name: my_project +version: 1.0.0 -{{ config( - materialized="table" # or any model type (e.g. view, incremental) -) }} - +models: + my_project: + sensitive: + +materialized: view + +secure: true ``` - ## Source freshness known limitation diff --git a/website/docs/reference/resource-configs/strategy.md b/website/docs/reference/resource-configs/strategy.md index f1493c7485f..e2b2cac1c59 100644 --- a/website/docs/reference/resource-configs/strategy.md +++ b/website/docs/reference/resource-configs/strategy.md @@ -4,6 +4,13 @@ description: "Strategy - Read this in-depth guide to learn about configurations datatype: timestamp | check --- + + +import SnapshotYaml from '/snippets/_snapshot-yaml-spec.md'; + + + + + + + + + ```yaml + snapshots: + - [name: snapshot_name](/reference/resource-configs/snapshot_name): + relation: source('my_source', 'my_table') + config: + strategy: timestamp + updated_at: column_name + ``` + + + + + ```jinja2 @@ -30,6 +54,7 @@ select ... ``` + @@ -47,6 +72,22 @@ snapshots: + + + + + ```yaml + snapshots: + - [name: snapshot_name](/reference/resource-configs/snapshot_name): + relation: source('my_source', 'my_table') + config: + strategy: check + check_cols: [column_name] | "all" + ``` + + + + ```jinja2 @@ -62,6 +103,7 @@ snapshots: ``` + @@ -88,7 +130,25 @@ This is a **required configuration**. There is no default value. ## Examples ### Use the timestamp strategy + + + +```yaml +snapshots: + - name: orders_snapshot_timestamp + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + strategy: timestamp + unique_key: id + updated_at: updated_at + +``` + + + + ```sql @@ -109,9 +169,31 @@ This is a **required configuration**. There is no default value. ``` + -### Use the check_cols strategy +### Use the check strategy + + + + +```yaml +snapshots: + - name: orders_snapshot_check + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: check + check_cols: + - status + - is_cancelled + +``` + + + + ```sql {% snapshot orders_snapshot_check %} @@ -129,6 +211,7 @@ This is a **required configuration**. There is no default value. {% endsnapshot %} ``` + ### Advanced: define and use custom snapshot strategy Behind the scenes, snapshot strategies are implemented as macros, named `snapshot__strategy` @@ -140,6 +223,23 @@ It's possible to implement your own snapshot strategy by adding a macro with the 1. Create a macro named `snapshot_timestamp_with_deletes_strategy`. Use the existing code as a guide and adjust as needed. 2. Use this strategy via the `strategy` configuration: + + + +```yaml +snapshots: + - name: my_custom_snapshot + relation: source('my_source', 'my_table') + config: + strategy: timestamp_with_deletes + updated_at: updated_at_column + unique_key: id +``` + + + + + ```jinja2 @@ -155,3 +255,4 @@ It's possible to implement your own snapshot strategy by adding a macro with the ``` + diff --git a/website/docs/reference/resource-configs/target_schema.md b/website/docs/reference/resource-configs/target_schema.md index 893686a7513..ffa95df9be7 100644 --- a/website/docs/reference/resource-configs/target_schema.md +++ b/website/docs/reference/resource-configs/target_schema.md @@ -4,9 +4,9 @@ description: "Target_schema - Read this in-depth guide to learn about configurat datatype: string --- -:::note +:::info -For [versionless](/docs/dbt-versions/core-upgrade/upgrading-to-v1.8#versionless) dbt Cloud accounts and dbt Core v1.9+, this functionality is no longer required. Use the [schema](/reference/resource-configs/schema) config as an alternative to define a custom schema while still respecting the `generate_schema_name` macro. +For [versionless](/docs/dbt-versions/core-upgrade/upgrading-to-v1.8#versionless) dbt Cloud accounts and dbt Core v1.9+, this configuration is no longer required. Use the [schema](/reference/resource-configs/schema) config as an alternative to define a custom schema while still respecting the `generate_schema_name` macro. ::: @@ -33,12 +33,14 @@ snapshots: ## Description -The schema that dbt should build a [snapshot](/docs/build/snapshots) into. Snapshots build into the same `target_schema`, no matter who is running them. +The schema that dbt should build a [snapshot](/docs/build/snapshots) into. When `target_schema` is provided, snapshots build into the same `target_schema`, no matter who is running them. On **BigQuery**, this is analogous to a `dataset`. ## Default -This is a **required** parameter, no default is provided. + +This is a required parameter, no default is provided. +For versionless dbt Cloud accounts and dbt Core v1.9+, this is not a required parameter. ## Examples ### Build all snapshots in a schema named `snapshots` @@ -53,38 +55,10 @@ snapshots: -### Use a target-aware schema -Use the [`{{ target }}` variable](/reference/dbt-jinja-functions/target) to change which schema a snapshot is built in. - -Note: consider whether this use-case is right for you, as downstream `refs` will select from the `dev` version of a snapshot, which can make it hard to validate models that depend on snapshots (see above [FAQ](#faqs)) - - - -```yml -snapshots: - +target_schema: "{% if target.name == 'prod' %}snapshots{% else %}{{ target.schema }}{% endif %}" - -``` - - + ### Use the same schema-naming behavior as models -Leverage the [`generate_schema_name` macro](/docs/build/custom-schemas) to build snapshots in schemas that follow the same naming behavior as your models. - -Notes: -* This macro is not available when configuring from the `dbt_project.yml` file, so must be configured in a snapshot config block. -* Consider whether this use-case is right for you, as downstream `refs` will select from the `dev` version of a snapshot, which can make it hard to validate models that depend on snapshots (see above [FAQ](#faqs)) - +For native support of environment-aware snapshots, upgrade to dbt Core version 1.9+ and remove any existing `target_schema` configuration. - - -```sql -{{ - config( - target_schema=generate_schema_name('snapshots') - ) -}} -``` - - + \ No newline at end of file diff --git a/website/docs/reference/resource-configs/unique_key.md b/website/docs/reference/resource-configs/unique_key.md index 9ad3417fd5e..996e7148292 100644 --- a/website/docs/reference/resource-configs/unique_key.md +++ b/website/docs/reference/resource-configs/unique_key.md @@ -4,6 +4,29 @@ description: "Unique_key - Read this in-depth guide to learn about configuration datatype: column_name_or_expression --- + + + + + +```yaml +snapshots: + - name: orders_snapshot + relation: source('my_source', 'my_table') + [config](/reference/snapshot-configs): + unique_key: id + +``` + + + + + + +import SnapshotYaml from '/snippets/_snapshot-yaml-spec.md'; + + + ```jinja2 @@ -12,8 +35,8 @@ datatype: column_name_or_expression ) }} ``` - + @@ -29,6 +52,8 @@ snapshots: ## Description A column name or expression that is unique for the inputs of a snapshot. dbt uses this to match records between a result set and an existing snapshot, so that changes can be captured correctly. +In Versionless and dbt v1.9 and later, [snapshots](/docs/build/snapshots) are defined and configured in YAML files within your `snapshots/` directory. The `unique_key` is specified within the `config` block of your snapshot YAML file. + :::caution Providing a non-unique key will result in unexpected snapshot results. dbt **will not** test the uniqueness of this key, consider [testing](/blog/primary-key-testing#how-to-test-primary-keys-with-dbt) the source data to ensure that this key is indeed unique. @@ -41,6 +66,26 @@ This is a **required parameter**. No default is provided. ## Examples ### Use an `id` column as a unique key + + + + + +```yaml +snapshots: + - name: orders_snapshot + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: timestamp + updated_at: updated_at + +``` + + + + ```jinja2 @@ -55,7 +100,9 @@ This is a **required parameter**. No default is provided. You can also write this in yaml. This might be a good idea if multiple snapshots share the same `unique_key` (though we prefer to apply this configuration in a config block, as above). + +You can also specify configurations in your `dbt_project.yml` file if multiple snapshots share the same `unique_key`: ```yml @@ -70,6 +117,25 @@ snapshots: ### Use a combination of two columns as a unique key This configuration accepts a valid column expression. As such, you can concatenate two columns together as a unique key if required. It's a good idea to use a separator (e.g. `'-'`) to ensure uniqueness. + + + + +```yaml +snapshots: + - name: transaction_items_snapshot + relation: source('erp', 'transactions') + config: + schema: snapshots + unique_key: "transaction_id || '-' || line_item_id" + strategy: timestamp + updated_at: updated_at + +``` + + + + @@ -93,10 +159,45 @@ from {{ source('erp', 'transactions') }} ``` + Though, it's probably a better idea to construct this column in your query and use that as the `unique_key`: + + + + +```yaml +snapshots: + - name: transaction_items_snapshot + relation: {{ ref('transaction_items_ephemeral') }} + config: + schema: snapshots + unique_key: id + strategy: timestamp + updated_at: updated_at +``` + + + + +```sql +{{ config(materialized='ephemeral') }} + +select + transaction_id || '-' || line_item_id as id, + * +from {{ source('erp', 'transactions') }} + +``` + + + +In this example, we create an ephemeral model `transaction_items_ephemeral` that creates an `id` column that can be used as the `unique_key` our snapshot configuration. + + + ```jinja2 @@ -121,3 +222,4 @@ from {{ source('erp', 'transactions') }} ``` + diff --git a/website/docs/reference/resource-configs/updated_at.md b/website/docs/reference/resource-configs/updated_at.md index 896405bf063..09122859e43 100644 --- a/website/docs/reference/resource-configs/updated_at.md +++ b/website/docs/reference/resource-configs/updated_at.md @@ -3,6 +3,29 @@ resource_types: [snapshots] description: "Updated_at - Read this in-depth guide to learn about configurations in dbt." datatype: column_name --- + + + + + + +```yaml +snapshots: + - name: snapshot + relation: source('my_source', 'my_table') + [config](/reference/snapshot-configs): + strategy: timestamp + updated_at: column_name +``` + + + + + +import SnapshotYaml from '/snippets/_snapshot-yaml-spec.md'; + + + ```jinja2 @@ -14,6 +37,7 @@ datatype: column_name ``` + @@ -27,6 +51,16 @@ snapshots: + + +:::caution + +You will get a warning if the data type of the `updated_at` column does not match the adapter-configured default. + +::: + + + ## Description A column within the results of your snapshot query that represents when the record row was last updated. @@ -39,6 +73,25 @@ No default is provided. ## Examples ### Use a column name `updated_at` + + + + +```yaml +snapshots: + - name: orders_snapshot + relation: source('jaffle_shop', 'orders') + config: + schema: snapshots + unique_key: id + strategy: timestamp + updated_at: updated_at + +``` + + + + ```sql @@ -61,12 +114,55 @@ select * from {{ source('jaffle_shop', 'orders') }} ``` + ### Coalesce two columns to create a reliable `updated_at` column Consider a data source that only has an `updated_at` column filled in when a record is updated (so a `null` value indicates that the record hasn't been updated after it was created). Since the `updated_at` configuration only takes a column name, rather than an expression, you should update your snapshot query to include the coalesced column. + + + +1. Create an staging model to perform the transformation. + In your `models/` directory, create a SQL file that configures an staging model to coalesce the `updated_at` and `created_at` columns into a new column `updated_at_for_snapshot`. + + + + ```sql + select * coalesce (updated_at, created_at) as updated_at_for_snapshot + from {{ source('jaffle_shop', 'orders') }} + + ``` + + +2. Define the snapshot configuration in a YAML file. + In your `snapshots/` directory, create a YAML file that defines your snapshot and references the `updated_at_for_snapshot` staging model you just created. + + + + ```yaml + snapshots: + - name: orders_snapshot + relation: ref('staging_orders') + config: + schema: snapshots + unique_key: id + strategy: timestamp + updated_at: updated_at_for_snapshot + + ``` + + +3. Run `dbt snapshot` to execute the snapshot. + +Alternatively, you can also create an ephemeral model to performs the required transformations. Then, you reference this model in your snapshot's `relation` key. + + + + + + ```sql @@ -93,3 +189,4 @@ from {{ source('jaffle_shop', 'orders') }} ``` + diff --git a/website/docs/reference/resource-properties/config.md b/website/docs/reference/resource-properties/config.md index 8190c7dd8ca..1e1867dda04 100644 --- a/website/docs/reference/resource-properties/config.md +++ b/website/docs/reference/resource-properties/config.md @@ -170,14 +170,6 @@ exposures: - - -Support for the `config` property on `semantic_models` was added in dbt Core v1.7 - - - - - ```yml @@ -193,20 +185,10 @@ semantic_models: - - - - -Support for the `config` property on `saved queries` was added in dbt Core v1.7. - - - - - ```yml @@ -226,8 +208,6 @@ saved-queries: - - diff --git a/website/docs/reference/resource-properties/constraints.md b/website/docs/reference/resource-properties/constraints.md index b8111ef0adb..63582974040 100644 --- a/website/docs/reference/resource-properties/constraints.md +++ b/website/docs/reference/resource-properties/constraints.md @@ -15,13 +15,21 @@ Constraints require the declaration and enforcement of a model [contract](/refer Constraints may be defined for a single column, or at the model level for one or more columns. As a general rule, we recommend defining single-column constraints directly on those columns. -If you are defining multiple `primary_key` constraints for a single model, those _must_ be defined at the model level. Defining multiple `primary_key` constraints at the column level is not supported. +If you define multiple `primary_key` constraints for a single model, those _must_ be defined at the model level. Defining multiple `primary_key` constraints at the column level is not supported. The structure of a constraint is: - `type` (required): one of `not_null`, `unique`, `primary_key`, `foreign_key`, `check`, `custom` - `expression`: Free text input to qualify the constraint. Required for certain constraint types, and optional for others. - `name` (optional): Human-friendly name for this constraint. Supported by some data platforms. -- `columns` (model-level only): List of column names to apply the constraint over +- `columns` (model-level only): List of column names to apply the constraint over. + + + +Foreign key constraints accept two additional inputs: +- `to`: A relation input, likely `ref()`, indicating the referenced table. +- `to_columns`: A list of column(s) in that table containing the corresponding primary or unique key. + +This syntax for defining foreign keys uses `ref`, meaning it will capture dependencies and works across different environments. It's available in [dbt Cloud Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) and versions of dbt Core starting with v1.9. @@ -31,42 +39,88 @@ models: # required config: - contract: - enforced: true + contract: {enforced: true} + + # model-level constraints + constraints: + - type: primary_key + columns: [first_column, second_column, ...] + - type: foreign_key # multi_column + columns: [first_column, second_column, ...] + to: ref('other_model_name') + to_columns: [other_model_first_column, other_model_second_columns, ...] + - type: check + columns: [first_column, second_column, ...] + expression: "first_column != second_column" + name: human_friendly_name + - type: ... + + columns: + - name: first_column + data_type: string + + # column-level constraints + constraints: + - type: not_null + - type: unique + - type: foreign_key + to: ref('other_model_name') + to_columns: other_model_column + - type: ... +``` + + + + + + +In older versions of dbt Core, when defining a `foreign_key` constraint, you need to manually specify the referenced table in the `expression` field. You can use `{{ target }}` variables to make this expression environment-aware, but the dependency between this model and the referenced table is not captured. Starting in dbt Core v1.9, you can specify the referenced table using the `ref()` function. + + + +```yml +models: + - name: + + # required + config: + contract: {enforced: true} # model-level constraints constraints: - type: primary_key - columns: [FIRST_COLUMN, SECOND_COLUMN, ...] - - type: FOREIGN_KEY # multi_column - columns: [FIRST_COLUMN, SECOND_COLUMN, ...] - expression: "OTHER_MODEL_SCHEMA.OTHER_MODEL_NAME (OTHER_MODEL_FIRST_COLUMN, OTHER_MODEL_SECOND_COLUMN, ...)" + columns: [first_column, second_column, ...] + - type: foreign_key # multi_column + columns: [first_column, second_column, ...] + expression: "{{ target.schema }}.other_model_name (other_model_first_column, other_model_second_column, ...)" - type: check - columns: [FIRST_COLUMN, SECOND_COLUMN, ...] - expression: "FIRST_COLUMN != SECOND_COLUMN" - name: HUMAN_FRIENDLY_NAME + columns: [first_column, second_column, ...] + expression: "first_column != second_column" + name: human_friendly_name - type: ... columns: - - name: FIRST_COLUMN - data_type: DATA_TYPE + - name: first_column + data_type: string # column-level constraints constraints: - type: not_null - type: unique - type: foreign_key - expression: OTHER_MODEL_SCHEMA.OTHER_MODEL_NAME (OTHER_MODEL_COLUMN) + expression: "{{ target.schema }}.other_model_name (other_model_column)" - type: ... ``` + + ## Platform-specific support In transactional databases, it is possible to define "constraints" on the allowed values of certain columns, stricter than just the data type of those values. For example, Postgres supports and enforces all the constraints in the ANSI SQL standard (`not null`, `unique`, `primary key`, `foreign key`), plus a flexible row-level `check` constraint that evaluates to a boolean expression. -Most analytical data platforms support and enforce a `not null` constraint, but they either do not support or do not enforce the rest. It is sometimes still desirable to add an "informational" constraint, knowing it is _not_ enforced, for the purpose of integrating with legacy data catalog or entity-relation diagram tools ([dbt-core#3295](https://github.com/dbt-labs/dbt-core/issues/3295)). +Most analytical data platforms support and enforce a `not null` constraint, but they either do not support or do not enforce the rest. It is sometimes still desirable to add an "informational" constraint, knowing it is _not_ enforced, for the purpose of integrating with legacy data catalog or entity-relation diagram tools ([dbt-core#3295](https://github.com/dbt-labs/dbt-core/issues/3295)). Some data platforms can optionally use primary or foreign key constraints for query optimization if you specify an additional keyword. To that end, there are two optional fields you can specify on any filter: - `warn_unenforced: False` to skip warning on constraints that are supported, but not enforced, by this data platform. The constraint will be included in templated DDL. @@ -234,7 +288,7 @@ select Snowflake suppports four types of constraints: `unique`, `not null`, `primary key`, and `foreign key`. It is important to note that only the `not null` (and the `not null` property of `primary key`) are actually checked at present. -The rest of the constraints are purely metadata, not verified when inserting data. +The rest of the constraints are purely metadata, not verified when inserting data. Although Snowflake does not validate `unique`, `primary`, or `foreign_key` constraints, you may optionally instruct Snowflake to use them for query optimization by specifying [`rely`](https://docs.snowflake.com/en/user-guide/join-elimination) in the constraint `expression` field. Currently, Snowflake doesn't support the `check` syntax and dbt will skip the `check` config and raise a warning message if it is set on some models in the dbt project. @@ -518,3 +572,73 @@ alter table schema_name.my_model add constraint 472394792387497234 check (id > 0 + +## Custom constraints + +In dbt Cloud and dbt Core, you can use custom constraints on models for the advanced configuration of tables. Different data warehouses support different syntax and capabilities. + +Custom constraints allow you to add configuration to specific columns. For example: + + - Set [masking policies](https://docs.snowflake.com/en/user-guide/security-column-intro#what-are-masking-policies) in Snowflake when using a Create Table As Select (CTAS). + + - Other data warehouses (such as [Databricks](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html) and [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#column_name_and_column_schema) have their own set of parameters that can be set for columns in their CTAS statements. + + +You can implement constraints in a couple of different ways: + + + +Here's an example of how to implement tag-based masking policies with contracts and constraints using the following syntax: + + + +```yaml + +models: + - name: my_model + config: + contract: + enforced: true + materialized: table + columns: + - name: id + data_type: int + constraints: + - type: custom + expression: "tag (my_tag = 'my_value')" # A custom SQL expression used to enforce a specific constraint on a column. + +``` + + + +Using this syntax requires configuring all the columns and their types as it’s the only way to send a create or replace ` mytable as ...`. It’s not possible to do it with just a partial list of columns. This means making sure the columns and constraints fields are fully defined. + +To generate a YAML with all the columns, you can use `generate_model_yaml` from [dbt-codegen](https://github.com/dbt-labs/dbt-codegen/tree/0.12.1/?tab=readme-ov-file#generate_model_yaml-source). + + + + +Alternatively, you can add a masking policy without tags: + + + +```yaml + +models: + - name: my_model + config: + contract: + enforced: true + materialized: table + columns: + - name: id + data_type: int + constraints: + - type: custom + expression: "masking policy my_policy" + +``` + + + + diff --git a/website/docs/reference/resource-properties/deprecation_date.md b/website/docs/reference/resource-properties/deprecation_date.md index be76ccb07f6..501fdc30237 100644 --- a/website/docs/reference/resource-properties/deprecation_date.md +++ b/website/docs/reference/resource-properties/deprecation_date.md @@ -53,11 +53,11 @@ Additionally, [`WARN_ERROR_OPTIONS`](/reference/global-configs/warnings) gives a |--------------------------------|----------------------------------------------------|------------------------| | `DeprecatedModel` | Parsing a project that defines a deprecated model | Producer | | `DeprecatedReference` | Referencing a model with a past deprecation date | Producer and consumers | -| `UpcomingDeprecationReference` | Referencing a model with a future deprecation date | Producer and consumers | +| `UpcomingReferenceDeprecation` | Referencing a model with a future deprecation date | Producer and consumers | -** Example ** +**Example** -Example output for an `UpcomingDeprecationReference` warning: +Example output for an `UpcomingReferenceDeprecation` warning: ``` $ dbt parse 15:48:14 Running with dbt=1.6.0 diff --git a/website/docs/reference/resource-properties/description.md b/website/docs/reference/resource-properties/description.md index ce0c7c42074..cf7b2b29a5a 100644 --- a/website/docs/reference/resource-properties/description.md +++ b/website/docs/reference/resource-properties/description.md @@ -1,7 +1,7 @@ --- resource_types: all datatype: markdown_string - +description: "This guide explains how to use the description key to add YAML descriptions to dbt resources (models, sources, seeds) using markdown and Jinja for better documentation." --- @@ -145,6 +146,32 @@ macros: + + + + + + +```yml +version: 2 + +data_tests: + - name: data_test_name + description: markdown_string + +``` + + + + + + + +The `description` property is available for generic and singular data tests beginning in dbt v1.9. + + + + diff --git a/website/docs/reference/resource-properties/freshness.md b/website/docs/reference/resource-properties/freshness.md index 03037e7b681..d68dee4fade 100644 --- a/website/docs/reference/resource-properties/freshness.md +++ b/website/docs/reference/resource-properties/freshness.md @@ -37,8 +37,6 @@ A freshness block is used to define the acceptable amount of time between the mo In the `freshness` block, one or both of `warn_after` and `error_after` can be provided. If neither is provided, then dbt will not calculate freshness snapshots for the tables in this source. - - In most cases, the `loaded_at_field` is required. Some adapters support calculating source freshness from the warehouse metadata tables and can exclude the `loaded_at_field`. If a source has a `freshness:` block, dbt will attempt to calculate freshness for that source: @@ -62,29 +60,9 @@ To exclude a source from freshness calculations, you have two options: - Don't add a `freshness:` block. - Explicitly set `freshness: null`. - - - - -Additionally, the `loaded_at_field` is required to calculate freshness for a table. If a `loaded_at_field` is not provided, then dbt will not calculate freshness for the table. - -Freshness blocks are applied hierarchically: -- A `freshness` and `loaded_at_field` property added to a source will be applied to all tables defined in that source -- A `freshness` and `loaded_at_field` property added to a source _table_ will override any properties applied to the source. - -This is useful when all of the tables in a source have the same `loaded_at_field`, as is often the case. - - ## loaded_at_field - -(Optional on adapters that support pulling freshness from warehouse metadata tables, required otherwise.) - - - -(Required) - - +Optional on adapters that support pulling freshness from warehouse metadata tables, required otherwise.

A column name (or expression) that returns a timestamp indicating freshness. If using a date field, you may have to cast it to a timestamp: diff --git a/website/docs/reference/resource-properties/identifier.md b/website/docs/reference/resource-properties/identifier.md index c15f7f18438..f11b3cd3ded 100644 --- a/website/docs/reference/resource-properties/identifier.md +++ b/website/docs/reference/resource-properties/identifier.md @@ -25,7 +25,7 @@ The name as stored in the database. This parameter is useful if you want to use a source table name that differs from the table name in the database. ## Default -By default, dbt will use the table's `name:` parameter as the identifier. +By default, dbt will use the table's `name` parameter as the identifier. ## Examples ### Use a simpler name for a source table than the one in your database diff --git a/website/docs/reference/resource-properties/quote.md b/website/docs/reference/resource-properties/quote.md index 50bf4c08c40..41d6fc29e5d 100644 --- a/website/docs/reference/resource-properties/quote.md +++ b/website/docs/reference/resource-properties/quote.md @@ -130,7 +130,7 @@ select user_group as "group" ``` Without setting `quote: true`: -- Schema tests applied to this column may fail due to invalid SQL +- [Data tests](/docs/build/data-tests) applied to this column may fail due to invalid SQL - Documentation may not render correctly, e.g. `group` and `"group"` may not be matched as the same column name. ## Example diff --git a/website/docs/reference/resource-properties/schema.md b/website/docs/reference/resource-properties/schema.md index 157a9ffc0a2..017d93e3235 100644 --- a/website/docs/reference/resource-properties/schema.md +++ b/website/docs/reference/resource-properties/schema.md @@ -28,12 +28,14 @@ The schema name as stored in the database. This parameter is useful if you want to use a source name that differs from the schema name. -#### BigQuery terminology +:::info BigQuery terminology -If you're using BigQuery, use the _dataset_ name as the `schema:` property. +If you're using BigQuery, use the _dataset_ name as the `schema` property. + +::: ## Default -By default, dbt will use the source's `name:` parameter as the schema name. +By default, dbt will use the source's `name` parameter as the schema name. ## Examples ### Use a simpler name for a source schema than the one in your database diff --git a/website/docs/reference/resource-properties/unit-testing-versions.md b/website/docs/reference/resource-properties/unit-testing-versions.md index 67236317650..39ef241c122 100644 --- a/website/docs/reference/resource-properties/unit-testing-versions.md +++ b/website/docs/reference/resource-properties/unit-testing-versions.md @@ -17,18 +17,18 @@ unit_tests: unit_tests: - name: test_is_valid_email_address model: my_model - versions: - include: - - 2 + versions: + include: + - 2 ... # my test_is_valid_email_address unit test will run on all versions EXCEPT 1 of my_model unit_tests: - name: test_is_valid_email_address model: my_model - versions: - exclude: - - 1 + versions: + exclude: + - 1 ... -``` \ No newline at end of file +``` diff --git a/website/docs/reference/seed-configs.md b/website/docs/reference/seed-configs.md index dd733795eef..5d5c39071d6 100644 --- a/website/docs/reference/seed-configs.md +++ b/website/docs/reference/seed-configs.md @@ -113,8 +113,8 @@ seeds: config: [enabled](/reference/resource-configs/enabled): true | false [tags](/reference/resource-configs/tags): | [] - [pre-hook](/reference/resource-configs/pre-hook-post-hook): | [] - [post-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [pre_hook](/reference/resource-configs/pre-hook-post-hook): | [] + [post_hook](/reference/resource-configs/pre-hook-post-hook): | [] [database](/reference/resource-configs/database): [schema](/reference/resource-properties/schema): [alias](/reference/resource-configs/alias): diff --git a/website/docs/reference/snapshot-configs.md b/website/docs/reference/snapshot-configs.md index 5afe429cfb4..144ecafde9d 100644 --- a/website/docs/reference/snapshot-configs.md +++ b/website/docs/reference/snapshot-configs.md @@ -24,15 +24,24 @@ Parts of a snapshot: + + +import SnapshotYaml from '/snippets/_snapshot-yaml-spec.md'; + + + + + + @@ -48,7 +57,7 @@ snapshots: [+](/reference/resource-configs/plus-prefix)[strategy](/reference/resource-configs/strategy): timestamp | check [+](/reference/resource-configs/plus-prefix)[updated_at](/reference/resource-configs/updated_at): [+](/reference/resource-configs/plus-prefix)[check_cols](/reference/resource-configs/check_cols): [] | all - + [+](/reference/resource-configs/plus-prefix)[invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) : true | false ```
@@ -69,7 +78,8 @@ snapshots: [+](/reference/resource-configs/plus-prefix)[strategy](/reference/resource-configs/strategy): timestamp | check [+](/reference/resource-configs/plus-prefix)[updated_at](/reference/resource-configs/updated_at): [+](/reference/resource-configs/plus-prefix)[check_cols](/reference/resource-configs/check_cols): [] | all - + [+](/reference/resource-configs/plus-prefix)[snapshot_meta_column_names](/reference/resource-configs/snapshot_meta_column_names): {} + [+](/reference/resource-configs/plus-prefix)[invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) : true | false ``` @@ -80,45 +90,60 @@ snapshots: -**Note:** Required snapshot properties _will not_ work when defined in `config` YAML blocks. We recommend that you define these in `dbt_project.yml` or a `config()` block within the snapshot `.sql` file. - - + - +**Note:** Required snapshot properties _will not_ work when only defined in `config` YAML blocks. We recommend that you define these in `dbt_project.yml` or a `config()` block within the snapshot `.sql` file or upgrade to v1.9. - + -```jinja + + +Refer to [configuring snapshots](/docs/build/snapshots#configuring-snapshots) for the available configurations. -{{ config( - [target_schema](/reference/resource-configs/target_schema)="", - [target_database](/reference/resource-configs/target_database)="", - [unique_key](/reference/resource-configs/unique_key)="", - [strategy](/reference/resource-configs/strategy)="timestamp" | "check", - [updated_at](/reference/resource-configs/updated_at)="", - [check_cols](/reference/resource-configs/check_cols)=[""] | "all" -) }} + +```yml +snapshots: + - name: + config: + [database](/reference/resource-configs/database): + [schema](/reference/resource-configs/schema): + [unique_key](/reference/resource-configs/unique_key): + [strategy](/reference/resource-configs/strategy): timestamp | check + [updated_at](/reference/resource-configs/updated_at): + [check_cols](/reference/resource-configs/check_cols): [] | all + [snapshot_meta_column_names](/reference/resource-configs/snapshot_meta_column_names): {} + [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) : true | false ``` + + + + + +Configurations can be applied to snapshots using the [YAML syntax](/docs/build/snapshots), available in Versionless and dbt v1.9 and higher, in the `snapshot` directory file. + + + + + ```jinja {{ config( - [schema](/reference/resource-configs/schema)="", - [database](/reference/resource-configs/database)="", - [alias](/reference/resource-configs/alias)="", + [target_schema](/reference/resource-configs/target_schema)="", + [target_database](/reference/resource-configs/target_database)="", [unique_key](/reference/resource-configs/unique_key)="", [strategy](/reference/resource-configs/strategy)="timestamp" | "check", [updated_at](/reference/resource-configs/updated_at)="", [check_cols](/reference/resource-configs/check_cols)=[""] | "all" + [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) : true | false ) }} ``` - @@ -135,7 +160,7 @@ snapshots: defaultValue="project-yaml" values={[ { label: 'Project file', value: 'project-yaml', }, - { label: 'Property file', value: 'property-yaml', }, + { label: 'YAML file', value: 'property-yaml', }, { label: 'Config block', value: 'config', }, ] }> @@ -160,6 +185,8 @@ snapshots: + + ```yaml @@ -178,11 +205,42 @@ snapshots: ``` + + + + + + +```yaml +version: 2 + +snapshots: + - name: [] + relation: source('my_source', 'my_table') + config: + [enabled](/reference/resource-configs/enabled): true | false + [tags](/reference/resource-configs/tags): | [] + [alias](/reference/resource-configs/alias): + [pre-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [post-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [persist_docs](/reference/resource-configs/persist_docs): {} + [grants](/reference/resource-configs/grants): {} +``` + + + + + +Configurations can be applied to snapshots using [YAML syntax](/docs/build/snapshots), available in Versionless and dbt v1.9 and higher, in the `snapshot` directory file. + + + + ```jinja @@ -198,106 +256,140 @@ snapshots: ``` + + - ## Configuring snapshots -Snapshots can be configured in one of three ways: +Snapshots can be configured in multiple ways: -1. Using a `config` block within a snapshot -2. Using a `config` [resource property](/reference/model-properties) in a `.yml` file -3. From the `dbt_project.yml` file, under the `snapshots:` key. To apply a configuration to a snapshot, or directory of snapshots, define the resource path as nested dictionary keys. - -Snapshot configurations are applied hierarchically in the order above. - -### Examples -#### Apply configurations to all snapshots -To apply a configuration to all snapshots, including those in any installed [packages](/docs/build/packages), nest the configuration directly under the `snapshots` key: - - - -```yml - -snapshots: - +unique_key: id -``` - - + +1. Defined in YAML files using a `config` [resource property](/reference/model-properties), typically in your [snapshots directory](/reference/project-configs/snapshot-paths) (available in [Versionless](/docs/dbt-versions/versionless-cloud) or and dbt Core v1.9 and higher). +2. From the `dbt_project.yml` file, under the `snapshots:` key. To apply a configuration to a snapshot, or directory of snapshots, define the resource path as nested dictionary keys. + -#### Apply configurations to all snapshots in your project -To apply a configuration to all snapshots in your project only (for example, _excluding_ any snapshots in installed packages), provide your project name as part of the resource path. + -For a project named `jaffle_shop`: +1. Defined in YAML files using a `config` [resource property](/reference/model-properties), typically in your [snapshots directory](/reference/project-configs/snapshot-paths) (available in [Versionless](/docs/dbt-versions/versionless-cloud) or and dbt Core v1.9 and higher). +2. Using a `config` block within a snapshot defined in Jinja SQL +3. From the `dbt_project.yml` file, under the `snapshots:` key. To apply a configuration to a snapshot, or directory of snapshots, define the resource path as nested dictionary keys. - +Note that in Versionless and dbt v1.9 and later, snapshots are defined in an updated syntax using a YAML file within your `snapshots/` directory (as defined by the [`snapshot-paths` config](/reference/project-configs/snapshot-paths)). For faster and more efficient management, consider the updated snapshot YAML syntax, [available in Versionless](/docs/dbt-versions/versionless-cloud) or [dbt Core v1.9 and later](/docs/dbt-versions/core). -```yml + -snapshots: - jaffle_shop: - +unique_key: id -``` +Snapshot configurations are applied hierarchically in the order above with higher taking precedence. - +### Examples +The following examples demonstrate how to configure snapshots using the `dbt_project.yml` file, a `config` block within a snapshot, and a `.yml` file. -Similarly, you can use the name of an installed package to configure snapshots in that package. +- #### Apply configurations to all snapshots + To apply a configuration to all snapshots, including those in any installed [packages](/docs/build/packages), nest the configuration directly under the `snapshots` key: -#### Apply configurations to one snapshot only + -We recommend using `config` blocks if you need to apply a configuration to one snapshot only. + ```yml - + snapshots: + +unique_key: id + ``` -```sql -{% snapshot orders_snapshot %} - {{ - config( - unique_key='id', - strategy='timestamp', - updated_at='updated_at' - ) - }} - -- Pro-Tip: Use sources in snapshots! - select * from {{ source('jaffle_shop', 'orders') }} -{% endsnapshot %} -``` + - +- #### Apply configurations to all snapshots in your project + To apply a configuration to all snapshots in your project only (for example, _excluding_ any snapshots in installed packages), provide your project name as part of the resource path. -You can also use the full resource path (including the project name, and subdirectories) to configure an individual snapshot from your `dbt_project.yml` file. + For a project named `jaffle_shop`: -For a project named `jaffle_shop`, with a snapshot file within the `snapshots/postgres_app/` directory, where the snapshot is named `orders_snapshot` (as above), this would look like: + - + ```yml -```yml -snapshots: - jaffle_shop: - postgres_app: - orders_snapshot: + snapshots: + jaffle_shop: +unique_key: id - +strategy: timestamp - +updated_at: updated_at -``` - - - -You can also define some common configs in a snapshot's `config` block. We don't recommend this for a snapshot's required configuration, however. - - - -```yml -version: 2 - -snapshots: - - name: orders_snapshot - config: - persist_docs: - relation: true - columns: true -``` - - + ``` + + + + Similarly, you can use the name of an installed package to configure snapshots in that package. + +- #### Apply configurations to one snapshot only + + + Use `config` blocks if you need to apply a configuration to one snapshot only. + + + + ```sql + {% snapshot orders_snapshot %} + {{ + config( + unique_key='id', + strategy='timestamp', + updated_at='updated_at' + ) + }} + -- Pro-Tip: Use sources in snapshots! + select * from {{ source('jaffle_shop', 'orders') }} + {% endsnapshot %} + ``` + + + + + + + + ```yaml + snapshots: + - name: orders_snapshot + relation: source('jaffle_shop', 'orders') + config: + unique_key: id + strategy: timestamp + updated_at: updated_at + persist_docs: + relation: true + columns: true + ``` + + Pro-tip: Use sources in snapshots: `select * from {{ source('jaffle_shop', 'orders') }}` + + + You can also use the full resource path (including the project name, and subdirectories) to configure an individual snapshot from your `dbt_project.yml` file. + + For a project named `jaffle_shop`, with a snapshot file within the `snapshots/postgres_app/` directory, where the snapshot is named `orders_snapshot` (as above), this would look like: + + + + ```yml + snapshots: + jaffle_shop: + postgres_app: + orders_snapshot: + +unique_key: id + +strategy: timestamp + +updated_at: updated_at + ``` + + + + You can also define some common configs in a snapshot's `config` block. We don't recommend this for a snapshot's required configuration, however. + + + + ```yml + version: 2 + + snapshots: + - name: orders_snapshot + +persist_docs: + relation: true + columns: true + ``` + + diff --git a/website/docs/reference/snapshot-properties.md b/website/docs/reference/snapshot-properties.md index 49769af8f6d..d940a9f344c 100644 --- a/website/docs/reference/snapshot-properties.md +++ b/website/docs/reference/snapshot-properties.md @@ -3,12 +3,62 @@ title: Snapshot properties description: "Read this guide to learn about using source properties in dbt." --- + + +In Versionless and dbt v1.9 and later, snapshots are defined and configured in YAML files within your `snapshots/` directory (as defined by the [`snapshot-paths` config](/reference/project-configs/snapshot-paths)). Snapshot properties are declared within these YAML files, allowing you to define both the snapshot configurations and properties in one place. + + + + + Snapshots properties can be declared in `.yml` files in: -- your `snapshots/` directory (as defined by the [`snapshot-paths` config](/reference/project-configs/snapshot-paths)) +- your `snapshots/` directory (as defined by the [`snapshot-paths` config](/reference/project-configs/snapshot-paths)). - your `models/` directory (as defined by the [`model-paths` config](/reference/project-configs/model-paths)) +Note, in Versionless and dbt v1.9 and later, snapshots are defined in an updated syntax using a YAML file within your `snapshots/` directory (as defined by the [`snapshot-paths` config](/reference/project-configs/snapshot-paths)). For faster and more efficient management, consider the updated snapshot YAML syntax, [available in Versionless](/docs/dbt-versions/versionless-cloud) or [dbt Core v1.9 and later](/docs/dbt-versions/core). + + + We recommend that you put them in the `snapshots/` directory. You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `snapshots/` or `models/` directory. + + + + +```yml +version: 2 + +snapshots: + - name: + [description](/reference/resource-properties/description): + [meta](/reference/resource-configs/meta): {} + [docs](/reference/resource-configs/docs): + show: true | false + node_color: # Use name (such as node_color: purple) or hex code with quotes (such as node_color: "#cd7f32") + [config](/reference/resource-properties/config): + [](/reference/snapshot-configs): + [tests](/reference/resource-properties/data-tests): + - + - ... + columns: + - name: + [description](/reference/resource-properties/description): + [meta](/reference/resource-configs/meta): {} + [quote](/reference/resource-properties/quote): true | false + [tags](/reference/resource-configs/tags): [] + [tests](/reference/resource-properties/data-tests): + - + - ... # declare additional tests + - ... # declare properties of additional columns + + - name: ... # declare properties of additional snapshots + +``` + + + + + ```yml @@ -41,3 +91,4 @@ snapshots: ``` + diff --git a/website/docs/sql-reference/aggregate-functions/sql-sum.md b/website/docs/sql-reference/aggregate-functions/sql-sum.md index 494a3863ad3..8216e3f790b 100644 --- a/website/docs/sql-reference/aggregate-functions/sql-sum.md +++ b/website/docs/sql-reference/aggregate-functions/sql-sum.md @@ -11,7 +11,7 @@ slug: /sql-reference/sum The SQL SUM function is handy and ever-present in data work. Let’s unpack what it is, how to use it, and why it's valuable. -Jumping into it, the SUM aggregate function allows you to calculate the sum of a numeric column or across a set of rows for a column. Ultimately, the SUM function is incredibly useful for calculating meaningful business metrics, such as Lifetime Value (LTV), and creating key numeric fields in [`fct_` and `dim_` models](/terms/dimensional-modeling). +Jumping into it, the SUM aggregate function allows you to calculate the sum of a numeric column or across a set of rows for a column. Ultimately, the SUM function is incredibly useful for calculating meaningful business metrics, such as Lifetime Value (LTV), and creating key numeric fields in [`fct_` and `dim_` models](https://www.getdbt.com/blog/guide-to-dimensional-modeling). ## How to use the SUM function in a query diff --git a/website/docs/terms/cte.md b/website/docs/terms/cte.md deleted file mode 100644 index 87ef31abc8e..00000000000 --- a/website/docs/terms/cte.md +++ /dev/null @@ -1,192 +0,0 @@ ---- -id: cte -title: CTE in SQL -description: A CTE is a temporary result set that can be used in a SQL query. You can think of a CTE as a separate, smaller query within the larger query you’re building up. -displayText: CTE -hoverSnippet: A Common Table Expression (CTE) is a temporary result set that can be used in a SQL query. You can use CTEs to break up complex queries into simpler blocks of code that can connect and build on each other. ---- - - - CTE in SQL: Quite possibly the best thing to happen to SQL - - -In a formal sense, a Common Table Expression (CTE), is a temporary result set that can be used in a SQL query. You can use CTEs to break up complex queries into simpler blocks of code that can connect and build on each other. In a less formal, more human-sense, you can think of a CTE as a separate, smaller query within the larger query you’re building up. Creating a CTE is essentially like making a temporary that you can access throughout the rest of the query you are writing. - -There are two-types of CTEs: recursive and non-recursive. This glossary focuses on non-recursive CTEs. - -## Why you should care about CTEs - -Have you ever read through a query and thought: - -- “What does this part of the query do?” -- “What are all the sources referenced in this query? Why did I reference this dependency?” -- “My query is not producing the results I expect and I’m not sure which part of the query is causing that.” - -These thoughts often arise when we’ve written SQL queries and models that utilize complex business logic, references and joins multiple upstream dependencies, and are not outputting expected results. In a nutshell, these thoughts can occur often when you’re trying to write data models! - -How can you make these complexities in your code more digestible and usable? CTEs to the rescue! - -## CTE Syntax: How it works - -To use CTEs, you begin by defining your first CTE using the `WITH` statement followed by a `SELECT` statement. - -Let’s break down this example involving a `rename_columns` CTE below: - -```sql -with rename_columns as ( - - select - - id as customer_id, - lower(first_name) as customer_first_name, - lower(last_name) as customer_last_initial - - from {{ ref('raw_customers') }} - -) - -select * from rename_columns -``` - -In this query above, you first create a CTE called `rename_columns` where you conduct a -simple `SELECT` statement that renames and lower cases some columns from a `raw_customers` /model. The final `select * from rename_columns` selects all results from the `rename_columns` CTE. - -While you shouldn't always think of CTEs as having classical arguments like SQL functions, you’ve got to call the necessary inputs for CTEs something. - -- CTE_EXPRESSION_NAME: This is the name of the CTE you can reference in other CTEs or SELECT statements. In our example, `rename_columns` is the CTE_EXPRESSION_NAME. **If you are using multiple CTEs in one query, it’s important to note that each CTE_EXPRESSION_NAME must be unique.** -- CTE_QUERY: This is the `SELECT` statement whose result set is produced by the CTE. In our example above, the `select … from {{ ref('raw_customers') }}` is the CTE_QUERY. The CTE_QUERY is framed by parenthesis. - -## When to use CTEs - -The primary motivation to implement CTEs in your code is to simplify the complexity of your queries and increase your code’s readability. There are other great benefits to using CTEs in your queries which we’ll outline below. - -### Simplification - -When people talk about how CTEs can simplify your queries, they specifically mean how CTEs can help simplify the structure, readability, and debugging process of your code. - -#### Establish Structure - -In leveraging CTEs, you can break complex code into smaller segments, ultimately helping provide structure to your code. At dbt Labs, we often like to use the [import, logical, and final structure](/guides/refactoring-legacy-sql?step=5#implement-cte-groupings) for CTEs which creates a predictable and organized structure to your dbt models. - -#### Easily identify dependencies - -When you import all of your dependencies as CTEs in the beginning of your query/model, you can automatically see which models, tables, or views your model relies on. - -#### Clearly label code blocks - -Utilizing the CTE_EXPRESSION_NAME, you can title what your CTE is accomplishing. This provides greater insight into what each block of code is performing and can help contextualize why that code is needed. This is incredibly helpful for both the developer who writes the query and the future developer who may inherit it. - -#### Test and debug more easily - -When queries are long, involve multiple joins, and/or complex business logic, it can be hard to understand why your query is not outputting the result you expect. By breaking your query into CTEs, you can separately test that each CTE is working properly. Using the process of elimination of your CTEs, you can more easily identify the root cause. - -### Substitution for a view - -Oftentimes you want to reference data in a query that could, or may have existed at one point, as a view. Instead of worrying about the view actually existing, you can leverage CTEs to create the temporary result you would want from the view. - -### Support reusability - -Using CTEs, you can reference the same resulting set multiple times in one query without having to duplicate your work by referencing the CTE_EXPRESSION_NAME in your from statement. - -## CTE example - -Time to dive into an example using CTEs! For this example, you'll be using the data from our [jaffle_shop demo dbt](https://github.com/dbt-labs/jaffle_shop) project. In the `jaffle_shop`, you have three tables: one for customers, orders, and payments. - -In this query, you're creating three CTEs to ultimately allow you to segment buyers by how many times they’ve purchased. - -```sql -with import_orders as ( - - select * from {{ ref('orders') }} - -), -aggregate_orders as ( - - select - - customer_id, - count(order_id) as count_orders - - from import_orders - where status not in ('returned', 'return pending') - group by 1 - -), -segment_users as ( - - select - - *, - case - when count_orders >= 3 then 'super_buyer' - when count_orders <3 and count_orders >= 2 then - 'regular_buyer' - else 'single_buyer' - end as buyer_type - - from aggregate_orders - -) -select * from segment_users -``` - -Let’s break this query down a bit: - -1. In the first `import_orders` CTE, you are simply importing the `orders` table which holds the data I’m interested in creating the customer segment off of. Note that this first CTE starts with a `WITH` statement and no following CTEs begin with a `WITH` statement. -2. The second `aggregate_orders` CTE utilizes the `import_orders` CTE to get a count of orders per user with a filter applied. -3. The last `segment_users` CTE builds off of the `aggregate_orders` by selecting the `customer_id`, `count_orders`, and creating your `buyer_type` segment. Note that the final `segment_users` CTE does not have a comma after its closing parenthesis. -4. The final `select * from segment_users` statement simply selects all results from the `segment_users` CTE. - -Your results from running this query look a little like this: - -| USER_ID | COUNT_ORDERS | BUYER_TYPE | -|---|---|---| -| 3 | 3 | super_buyer | -| 64 | 1 | single_buyer | -| 94 | 2 | regular_buyer | - -:::tip Tip -If you are finding yourself using the same code for a certain CTE across multiple -queries or models, that’s probably a good sign that CTE should be its own [model](https://docs.getdbt.com/docs/build/models) or view. -::: - -## CTE vs Subquery - -A is a nested query that can oftentimes be used in place of a CTE. Subqueries have different syntax than CTEs, but often have similar use cases. This content won’t go too deep into subqueries here, but it'll highlight some of the main differences between CTEs and subqueries below. - -| CTE | Subquery | -|---|---| -| Typically more readable since CTEs can be used to give structure to your query | Typically less readable, especially if there are many nested queries | -| Allows for recursiveness | Does not allow for recursiveness | -| CTEs must have unique CTE_EXPRESSION_NAMES when used in a query | Subqueries don’t always have to be explicitly named | -| CTEs cannot be used in a `WHERE` clause | Subqueries can be used in a `WHERE` clause | - -## Data warehouse support for CTEs - -CTEs are likely to be supported across most, if not all, [modern data warehouses](https://blog.getdbt.com/future-of-the-modern-data-stack/). Please use this table to see more information about using CTEs in your specific . - -| Data Warehouse | Support CTEs? | -|---|---| -|[Snowflake](https://docs.snowflake.com/en/user-guide/queries-cte.html) | :white_check_mark: | -|[Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_WITH_clause.html) | :white_check_mark: | -|[Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax) | :white_check_mark: | -|[Databricks](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-qry-select-cte.html) | :white_check_mark: | -|[Postgres](https://www.postgresqltutorial.com/postgresql-cte/) | :white_check_mark: | - -## Conclusion - -CTEs are essentially temporary views that can be used throughout a query. They are a great way to give your SQL more structure and readability, and offer simplified ways to debug your code. You can leverage appropriately named CTEs to easily identify upstream dependencies and code functionality. CTEs also support recursiveness and reusability in the same query. Overall, CTEs can be an effective way to level-up your SQL to be more organized and understandable. - -## Further Reading - -If you’re interested in reading more about CTE best practices, check out some of our favorite content around model refactoring and style: - -- [Refactoring Legacy SQL to dbt](/guides/refactoring-legacy-sql?step=5#implement-cte-groupings) -- [dbt Labs Style Guide](https://docs.getdbt.com/best-practices/how-we-style/0-how-we-style-our-dbt-projects) -- [Modular Data Modeling Technique](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) - -Want to know why dbt Labs loves CTEs? Check out the following pieces: - -- [Why we use so many CTEs](https://discourse.getdbt.com/t/why-the-fishtown-sql-style-guide-uses-so-many-ctes/1091) -- [CTEs are Passthroughs](https://discourse.getdbt.com/t/ctes-are-passthroughs-some-research/155) - diff --git a/website/docs/terms/dag.md b/website/docs/terms/dag.md deleted file mode 100644 index 93e2956ebb3..00000000000 --- a/website/docs/terms/dag.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -id: dag -title: DAG -description: A DAG is a Directed Acyclic Graph, a type of graph whose nodes are directionally related to each other and don’t form a directional closed loop. -displayText: DAG -hoverSnippet: A DAG is a Directed Acyclic Graph, a type of graph whose nodes are directionally related to each other and don’t form a directional closed loop. ---- - - - What is a DAG and why is it important? - dbt Labs - - -A DAG is a **D**irected **A**cyclic **G**raph, a type of graph whose nodes are directionally related to each other and don’t form a directional closed loop. In the practice of analytics engineering, DAGs are often used to visually represent the relationships between your data models. - -While the concept of a DAG originated in mathematics and gained popularity in computational work, DAGs have found a home in the modern data world. They offer a great way to visualize data pipelines and lineage, and they offer an easy way to understand dependencies between data models. - -## DAG use cases and best practices - -DAGs are an effective tool to help you understand relationships between your data models and areas of improvement for your overall [data transformations](https://www.getdbt.com/analytics-engineering/transformation/). - -### Unpacking relationships and data lineage - -Can you look at one of your data models today and quickly identify all the upstream and downstream models? If you can’t, that’s probably a good sign to start building or looking at your existing DAG. - -:::tip Upstream or downstream? - -How do you know if a model is upstream or downstream from the model you’re currently looking at? Upstream models are models that must be performed prior to the current model. In simple terms, the current model depends on upstream models in order to exist. Downstream relationships are the outputs from your current model. In a visual DAG, such as the dbt Lineage Graph, upstream models are to the left of your selected model and downstream models are to the right of your selected model. Ever confused? Use the arrows that create the directedness of a DAG to understand the direction of movement. - -::: - -One of the great things about DAGs is that they are *visual*. You can clearly identify the nodes that connect to each other and follow the lines of directions. When looking at a DAG, you should be able to identify where your data sources are going and where that data is potentially being referenced. - -Take this mini-DAG for an example: - - - -What can you learn from this DAG? Immediately, you may notice a handful of things: - -- `stg_users`and `stg_user_groups` models are the parent models for `int_users` -- A join is happening between `stg_users` and `stg_user_groups` to form the `int_users` model -- `stg_orgs` and `int_users` are the parent models for `dim_users` -- `dim_users` is at the end of the DAG and is therefore downstream from a total of four different models - -Within 10 seconds of looking at this DAG, you can quickly unpack some of the most important elements about a project: dependencies and data lineage. Obviously, this is a simplified version of DAGs you may see in real life, but the practice of identifying relationships and data flows remains very much the same, regardless of the size of the DAG. - -What happens if `stg_user_groups` just up and disappears one day? How would you know which models are potentially impacted by this change? Look at your DAG and understand model dependencies to mitigate downstream impacts. - -### Auditing projects - -A potentially bold statement, but there is no such thing as a perfect DAG. DAGs are special in-part because they are unique to your business, data, and data models. There’s usually always room for improvement, whether that means making a CTE into its own view or performing a join earlier upstream, and your DAG can be an effective way to diagnose inefficient data models and relationships. - -You can additionally use your DAG to help identify bottlenecks, long-running data models that severely impact the performance of your data pipeline. Bottlenecks can happen for multiple reasons: -- Expensive joins -- Extensive filtering or [use of window functions](https://docs.getdbt.com/blog/how-we-shaved-90-minutes-off-model) -- Complex logic stored in views -- Good old large volumes of data - -...to name just a few. Understanding the factors impacting model performance can help you decide on [refactoring approaches](https://learn.getdbt.com/courses/refactoring-sql-for-modularity), [changing model materialization](https://docs.getdbt.com/blog/how-we-shaved-90-minutes-off-model#attempt-2-moving-to-an-incremental-model)s, replacing multiple joins with surrogate keys, or other methods. - - - -### Modular data modeling best practices - -See the DAG above? It follows a more traditional approach to data modeling where new data models are often built from raw sources instead of relying on intermediary and reusable data models. This type of project does not scale with team or data growth. As a result, analytics engineers tend to aim to have their DAGs not look like this. - -Instead, there are some key elements that can help you create a more streamlined DAG and [modular data models](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/): - -- Leveraging [staging, intermediate, and mart layers](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) to create layers of distinction between sources and transformed data -- Abstracting code that’s used across multiple models to its own model -- Joining on surrogate keys versus on multiple values - -These are only a few examples of some best practices to help you organize your data models, business logic, and DAG. - -:::tip Is your DAG keeping up with best practices? - -Instead of manually auditing your DAG for best practices, the [dbt project evaluator package](https://github.com/dbt-labs/dbt-project-evaluator) can help audit your project and find areas of improvement. - -::: - -## dbt and DAGs - -The marketing team at dbt Labs would be upset with us if we told you we think dbt actually stood for “dag build tool,” but one of the key elements of dbt is its ability to generate documentation and infer relationships between models. And one of the hallmark features of [dbt Docs](https://docs.getdbt.com/docs/build/documentation) is the Lineage Graph (DAG) of your dbt project. - -Whether you’re using dbt Core or Cloud, dbt docs and the Lineage Graph are available to all dbt developers. The Lineage Graph in dbt Docs can show a model or source’s entire lineage, all within a visual frame. Clicking within a model, you can view the Lineage Graph and adjust selectors to only show certain models within the DAG. Analyzing the DAG here is a great way to diagnose potential inefficiencies or lack of modularity in your dbt project. - - - -The DAG is also [available in the dbt Cloud IDE](https://www.getdbt.com/blog/on-dags-hierarchies-and-ides/), so you and your team can refer to your lineage while you build your models. - -:::tip Leverage exposures - -One of the newer features of dbt is [exposures](https://docs.getdbt.com/docs/build/exposures), which allow you to define downstream use of your data models outside of your dbt project *within your dbt project*. What does this mean? This means you can add key dashboards, machine learning or data science pipelines, reverse ETL syncs, or other downstream use cases to your dbt project’s DAG. - -This level of interconnectivity and transparency can help boost data governance (who has access to and who [owns](https://docs.getdbt.com/reference/resource-configs/meta#designate-a-model-owner) this data) and transparency (what are the data sources and models affecting your key reports). - -::: - -## Conclusion - -A Directed acyclic graph (DAG) is a visual representation of your data models and their connection to each other. The key components of a DAG are that nodes (sources/models/exposures) are directionally linked and don’t form acyclic loops. Overall, DAGs are an effective tool for understanding data lineage, dependencies, and areas of improvement in your data models. - -> *Get started with [dbt today](https://www.getdbt.com/signup/) to start building your own DAG!* - -## Further reading - -Ready to restructure (or create your first) DAG? Check out some of the resources below to better understand data modularity, data lineage, and how dbt helps bring it all together: - -- [Data modeling techniques for more modularity](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -- [How we structure our dbt projects](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) -- [How to audit your DAG](https://www.youtube.com/watch?v=5W6VrnHVkCA) -- [Refactoring legacy SQL to dbt](/guides/refactoring-legacy-sql) diff --git a/website/docs/terms/data-catalog.md b/website/docs/terms/data-catalog.md deleted file mode 100644 index 59bb983767a..00000000000 --- a/website/docs/terms/data-catalog.md +++ /dev/null @@ -1,87 +0,0 @@ ---- -id: data-catalog -title: Data catalog -description: A data catalog is an inventory of data assets from different parts of the data stack within an organization. This catalog can display metadata, lineage, and business definitions from your different data sources. -displayText: data catalog -hoverSnippet: A data catalog is an inventory of data assets from different parts of the data stack within an organization. This catalog can display metadata, lineage, and business definitions from your different data sources. ---- - - - Data catalog: a centralized place for data about your data - - -A data catalog is an inventory of data assets from different parts of the data stack within an organization. It helps to connect different parts of your stack by showing how your data relates to one another, all within one central location. A catalog can display metadata, lineage, and business definitions from your different data sources and allows for easy collaboration and governance. Data catalogs allow analytics engineers to properly document all parts of their data stack, making for easy ownership. - -## Features of a data catalog - -Data catalogs are known for 6 features that make them so powerful for both data teams and business users: - -- Access to metadata -- Business glossary functionality -- Built-in data lineage -- Support collaboration -- Enhanced data governance -- Varied integrations - -### Metadata - -Metadata is the data that describes data; this refers to characteristics like who created the dataset, its size, the databases and schemas it contains, and when it was last updated. It tells you where you can find a particular data source in terms of where it lives within your . Those properties help an analytics engineer fully understand the data and its intricacies before working with it. - -### Business glossary - -Business glossaries within a data catalog are helpful for understanding how a dataset and its columns relate back to their specific organization. They often contain information about the business description and purpose of a dataset or model, and they display the business definitions of columns and logic within a dataset. Business glossaries are particularly helpful for knowing which dataset and column to use in your logic when writing models or defining metrics. - -### Data lineage - -Data lineage provides a holistic view of how data moves through an organization, where it is transformed, protected, and consumed. Lineage shows the relationships between datasets and models across different platforms within your data stack. Lineage is particularly helpful for understanding dependencies between datasets. Depending on the tool you use, catalogs can show it on a dataset or even column level. This way, when you are updating any process within your stack, you can do so with these dependencies in mind. - -### Collaboration - -Data catalogs make collaboration between data and business teams easy. It allows you to see who owns what datasets from both a technical and organizational perspective. Business teams know who to talk to when data is incorrect or not fresh, while data teams know who to turn to for questions on business context. You can also know things like when a data was last updated, how it's used, or to get more context on your warehouse. - -### Data governance - -Data governance allows you to control who can access which domains within a catalog or specific datasets. Most data catalogs allow you to organize your data based on a specific area of the business and then limit user access to only their area of expertise. Catalogs also help with governing which datasets meet the data quality standards required for utilization. Many tools display a quality score and let you know when a dataset hasn’t been run in a while or is deprecated, preventing users from using unreliable data sources. - -Data catalogs can also be used to identify specific datasets and columns that contain PII data. This way, teams can have a better understanding of who should and should not have access to sensitive data. Having better clarity over sensitive data will help your business stay compliant and ensure its properly protecting customer information. - -### Integrations - -Data catalogs are compatible with many other tools in your modern data stack. They typically allow the documentation of your data warehouse, transformation tool, and business intelligence tool to all sit in one central location. This helps to build transparency across the stack and creates a single source of truth for the organization to depend on. - -## Types of data catalogs - -Like most data tools, there are two different types: an open-source data catalog and an enterprise data catalog. Let’s talk about the differences between these and the pros and cons of each. - -### Open source - -Open source data catalogs are free for you to use and often provide a great level of flexibility. You can build a custom solution that meets your exact needs and security requirements. But because they are free, you will be expected to manage the entire platform and set it up. Oftentimes, it’s not as simple as plugging in your various credentials to each tool in your modern data stack. It requires careful reading through the provided documentation and setting up each tool on its own, which often requires a certain threshold of technical skill. This makes for a typically more intense and potentially lengthy set-up process because there may not be experienced people to help you along the way and walk you through the steps. - -Open source tools also require maintenance. Oftentimes, settings will change in the connected platforms and it's up to your team to ensure compatibility and fix any breaking changes. - -In addition, with open source tools, you often need to host them yourself on a cloud provider of choice if your catalog will see broad use across the team. Depending on what you prefer, you may have to deploy multiple microservices on a platform like AWS or Azure. - -Lastly, you want to keep in mind your end user: Is this data catalog meant to be utilized by the data team or the larger business teams? Business users may have a harder time navigating an open source tool because it’s usually not as easy as logging in with an account. It requires more technical expertise to use effectively. If a business user has trouble with the catalog, it could cause a potential lag in important processes. - -### Enterprise data catalog software - -Enterprise data catalogs are different from open source in that they are completely managed by the company that creates them. You pay a fee to use them and are paying for the ongoing support, quick set-up process, and the minimal maintenance that comes with it. You are typically walked through it with a dedicated resource, and the integrations with external tools can be smoother because the vendor has teams dedicated to maintaining those relationships. - -The biggest things to keep in mind with enterprise data catalogs is your budget, use cases, and greater data culture. Can your organization afford to pay for a data catalog tool? While they require fewer engineering resources to maintain, they do come with a price tag. When considering if it is worth spending your money on an enterprise tool, make sure you consider where your business and data teams stand. Is your business at a place where it respects the initiatives put in place by the data team? Are the initiatives big enough where having one would make sense? - -Does the data team fully understand the data and its lineage? If they don’t, it's probably too early to put this in front of business users. You want to make sure they are set up for success when being trained to use a new tool. - -Do you have sophisticated data models and sources that the business knows how to use? If not, it may be worth focusing on building out the right metrics and models to be used first. - -Is the culture data-driven? If business users are caught up in their own processes like Google spreadsheets, they may not even utilize a catalog. You don’t want to pay for a tool that is too sophisticated for where the business and data teams currently stand. Don’t rush the process. -### Data catalog tools -Data teams may choose to use third-party tools with data cataloging capabilities such as [Atlan](https://ask.atlan.com/hc/en-us/articles/4433673207313-How-to-set-up-dbt-Cloud), Alation, [Collibra](https://marketplace.collibra.com/listings/dbt-lineage-to-collibra-integration/), [Metaphor](https://support.metaphor.io/hc/en-us/articles/9302185081627), [Select Star](https://docs.selectstar.com/integrations/dbt/dbt-cloud), [Stemma](https://docs.stemma.ai/docs/stemma/getting-started/what-we-need-from-you/dbt-integration/), [Castor](https://docs.castordoc.com/integrations/dbt), and others. These tools often integrate directly with your data pipelines and dbt workflows and offer zoomed-in data cataloging and lineage capabilities. - -## Conclusion - -Data catalogs are a valuable asset to any data team and business as a whole. They allow people within an organization to find the data that they need when they need it and understand its quality or sensitivity. This makes communication across teams more seamless, preventing problems that impact the business in the long run. Weigh your options in terms of whether to go with open source or enterprise, trusting that the decision you land on will be best for your organization. - -## Additional reading - -- [Why both data cataloging?](https://www.getdbt.com/analytics-engineering/transformation/data-catalog/) -- [Glossary: Data warehouse](/terms/data-warehouse) diff --git a/website/docs/terms/data-extraction.md b/website/docs/terms/data-extraction.md deleted file mode 100644 index 52148a35421..00000000000 --- a/website/docs/terms/data-extraction.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -id: data-extraction -title: Data extraction -description: Data extraction is the process by which data is retrieved from multiple sources, often varying in volume and structure. -displayText: data extraction -hoverSnippet: Data extraction is the process by which data is retrieved from multiple sources, often varying in volume and structure. ---- - - - Data extraction: The first step of the ELT process - - -There is no data work without data. So how do data teams get the data they need? Data extraction is the process by which data is retrieved from multiple sources, often varying in volume and structure. Most data extraction processes are followed by a loading process, where that extracted data is loaded into a central . - -To actually extract said data, teams will often rely on various [ETL tools](https://docs.getdbt.com/terms/elt#elt-tools) or create custom scripts to call API endpoints. Other times data teams may be forced to do some hacky work like manually making and dropping a CSV into an S3 bucket. Definitely a rarity. But not unheard of. - -Come take a deep dive into data extraction, the process that allows us data folk to actually play with, well, said data. - -## Data extraction process: How does it work? - -There are two primary ways modern data teams are using to extract data: tools and custom scripts. - -- **Extraction via ETL tools**: SaaS ETL tools like Fivetran, Airbyte, and more, allow data folks to select connectors or data sources and sync their required data directly to their data warehouses. These platforms reduce the need to write custom API calls to data sources and instead allow data folks to worry more on transforming their data when it hits their data warehouse. -- **Extraction via custom scripts**: It’s probably inevitable, but at one point, you’re likely to find yourself hacking together a Python script to make API calls to a data source that doesn’t have a connector in an ETL tool. But let’s be real: while this is intimidating, it isn’t the end of the world. Writing and maintaining custom scripts for extracting data from data source APIs is not the most fun and there are real concerns (API limits, access tokens, lack of documentation, changing APIs, writing to external storage or directly to your data warehouse) to look out for, but gear up, read up on some basic curl requests and Python, and you got this. - -These two methods above are for automated extraction, processes that you only need to run once (in theory) to get the data you need on a regular basis. For non-automated processes, such as one-time extractions or uploads to your data warehouse, data folks can upload their data to external storage, such as S3 buckets, to load to your data warehouse, or leverage [dbt seeds](/docs/build/seeds). - -## Commonly extracted data - -Obviously, the type of business you work for and the systems your team uses will determine the data you extract. However, there are usually some common sources that data teams will extract for business users: -- Ad platforms such as Facebook Ads, Google Ads, or Pinterest Ads -- Accounting softwares like Netsuite -- Sales CRMs such as Salesforce or HubSpot -- Backend application databases -- Customer service SaaS products like Zendesk or Kustomer - -The data that is typically extracted and loaded in your data warehouse is data that business users will need for baseline reporting, OKR measurement, or other analytics. - -:::tip Don’t fix what’s not broken -As we just said, there are usually common data sources that data teams will extract from, regardless of business. Instead of writing transformations for these tables and data sources, leverage [dbt packages](https://hub.getdbt.com/) to save yourself some carpal tunnel and use the work someone else has already done for you. -::: - -## Data extraction tools - -If you’re not writing your own extraction scripts, you’re likely using an [ELT tool](https://docs.getdbt.com/terms/elt#elt-tools) to help you extract and load your various data sources into your data warehouse. Below, you’ll find some commonly used tools to help you do just that. - -| Tool | Description | Open source option? | -|:---:|:---:|:---:| -| Airbyte | Airbyte is an open-source and cloud service that allows teams to create data extraction and load pipelines. | ✅ | -| Stitch by Talend | Stitch (part of Talend) is another SaaS product that has many data connectors to extract data and load it into data warehouses. | ❌ | -| Fivetran/HVR | Fivetran is a SaaS company that helps data teams extract, load, and perform some transformation on their data. Fivetran easily integrates with modern data warehouses and dbt. They also offer transformations that leverage dbt Core. | ❌ | -| Funnel | Funnel is another product that can extract and load data. Funnel’s data connectors are primarily focused around marketing data sources. | ❌ | - -## Data extraction challenges to look out for - -There are definitely some considerable considerations in data extraction, mainly around costs and viability. - -- **Cadence and costs**: How often does your data need to be synced or refreshed? How often will your stakeholders really be looking at the data? There can be considerable costs to hitting API endpoints or retrieving data via ELT tools depending on the cadence you set for your data extractions. Talk to your stakeholders, understand when folks would leverage fresher data, and run some basic cost-benefit analyses to understand the cadence that works for your data extractions. -- **Viability**: Can you even extract the data your stakeholders need? As analytics engineers, your initial reaction is to check if an ETL tool has an existing connector for it. If it doesn’t, you may have to whip up a script to call the API (if there is one). If there is no API available, well, then it’s time to put on your creativity hat and get hacky! -- **PII concerns**: Oftentimes, data teams may be interested in masking PII data before it even hits their data warehouse. This would involve masking or removing the PII data immediately after extraction and immediately prior to loading the data into your data warehouse. For folks that want to mask PII, but are okay with masking it once it’s in their data warehouse, data teams can create masking policies using dbt packages. -- **Data accuracy**: This is less of a concern for data extracted via ETL tools or custom scripts, but for internal sources, such as static CSV files manually input by someone on your marketing team, you’re going to want to ensure that data is accurate (ideally before it hits your data warehouse). Not the end of the world if it does, but more of a nuisance than anything and something to look out for. - -:::tip Testing your data sources -Using dbt, data folks can run automated tests on their raw data that is loaded into their data warehouse via [sources](https://docs.getdbt.com/docs/build/sources). -::: - -## Conclusion - -Having no data extraction is the equivalent of a conductor not having an orchestra at their disposal: sad. Overall, data extraction in analytics engineering is the process of extracting data, usually via an automated ETL tool or script, for data sources that will later be loaded into a central data warehouse. There are some considerations to look at prior to the data extraction process, such as cost, viability, and PII concerns. - -## Further reading - -Ready to take a deeper dive into all things data extraction, ELT and dbt? Check out some of our favorite resources below: - -- [Glossary: ELT](https://docs.getdbt.com/terms/elt) -- [Glossary: ETL](https://docs.getdbt.com/terms/etl) -- [Four questions to help accurately scope analytics engineering projects](https://www.getdbt.com/blog/4-questions-to-help-you-more-accurately-scope-analytics-engineering-projects/) -- [Five principles that will keep your data warehouse organized](https://www.getdbt.com/blog/five-principles-that-will-keep-your-data-warehouse-organized/) diff --git a/website/docs/terms/data-lake.md b/website/docs/terms/data-lake.md deleted file mode 100644 index e1b75a616b9..00000000000 --- a/website/docs/terms/data-lake.md +++ /dev/null @@ -1,112 +0,0 @@ ---- -id: data-lake -title: Data lake -description: A data lake is a data management system used for storing large amounts of data in in its raw, native form as files. -displayText: data lake -hoverSnippet: A data lake is a data management system used for storing large amounts of data in in its raw, native form as files. Data lakes can store any type of data—structured, semi-structured, unstructured—in one centralized place. ---- - - - Data lake: an integral addition to the MDS - - -A data lake is a data management system used for storing large amounts of data in in its raw, native form as files. Data lakes can store any type of data—structured, semi-structured, unstructured—in one centralized place. Several common data file formats that are widely being used today include CSV, , XML, Parquet, and Avro. This makes the data lake a cost-effective and flexible storage container in contrast to the , where data must be in a structured and tabular format. The primary use case of a data lake in many organizations is to serve as an initial staging area before data is ready to be transformed and aggregated in a data warehouse. - -## How do data lakes provide value? - -In the past, some organizations couldn’t store all their data in one centralized place because databases and data warehouses could only store structured, relational data. On top of that, data storage was once cost-prohibitive, hence data teams would have to filter and transform data volumes to smaller sizes first to be able to store them. These challenges have been addressed by cloud data lakes; they allow for scalability, flexibility, and cost savings—all of which are handled by the cloud platform itself. - -### Scalability - -Data lakes allow you to scale your storage up or down depending on how much data you need to store at a particular point in time. You no longer have to know and calculate upfront how much storage capacity you need because of the advent of cloud data lakes. In the past, setting up a new data lake involved considerable hardware configuration tasks. Now, all of this can be achieved in a few steps by hitting a few buttons on your web browser or by typing a few lines of code on your computer. - -### Flexibility - -At times, a data team might know data from a new source could be useful, but they might now know how it would be used yet. Data lakes offer a place to store this data without needing to build a use case for structuring or shaping it first. This is different from the approach that data warehouses take, where it was optimized to store and analyze relational, structured data. In addition to the data lake’s ability to store raw, uncurated data, the advent of data lake query engines (ex. Athena, Dremio, Starburst, etc.) mean that data analysts and data scientists can now perform exploratory data analysis (EDA) on top of a data lake using this layer of abstraction, without having to bring it into the data warehouse first. - -### Cost-effectiveness - -The rise of the data lake coincided with the cloud computing revolution. Data teams no longer had to worry about making massive upfront hardware investments for data storage. Instead, you pay a usage-based fee dependent on how much data you store and how many compute queries you run. - -### Modern approaches - -As mentioned earlier, storing data in the past was an expensive endeavor, therefore organizations had to curate and think through what type of data they brought into their data warehouse. This approach is called ETL (Extract-Transform-Load), where only transformed data ultimately gets stored and analyzed in a data warehouse or data lake. - -The ability to store tons of data in a cost-efficient and flexible way in the data lake gave rise to a new approach to processing data, a technique that aligns with the modern practice of analytics engineering—the ,Extract-Load-Transform (ELT) approach. In this new process, data is immediately loaded to the destination data lake upon extraction from the source. The benefit of this approach is that it allows for flexibility and exploration of new business use cases which may or may not be known initially when data is ingested. - -## What are the drawbacks and challenges when using a data lake? - -For all of the advantages of data lakes listed above such as cost-effectiveness and flexibility, they also come with several drawbacks and challenges. - -### Inability to do fast reporting - -Query performance and speed is one capability area where data warehouses typically trump data lakes. While structuring data first may seem inflexible and rigid at times, it is the right approach to implement when you have analyses and reports that you want to run frequently. The following are several query performance techniques that can only be applied to data warehouses: - -| Performance technique | Description | Practical scenario | -|:---:|:---:|:---:| -| Columnar storage | Data is physically organized in columns in a data warehouse rather than rows (in a database) or files (in a data lake) | Most analysis and reports require pulling only a subset of columns from an entire . Columnar storage makes your queries run faster by retrieving only the relevant columns for your analysis | -| Query caching | When a query is executed, the result is temporarily stored for a period of time | When someone executes a query on a table, the results of that table will be made available right away to the next user who executes the same query, significantly reducing computation time | -| Data partitioning | Grouping similar data together based on selected table dimensions | Many organizations partition their tables based on a dimension that includes a date field. The reason for this is that most analyses only require pulling data on a rolling two-year period. If you want to calculate year-to-date sales this year and compare it to the same period last year, partitioning your data based on date will make your queries run faster by eliminating the need to scan through the entire table first | - -That being said, storing it in a data lake first and exploring it with an ad hoc query engine would be the recommended approach if you have a massive data set with a still undefined use case you want to explore. - -### Lack of fine-grained access control - -It is difficult to enforce fine-grained access control on your data when it's in its raw form. Fine-grained access control pertains to granting permissions to a particular subset of your data set by restricting access to certain rows and columns. These two concepts are known as column-level security and row-level security: - -- **Column-level security**: A bank may want to anonymize columns that contain personally identifiable information (PII) such as credit card numbers, social security numbers, and so on. To achieve this, analytics engineers use a variety of encryption functions available in their data warehouse. -- **Row-level security**: Imagine a retailer with a massive table containing millions of rows of sales transactions across all 50 states in the US. These companies may want to dynamically enforce limited querying permissions to end-users based on which state they’re in. For example, when an analyst based in California starts querying the table, the data set would pre-filter itself to only show sales data from California, even if the analyst attempts to query the entire table. This type of row-level data governance is typically better suited for data warehouses than data lakes. - -## Data lake use cases - -Organizations use data lakes for many different reasons. Most of these reasons ultimately tie back to the three primary benefits of cost-effectiveness, scalability, and flexibility summarized earlier. Below are common use cases that data lakes are able to achieve: - -### Data archiving and storage - -Data lakes can support cost-effective archiving of historical data that is no longer being actively used. Most organizations have data retention and lifecycle policies that indicate how business data should be stored and analyzed, where it is typically organized into three tiers: Hot, Warm, and Cold storage. As an example, a company may state that the past two years’ worth of data belongs in the hot tier, data from three to five years ago are in the warm tier, and anything beyond that in the cold tier. - -| Storage tier | Access pattern | Description | -|:---:|:---:|:---:| -| Hot | Data that is being used often | This is primarily the level in which data warehouses lie. At this level, data is highly structured and optimized for reporting and analytics. Data lakes may also lie at this tier to support machine learning and exploratory data analysis use cases | -| Warm | Data that is infrequently accessed | At this level, data is infrequently accessed and stored at a lower cost than in the hot tier. On some occasions, data may need to be transitioned back to the hot tier which cloud computing companies allow you to do with relative ease | -| Cold | Data stored for archiving purposes | Data in this tier is rarely accessed. Typically, cold data must be retained for regulatory and compliance purposes on a long-term basis, if not indefinitely. | - -### Data science and machine learning - -Because of a data lake’s ability to store any type of data format, it lends itself well to advanced analytics use cases, especially those that require the use of semi-structured and unstructured data that data warehouses traditionally don’t support. Some examples include: - -- **Sentiment analysis**: This is a technique that uses statistics and natural language processing (NLP) algorithms to determine the emotional meaning of communications. Organizations use sentiment analysis to evaluate customer reviews, call center interactions, social media posts, and other related content, all of which require the use of unstructured data sources (e.g. free-form text, audio recordings) -- **Predictive maintenance**: This is a common use case in the field of manufacturing, mining, and other heavy industries. Organizations take advantage of a data lake’s ability to store machine logs, sensor and telemetry data to predict the probability of a piece of equipment failing before it happens. This enables the company to make proactive actions to service the equipment, thus preventing defects and maximizing resource utilization. - -### Exploratory data analysis (EDA) - -Because you don’t need to impose a formal structure for how data is organized in a data lake, you can perform preliminary data exploration on that data, such as calculate summary statistics, discover anomalies and outliers, and plot data visualizations to derive preliminary insights. Commonly referred to as EDA, this is typically conducted as an initial step before formalizing a data science or machine learning use case. - -## Data lake vs. data warehouse - -| | Data lake | Data warehouse | -|---|---|---| -| Types of data | Structured, Semi-Structured, Unstructured | Structured | -| Data stored in | Folders and files in raw format | Schemas and tabular data format | -| Schema/schema definition | Store data in its raw format, transform the data later | Must know upfront | -| Intended users | Data engineers, analytics engineers, data analysts, data scientists | Analytics engineers, data analysts, business analysts | -| Common use cases | Data archiving and storage, data science and machine learning, exploratory data analysis | Business intelligence, dashboarding, reporting and analytics | - -## Data platforms that support data lake workloads - -| Data Platform | Description | Data warehouse | -|:---:|:---:|---| -| Cloudera | Cloudera Open Data Lakehouse is a platform that provides data lake flexibility and data warehouse performance in a single platform. | Structured | -| Databricks | Databricks is a cloud-based collaborative data science, data engineering, and data analytics platform that brings the best of data warehouses and data lakes into a single unified platform. | Schemas and tabular data format | -| Dremio | Dremio is the data lakehouse platform built for SQL and built on open source technologies that both data engineers and data analysts love. Dremio powers BI dashboards and analytics directly on data lake storage. | Must know upfront | -| Snowflake | Snowflake is a fully-managed platform for data warehousing, data lakes, data engineering, data science, and data application development. | Analytics engineers, data analysts, business analysts | - -## Conclusion - -The data lake is the younger data management platform compared to its data warehouse counterpart. Because of its unique ability to hold large amounts of data in its native, raw format, it has allowed organizations to store all its data in a centralized place, even if sometimes they don’t have a definitive use case for the data yet. In addition, it serves as a great buffer and landing zone for data before it is ultimately transformed and aggregated in a data warehouse. Lastly, it has unlocked a world of new possibilities by enabling organizations to build data science and machine learning use cases on top of it. The data lake is an integral pillar in the Modern Data Stack and the practice of analytics engineering. - -## Additional reading -- [Glossary: Data warehouse](/terms/data-warehouse) -- [Glossary: ETL](/terms/etl) -- [Glossary: ELT](/terms/elt) -- [Glossary: EDW](/terms/edw) diff --git a/website/docs/terms/data-lineage.md b/website/docs/terms/data-lineage.md deleted file mode 100644 index 42217db40d8..00000000000 --- a/website/docs/terms/data-lineage.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -id: data-lineage -title: What is data lineage? -description: Data lineage provides a holistic view of how data moves through an organization, where it’s transformed and consumed. -displayText: data lineage -hoverSnippet: Data lineage provides a holistic view of how data moves through an organization, where it’s transformed and consumed. ---- - - - What is data lineage? And how do you get started? - - -Data lineage provides a holistic view of how data moves through an organization, where it’s transformed and consumed. Overall, data lineage is a fundamental concept to understand in the practice of analytics engineering and modern data work. - -At a high level, a data lineage system typically provides data teams and consumers with one or both of the following resources: - -- A visual graph (DAG) of sequential workflows at the data set or column level -- A data catalog of data asset origins, owners, definitions, and policies - -This holistic view of the data pipeline allows data teams to build, troubleshoot, and analyze workflows more efficiently. It also enables business users to understand the origins of reporting data and provides a means for data discovery. - -We’ll unpack why data lineage is important, how it works in the context of analytics engineering, and where some existing challenges still exist for data lineage. - -## Why is data lineage important? - -As a data landscape grows in size and complexity, the benefits of data lineage become more apparent. For data teams, the three main advantages of data lineage include reducing root-cause analysis headaches, minimizing unexpected downstream headaches when making upstream changes, and empowering business users. - -### Root cause analysis - -It happens: dashboards and reporting fall victim to data pipeline breaks. Data teams quickly need to diagnose what’s wrong, fix where things may be broken, and provide up-to-date numbers to their end business users. But when these breaks happen (and they surely do) how can teams quickly identify the root cause of the problem? - -If data teams have some form of data lineage in place, they can more easily identify the root cause of the broken pipeline or data quality issue. By backing out into the data models, sources, and pipelines powering a dashboard a report, data teams can understand all the upstream elements impacting that work and see where the issues lie. - -Will a data lineage or a DAG solve your breaking pipelines? Definitely not. Will it potentially make your life easier to find problems in your data work? Heck yes. - -### Downstream impacts on upstream changes - -You may have been here—your backend engineering team drops the `customers` table to create a newer, more accurate `users` table. The only bad thing is…[they forgot to tell the data team about the change](https://docs.getdbt.com/blog/when-backend-devs-spark-joy). - -When you have a data lineage system, you can visually see which downstream models, nodes, and exposures are impacted by big upstream changes such as source or model renaming or removals. Referring to your DAG or data lineage system before any significant change to your analytics work is a great way to help prevent accidental downstream issues. - -### Value to business users - -While data lineage makes it easier for data teams to manage pipelines, stakeholders and leaders also benefit from data lineage, primarily around promoting data transparency into the data pipelines. - -**Shared data literacy** - -New hires, existing team members, and internal data practitioners can independently explore a holistic view of the data pipeline with a data lineage system. For data teams using a DAG to encapsulate their data work, business users have a clear visual representation of how data flows from different sources to the dashboards they consume in their BI tool, providing an increased level of transparency in data work. At the end of the day, the added visibility makes it easier for everyone to be on the same page. - -**Pipeline cleanliness** - -A visual graph (DAG) of how data flows through various workflows makes it easy to identify redundant loads of source system data or workflows that produce identical reporting insights. - -Spotlighting redundant data models can help trim down on WET (write every time/write everything twice) code, non-performant joins, and ultimately help promote reusability, modularity, and standardization within a data pipeline. - -Overall, data lineage and data-driven business go hand-in-hand. A data lineage system allows data teams to be more organized and efficient, business users to be more confident, and data pipelines to be more modular. - -## How does data lineage work? - -In the greater data world, you may often hear of data lineage systems based on tagging, patterns or parsing-based systems. In analytics engineering however, you’ll often see data lineage implemented in a DAG or through third-party tooling that integrates into your data pipeline. - -### DAGs (directed acyclic graphs) - -If you use a transformation tool such as dbt that automatically infers relationships between data sources and models, a DAG automatically populates to show you the lineage that exists for your [data transformations](https://www.getdbt.com/analytics-engineering/transformation/). - - - -Your is used to visually show upstream dependencies, the nodes that must come before a current model, and downstream relationships, the work that is impacted by the current model. DAGs are also directional—they show a defined flow of movement and form non-cyclical loops. - -Ultimately, DAGs are an effective way to see relationships between data sources, models, and dashboards. DAGs are also a great way to see visual bottlenecks, or inefficiencies in your data work (see image below for a DAG with...many bottlenecks). Data teams can additionally add [meta fields](https://docs.getdbt.com/reference/resource-configs/meta) and documentation to nodes in the DAG to add an additional layer of governance to their dbt project. - - - -:::tip Automatic > Manual - -DAGs shouldn’t be dependent on manual updates. Instead, your DAG should be automatically inferred and created with your data transformation and pipelines. Leverage tools such as dbt to build your own version-controlled DAG as you develop your data models. - -::: - -### Third-party tooling - -Data teams may also choose to use third-party tools with lineage capabilities such as [Atlan](https://ask.atlan.com/hc/en-us/articles/4433673207313-How-to-set-up-dbt-Cloud), Alation, [Collibra](https://marketplace.collibra.com/listings/dbt-lineage-to-collibra-integration/), [Datafold](https://www.datafold.com/column-level-lineage), Metaphor, [Monte Carlo](https://docs.getmontecarlo.com/docs/dbt-cloud), [Select Star](https://docs.selectstar.com/integrations/dbt/dbt-cloud), or [Stemma](https://docs.stemma.ai/docs/stemma/getting-started/what-we-need-from-you/dbt-integration/). These tools often integrate directly with your data pipelines and dbt workflows and offer zoomed-in data lineage capabilities such as column-level or business logic-level lineage. - -## Data lineage challenges - -The biggest challenges around data lineage become more apparent as your data, systems, and business questions grow. - -### Data lineage challenge #1: Scaling data pipelines - -As dbt projects scale with data and organization growth, the number of sources, models, macros, seeds, and [exposures](https://docs.getdbt.com/docs/build/exposures) invariably grow. And with an increasing number of nodes in your DAG, it can become harder to audit your DAG for WET code or inefficiencies. - -Working with dbt projects with thousands of models and nodes can feel overwhelming, but remember: your DAG and data lineage are meant to help you, not be your enemy. Tackle DAG audits in chunks, document all models, and [leverage strong structure conventions](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview). - -:::tip dbt project evaluator - -Is your DAG keeping up with best practices? Instead of manually auditing your DAG, the [dbt project evaluator package](https://github.com/dbt-labs/dbt-project-evaluator) can help audit your project and find areas of improvement. - -::: - -### Data lineage challenge #2: Column-level lineage - -Complex workflows also add to the difficulties a data lineage system will encounter. For example, consider the challenges in describing a data source's movement through a pipeline as it's filtered, pivoted, and joined with other tables. These challenges increase when the granularity of the data lineage shifts from the table to the column level. - -As data lineage graphs mature and grow, it becomes clear that column- or field-level lineage is often a needed layer of specificity that is not typically built in to data lineage systems. Learn more about the [column-level lineage](/docs/collaborate/column-level-lineage) feature in [dbt Explorer](https://www.getdbt.com/product/dbt-explorer) and how it can help you gain insights. - -## Conclusion - -Data lineage is the holistic overview of how data moves through an organization or system, and is typically represented by a DAG. Analytics engineering practitioners use their DAG and data lineage to unpack root causes in broken pipelines, audit their models for inefficiencies, and promote greater transparency in their data work to business users. Overall, using your data lineage and DAG to know when your data is transformed and where it’s consumed is the foundation for good analytics work. - -## Further reading - -DAGs, data lineage, and root cause analysis…tell me more! Check out some of our favorite resources of writing modular models, DRY code, and data modeling best practices: - -- [Glossary: DRY](https://docs.getdbt.com/terms/dry) -- [Data techniques for modularity](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -- [How we structure our dbt projects](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) diff --git a/website/docs/terms/data-warehouse.md b/website/docs/terms/data-warehouse.md deleted file mode 100644 index cf6f5de3d20..00000000000 --- a/website/docs/terms/data-warehouse.md +++ /dev/null @@ -1,89 +0,0 @@ ---- -id: data-warehouse -title: Data warehouse -description: How have data warehouses evolved over the last 40 years? Explore the nuanced changes in use case since Inmon first coined the term. -displayText: data warehouse -hoverSnippet: A data warehouse is a data management system used for data storage and computing that allows for analytics activities such as transforming and sharing data. ---- - - - Data warehouses in the modern data stack - dbt Labs - - -A data warehouse is a data management system used for data storage and computing that allows for analytics activities such as transforming and sharing data. It helps businesses to capture and store data from external sources. Analytics engineers and data analysts use it to query datasets using SQL, helping to transform them into powerful data models and reports. Data warehouses are the central source of truth for any modern data stack. Data is ingested, transformed, and shared to other tools from the warehouse. - -There are two main types of data warehouses — on-prem warehouses and cloud warehouses. An on-prem data warehouse is a physical location where companies need to maintain hardware and software in order to store data. A cloud data warehouse is available anywhere and doesn’t include a physical location that you need to access. In this arrangement, you pay to use the storage space and compute power that is provided and maintained by another company. - -## History of data warehouses - -While data has been stored throughout history, it wasn’t until the 1980s that technology began to accelerate and the first official data warehouse was created. It was an on-prem warehouse consisting of a lot of computer processing and storage towers, taking up a lot of space. As you can imagine, this caused a lot of problems. It not only took up a lot of physical space, but employees had to maintain the hardware and software of these warehouses. This quickly became expensive and unrealistic for smaller companies without the budget or space. - -When Amazon began scaling their on-prem data warehouses to support their business, they noticed an opportunity to sell compute capacity to other businesses in order to save costs. This is when Redshift, Amazon’s cloud data warehouse product, came to be. Shortly after, other tech giants like Google and Microsoft who were also building data infrastructure followed suit. - -Now, you can be anywhere and access the power of an online warehouse. You no longer need to maintain the infrastructure yourself but can pay a company to do this for you. This is cheaper and allows for faster data capabilities. - - -## Why businesses need data warehouses - -Data warehouses were once unrealistic due to the costs associated with them. Now that cloud warehouses make them available to nearly everyone, they have a plethora of benefits to offer businesses. Cloud warehouses allow for scalability, availability, cost savings, and increased security- all of which are handled by the provider themself. - -### Scalability - -Data warehouses allow you to scale computing up or down depending on how fast you need your transformations to run and how much you are willing to spend. You can turn computing resources on or off as well in order to save on costs. - -### Availability - -Data warehouses are always available. While latency may vary based on source and destination locations, your data can be accessed anywhere, at any time. This is ideal for the remote culture that we are currently living in, where anyone can work from anywhere. - -### Cost savings - -Because you no longer need to maintain all of the infrastructure, you can save on costs related to maintenance. Because the data warehouse companies manage so much data, they are able to unlock cost-savings that you wouldn’t be able to otherwise. - -### Security - -Data warehouses offer advanced security features that ensure your data is always secure. They often directly handle certain compliance strategies needed with healthcare and financial data, eliminating the need for you to do this yourself. They also have features such as roles and users which help you control who has access to your data. But we will get into this more later. - -## Potential business use cases - -Businesses can leverage data warehouses for many different reasons. Most of these reasons end up savings time and money for the business, whether directly or indirectly. - -### Consolidating all of your data in one place - -First, a data warehouse acts as a single source of truth for all of your data. Rather than having all of your data spread across different platforms, it is available to you in one place. This allows you to standardize all of your core metrics and data definitions, rather than depending on the metrics calculated by platforms like Google and Facebook. If you find that different metrics don’t align across platforms, a data warehouse acts as a dependable source for the right metric. Rather than relying on external platforms, you now have one that centralizes all of your data. - -Not to mention, you will save your analytics engineer and data analyst a few headaches. They would otherwise have to manually pull needed data from various sources. Not having a single source of truth decreases your data quality, wastes your data team’s precious time, and makes it difficult to combine data from different sources. - -### Ability to control who has access and the type of access they have - -Data warehouses have extensive security features which allow you to control who has access to what. You have the ability to give someone as little or extensive permissions as you wish. Warehouses give you the ability to create users and assign them roles. Each role has its own set of permissions to which databases and tables it can see. Then, you can also choose who is allowed to query those tables or even update and delete them. - -When anyone in your organization can easily access your data, bad things can happen. You risk the potential of important data being deleted, incorrectly edited, or inappropriately accessed. Data warehouses users, roles, policies, and security measures can help ensure data is in the right hands of the right people. - -### Fast reporting - -Because all of your data is located in the same place, it allows for faster reporting compared to pulling data from many different sources. A central location allows for you to quickly access and query millions of rows of data, allowing transformations and reporting to be done much faster. - -## Data platforms that support data warehousing workloads - -| **Data platform** | **Description** | -|---|---| -| Snowflake | Snowflake is a fully-managed platform for data warehousing, data lakes, data engineering, data science, and data application development. | -| Databricks | Databricks is a cloud-based collaborative data science, data engineering, and data analytics platform that combines the best of data warehouses and data lakes into a lakehouse architecture. | -| Google BigQuery | Google BigQuery is a serverless, highly scalable data warehouse that comes with a built-in query engine. | -| Amazon Redshift | Amazon Redshift is a fully-managed petabyte-scale cloud-based data warehouse designed for large scale data set storage and analysis. | -| Postgres | PostgreSQL is an advanced, enterprise class open source relational database that supports both SQL (relational) and (non-relational) querying. | - -## Data warehouse vs data lake - -A data lake is a system where you store, process, and query unstructured, semi-structured, and structured data at almost any scale. The main difference between a data warehouse and a data lake is the type and way data is stored. Data warehouses contain structured data that is meant to organize data for analytics use. Data lakes can contain pretty much any kind of data—structured or unstructured—and data is usually left in its raw format until it's ready to use. Compare that to data warehouses, whose primary goal is to be a place for data teams to store both raw and transformed, usable data. - -## Conclusion - -Data warehouses have come a long way [in the last 40 years](https://www.getdbt.com/blog/future-of-the-modern-data-stack/). They began as a physical location with huge costs associated with them to a system available to anyone, anywhere at an affordable cost. They have the power to centralize all of your business’s data, allowing for faster analytics operations, standardized KPIs, and a single source of truth. All businesses need a data warehouse in order to operate quickly and efficiently with data that they can rely on. The question isn’t whether or not you need a data warehouse, but which data warehouse you should choose. Make a list of the key features needed for your business and use that to assess the options above. - -## Additional reading - -- [Operational analytics](https://www.getdbt.com/analytics-engineering/use-cases/operational-analytics/) -- [Glossary: ETL](https://docs.getdbt.com/terms/etl/) -- [Glossary: ELT](https://docs.getdbt.com/terms/elt/) - diff --git a/website/docs/terms/data-wrangling.md b/website/docs/terms/data-wrangling.md deleted file mode 100644 index 46a14a25949..00000000000 --- a/website/docs/terms/data-wrangling.md +++ /dev/null @@ -1,166 +0,0 @@ ---- -id: data-wrangling -title: Data wrangling -description: Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data. -displayText: data wrangling -hoverSnippet: Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data. ---- - - - Data wrangling: the workflow that bred analytics engineers - - -Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. For analytics engineers, you may know this better by the name of data cleaning. In data science or machine learning, "wrangling" often refers to prepping the data for model creation. - -The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data in a downstream model without worrying about basic cleaning like renaming, datatype casting, etc. Data wrangling acts as preparation for the development of [intermediate, fct/dim, or mart data models](/best-practices/how-we-structure/1-guide-overview) that form the base layer that other data work can be built off of. Analytics engineers tend to do data wrangling work in the staging layer as a first transformation step after loading the data. This eliminates a foundational step done by an analytics engineer or analyst when building a downstream data model or dashboard. - -## Data wrangling steps - -The data wrangling *structured* process includes data discovery, structuring, cleaning, enriching, validating, and publishing. While this is the general workflow, there isn't one definitive workflow. This will vary depending on the transformation tool you’re using and specific use case. - -### Data discovery - -Data discovery involves getting to know the data that you are working with. This involves looking at key statistical measures of your dataset. Some of these include: - -- Row count -- Number of columns -- Column data types -- Distribution of column values -- Number of duplicate rows -- Number of nulls - -Oftentimes, data warehouses have a preview capability so data team members can easily see a table’s makeup (column name, type, row count, etc.), but functions such as `SUM()` and `COUNT()` will come in handy for finding these values. You can use the `GROUP BY` statement with these functions to find the counts of certain rows for different categories of data. In addition, you’ll want to identify primary keys, check for duplicates of primary keys, and ensure every row of data has a column that can act as a primary key! - -### Structuring - -Structuring your data is a type of transformation that involves reformatting and reorganizing your data so that it is stored in a way that makes the values usable. This could mean rearranging how the data is displayed in columns and rows. Chances are you are using an tool to ingest your data, so the data is likely in a tabular format and you won’t need to do that much restructuring. If your data is structured, you really only need to worry about nested data types such as data. When structuring your data, you want to ask yourself these questions: - -- Is your data in the format you need to perform analysis on it? Does your data need to be potentially unnested? *Should you nest or objectize columns together?* -- Do the column names and values look correct for your use case? - -If your data is not in a format that is usable, you can look into different solutions such as pivoting or using different functions to unpack lists and JSON files so that they are in a tabular format. Pivoting is helpful because it allows you to change the way your dataset is structured by rearranging the way columns, rows, and their values are displayed. dbt has a [pre-built macro](https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/pivot.sql) that makes pivoting less of a headache and more of a breeze. - -### Cleaning - -The cleaning stage involves using different functions so that the values in your data tables are usable in your models and reports. The majority of the work done in staging models is this type of cleaning that includes: - -- Datatype casting -- Lower/upper casing string values -- Converting timestamps -- Aliasing/column renaming -- Removing appropriate duplicates or nulls you found in the discovery process -- Eliminating unnecessary characters or spaces from values - -Certain cleaning steps, like removing rows with null values, are helpful to do at the beginning of the process because removing nulls and duplicates from the start can increase the performance of your downstream models. In the cleaning step, it’s important to follow a standard for your transformations here. This means you should be following a consistent naming convention for your columns (especially for your primary keys) and casting to the same timezone and datatypes throughout your models. Examples include making sure all dates are in UTC time rather than source timezone-specific, all strings are in either lower or upper case, etc. - -:::tip dbt to the rescue! -If you're struggling to do all the cleaning on your own, remember that dbt packages ([dbt expectations](https://github.com/calogica/dbt-expectations), [dbt_utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/), and [re_data](https://www.getre.io/)) and their macros are also available to help you clean up your data. -::: - -### Enriching - -Enriching your data means enhancing it by supplementing incomplete or missing data. This could involve basic case or coalesce statements that use an already existing column in order to produce a new column. It could also look like joining an already existing date column with a date table that contains more extensive information about a certain date. Keep in mind that you don’t want to go overboard with enriching or joining here—you only want to add what will be repeatedly used in modeling and analysis. - -:::tip Python for enrichment? -With the new capability of [Python in dbt](/docs/build/python-models), will folks start using Python to help enrich their data? Only time will tell, but we’re eager to hear how you want to be using Python in dbt. Please join the [#dbt-core-python-models channel](https://www.getdbt.com/community/join-the-community/) to join in on the discussions happening around them. -::: - -### Validating - -Validating data is the process of ensuring that the changes you just made to a dataset during your transformation are accurate. At this stage, you may be asking yourself: -- Are the primary keys still unique? Are there the same number of primary keys in this transformed table than in my upstream sources? -- Has the relationship with the upstream table(s) changed at all, or is it still 1-1? If not, is that expected behavior? -- Has the distribution of column values changed at all? Are column values even correct? -- Did I select the correct columns I want present at this stage? - -To answer these questions, you'll likely find yourself looking for and counting nulls, rows, duplicates, and primary keys. You'll likely reference upstream models regularly in this phase to ensure your transformation code is accurate and performing what you intended it to do. - -Validation is always a little manual, but [dbt tests, macros, and packages](#validating-1) can help make your data validation a little easier 😉 . - -### Publishing - -The last step of the data wrangling process is publishing. In analytics engineering, we typically refer to this as “pushing to production”. This essentially means that you are making the data models available to use in downstream data models, dashboards, and reports. This additionally means pushing the code changes for these staging models to the main branch in your git repository. For non-ephemeral models, the process of publishing could be as simple as running a query as a , creating a table in your production , or running dbt Cloud in production for table recreation. - -CI/CD jobs are often used as part of the publishing process to test and linter code before it is pushed to production. This helps to ensure changes made are actually reliable and safe to merge. CI/CD is a best practice because it allows data models to be updated quickly and efficiently, ensuring no downstream models are impacted. - -When pushing to production, you want to make sure these data models are accessible by those building the models and reports. This may mean you have to play around with users, roles, and permissions in your data warehouse. Your transformation tool should have read access from these tables. Additionally, you could use dbt grants to apply these permissions directly at build time. - -## Data wrangling benefits - -Why should you spend all of that time doing relatively tedious and repetitive work? Well, there are a number of benefits that can make the slog worth it. Those benefits include: - -- Increased data quality -- Increase data usability/modularity -- More standardization -- Deeper understanding of data -- Potential performance improvements on downstream models - -### Increased data quality - -Data wrangling increases the overall quality of your code and the data it produces. Because the cleaning is already done and validated, you don’t have to worry about someone forgetting to clean or standardize a dataset downstream and using messy or inconsistent data. - -### Increased data usability/modularity - -Because data is wrangled once when it is ingested into the data warehouse, analytics engineers don’t need to constantly be recleaning and transforming source data from its origin and follow practices. Wrangled data allows them to use clean and modular models repeatedly throughout their work. - -### Standardization - -When data is wrangled, it is matched with a standard set that your data team establishes that is then applied to all datasets. It ultimately creates consistent staging layers for analytics engineers to build their intermediate, fct/dim, and mart models. Data team members don’t need to worry about upholding standards in downstream models because this is already done when the data is first ingested. - -### Deeper understanding of data - -By first wrangling or cleaning data, you get to learn about the data’s intricacies in the process. Though manual, this process allows you to find issues in the data and understand them deeply before using them in downstream processes. This minimzes potential problems that can go unnoticed because you’ve already explored and validated the datasets. It also helps you understand how tables can be joined together downstream. - -Additionally, this initial data exploration and transformation helps you collaborate better with [backend application developers](https://docs.getdbt.com/blog/when-backend-devs-spark-joy) or data engineers to work on formatting the raw data in a format that is most appropriate for analytics work. - -### Potential performance improvements on downstream models - -Lastly, data wrangling allows for potential improvements in performance in downstream models. Because you’ve cleaned the data and potentially removed duplicates and null values, models will be quicker to run. -## Data wrangling in SQL - -SQL is the most common language for data wrangling. While you can wrangle data using other languages, such as Python, SQL is the most common (and straightforward) language used for data wrangling and transformation in relational databases. Let’s look at some of the most common SQL functions for each of the data wrangling steps. - -### SQL cleaning - -- `CAST` is commonly used to cast values in a column to a specified data type. - -- `CONVERT_TZ` can be used to convert values in a column to a specific timezone. - -- `LOWER`/`UPPER` is used to capitalize or lowercase string values. - -- `TRIM` can remove leading or trailing characters in strings, making string functions easier to use downstream or more consistent across tables. - -- `REPLACE` replaces a specified character in column values. - -You can also use custom built macros, such as those from a dbt package called [re_data](https://hub.getdbt.com/re-data/re_data/latest/), to clean columns using SQL. - -### Enriching - -Enriching data using SQL can often involve the use of functions, such as: - -- CASE statements allow you to replace values using “when-then” statements. They end with an “else” statement to catch the values that don’t fall in any of the “when-then” statements. -- `IFNULL` replaces any null values in a column with whatever value you specify. -- `COALESCE` returns the first non-null value from a list or column that you give it. This function is useful for replacing null values with one that you specify or coalescing multiple column values together. - -### Structuring - -Pivot tables come in handy when restructuring your data. You can use them to make your column names your values and vice versa. Dbt has a [macro](https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/pivot.sql) built out that allows you to completely customize and pivot your tables without having to write crazy complicated code. - -For nested data types such as JSON, you’ll want to check out the JSON parsing and extraction function of your data warehouse to help work with this data. - -### Validating - -dbt offers [generic data tests](/docs/build/data-tests#more-generic-data-tests) in every dbt project that allows you to validate accepted, unique, and null values. They also allow you to validate the relationships between tables and that the primary key is unique. - -If you can’t find what you need with the generic tests, you can download an additional dbt testing package called [dbt_expectations](https://hub.getdbt.com/calogica/dbt_expectations/0.1.2/) that dives even deeper into how you can test the values in your columns. This package has useful data tests like `expect_column_values_to_be_in_type_list`, `expect_column_values_to_be_between`, and `expect_column_value_lengths_to_equal`. - -## Conclusion - -You could argue that data wrangling is one of the most important parts of an analytics engineer's job. It increases data quality, makes your data usable, standardizes it, increases your understanding, and improves performance. None of this would be possible without data discovery, structuring, cleaning, enriching, validating, and publishing steps that make up the wrangling process. - -## Futher reading - -- [Our favorite SQL functions](https://www.getdbt.com/sql-foundations/top-sql-functions/) -- [Glossary: Data warehouse](/terms/data-warehouse) -- [Glossary: Primary key](/terms/primary-key) -- [Glossary: JSON](/terms/json) diff --git a/website/docs/terms/dataframe.md b/website/docs/terms/dataframe.md deleted file mode 100644 index e91b5d59cf6..00000000000 --- a/website/docs/terms/dataframe.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -id: dataframe -title: DataFrame -description: A DataFrame is a way of storing and manipulating tabular data in Python. They gained popularity first as a part of R and then as a part of pandas. -displayText: dataframe -hoverSnippet: A DataFrame is a two-dimensional data structure (rows and columns). It's the most common way of representing and interacting with large datasets in Python. ---- - - What is a DataFrame in Python? - dbt Labs - - -A DataFrame is a way of storing and manipulating tabular data in Python. DataFrames are often likened to tables with columns and rows that you could find in any , Google Sheet, or Excel workbook. - -A DataFrame entry in an analytics engineering glossary…what is happening? You’re reading this right. While SQL is the go-to programming language for most analytics engineering work, there are likely inevitable situations where you've found yourself writing some Python and using DataFrames. - -While DataFrames are also used in other languages for data processing, such as R and Scala, the focus of this glossary page will be on Python DataFrames, their use cases, and their relation to analytics engineering work. - -## How DataFrames work - -DataFrames have a long history ([going back to 1990](https://towardsdatascience.com/preventing-the-death-of-the-dataframe-8bca1c0f83c8#:~:text=The%20earliest%20%E2%80%9Cdataframe%E2%80%9D%2C%20originally,Hastie%20in%201992%20%5B1%5D)!), but gained popularity first as a part of R and then as a part of [pandas](https://pandas.pydata.org/), an open source Python library of useful data analysis and manipulation tools. To work with DataFrames in Python, folks typically need to import the pandas library in the beginning of their script, `.py` file, or Python notebook with the conventional `import pandas as pd`. - -One of the strengths of DataFrames lies in its ability to take data in its original form (ex. array, list, , parquet, dictionary) and form a tabular (rows and columns) format out of it. Once this data is in a tabular format, you can apply functions and packages to that data to clean, transform, and enrich it. - -Below is an example creation of a Python DataFrame from a list and some light enrichment on it: - -```python -import pandas as pd - -def is_credit_card_purchase(x): - if x == 'credit_card': - return True - else: - return False - -jaffle_shop_orders = [[1, 1, 'credit_card', 1000], [2, 2, 'credit_card', 2000], [3,3, 'coupon', 100]] -orders_df = pd.DataFrame(jaffle_shop_orders, columns=['unique_id', 'order_id', 'payment_method', 'amount']) -orders_df.set_index(['unique_id'], inplace=True) -orders_df['is_credit_card'] = orders_df['payment_method'].apply(is_credit_card_purchase) - -print(orders_df) -``` - -This script will return an `orders_df` DataFrame that looks like this: - -| unique_id | order_id | payment_method | amount | is_credit_card | -|---|---|---|---|---| -| 1 | 1 | credit_card | 1000 | True -| 2 | 2 | credit_card | 2000 | True -| 3 | 3 | coupon | 100 | False - -:::info A note on Python flavors -If you’re running Python in Snowflake via Snowpark, you would typically be working with [Snowpark](https://docs.snowflake.com/en/developer-guide/snowpark/python/working-with-dataframes.html) or pandas DataFrames. For folks running Python from Google BigQuery or Databricks users, they can use both pandas or [PySpark DataFrames](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html). There might be slight syntax differences between the different Python flavors of Snowpark, PySpark, and pandas, but much of the functionality remains the same. -::: - -It's also possible and common practice to string together a number of DataFrame transformations. For example, if `df` represents a DataFrame containing one row per person living in the Eastern United States over the last decade, you can calculate the number of people living in Philadelphia each year: - -```python -df.filter("city == 'Philadelphia'") - .withColumn("population", count("name")) - .group_by("year") -``` - -In most distributed frameworks, these transformations are evaluated "lazily." Rather than performing each transformation, calculating its results, and storing those results, the framework develops a *plan* for how it *will* perform those calculations. When you want to *act* on the transformed DataFrame—see the top 10 results, or write it back to a table in the database—then the framework's optimizer calculates the most efficient way to deliver those results, based on all the steps you have defined. - -If you're familiar with SQL, you can think of a DataFrame like a `select` statement, and each new DataFrame operation as a separate . - -You can write a long SQL query containing many complex CTEs. When you run the query with `limit 10` to see a sample of its results, or create that query as a table in the database (what dbt does when it runs your model), the data warehouse optimizes your query and produces the results in the most efficient way possible. - -## DataFrame use cases - -You could probably write hundreds of pages on DataFrame use cases and examples, but at their core, DataFrames, *in the context of analytics engineering*, are often used to manipulate data outside of SQL capabilities, work with data during API extraction, and leverage data science and machine learning. - -### Enrichment and manipulation of data outside of SQL capabilities - -Let’s just say it: there’s a lot of things you can do in Python that could do in SQL and vice versa, but Python packages typically win out when it comes to data enrichment. A typical use case for Python DataFrames is the ability to apply Python libraries or functions to data in the DataFrame. - -In practice, this could look like applying an [IP parser](https://pypi.org/project/ipparser/) to an IP address column, using a package to determine whether a [date falls on a holiday](/docs/build/python-models#using-pypi-packages), or leveraging [numpy](https://numpy.org/) for performant and complex mathematical computations. - -:::tip dbt x Python DataFrames -dbt supports the use of beta [Python models in dbt](/docs/build/python-models). What does this mean exactly? This means that Python-defined data transformations can be created and used in a dbt project in the same vein as a classic dbt SQL model. These Python models are incredibly new and the team is eagerly looking for feedback in how folks want to use and ritualize them. -::: - -### Manipulation of data during extraction and loading scripts - -It’s not the most pleasant of experiences, but as an analytics engineer, you’re going to find yourself writing a hacky Python script at one point to extract data from a system or API that doesn’t have an innate connector in an [ETL tool](https://docs.getdbt.com/terms/elt#elt-tools). - -As you unpack and unnest the JSON received from these API endpoints, you’ll likely use DataFrames to make your data (and life) a little easier to work with. We won’t go into great depth here since this probably won’t happen too often in your career as an analytics engineer, but it’s beneficial to understand the basics of DataFrames and working with [requests, JSON, and DataFrames](https://stackoverflow.com/questions/42518864/convert-json-data-from-request-into-pandas-dataframe). - -### Data science and machine learning - -If SQL is an analytics engineer’s oven, Python is a data scientist's stovetop. Data scientists and machine learning engineers often use Python and DataFrames to perform exploratory analysis, feature engineering and data preparation, and the application of models and algorithms on datasets. Understanding and using DataFrames is step 1 (of many steps) to becoming a data person that can create meaningful data science and machine learning models. - -All this data science and machine learning talk…“But, I’m an analytics engineer,” you say adamantly. One of the great, beautiful, and sometimes frustrating qualities about analytics engineers is their jack-of-all-trades-ness. You can transform data in your sleep, talk ROI and CPAs all day with your VP of marketing, and use git like you studied computer science in college—what can’t you do?? You’ve probably experimented with a predictive analytics model, some light forecasting, or sentiment analysis at one point in your data journey. You may not be interested in making the conversion to full-fledged data scientists or machine learning engineer, but enjoy a challenge from time to time. - -There’s a reason data warehouses and platforms like Snowflake, BigQuery, and Databricks are providing support for Python: because folks are asking for it. There are endless use cases for Python and DataFrames that fall outside of data science and machine learning work, but as you start working and feeling more comfortable in Python, you may be tempted to start experimenting with these different forms of data work. And the world’s your oyster, right? - -## Conclusion - -A DataFrame is a tabular data storage format in Python that is widely used across different roles in the data world. Since a DataFrame stores data in rows and columns, similar to how analytics engineers manipulate tables stored in data warehouses, data folks can transform, engineer, and enrich data in DataFrames using Python and Python packages. Analytics engineers may find themselves using DataFrames when they’re extracting data via APIs, enriching data with third-party packages, or experimenting with data science and machine learning models. - -## Further reading - -Are you ready to dip your toes in DataFrames, Python, and dbt? Check out some of the resources below to learn more about how dbt is embracing Python: - -- [Python models in dbt](/docs/build/python-models) -- #beta-feedback-python-models Slack channel in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/) -- [Best practices for developing Python models in dbt discussion](https://github.com/dbt-labs/docs.getdbt.com/discussions/1811) \ No newline at end of file diff --git a/website/docs/terms/ddl.md b/website/docs/terms/ddl.md deleted file mode 100644 index c4324e75fa9..00000000000 --- a/website/docs/terms/ddl.md +++ /dev/null @@ -1,128 +0,0 @@ ---- -id: ddl -title: DDL -description: Data Definition Language (DDL) is a group of SQL statements that you can execute to manage database objects, including tables, views, and more. -displayText: DDL -hoverSnippet: Data Definition Language (DDL) is a group of SQL statements that you can execute to manage database objects, including tables, views, and more. ---- - - - What is Data Definition Language (DDL) in SQL? - - -Data Definition Language (DDL) is a group of SQL statements that you can execute to manage database objects, including tables, views, and more. Using DDL statements, you can perform powerful commands in your database such as creating, modifying, and dropping objects. DDL commands are usually executed in a SQL browser or stored procedure. - -DDL is contrasted with Data Manipulation Language (DML) which is the SQL that is used to actually access and manipulate data in database objects. The majority of data analysts will rarely execute DDL commands and will do the majority of their work creating DML statements to model and analyze data. - -:::note Note -Data folks don’t typically write DDL [since dbt will do it for them](https://docs.getdbt.com/docs/about/overview#:~:text=dbt%20allows%20analysts%20avoid%20writing,dbt%20takes%20care%20of%20materialization.). -::: - -To be honest, DDL is definitely some of the drier content that exists out there in the greater data world. However, because DDL commands are often uncompromising and should be used with caution, it’s incredibly important to understand how they work and when they should be used. We hope you can use this page to learn about the basics, strengths, and limitations of DDL statements. - -## Types of DDL Statements - -DDL statements are used to create, drop, and manipulate objects in your database. They are often, but not always, unforgiving and irreversible. “With great power comes great responsibility,” is usually the first thing I think of before I execute a DDL command. We’ll highlight some of the primary DDL commands that are used by analytics engineers below. - -:::important Important -The syntax for DDL commands can be pretty database-specific. We are trying to make this glossary page as generic as possible, but please use the “Further Reading” section to see the specifics on how the following DDL commands would be implemented in your database of interest! -::: - -### ALTER - -Using the `ALTER` DDL command, you can change an object in your database that already exists. By "change", we specifically mean you can: - -- Add new, remove, and rename columns to views and tables -- Rename a view or table -- Modify the structure of a view or table -- And more! - -The generic syntax to use the ALTER command is as follows: - -```sql -ALTER ; -``` - -To alter a table’s column, you may do something like this: - -```sql -ALTER TABLE customers rename column last_name as last_initial; -``` - -In this example, you have to rename the `last_name` column [in jaffle_shop’s](https://github.com/dbt-labs/jaffle_shop) `customers` table to be called `last_initial`. - -### DROP - -The `DROP` command. Probably the most high-stakes DDL statement one can execute. One that should be used with the *utmost* of care. At its core, an executed `DROP` statement will remove that object from the . You can drop tables, views, schemas, databases, users, functions, and more. - -Some data warehouses such as Snowflake allow you to add restrictions to `DROP` statements to caution you about the impact of dropping a table, view, or schema before it’s actually dropped. In practice, we recommend you never drop raw source tables as they are often your baseline of truth. Your database user also usually needs the correct permissions to drop database objects. - -The syntax to use the `DROP` command is as follows: - -```sql -DROP ; -``` - -You can drop your `customer` table like this: - -```sql -DROP TABLE customers; -``` - -### CREATE - -With the `CREATE` statement, you can create new objects in your data warehouse. The most common objects created with this statement are tables, schemas, views, and functions. Unlike `DROP`, `ALTER`, and `TRUNCATE` commands, there’s little risk with running `CREATE` statements since you can always drop what you create. - -Creating tables and views with the `CREATE` command requires a strong understanding of how you want the data structured, including column name and data type. Using the `CREATE` command to establish tables and views can be laborious and repetitive, especially if the schema objects contain many columns, but is an effective way to create new objects in a database. After you create a table, you can use DML `INSERT` statements and/or a transformation tool such as dbt to actually get data in it. - -The generic syntax to use the `CREATE` command is as follows: - -```sql -CREATE ; -``` - -Creating a table using the `CREATE` statement may look a something like this: - -```sql -CREATE TABLE prod.jaffle_shop.jaffles ( - id varchar(255), - jaffle_name varchar(255) - created_at timestamp, - ingredients_list varchar(255), - is_active boolean -); -``` - -Note that you had to explicitly define column names and column data type here. *You must have a strong understanding of your data’s structure when using the CREATE command for tables and views.* - -### TRUNCATE - -The `TRUNCATE` command will remove all rows from a table while maintaining the underlying table structure. The `TRUNCATE` command is only applicable for table objects in a database. Unlike `DROP` statements, `TRUNCATE` statements don’t remove the actual table from the database, just the data stored in them. - -The syntax to use the `TRUNCATE` command is as follows: - -```sql -TRUNCATE TABLE
; -``` - -You can truncate your jaffle_shop’s `payments` table by executing this statement: - -```sql -TRUNCATE TABLE payments; -``` - -Previously, this table was 113 rows. After executing this statement, the table is still in your database, but now has zero rows. - -## Conclusion - -DDL statements allow you to remove, edit, and add database objects. Some of the most common DDL statements you’ll execute include `CREATE`, `DROP`, `COMMENT`, `ALTER`, and more. DDL commands are typically executed in a SQL browser or stored procedure. Ultimately, DDL commands are all-powerful and potentially high-risk and should be used with the greatest of care. In the case of DDL, **do not** throw caution to the wind… - -## Further reading - -For database-specific DDL resources, check out the following: - -- [DDL commands in Snowflake](https://docs.snowflake.com/en/sql-reference/sql-ddl-summary.html) -- [SQL commands in Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/c_SQL_commands.html) (contains DDL) -- [DDL statements in Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language) -- [DDL statements in Databricks](https://docs.databricks.com/sql/language-manual/index.html#ddl-statements) -- [DDL in Amazon Athena](https://docs.aws.amazon.com/athena/latest/ug/language-reference.html) diff --git a/website/docs/terms/deploying.md b/website/docs/terms/deploying.md deleted file mode 100644 index 53e59658142..00000000000 --- a/website/docs/terms/deploying.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -id: deploying -title: Deploying -description: Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. -displayText: Deploying -hoverSnippet: Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. ---- - -Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. For more details, refer to [Deploy dbt jobs](/docs/deploy/deployments). - - - diff --git a/website/docs/terms/dimensional-modeling.md b/website/docs/terms/dimensional-modeling.md deleted file mode 100644 index de88f7c318d..00000000000 --- a/website/docs/terms/dimensional-modeling.md +++ /dev/null @@ -1,159 +0,0 @@ ---- -id: dimensional-modeling -title: Dimensional modeling -description: Dimensional modeling is a data modeling technique where you break data up into “facts” and “dimensions” to organize and describe entities in your data warehouse -displayText: dimensional modeling -hoverSnippet: Dimensional modeling is a data modeling technique where you break data up into “facts” and “dimensions” to organize and describe entities within your data warehouse. ---- - - - Dimensional modeling: An essential concept in data modeling - - -Dimensional modeling is a data modeling technique where you break data up into “facts” and “dimensions” to organize and describe entities within your data warehouse. The result is a staging layer in the data warehouse that cleans and organizes the data into the business end of the warehouse that is more accessible to data consumers. - -By breaking your data down into clearly defined and organized entities, your consumers can make sense of what that data is, what it’s used for, and how to join it with new or additional data. Ultimately, using dimensional modeling for your data can help create the appropriate layer of models to expose in an end business intelligence (BI) tool. - -There are a few different methodologies for dimensional modeling that have evolved over the years. The big hitters are the Kimball methodology and the Inmon methodology. Ralph Kimball’s work formed much of the foundation for how data teams approached data management and data modeling. Here, we’ll focus on dimensional modeling from Kimball’s perspective—why it exists, where it drives value for teams, and how it’s evolved in recent years. - -## What are we trying to do here? - -Let’s take a step back for a second and ask ourselves: why should you read this glossary page? What are you trying to accomplish with dimensional modeling and data modeling in general? Why have you taken up this rewarding, but challenging career? Why are *you* here? - -This may come as a surprise to you, but we’re not trying to build a top-notch foundation for analytics—we’re actually trying to build a bakery. - -Not the answer you expected? Well, let’s open up our minds a bit and explore this analogy. - -If you run a bakery (and we’d be interested in seeing the data person + baker venn diagram), you may not realize you’re doing a form of dimensional modeling. What’s the final output from a bakery? It’s that glittering, glass display of delicious-looking cupcakes, cakes, cookies, and everything in between. But a cupcake just didn’t magically appear in the display case! Raw ingredients went through a rigorous process of preparation, mixing, melting, and baking before they got there. - -Just as eating raw flour isn’t that appetizing, neither is deriving insights from raw data since it rarely has a nice structure that makes it poised for analytics. There’s some considerable work that’s needed to organize data and make it usable for business users. - -This is where dimensional modeling comes into play; it’s a method that can help data folks create meaningful entities (cupcakes and cookies) to live inside their [data mart](https://docs.getdbt.com/best-practices/how-we-structure/4-marts) (your glass display) and eventually use for business intelligence purposes (eating said cookies). - -So I guess we take it back—you’re not just trying to build a bakery, you’re also trying to build a top-notch foundation for meaningful analytics. Dimensional modeling can be a method to get you part of the way there. - -## Facts vs. dimensions - -The ultimate goal of dimensional modeling is to be able to categorize your data into their fact or dimension models, making them the key components to understand. So what are these components? - -### Facts - -A fact is a collection of information that typically refers to an action, event, or result of a business process. As such, people typically liken facts to verbs. In terms of a real business, some facts may look like account creations, payments, or emails sent. - -It’s important to note that fact tables act as a historical record of those actions. You should almost never overwrite that data when it needs updating. Instead, you add new data as additional rows onto that table. - -For many businesses, marketing and finance teams need to understand all the touchpoints leading up to a sale or conversion. A fact table for a scenario like this might look like a `fct_account_touchpoints` table: - -| **unique_id** | **touchpoint_id** | **account_id** | **touchpoint_name** | **touchpoint_created_at_utc** | -|---|---|---|---|---| -| 23534 | 34 | 325611 | fall_convention_2020 | 2022-01-30 00:11:26 | -| 12312 | 29 | 325611 | demo_1 | 2022-05-29 01:42:07 | -| 66782 | 67 | 325611 | demo_2 | 2022-06-25 04:10:32 | -| 85311 | 15 | 105697 | fall_convention_2020 | 2022-05-29 06:13:45 | - -Accounts may have many touch points and this table acts as a true log of events leading up to an account conversion. - -This table is great and all for helping understanding what might have led to a conversion or account creation, but what if business users need additional context on these accounts or touchpoints? That’s where dimensions come into play. - -### Dimensions -A dimension is a collection of data that describe who or what took action or was affected by the action. Dimensions are typically likened to nouns. They add context to the stored events in fact tables. In terms of a business, some dimensions may look like users, accounts, customers, and invoices. - -A noun can take multiple actions or be affected by multiple actions. It’s important to call out: a noun doesn’t become a new thing whenever it does something. As such, when updating dimension tables, you should overwrite that data instead of duplicating them, like you would in a fact table. - -Following the example from above, a dimension table for this business would look like an `dim_accounts` table with some descriptors: - -| account_id | account_created_at_utc | account_name | account_status | billing_address | -|---|---|---|---|---| -| 325611 | 2022-06-29 12:11:43 | Not a Pyramid Scheme | active | 9999 Snake Oil Rd, Los Angeles, CA | -| 234332 | 2019-01-03 07:34:50 | Charlie’s Angels’ Chocolate Factory | inactive | 123 Wonka Way, Indianapolis, IN | -| 105697 | 2020-12-11 11:50:22 | Baggins Thievery | active | The Shire | - -In this table, each account only has one row. If an account’s name or status were to be updated, new values would overwrite existing records versus appending new rows. - -:::tip Snapshots -For fact tables you want to keep track of changes to, folks can leverage [dbt snapshots](/docs/build/snapshots). -::: - -### Facts and dimensions at play with each other -Cool, you think you’ve got some facts and dimensions that can be used to qualify your business. There’s one big consideration left to think about: how do these facts and dimensions interact with each other? - -![Image of depicting how facts and dimensions join together to create analytics ready datasets](/img/docs/terms/dimensional-modeling/fact-star.png) - -Pre-cloud data warehouses, there were two dominant design options, star schemas and snowflake schemas, that were used to concretely separate out the lines between fact and dimension tables. - -- In a star schema, there’s one central fact table that can join to relevant dimension tables. -- A snowflake schema is simply an extension of a star schema; dimension tables link to other dimension tables making it form a snowflake-esque shape. - -It sounds really nice to have this clean setup with star or snowflake schemas. Almost as if it’s too good to be true (and it very well could be). - -The development of cheap cloud storage, BI tools great at handling joins, the evolution of SQL capabilities, and data analysts with growing skill sets have changed the way data folks use to look at dimensional modeling and star schemas. Wide tables consisting of fact and dimension tables joined together are now a competitive option for data teams. - -Below, we’ll dig more into the design process of dimensional modeling, wide tables, and the beautiful ambiguity of it all. - -## The dimensional modeling design process - -According to the Kimball Group, the official(™) four-step design process is (1) selecting a business process to analyze, (2) declaring the , (3) Identifying the dimensions, and (4) Identifying the facts. That makes dimensional modeling sound really easy, but in reality, it’s packed full of nuance. - -Coming back down to planet Earth, your design process is how you make decisions about: - -- Whether something should be a fact or a dimension -- Whether you should keep fact and dimension tables separate or create wide, joined tables - -This is something that data philosophers and thinkers could debate long after we’re all gone, but let’s explore some of the major questions to hold you over in the meantime. - -### Should this entity be a fact or dimension? - -Time to put on your consultant hat because that dreaded answer is coming: it depends. This is what makes dimensional modeling a challenge! - -Kimball would say that a fact must be numeric. The inconvenient truth is: an entity can be viewed as a fact or a dimension depending on the analysis you are trying to run. - -:::note Birds of a feather -If you ran a clinic, you would probably have a log of appointments by patient. At first, you could think of appointments as facts—they are, after all, events that happen and patients can have multiple appointments—and patients as dimensions. But what if your business team really cared about the appointment data itself—how well it went, when it happened, the duration of the visit. You could, in this scenario, make the case for treating this appointments table as a dimension table. If you cared more about looking at your data at a patient-level, it probably makes sense to keep appointments as facts and patients as dimensions. All this to say is that there’s inherent complexity in dimensional modeling, and it’s up to you to draw those lines and build those models. -::: - -So then, how do you know which is which if there aren’t any hard rules!? Life is a gray area, my friend. Get used to it. - -A general rule of thumb: go with your gut! If something feels like it should be a fact to meet your stakeholders' needs, then it’s a fact. If it feels like a dimension, it’s a dimension. The world is your oyster. If you find that you made the wrong decision down the road, (it’s usually) no big deal. You can remodel that data. Just remember: you’re not a surgeon. No one will die if you mess up (hopefully). So, just go with what feels right because you’re the expert on your data 👉😎👉 - -Also, this is why we have data teams. Dimensional modeling and data modeling is usually a collaborative effort; working with folks on your team to understand the data and stakeholder wants will ultimately lead to some rad data marts. - -### Should I make a wide table or keep them separate? - -Yet again, it depends. Don’t roll your eyes. Strap in for a quick history lesson because the answer to this harkens back to the very inception of dimensional modeling. - -Back in the day before cloud technology adoption was accessible and prolific, storing data was expensive and joining data was relatively cheap. Dimensional modeling came about as a solution to these issues. Separating collections of data into smaller, individual tables (star schema-esque) made the data cheaper to store and easier to understand. So, individual tables were the thing to do back then. - -Things are different today. Cloud storage costs have gotten really inexpensive. Instead, computing is the primary cost driver. Now, keeping all of your tables separate can be expensive because every time you join those tables, you’re spending usage credits. - -Should you just add everything to one, wide table? No. One table will never rule them all. Knowing whether something should be its own fact table or get added on to an existing table generally comes down to understanding who will be your primary end consumers. - -For end business users who are writing their own SQL, feel comfortable performing joins, or use a tool that joins tables for them, keeping your data as separate fact and dimension tables is pretty on-par. In this setup, these users have the freedom and flexibility to join and explore as they please. - -If your end data consumers are less comfortable with SQL and your BI tool doesn’t handle joins well, you should consider joining several fact and dimension tables into wide tables. Another consideration: these wide, heavily joined tables can tend to wind up pretty specialized and specific to business departments. Would these types of wide tables be helpful for you, your data team, and your business users? Well, that’s for you to unpack. - -## Advantages and disadvantages of dimensional modeling - -The benefits and drawbacks of dimensional modeling are pretty straightforward. Generally, the main advantages can be boiled down to: - -* **More accessibility**: Since the output of good dimensional modeling is a [data mart](https://docs.getdbt.com/best-practices/how-we-structure/4-marts), the tables created are easier to understand and more accessible to end consumers. -* **More flexibility**: Easy to slice, dice, filter, and view your data in whatever way suits your purpose. -* **Performance**: Fact and dimension models are typically materialized as tables or [incremental models](https://docs.getdbt.com/docs/build/incremental-models). Since these often form the core understanding of a business, they are queried often. Materializing them as tables allows them to be more performant in downstream BI platforms. - -The disadvantages include: -* **Navigating ambiguity**: You need to rely on your understanding of your data and stakeholder wants to model your data in a comprehensible and useful way. What you know about your data and what people really need out of the data are two of the most fundamental and difficult things to understand and balance as a data person. -* **Utility limited by your BI tool**: Some BI tools don’t handle joins well, which can make queries from separated fact and dimensional tables painful. Other tools have long query times, which can make querying from ultra-wide tables not fun. - -## Conclusion - -Dimensional data modeling is a data modeling technique that allows you to organize your data into distinct entities that can be mixed and matched in many ways. That can give your stakeholders a lot of flexibility. [While the exact methodologies have changed](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/)—and will continue to, the philosophical principle of having tables that are sources of truth and tables that describe them will continue to be important in the work of analytics engineering practitioners. - - -## Additional Reading - -Dimensional modeling is a tough, complex, and opinionated topic in the data world. Below you’ll find some additional resources that may help you identify the data modeling approach that works best for you, your data team, and your end business users: - - - -* [Modular data modeling techniques](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -* [Stakeholder-friendly model naming conventions](https://docs.getdbt.com/blog/stakeholder-friendly-model-names/) -* [How we structure our dbt projects guide](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) diff --git a/website/docs/terms/dml.md b/website/docs/terms/dml.md deleted file mode 100644 index 54fe69e845e..00000000000 --- a/website/docs/terms/dml.md +++ /dev/null @@ -1,107 +0,0 @@ ---- -id: dml -title: DML -description: Data Manipulation Language (DML) is a class of SQL statements that are used to query, edit, add and delete row-level data from database tables or views. -displayText: DML -hoverSnippet: Data Manipulation Language (DML) is a class of SQL statements that are used to query, edit, add and delete row-level data from database tables or views. The main DML statements are SELECT, INSERT, DELETE, and UPDATE. ---- - - - DML: The SQL statements that make the data world go 'round - - -Data Manipulation Language (DML) is a class of SQL statements that are used to query, edit, add and delete row-level data from database tables or views. The main DML statements are `SELECT`, `INSERT`, `DELETE`, and `UPDATE`. - -DML is contrasted with Data Definition Language (DDL) which is a series of SQL statements that you can use to edit and manipulate the *structure* of databases and the objects in them. - -Similar to DDL, DML can be a *tad* bit boring. However, DML statements are what allows analysts and analytics engineers to do their work. We hope you can use this glossary to understand when and why DML statements are used and how they may contrast with similar DDL commands. - - -## Types of DML Statements - -The primary DML statements are `SELECT`, `INSERT`, `DELETE`, and `UPDATE`. With the exception of `SELECT` statements, all of the others are only applicable to data within tables in a database. The primary difference between `SELECT` and all the other DML statements is its impact to row-level data: - -- To *change* the actual data that lives in tables, use `INSERT`, `DELETE`, and `UPDATE` statements -- To *access* the data in databse object, use `SELECT` statements - -:::important Important -For the most part, the syntax for DML statements are pretty universal across [Supported Data Platforms](https://docs.getdbt.com/docs/supported-data-platforms) including Google Bigquery, Databricks, Postgres, Amazon Redshift, and Snowflake. Regardless, please use the “Further Reading” section to see the specifics on how the following DML statements would be implemented in your database of interest! -::: - -### SELECT - -Ah, our favorite of DML statements! This is the SQL we all know and love (most of the time). Because the `SELECT` statement allows you to access and manipulate data that exists in database objects, it makes it the true powerhouse in data analysis and analytics engineering. - -You write `SELECT` statements to create queries that build data models and perform robust analysis. With `SELECT` statements, you can join different views and tables, qualify data by setting filters, apply functions and operators on the data, and more. `SELECT` statements, unlike `INSERT`, `DELETE`, and `UPDATE`, don’t actually change the row-level value stored in the tables/views. Instead, you write `SELECT` statements to express the business logic needed to perform analysis. - -All `SELECT` statements need three elements: a `SELECT` clause in the beginning, the actual field selection and manipulation, and a `FROM` statement which is specifying which database object you’re trying to access. - -Here’s an example `SELECT` statement: - -```sql -select - - payment_method, - sum(amount) AS amount - -from {{ ref('raw_payments') }} -group by 1 -``` - -In this example, your selection of the `payment_method` column and summation of the `amount` column is the meat of your query. The `from {{ ref('raw_payments') }}` specifies the actual table you want to do the selecting from. - -### INSERT - -Using the `INSERT` DML command, you can add rows to a table that exists in your database. To be honest, data folks are rarely inserting data into tables manually with the `INSERT` command. Instead, data team members will most often use data that’s already been inserted by an tool or other data ingestion process. - -You can insert a record [in jaffle_shop’s](https://github.com/dbt-labs/jaffle_shop) `raw_customers` table like this: - -```sql -INSERT INTO raw_customers VALUES (101, 'Kira', 'F.'); -``` - -As you can see from this example, you clearly set all the column values that exist in your `raw_customers` table. For `INSERT` statements, you can explicitly specify the values you want to insert or use a query result to set the column values. - -### DELETE - -The `DELETE` command will remove rows in an existing table in your database. In practice, you will usually specify a `WHERE` clause with your `DELETE` statement to only remove specific rows from a table. But, you shouldn't really ever delete rows from tables. Instead, you should apply filters on queries themselves to remove rows from your modeling or analysis. - -For the most part, if you wanted to remove all existing rows in a table, but keep the underlying table structure, you would use the `TRUNCATE` DDL command. If you wanted to remove all rows and drop the entire table, you could use the `DROP` DDL command. - -You can delete the record for any Henry W. in jaffle_shop’s `customers` table by executing this statement: - -```sql -DELETE FROM customers WHERE first_name = 'Henry' AND last_name = 'W.'; -``` - -### UPDATE - -With the `UPDATE` statement, you can change the actual data in existing rows in a table. Unlike the `ALTER` DDL command that changes the underlying structure or naming of database objects, the `UPDATE` statement will alter the actual row-level data. You can qualify an `UPDATE` command with a `WHERE` statement to change the values of columns of only specific rows. - -You can manually update the status column of an order in your orders table like this: - -```sql -UPDATE orders SET status = 'returned' WHERE order_id = 7; -``` - -:::tip Tip -The `UPDATE` statement is often compared to the `MERGE` statement. With `MERGE` statements, you can insert, update, *and* delete records in a single command. Merges are often utilized when there is data between two tables that needs to be reconciled or updated. You'll see merges most commonly executed when a source table is updated and a downstream table needs to be updated as a result of this change. Learn more about [how dbt uses merges in incremental models here](https://docs.getdbt.com/docs/build/incremental-overview#how-incremental-models-work-in-dbt). -::: - -## Conclusion - -DML statements allow you to query, edit, add, and remove data stored in database objects. The primary DML commands are `SELECT`, `INSERT`, `DELETE`, and `UPDATE`. Using DML statements, you can perform powerful actions on the actual data stored in your system. You'll typically see DML `SELECT` statements written in data models to conduct data analysis or create new tables and views. In many ways, DML is the air that us data folks breathe! - -## Further reading - -For more resources on why people who use dbt don’t write DML, check out the following: - -- [Why not write DML](/faqs/Project/why-not-write-dml) -- [SQL dialect](/faqs/Models/sql-dialect) - -For database-specific DML documents, please check out the resources below: - -- [DML in Snowflake](https://docs.snowflake.com/en/sql-reference/sql-dml.html) -- [Updating tables with DML commands in Redshift](https://docs.aws.amazon.com/redshift/latest/dg/t_Updating_tables_with_DML_commands.html) -- [DML in Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-manipulation-language) -- [Delta Lake DML for Databricks](https://databricks.com/blog/2020/09/29/diving-into-delta-lake-dml-internals-update-delete-merge.html) diff --git a/website/docs/terms/dry.md b/website/docs/terms/dry.md deleted file mode 100644 index 04b83642a08..00000000000 --- a/website/docs/terms/dry.md +++ /dev/null @@ -1,97 +0,0 @@ ---- -id: dry -title: DRY -description: DRY is a software development principle that stands for “Don’t Repeat Yourself.” Living by this principle means that your aim is to reduce repetitive patterns and code. -displayText: DRY -hoverSnippet: DRY is a software development principle that stands for “Don’t Repeat Yourself.” Living by this principle means that your aim is to reduce repetitive patterns and duplicate code and logic in favor of modular and referenceable code. ---- - - - What is DRY? Hint: It makes for great code - dbt Labs - - -DRY is a software development principle that stands for “Don’t Repeat Yourself.” Living by this principle means that your aim is to reduce repetitive patterns and duplicate code and logic in favor of modular and referenceable code. - -The DRY code principle was originally made with software engineering in mind and coined by Andy Hunt and Dave Thomas in their book, _The Pragmatic Programmer_. They believed that “every piece of knowledge must have a single, unambiguous, authoritative representation within a system.” As the field of analytics engineering and [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) develops, there’s a growing need to adopt [software engineering best practices](https://www.getdbt.com/product/what-is-dbt/), including writing DRY code. - -## Why write DRY code? - -DRY code is one of the practices that makes a good developer, a great developer. Solving a problem by any means is great to a point, but eventually, you need to be able to write code that's maintainable by people other than yourself and scalable as system load increases. That's the essence of DRY code. - -But what's so great about being DRY as a bone anyway, when you can be WET? - -### Don’t be WET - -WET, which stands for “Write Everything Twice,” is the opposite of DRY. It's a tongue-in-cheek reference to code that doesn’t exactly meet the DRY standard. In a practical sense, WET code typically involves the repeated _writing_ of the same code throughout a project, whereas DRY code would represent the repeated _reference_ of that code. - -Well, how would you know if your code isn't DRY enough? That’s kind of subjective and will vary by the norms set within your organization. That said, a good rule of thumb is [the Rule of Three](https://en.wikipedia.org/wiki/Rule_of_three_(writing)#:~:text=The%20rule%20of%20three%20is,or%20effective%20than%20other%20numbers.). This rule states that the _third_ time you encounter a certain pattern, you should probably abstract it into some reusable unit. - -There is, of course, a tradeoff between simplicity and conciseness in code. The more abstractions you create, the harder it can be for others to understand and maintain your code without proper documentation. So, the moral of the story is: DRY code is great as long as you [write great documentation.](https://docs.getdbt.com/docs/build/documentation) - -### Save time & energy - -DRY code means you get to write duplicate code less often. You're saving lots of time writing the same thing over and over. Not only that, but you're saving your cognitive energy for bigger problems you'll end up needing to solve, instead of wasting that time and energy on tedious syntax. - -Sure, you might have to frontload some of your cognitive energy to create a good abstraction. But in the long run, it'll save you a lot of headaches. Especially if you're building something complex and one typo can be your undoing. - -### Create more consistent definitions - -Let's go back to what Andy and Dave said in _The Pragmatic Programmer_: “Every piece of knowledge must have a single, unambiguous, authoritative representation within a system.” As a data person, the words “single” and “unambiguous” might have stood out to you. - -Most teams have essential business logic that defines the successes and failures of a business. For a subscription-based DTC company, this could be [monthly recurring revenue (MRR)](https://www.getdbt.com/blog/modeling-subscription-revenue/) and for a SaaS product, this could look like customer lifetime value (CLV). Standardizing the SQL that generates those metrics is essential to creating consistent definitions and values. - -By writing DRY definitions for key business logic and metrics that are referenced throughout a dbt project and/or BI (business intelligence) tool, data teams can create those single, unambiguous, and authoritative representations for their essential transformations. Gone are the days of 15 different definitions and values for churn, and in are the days of standardization and DRYness. - -:::important dbt Semantic Layer, powered by MetricFlow - -The [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), powered by [MetricFlow](/docs/build/about-metricflow), simplifies the process of defining and using critical business metrics, like revenue in the modeling layer (your dbt project). By centralizing metric definitions, data teams can ensure consistent self-service access to these metrics in downstream data tools and applications. The dbt Semantic Layer eliminates duplicate coding by allowing data teams to define metrics on top of existing models and automatically handles data joins. - -::: - -## Tools to help you write DRY code - -Let’s just say it: Writing DRY code is easier said than done. For classical software engineers, there’s a ton of resources out there to help them write DRY code. In the world of data transformation, there are also some tools and methodologies that can help folks in [the field of analytics engineering](https://www.getdbt.com/what-is-analytics-engineering/) write more DRY and [modular code](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/). - - -### Common Table Expressions (CTEs) - -CTEs are a great way to help you write more DRY code in your data analysis and dbt models. In a formal sense, a CTE is a temporary results set that can be used in a query. In a much more human and practical sense, we like to think of CTEs as separate, smaller queries within the larger query you’re building up. Essentially, you can use CTEs to break up complex queries into simpler blocks of code that are easier to debug and can connect and build off of each other. - -If you’re referencing a specific query, perhaps for aggregations that join back to an unaggregated view, CTEs can simply be referenced throughout a query with its CTE_EXPRESSION_NAME. - - -### View materializations - -View [materializations](https://docs.getdbt.com/docs/build/materializations) are also extremely useful for abstracting code that might otherwise be repeated often. A is a defined passthrough SQL query that can be run against a database. Unlike a table, it doesn’t store data, but it defines the logic that you need to use to fetch the underlying data. - -If you’re referencing the same query, CTE, or block of code, throughout multiple data models, that’s probably a good sign that code should be its own view. - -For example, you might define a SQL view to count new users created in a day: - -```sql - select - created_date, - count(distinct(user_id)) as new_users - from {{ ref('users') }} - group by created_date -``` - -While this is a simple query, writing this logic every time you need it would be super tedious. And what if the `user_id` field changed to a new name? If you’d written this in a WET way, you’d have to find every instance of this code and make the change to the new field versus just updating it once in the code for the view. - -To make any subsequent references to this view DRY-er, you simply reference the view in your data model or query. - -### dbt macros and packages - -dbt also supports the use of [macros](/docs/build/jinja-macros) and [packages](https://docs.getdbt.com/docs/build/packages) to help data folks write DRY code in their dbt projects. Macros are Jinja-supported functions that can be reused and applied throughout a dbt project. Packages are libraries of dbt code, typically models, macros, and/or tests, that can be referenced and used in a dbt project. They are a great way to use transformations for common data sources (like [ad platforms](https://hub.getdbt.com/dbt-labs/facebook_ads/latest/)) or use more [custom tests for your data models](https://hub.getdbt.com/calogica/dbt_expectations/0.1.2/) _without having to write out the code yourself_. At the end of the day, is there really anything more DRY than that? - -## Conclusion - -DRY code is a principle that you should always be striving for. It saves you time and energy. It makes your code more maintainable and extensible. And potentially most importantly, it’s the fine line that can help transform you from a good analytics engineer to a great one. - -## Further reading - -* [Data modeling technique for more modularity](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -* [Why we use so many CTEs](https://docs.getdbt.com/docs/best-practices) -* [Glossary: CTE](https://docs.getdbt.com/terms/cte) -* [Glossary: Materialization](https://docs.getdbt.com/terms/materialization) -* [Glossary: View](https://docs.getdbt.com/terms/view) diff --git a/website/docs/terms/edw.md b/website/docs/terms/edw.md deleted file mode 100644 index 1ac0f37ee47..00000000000 --- a/website/docs/terms/edw.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -id: edw -title: EDW -description: The primary difference between an EDW and a regular data warehouse is, well, semantics and perspective. An EDW like any other data warehouse, is a collection of databases that centralize a business's data -displayText: EDW -hoverSnippet: An Enterprise Data Warehouse (EDW), like any other data warehouse, is a collection of databases that centralize a business's information from multiple sources and applications. ---- - - - What does an EDW (Enterprise Data Warehouse) really mean? - - -An Enterprise Data Warehouse (EDW), like any other , is a collection of databases that centralize a business's information from multiple sources and applications. The primary difference between an EDW and a regular data warehouse is, well, semantics and perspective. - -The data stored in an EDW comes from all different functions of a company—marketing, finance, engineering, product, and more. The primary goal of an EDW is to provide a central and organized home for both the raw and transformed version of this data. EDWs in modern data practices are typically set-up in the cloud, meaning that the servers used to run the warehouse are owned and managed by a cloud provider such as Snowflake, AWS Redshift, or Google BigQuery. - - -## Data warehouse vs enterprise data warehouse - -![](/img/docs/terms/edw/meme.png) - -In an EDW, all departments of an organization store their raw and transformed data in databases within a main warehouse. For organizations that are not calling their data warehouse an EDW and have a more siloed setup, there’s a chance each department each has *their own separate* data warehouse for storage and computation. **But practically, the difference between a data warehouse and an enterprise data warehouse is semantics.** - -Organization size, distribution, data complexity, and business needs can all determine whether a company wants a centralized data warehouse or distributed warehouses per function. Nonetheless, if your organization only has one data warehouse that centrally houses all of your data sources, the distinction isn't really necessary, but *technically* that could be called an EDW. - -In the world of analytics engineering, most teams have one central data warehouse that houses data from all of their different departments and functions. - - -### Why is this distinction necessary? - -One of the main distinctions is in an organization’s users and distribution. If an organization has multiple databases, a central data warehouse is used to create separate entities between raw and source data, staging work, and ready-for-use analytics datasets. In this EDW and classic data warehouse setup, data is accessible across an organization, data teams can create tables that join data from multiple sources, and users can gain enriched perspectives into their data. - -If a company has very siloed departments that manage their own data, budgets, and have little need for crossover with other departments or data sources, emphasizing the difference between a central EDW data warehouse and their own data warehouse could be a necessity for budgeting and governance reasons. - -Lastly, the somewhat exponential adoption of cloud data warehouses in the last decade has shifted the terminology from what many people called an EDW to a data warehouse. - - -## Enterprise data warehouse use cases - -There are a variety of reasons why an organization might opt to have an EDW or data warehouse. A centralized and organized data warehouse provide advantages for the following use cases: - -- Create clear partitions between raw, staging, and heavily transformed data -- Standardize data definitions and metrics across multiple data sources -- Connect a BI tool to one central data warehouse and surface that data to users across a business - -### Benefits of an EDW - -Like most other data warehouses, the benefit of an EDW is the ability to store raw and transformed data from multiple sources in one single data warehouse. Users across different departments and data team members embedded in different functions can all have access to the same data. Cloud data warehouses also scale with data and users, making EDWs an appropriate place for organizations to grow their analytics work. - -EDWs also help in building a 360-degree view of the company by combining different sources of information, such as customer feedback, financial records, product inventory, and marketing insights. All of this information can then be organized in data marts, schemas, and tables within one EDW that are eventually exposed to a BI tool. - -In addition, because all of an organization’s data is stored in one place, data teams can provide access to only those who need access to specific schemas and tables. Keeping these access patterns and changes in only one data warehouse will limit the amount of data needed to go through for auditing and other security regulations. - -## Conclusion - -An enterprise data warehouse is, in general, like any other data warehouse; it acts as a central home for multiple departments’ raw and transformed data. An EDW is often composed of multiple databases to store raw, staging, development, and production-ready data. The primary benefits for an EDW are centralization, standardization, and accessibility. You probably have a data warehouse setup like an EDW, you’re likely just not calling it that 😉 - - -## Additional reading -EDW, data warehouse, or something different altogether? Check out some of our favorite resources on the fundamental of data storage and organization: - -- [Glossary: Dimensional modeling](https://docs.getdbt.com/terms/dimensional-modeling) -- [Glossary: Data warehouse](https://docs.getdbt.com/terms/data-warehouse) \ No newline at end of file diff --git a/website/docs/terms/elt.md b/website/docs/terms/elt.md index 59cfc77778c..0e7d11bf7dd 100644 --- a/website/docs/terms/elt.md +++ b/website/docs/terms/elt.md @@ -136,4 +136,4 @@ Here's some of our favorite content about the ELT workflow: - [The case for the ELT workflow](https://www.getdbt.com/analytics-engineering/case-for-elt-workflow/) - [A love letter to ETL tools](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/) -- [What is dbt?](https://getdbt.com/product/what-is-dbt/) \ No newline at end of file +- [What is dbt?](https://getdbt.com/product/what-is-dbt/) diff --git a/website/docs/terms/grain.md b/website/docs/terms/grain.md deleted file mode 100644 index 608a5c6391d..00000000000 --- a/website/docs/terms/grain.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -id: grain -title: Data grain -description: Grain is the combination of columns at which records in a table are unique. Ideally, this is captured in a single column or a unique primary key. -displayText: grain -hoverSnippet: Your data's grain is the combination of columns at which records in a table are unique. Ideally, this is captured in a single column and a unique primary key. ---- - - - Data grain: What granularity means in terms of data modeling - - -Grain is the combination of columns at which records in a table are unique. Ideally, this is captured in a single column, a unique , but even then, there is descriptive grain behind that unique id. Let’s look at some examples to better understand this concept. - -| user_id | address | -| --- | --- | -| 1 | 123 Jaffle Ln | -| 2 | 456 Waffle St | -| 3 | 789 Raffle Rd | - -In the above table, each `user_id` is unique. This table is at the *user* *grain*. - -| user_id | address | -| --- | --- | -| 1 | 123 Jaffle Ln | -| 1 | 420 Jaffle Ln | -| 2 | 456 Waffle St | -| 3 | 789 Raffle Rd | - -In the above table, `user_id` is no longer unique. The combination of `user_id` and `address` creates a unique combination, thus this table is at the *user* *address* *grain*. We generally describe the grain conceptually based on the names of the columns that make it unique. A more realistic combination you might see in the wild would be a table that capture the state of all users at midnight every day. The combination of the captured `updated_date` and `user_id` would be unique, meaning our table is at *user per day* grain. - -In both examples listed in the previous paragraph, we’d want to create a of some sort from the combination of columns that comprise the grain. This gives our table a primary key, which is crucial for testing and optimization, and always a best practice. Typically, we’ll name this primary key based on the verbal description of the grain. For the latter example, we’d have `user_per_day_id`. This will be more solid and efficient than testing than repeatedly relying on the combination of those two columns. - -Thinking deeply about grain is a crucial part of data modeling. As we design models, we need to consider the entities we’re describing, and what dimensions (time, attributes, etc.) might fan those entities out so they’re no longer unique, as well as how we want to deal with those. Do we need to apply transformations to deduplicate and collapse the grain? Or do we intentionally want to expand the grain out, like in our *user per day* example? - -There’s no right answer here, we have the power to do either as it meets our needs. The key is just to make sure we have a clear sense of our grain for every model we create, that we’ve captured it in a primary key, and that we’re applying tests to ensure that our primary key column is unique and not null. \ No newline at end of file diff --git a/website/docs/terms/hover-terms.md b/website/docs/terms/hover-terms.md new file mode 100644 index 00000000000..d0b024a941d --- /dev/null +++ b/website/docs/terms/hover-terms.md @@ -0,0 +1,125 @@ +--- +cte: + displayText: CTE + hoverSnippet: A Common Table Expression (CTE) is a temporary result set that can be used in a SQL query. You can use CTEs to break up complex queries into simpler blocks of code that can connect and build on each other. + +dag: + displayText: DAG + hoverSnippet: A DAG is a Directed Acyclic Graph, a type of graph whose nodes are directionally related to each other and don’t form a directional closed loop. + +data-extraction: + displayText: data extraction + hoverSnippet: Data extraction is the process by which data is retrieved from multiple sources, often varying in volume and structure. + +data-lake: + displayText: data lake + hoverSnippet: A data lake is a data management system used for storing large amounts of data in in its raw, native form as files. Data lakes can store any type of data—structured, semi-structured, unstructured—in one centralized place. + +data-lineage: + displayText: data lineage + hoverSnippet: Data lineage provides a holistic view of how data moves through an organization, where it’s transformed and consumed. + +data-warehouse: + displayText: data warehouse + hoverSnippet: A data warehouse is a data management system used for data storage and computing that allows for analytics activities such as transforming and sharing data. + +data-catalog: + displayText: data catalog + hoverSnippet: A data catalog is an inventory of data assets from different parts of the data stack within an organization. This catalog can display metadata, lineage, and business definitions from your different data sources. + +data-wrangling: + displayText: data wrangling + hoverSnippet: Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data. + +dataframe: + displayText: dataframe + hoverSnippet: A DataFrame is a two-dimensional data structure (rows and columns). It's the most common way of representing and interacting with large datasets in Python. + +ddl: + displayText: DDL + hoverSnippet: Data Definition Language (DDL) is a group of SQL statements that you can execute to manage database objects, including tables, views, and more. + +deploying: + displayText: Deploying + hoverSnippet: Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. + +dimensional-modeling: + displayText: dimensional modeling + hoverSnippet: Dimensional modeling is a data modeling technique where you break data up into “facts” and “dimensions” to organize and describe entities within your data warehouse. + +dml: + displayText: DML + hoverSnippet: Data Manipulation Language (DML) is a class of SQL statements that are used to query, edit, add and delete row-level data from database tables or views. The main DML statements are SELECT, INSERT, DELETE, and UPDATE. + +dry: + displayText: DRY + hoverSnippet: DRY is a software development principle that stands for “Don’t Repeat Yourself.” Living by this principle means that your aim is to reduce repetitive patterns and duplicate code and logic in favor of modular and referenceable code. + +edw: + displayText: EDW + hoverSnippet: An Enterprise Data Warehouse (EDW), like any other data warehouse, is a collection of databases that centralize a business's information from multiple sources and applications. + +elt: + displayText: ELT + hoverSnippet: Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, loading it into a target data warehouse, and finally transforming it. + +etl: + displayText: ETL + hoverSnippet: Extract, Transform, Load (ETL) is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. + +grain: + displayText: grain + hoverSnippet: Your data's grain is the combination of columns at which records in a table are unique. Ideally, this is captured in a single column and a unique primary key. + +idempotent: + displayText: idempotent + hoverSnippet: Idempotent describes a process that gives you the same result no matter how many times you run it. + +json: + displayText: JSON + hoverSnippet: JSON (JavaScript Object Notation) is a minimal format for semi-structured data used to capture relationships between fields and values. + +materialization: + displayText: materialization + hoverSnippet: The exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a data warehouse. + +model: + hoverSnippet: A model is an essential building block of the DAG + displayText: model + +monotonically-increasing: + displayText: monotonically increasing + hoverSnippet: A monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. + +predicate-pushdown: + displayText: Predicate pushdown + hoverSnippet: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query + +primary-key: + displayText: primary key + hoverSnippet: A primary key is a non-null column in a database object that uniquely identifies each row. + +relational-database: + displayText: relational database + hoverSnippet: A relational database provides a structured way to store data into tables consisting of rows and columns. Different tables in a relational database can be joined together using common columns from each table, forming relationships. + +reverse-etl: + displayText: reverse ETL + hoverSnippet: Reverse ETL is the process of getting your transformed data stored in your data warehouse to end business platforms, such as sales CRMs and ad platforms. + +subquery: + displayText: subquery + hoverSnippet: A subquery is a query within another query. Subqueries are often used when you need to process data in multiple steps. + +surrogate-key: + displayText: surrogate key + hoverSnippet: A surrogate key is a unique identifier derived from the data itself. It often takes the form of a hashed value of multiple columns that will create a uniqueness constraint for each row. + +table: + displayText: table + hoverSnippet: In simplest terms, a table is the direct storage of data in rows and columns. Think excel sheet with raw values in each of the cells. + +view: + displayText: view + hoverSnippet: A view (as opposed to a table) is a defined passthrough SQL query that can be run against a database (or data warehouse). +--- diff --git a/website/docs/terms/idempotent.md b/website/docs/terms/idempotent.md deleted file mode 100644 index ea3ef0a099b..00000000000 --- a/website/docs/terms/idempotent.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -id: idempotent -title: Idempotent -description: Idempotent is an adjective to describe a process that gives you the same result no matter how many times you run it. -displayText: idempotent -hoverSnippet: Idempotent describes a process that gives you the same result no matter how many times you run it. ---- - - - What is idempotency and why is the concept important in data? - - -Idempotent is an adjective to describe a process that gives you the same result no matter how many times you run it. - -For a mathematical example, adding 1 changes the results, but multiplying by 1 is idempotent. When you add 1 to a number and then add 1 again, you get different results. If you multiply a number by 1 and multiply by 1 again, you do get the same result. - -A more real-world example of idempotency is the process of saving a file in a word processor. Given the same inputs (i.e. the same document contents), clicking "_Save_" one time will leave your system in the exact same state as clicking "_Save_" five times in a row. - -A non-idempotent version of the "_Save_" button might do something like "Append the paragraph I just wrote to the end of the file". Doing _that_ five times in a row will _not_ leave you in the same state as doing it one time; your most recent paragraph would have duplicates. - -If word processors only gave us non-idempotent "Append paragraph" / "Update paragraph" / "Delete paragraph" operations, then saving our document changes would be a lot more difficult! We'd have to keep track of which paragraphs we previously saved, and either make sure to not save them again or have a process in place to regularly clean up duplicate paragraphs. The implementation of the "_Save_" button in word processors takes the collection of low-level non-idempotent filesystem operations (read/append/overwrite/delete), and systematically runs them in a certain order so that the _user_ doesn't have to deal with the non-idempotency. The user can just focus on writing -- choosing words, editing for clarity, ensuring paragraphs aren't too long, etc. -- and the word processor deals with making sure the words get persisted properly to disk. - -This word processing analogy is very similar to what dbt does for [data transformation](https://www.getdbt.com/analytics-engineering/transformation/): it takes the collection of low-level non-idempotent database operations (`SELECT`/`INSERT`/`UPDATE`/`DELETE` -- collectively known as DML statements), and systematically runs them in a certain order so that analytics engineers don't have to deal with non-idempotency. We can just focus on the data -- [choosing good model and column names](https://docs.getdbt.com/blog/on-the-importance-of-naming), [documenting them](/community/resources/viewpoint#documentation), [ensuring data consumers can understand them](https://docs.getdbt.com/docs/best-practices#consider-the-information-architecture-of-your-data-warehouse), etc. -- and [`dbt run`](https://docs.getdbt.com/reference/commands/run) will make sure the database ends up in the right state. diff --git a/website/docs/terms/json.md b/website/docs/terms/json.md deleted file mode 100644 index 652fb58cbe3..00000000000 --- a/website/docs/terms/json.md +++ /dev/null @@ -1,103 +0,0 @@ ---- -id: json -title: JSON -description: JSON (JavaScript Object Notation) is a minimal format for semi-structured data used to capture relationships between fields and values. -displayText: JSON -hoverSnippet: JSON (JavaScript Object Notation) is a minimal format for semi-structured data used to capture relationships between fields and values. ---- - -JSON stands for JavaScript Object Notation. JSON is a minimal format which is great for processing data for applications. It can capture many types of relationships in a concise format and is a commonly used format for semi-structured data. The tables in your contain structured data (as opposed to semi-structured) where for each row, each field typically contains one value. Structured data, or tabular data, is intuitive and easy to read, but semi-structured data offers more flexibility. - -Let’s talk through what that looks like in practice so you can get a better sense of what we mean. - -## JSON syntax example - -When looking at data formatted in JSON, we say that the data is stored in **JSON objects**. These are composed of key-value pairs. JSON objects are enclosed in curly brackets (`{ }`) and each key-value pair is separated by a comma. Here’s an example: - -```json -order = {"customer_id":2947, "order_id":4923, "order_items":"cheesecake"} -``` - -`order` is the JSON object. `"customer_id":2947` is one of the key-value pairs within this JSON object. - -If I wanted to find the `customer_id`, I could return that value with `order["customer_id"]` or `order.customer_id`. It’s easy for us to simply read the `customer_id` just by looking at the JSON object in this example, but what if your JSON object contains hundreds of key-value pairs or complex nesting? Being aware of how to pull information out of JSON is essential if you’re working with it in the wild. - -A key feature of JSON is that it can contain data types that aren’t normally found in relational databases, namely **dictionaries** and **arrays**. Let’s break down what that means and then we’ll look at an example to pull everything together. - -### Dictionaries and arrays in JSON - -JSON inherits its syntax from JavaScript (JS) so dictionaries and arrays are formatted in the same way as they are in JS. Dictionaries are formatted just like JSON objects and consist of key-value pairs. Arrays are lists of values and they’re enclosed in square brackets (`[ ]`) and each value is separated by a comma, like so: - -```json -menu_items = ["cheesecake", "danish", "coffee"] -``` - -Individual values from an array can be called by referencing the location of a value within the array. Arrays are zero-indexed which means that the first item is at position 0 and we count up from there. - -- `menu_items[0]` will return “cheesecake” -- `menu_items[1]` will return “danish” -- `menu_items[2]` will return “coffee” - -Dictionaries and arrays can be nested in JSON objects as well as nested in each other. **Dictionaries and arrays can only be values. They can never be keys.** - -Here’s an example of a JSON object describing a tweet from [Twitter’s developer platform](https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/overview). - -```json -tweet = -{ - "created_at": "Thu Apr 06 15:24:15 +0000 2017", - "id_str": "850006245121695744", - "text": "1\/ Today we\u2019re sharing our vision for the future of the Twitter API platform!\nhttps:\/\/t.co\/XweGngmxlP", - "user": { - "id": 2244994945, - "name": "Twitter Dev", - "screen_name": "TwitterDev", - "location": "Internet", - "url": "https:\/\/dev.twitter.com\/", - "description": "Your official source for Twitter Platform news, updates & events. Need technical help? Visit https:\/\/twittercommunity.com\/ \u2328\ufe0f #TapIntoTwitter" - }, - "place": { - }, - "entities": { - "hashtags": [ - ], - "urls": [ - { - "url": "https:\/\/t.co\/XweGngmxlP", - "unwound": { - "url": "https:\/\/cards.twitter.com\/cards\/18ce53wgo4h\/3xo1c", - "title": "Building the Future of the Twitter API Platform" - } - } - ], - "user_mentions": [ - ] - } -} -``` - -Here's a quick quiz to see if you're understanding the file's structure: - -
-How would you call the user ID? -tweet['user']['id'] -
- -
-How would you call the unwound url? -tweet['entities']['urls'][0]['unwound']['url'] -
- -As you can see, JSON objects can get complex pretty quickly. - -## Why is JSON important in modern analytics? - -Semi-structured data offers flexibility with the trade-off of being more complex. JSON doesn’t require a pre-defined schema. It allows nesting, values can be different data types, and it lends itself well to changes in the shape of the incoming data. As you can imagine, the above Tweet object would look very different if we tried to restructure it so it could fit into a table. It would be hard to read or we would lose information or both. - -## Use cases for JSON - -JSON is lightweight and often used to transfer data over a network connection. As we’ve seen, data from social media sites are often stored as JSON objects. JSON is also commonly how data from IoT sensors is formatted and you’ll often see JSON when using an API. - -## Conclusion - -The greatest strength of JSON also acts as its weakness—the data it contains informs the shape the object takes, rather than the other way around. Structured data is the bread and butter of analytics work, but a semi-structured format is an alternative option when a tabular format becomes too rigid to describe the relationships between different entities. \ No newline at end of file diff --git a/website/docs/terms/materialization.md b/website/docs/terms/materialization.md deleted file mode 100644 index 328076f1483..00000000000 --- a/website/docs/terms/materialization.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -id: materialization -title: Materialization -description: A materialization is the exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a data warehouse. -displayText: materialization -hoverSnippet: The exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a data warehouse. ---- - - - What does materialization mean in the context of dbt? - - -:::important This page could use some love -This term would benefit from additional depth and examples. Have knowledge to contribute? [Create an issue in the docs.getdbt.com repository](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to begin the process of becoming a glossary contributor! -::: - -The exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a . It's the manner in which the data is represented, and each of those options is defined either canonically (tables, views, incremental), or bespoke. - -It is important to consider the downstream impacts of your materialization choice on query run times and macro capabilities. - diff --git a/website/docs/terms/model.md b/website/docs/terms/model.md deleted file mode 100644 index c589cc196a7..00000000000 --- a/website/docs/terms/model.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -id: model -title: Model -description: A model is an essential building block of the DAG -displayText: model -hoverSnippet: A model is an essential building block of the DAG ---- - -A model is an essential building block of the DAG that lives in a single file and contains logic that transforms data. This logic can be expressed as a SQL `select` statement or a Python dataframe operation. Models can be materialized in the warehouse in different ways — most of these materializations require models to be built in the warehouse. \ No newline at end of file diff --git a/website/docs/terms/monotonically-increasing.md b/website/docs/terms/monotonically-increasing.md deleted file mode 100644 index b4e3987995d..00000000000 --- a/website/docs/terms/monotonically-increasing.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -id: monotonically-increasing -title: Monotonically increasing -description: A monotonically increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. -displayText: monotonically increasing -hoverSnippet: A monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. ---- - -Monotonicity means unchanging (think monotone); a monotonic sequence is a sequence where the order of the value of the elements does not change. In other words, a monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example the sequences `[1, 6, 7, 11, 131]` or `[2, 5, 5, 5, 6, 10]`.. - -Monotonically-increasing values often appear in primary keys generated by production systems. In an analytics engineering context, you should avoid generating such values or assuming their existence in your models, because they make it more difficult to create an data model. Instead you should create a which is derived from the unique component(s) of a row. diff --git a/website/docs/terms/predicate-pushdown.md b/website/docs/terms/predicate-pushdown.md deleted file mode 100644 index 8e9bad85b6b..00000000000 --- a/website/docs/terms/predicate-pushdown.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -id: predicate-pushdown -title: predicate pushdown -description: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query -displayText: Predicate pushdown -hoverSnippet: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query ---- - -A predicate pushdown is an expression used to determine what rows in a database apply to a particular query. For example, if you filter in a `WHERE` clause based on a specific dimension value, the database searches to determine what values in the database apply to the query. The optimization known as "predicate pushdown" involves applying this filtering process to the database, leading to enhanced and faster query performance. - diff --git a/website/docs/terms/primary-key.md b/website/docs/terms/primary-key.md deleted file mode 100644 index d67d928a218..00000000000 --- a/website/docs/terms/primary-key.md +++ /dev/null @@ -1,148 +0,0 @@ ---- -id: primary-key -title: Primary key -description: A primary key is a non-null column in a database object that uniquely identifies each row. Primary keys take the form of a natural or surrogate key. -displayText: primary key -hoverSnippet: A primary key is a non-null column in a database object that uniquely identifies each row. ---- - - - Primary key in SQL (AKA Constraints) — dbt Labs - - -A primary key is a non-null column in a database object that uniquely identifies each row. Primary keys take the form of a natural or . It’s important to note that for each or in your database, there must only be one primary key column per database object. - -At their core, you create and use these row-level unique identifiers to: - -* Ensure a lack of duplicate rows in your tables -* Identify table grains easily -* Help unpack how tables join together -* Establish a consistent naming system for primary keys across your data models - -One of the great things about data modeling is that there are very few rules to it. You have the flexibility to create the models and columns that are applicable to your business and the SQL you use to accomplish that is pretty much up to you and your team. _Having a primary key in each data model is pretty much the one rule you can’t break._ Without primary keys that are tested for non-nullness and uniqueness, duplicate or null records can slip undetected into your data models and cause counts to be incorrect. These two reasons coupled together can create a sense of distrust in the data and data team. - -Use this glossary page to understand the importance of primary keys, how natural keys and surrogate keys differ, and how support for primary keys varies. - -## Types of primary keys - -Primary keys can be established two ways: naturally or derived through the data in a surrogate key. - -* A **natural key** is a primary key that is innate to the data. Perhaps in tables there’s a unique `id` field in each table that would act as the natural key. You can use documentation like entity relationship diagrams (ERDs) to help understand natural keys in APIs or tables. In a perfect world, all of our primary keys would be natural keys… _but this is an imperfect world!_ -* A **surrogate key** is a hashed value of multiple fields in a dataset that create a uniqueness constraint on that dataset. You’ll essentially need to make a surrogate key in every table that lacks a natural key. An example of this could be a custom table that reports daily performance per `ad_id` from an ad platform. You can derive a surrogate key by hashing the `date` and `ad_id` fields to create a unique value per row. - -A note on primary key data types: natural keys will often take the form of an integer or other numeric value (ex. 45932). Surrogate keys, on the other hand, are usually alphanumeric strings since they are hashed values (ex. ‘62aef884fbe3470ce7d9a92140b09b17’). - -:::tip Tip -dbt supports [packages](https://docs.getdbt.com/docs/build/packages), libraries of open-source macros and data models, to help data teams avoid doing duplicative work. One of these packages, [dbt_utils](https://github.com/dbt-labs/dbt-utils), contains a series of macros that are built to alleviate common struggles in data modeling. The [surrogate_key](https://github.com/dbt-labs/dbt-utils#surrogate_key-source) macro offers a DRY (don’t repeat yourself) solution to creating surrogate keys across different data warehouses in the event that your data doesn’t contain natural keys. -::: - -## Data warehouse support for primary keys - -What do we mean when we say a primary key is supported in a database? What does it mean if primary keys are enforced? - -* **Support**: If a primary key is supported in a database, that means they allow you to explicitly let the system know if a specific field is a primary key. This will happen in the DDL (data definition language) to create the table, like in the example below, or an `ALTER` statement that specifies which field is the primary key. -* **Enforcement**: If a database enforces primary keys, that means it would raise an error if one of the constraints on primary keys (uniqueness and non-null) was broken during an `INSERT` or `UPDATE` statement. - -The table below gives an overview of primary key support and enforcement in some of the major data warehouses. Below the table you’ll additionally see a breakdown of some details around primary key implementation for these data warehouses. - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Supports primary keys?Fully enforces primary keys?
Snowflake
Amazon Redshift
Google BigQuery
Databricks
Postgres
- -### Snowflake - -Snowflake allows for data folks to explicitly identify primary keys during table creation or using an `ALTER` statement. To see identified primary keys in your database, you can run the `SHOW PRIMARY KEYS` command. It’s important to note, however, that Snowflake primary key indicators are purely descriptive–meaning they don’t enforce either non-nullness or uniqueness requirements. However, Snowflake offers a separate `NOT NULL` constraint that will be enforced for specified fields. - -### Amazon Redshift - -With Redshift, you can specify primary keys constraints on tables, but Redshift won’t provide out-of-the-box primary key enforcement. Similar to Snowflake, Redshift does allow users to add a [`NOT NULL` constraint](https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_TABLE_NEW.html) that is actually enforced. - -In general for Redshift, it’s still good practice to define your primary keys (regardless of the lack of uniqueness enforcement) because they can help the [query planner](https://docs.getdbt.com/blog/redshift-configurations-dbt-model-optimizations) more quickly identify uniqueness and foreign key relationships. - -### Google BigQuery - -BigQuery is pretty unique here in that it doesn’t support or enforce primary keys. If your team is on BigQuery, you’ll need to have some [pretty solid data testing](/docs/build/data-tests) in place to ensure your primary key fields are unique and non-null. - -### Databricks - -Databricks’ Delta tables in Unity Catalog provide support for declaring [informational primary keys](https://docs.databricks.com/tables/constraints.html#declare-primary-key-and-foreign-key-relationships). These primary key constraints are not enforced. Databricks currently offers [two enforced constraint](https://docs.databricks.com/tables/constraints.html#enforced-constraints-on-databricks) types: `not-null` and `check`. The `not-null` one is pretty straightforward, but the `check` constraint is more unique to Databricks. With the `check` constraint, you can test that a certain boolean expression executes as `true` for each row in a table. This constraint is more likely to be helpful for ensuring accepted values are met for fields rather than for primary key requirements. - -### Postgres - -Postgres is the true standout here in that it both supports and enforces primary keys! However, you shouldn’t be too surprised about this. One of the primary use cases for Postgres is that it often serves as the home for backend application tables and is usually managed by a [team of backend developers](https://docs.getdbt.com/blog/when-backend-devs-spark-joy). Since these tables often act as a source of truth for many businesses, it’s critical that primary key fields must exist, be non-null, and unique. - -## How to indicate primary keys - -For data warehouses that support primary keys (like Snowflake, Amazon Redshift, and Postgres), you can add a primary key indicator to the column you want to use as a primary key in the DDL to create the table. You may also use an `ALTER` DDL statement to set a column as a primary key if the table is already created. - -In the example below, you can add a new `jaffles` table to the [jaffle_shop](https://github.com/dbt-labs/jaffle_shop) project and make the `id` field the primary key. - -```sql -CREATE TABLE prod.jaffle_shop.jaffles ( - id varchar(255) primary key, - jaffle_name varchar(255) - created_at timestamp, - ingredients_list varchar(255), - is_active boolean -); -``` - -:::note Note -If you don't have a field in your table that would act as a natural primary key, you’ll need to[ create a surrogate key](https://docs.getdbt.com/blog/sql-surrogate-keys) for it. -::: - -If your data warehouse doesn’t provide out-of-the box support and enforcement for primary keys, it’s important to clearly label and put your own constraints on primary key fields. This could look like: - -* **Creating a consistent naming convention for your primary keys**: You may see an `id` field or fields prefixed with `pk_` (ex. `pk_order_id`) to identify primary keys. You may also see the primary key be named as the obvious table grain (ex. In the jaffle shop’s `orders` table, the primary key is called `order_id`). -* **Adding automated [data tests](/docs/build/data-tests) to your data models**: Use a data tool, such as dbt, to create not null and unique tests for your primary key fields. - -## Testing primary keys - -When we talk about testing our primary keys, we really mean testing their uniqueness and non-nullness. Given that not all modern data warehouses support or enforce primary key constraints, your data team will likely fall under two scenarios: - -1. For databases that support primary key enforcement, you should receive failures when your constraints are broken. -2. For databases that don’t offer support and enforcement of primary keys, you’re going to need to regularly test that primary keys aren’t violating their golden rule of uniqueness and non-nullness. To do this, we recommend implementing a tool like dbt that allows you to define version-controlled and code-based tests on your data models. Using these tests, you should create [not null](https://docs.getdbt.com/reference/resource-properties/tests#not_null) and [unique](https://docs.getdbt.com/reference/resource-properties/tests#unique) tests for every primary key field throughout your dbt project. Other methods for primary key testing may look like writing custom tests or ad hoc queries that check for uniqueness and non-nullness. - -:::tip Tip -You can use dbt’s [documentation](https://docs.getdbt.com/docs/build/documentation) and [testing](https://docs.getdbt.com/reference/resource-properties/tests) capabilities to clearly identify and QA primary keys in your data models. For your primary key column, you should mention that the field is the unique identifier for that table and test for uniqueness and non-nullness. -::: - -## Conclusion - -Say it with me or get it tattooed on your lower back: every database object in your data warehouse needs a primary key. At their core, primary keys are fields that uniquely identify each row in a table and help ensure there are no duplicates in the data. Primary keys take shape as either natural keys, fields that are innate to the data, or as surrogate keys, hashed column values that create a uniqueness constraint on the data. Not every modern data warehouse provides explicit support or enforcement of primary keys, so it’s incredibly important to have a method to test that your primary keys are unique and not null. - -## Further reading - -* [Testing primary keys in dbt](https://docs.getdbt.com/blog/primary-key-testing) -* [Surrogate keys and dbt](https://docs.getdbt.com/blog/sql-surrogate-keys) -* [dbt Constraints Snowflake Labs package](https://hub.getdbt.com/snowflake-labs/dbt_constraints/latest/) diff --git a/website/docs/terms/relational-database.md b/website/docs/terms/relational-database.md deleted file mode 100644 index 8f05e5f4944..00000000000 --- a/website/docs/terms/relational-database.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -id: relational-database -title: Relational database -description: A relational database provides a structured way to store data into tables consisting of rows and columns. Different tables in a relational database can be joined together using common columns from each table, forming relationships. -displayText: relational database -hoverSnippet: A relational database provides a structured way to store data into tables consisting of rows and columns. Different tables in a relational database can be joined together using common columns from each table, forming relationships. ---- - - - Relational database: A way to get order out of data chaos - - -A relational database provides a structured way to store data into tables consisting of rows and columns. Different tables in a relational database can be joined together using common columns from each table, forming relationships. - -Analytics engineers use relational database models to process high volumes of data that, in its rawest form, is too difficult for an end user or analyst to read and comprehend. Thanks to these models, people can easily query, interpret, and derive insight out of data using the accessible SQL. - -Anyone who’s ever managed or modeled data will tell you that data points are only meaningful in relation to each other. The very philosophy behind data management and data analytics has centered on forming a narrative out of seemingly disparate elements. - -At the heart of this notion sits the relational database, which was first introduced by computer scientist E.F. Codd in the year 1970 — 13 years before the internet was even invented! - -## How relational databases work - -The legwork behind relational databases lies in establishing pre-defined relationships between tables, also called “entities”. For example, in the [jaffle_shop](https://github.com/dbt-labs/jaffle_shop) ecommerce store database where customers’ information is stored in a `customers` table and orders information is stored in an `orders` table, a relationship is defined such that each order is attributed to a customer. - -![](/img/docs/terms/relational-database/relation.png) - -The way relationships are defined is via primary keys and foreign keys. - -By definition, a is a column (or combination of columns as a surrogate key) which identifies a unique record. There can be only one primary key per table, and the primary key should be unique and not null. - -On the other hand, a foreign key is a column (or combination of columns) in one table that references the primary key in another table. In the above example, multiple orders can belong to one customer. Assuming that `id` is defined as the primary key for the `customers` table, `user_id` in the `orders` table would be the foreign key. - -In analytics engineering, where the focus is geared towards data modeling and creating a reporting layer for a BI tool, relational databases are a great fit. Data modeling defines how the data elements are related to each other, and a well-organized database is the cornerstone of effective data querying. - -## Use cases for relational databases - -Relational databases are best for structured data that can be organized into tables made up of rows and columns. Data teams rely on relational databases for storing transactional data, and also when data querying and data analysis is needed. - -### Transactional processing - -As mentioned earlier, relational databases are a great fit for transaction-oriented systems such as CRM tools, e-commerce platforms, or finance software. Companies tend to use relational databases when transactional consistency is required, as they offer a near failsafe environment for data accuracy and completion. When a transaction consists of several steps, the system treats the steps as a single transaction and assures that the operation follows an ‘all-or-nothing’ scenario, ie: the steps either all survive or all fail. - -### Modeling data and organizing it for analysis - -Relational databases support common data modeling techniques such as , Data Vault, or sometimes hybrid approaches that combine different modeling techniques. Such methodologies allow teams to organize their data into useful data structures. - -A data model is the overarching conceptual layer that organizes data entities and their relationships. The specific physical implementation of that data model including the definitions of data types and constraints constitutes the database schema. - -Having organized data entities also helps analytics engineers and analysts build meaningful queries that derive data in a format and granularity that is otherwise not directly available in the base database. - -Most analytics engineers have to deal with both relational (typically structured data) and non-relational data (typically unstructured data) coming in from multiple sources. The data is then transformed until it ultimately gets modeled into data entities using relational modeling approaches. More on non-relational databases in the following section, but in a nutshell, structured data is data that can be easily stored in a relational database system, while unstructured data is composed of formats that cannot easily (or at all) be broken down into tabular data. Common examples of unstructured data include video files, PDFs, audio files, and social media posts. - -Another popular format is semi-structured data which is inherently difficult to organize into rows and columns, but contains semantic markup that makes it possible to extract the underlying information. Some examples include XML and . - -Relational data warehouses provide relational databases that are specifically optimized for analytical querying rather than transaction processing. Increasingly, data warehouses are providing better support for unstructured data, or data that cannot be stored in relational tables. . - -Even when analytics engineers do not physically enforce relationships at the database level (many modern data warehouses allow for defining relational constraints but do not actually enforce them), they do follow a relational process. This process enables them to still organize the data into logical entities whenever possible, and in order to make sure that the data is not redundant and easily queryable. - -## Relational database vs. non-relational database - -The main difference between a relational and non-relational database is in how they store information. Relational databases are well-suited for data that is structured and store values in tables, and non-relational databases store data in a non-tabular form called unstructured data. - -As datasets are becoming dramatically more complex and less structured, the format of the ingested data can sometimes be unpredictable which makes the case for non-relational databases (also called NoSQL). - -NoSQL databases are also typically better suited for granular real-time monitoring. On the other hand, relational databases make it easier to look at transformed and aggregated data, making them a more appropriate fit for reporting and analytics. - -The below table summarizes the main differences between a relational and a non-relational database: - -| | Relational Database | Non-Relational Database | -|---|---|---| -| Data storage | Data is stored in tables. | Data is stored in document files, graph stores, key-value stores, or wide-column stores. | -| Data format | Data is structured. | Data is mainly unstructured. | -| Usage | Mainly used for recording transactions, data modeling, and data analysis. | Mainly used to ingest large volume real-time data streams. | -| Data Integrity | The relationships and constraints defined help ensure higher data integrity. | Non-relational databases do not guarantee data integrity. | -| Scalability | Scalable at a high price tag. | Highly scalable. | - -## Conclusion - -Relational databases store data in a systematic way, and support querying multiple tables together in order to generate business insights. - -Often starting off with unorganized and chaotic data, analytics engineers leverage relational databases to bring structure and consistency to their data. - -Relational databases also have a strong record of transactional consistency. While some companies are racing to embrace non-relational databases in order to handle the big volume of unstructured data, most of their workloads likely remain transactional and analytical in nature which is why relational databases are very common. - -## Further reading - -- [Glossary: Primary key](/terms/primary-key) -- [Glossary: Data warehouse](/terms/data-warehouse) diff --git a/website/docs/terms/subquery.md b/website/docs/terms/subquery.md deleted file mode 100644 index d7aecdd52cc..00000000000 --- a/website/docs/terms/subquery.md +++ /dev/null @@ -1,224 +0,0 @@ ---- -id: subquery -title: Subquery in SQL -description: "A subquery is what the name suggests: a query within another query. The true inception of SQL. Subqueries are often used when you need to process data in several steps." -displayText: subquery -hoverSnippet: A subquery is a query within another query. Subqueries are often used when you need to process data in multiple steps. ---- - - - What is a Subquery in SQL and when are they useful? - dbt Labs - -A subquery is what the name suggests: a query within another query. _The true inception of SQL_. Subqueries are often used when you need to process data in several steps. For the majority of subqueries you’ll see in actual practice, the inner query will execute first and pass its result to the outer query it's nested in. - -Subqueries are usually contrasted with Common Table Expressions (CTEs) as they have similar use cases. Unlike CTEs, which are usually separate `SELECT` statements within a query, subqueries are usually `SELECT` statements nested within a `JOIN`, `FROM`, or `WHERE` statement in a query. - -To be honest, we rarely write subqueries here at dbt Labs since we prefer to use CTEs. We find that CTEs, in general, support better query readability, organization, and debugging. However, subqueries are a foundational concept in SQL and still widely used. We hope you can use this glossary to better understand how to use subqueries and how they differ from CTEs. - -## Subquery syntax - -While there are technically several types of subqueries, the general syntax to build them is the same. A subquery usually consists of the following: - -- Enclosing parentheses -- A name -- An actual SELECT statement -- A main query it is nested in via a FROM, WHERE, or JOIN clause - -Let’s take this to an example, using the [sample jaffle_shop dataset](https://github.com/dbt-labs/jaffle_shop). - -```sql -select customer_id, count(order_id) as cnt_orders - from ( - select * from {{ ref('orders') }} - ) all_orders -group by 1 -``` - -Given the elements of subqueries laid out in the beginning, let’s break down this example into its respective parts. - -| Subquery elements | Example | -|---|---| -| Enclosing parentheses | :white_check_mark: | -| Subquery name | `all_orders` | -| `SELECT` statement | `select * from {{ ref('orders') }}` | -| Main query it is nested in | `select customer_id, count(order_id) as cnt_orders from all_orders group by 1` | - -When this query is actually executed, it will start by running the innermost query first. In this case, it would run `select * from {{ ref('orders') }}` first. Then, it would pass those results to the outer query, which is where you grab the count of orders by `customer_id`. - -```note Note -If you want to learn more about what a `ref` is, [check out our documentation on it.](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) -``` - -This is a relatively straightforward example, but should hopefully show you that subqueries start off like most other queries. As you nest more subqueries together, that’s when you unearth the power of subqueries, but also when you start to notice some readability tradeoffs. If you are using subqueries regularly, you'll want to leverage indenting and [strong naming conventions](https://docs.getdbt.com/blog/on-the-importance-of-naming) for your subqueries to clearly distinguish code functionality. - -## Types of subqueries - -In your day-to-day, you won’t normally formalize the names of the different types of subqueries you can write, but when someone uses the term “correlated subquery” at a data conference, you'll want to know what that means! - -### Nested subqueries - -Nested subqueries are subqueries like the one you saw in the first example: a subquery where the inner query is executed first (and once) and passes its result to the main query. The majority of subqueries you will see in the real world are likely to be a nested subquery. These are most useful when you need to process data in multiple steps. - -:::tip Debugging subqueries tip -It’s important to note that since the inner query is executed first in a nested subquery, the inner query must be able to execute by itself. If it’s unable to successfully run independently, it cannot pass results to the outer query. -::: - -### Correlated subqueries - -A correlated subquery is a nested subquery’s counterpart. If nested subqueries execute the inner query first and pass their result to the outer query, correlated subqueries execute the outer query first and pass their result to their inner query. For correlated subqueries, it’s useful to think about how the code is actually executed. - -In a correlated subquery, the outer query will execute row-by-row. For each row, that result from the outer query will be passed to the inner query. Compare this to nested queries: in a nested query, the inner query is executed first and only once before being passed to the outer query. - -These types of subqueries are most useful when you need to conduct analysis on a row-level. - -### Scalar and non-scalar subqueries - -Scalar subqueries are queries that only return a single value. More specifically, this means if you execute a scalar subquery, it would return one column value of one specific row. Non-scalar subqueries, however, can return single or multiple rows and may contain multiple columns. - -You may want to use a scalar subquery if you’re interested in passing only a single-row value into an outer query. This type of subquery can be useful when you’re trying to remove or update a specific row’s value using a Data Manipulation Language (DML) statement. - -## Subquery examples - -You may often see subqueries in joins and DML statements. The following sections contain examples for each scenario. - -### Subquery in a join - -In this example, you want to get the lifetime value per customer using your `raw_orders` and `raw_payments` table. Let’s take a look at how you can do that with a subquery in a join: - -```sql -select - - orders.user_id, - sum(payments.amount) as lifetime_value - -from {{ ref('raw_orders') }} as orders -left join ( - - select - - order_id, - amount - - from {{ ref('raw_payments') }} - -) all_payments -on orders.id = payments.order_id -group by 1 -``` - -Similar to what you saw in the first example, let’s break down the elements of this query. - -| Subquery elements | Example | -|---|---| -| Enclosing parentheses | :white_check_mark: | -| Subquery name | `all_payments` | -| `SELECT` statement | `select order_id, amount from {{ ref('raw_payments') }}` | -| Main query it is nested in | `select orders.user_id, sum(payments.amount) as lifetime_value from {{ ref('raw_orders') }} as orders...` | - -In this example, the `all_payments` subquery will execute first. you use the data from this query to join on the `raw_orders` table to calculate lifetime value per user. Unlike the first example, the subquery happens in the join statement. Subqueries can happen in `JOIN`, `FROM`, and `WHERE` clauses. - -### Subquery in a DML command - -You may also see subqueries used in DML commands. As a jogger, DML commands are a series of SQL statements that you can write to access and manipulate row-level data in database objects. Oftentimes, you’ll want to use a query result in a qualifying `WHERE` clause to only delete, update, or manipulate certain rows of data. - -In the following example, you'll attempt to update the status of certain orders based on the payment method used in the `raw_payments` table. - -```sql -UPDATE raw_orders -set status = 'returned' -where order_id in ( -select order_id -from raw_payments -where payment_method = 'bank_transfer') -``` - -## Subquery vs CTE - -A subquery is a nested query that can oftentimes be used in place of a CTE. Subqueries have different syntax than CTEs, but often have similar use cases. The content won’t go too deep into CTEs here, but it’ll highlight some of the main differences between CTEs and subqueries below. - -| CTE | Subquery | -|---|---| -| Typically more readable since CTEs can be used to give structure to your query | Typically less readable, especially if there are many nested queries | -| Reusable in the same query | Must declare the subquery everytime it is used in a query | -| Allows for recursiveness | Does not allow for recursiveness | -| CTEs must have unique CTE_EXPRESSION_NAMES when used in a query | Subqueries don’t always have to be explicitly named | -| CTEs cannot be used in a `WHERE` clause | Subqueries can be used in a `WHERE` clause | - -### Subquery vs CTE example - -The following example demonstrates the similarities and differences between subqueries and CTEs. Using the [first subquery example](#subquery-in-a-join), you can compare how you would perform that query using subquery or a CTE: - - - - -```sql Subquery example -select customer_id, count(order_id) as cnt_orders - from ( - - select * from {{ ref('orders') }} - - ) all_orders -group by 1 -``` - - - -```sql CTE example -with all_orders as ( - -select * from {{ ref('orders') }} - -), -aggregate_orders as ( - - select - - customer_id, - count(order_id) as cnt_orders - - from all_orders - group by 1 - -) -select * from aggregate_orders -``` - - - - -While the code for the query involving CTEs may be longer in lines, it also allows us to explicitly define code functionality using the CTE name. Unlike the subquery example that executes its inner query and then the outer query, the query using CTEs executes moving down the code. - -Again, choosing to use CTEs over subqueries is a personal choice. It may help to write out the same code functionality in a subquery and with CTEs and see what is more understandable to you. - -## Data warehouse support for subqueries - -Subqueries are likely to be supported across most, if not all, modern data warehouses. Please use this table to see more information about using subqueries in your specific data warehouse. - -| Data warehouse | Supports subqueries? | -|---|---| -| [Snowflake](https://docs.snowflake.com/en/user-guide/querying-subqueries.html) | :white_check_mark: | -| [Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_Subquery_examples.html) | :white_check_mark: | -| [Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/subqueries) | :white_check_mark: | -| [Databricks](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-qry-query.html) | :white_check_mark: | -| [Postgres](https://www.postgresqltutorial.com/postgresql-subquery/) | :white_check_mark: | - -## Conclusion - -I’m going to be honest, I was hesitant to start writing the glossary page for SQL subqueries. As someone who has been using CTEs almost exclusively in their data career, I was intimidated by this concept. However, I am excited to say: Subqueries are not as scary as I expected them to be! - -At their core, subqueries are nested queries within a main query. They are often implemented in `FROM`, `WHERE`, and `JOIN` clauses and are used to write code that builds on itself. Despite the fact that subqueries are SQL like any other query, it is important to note that subqueries can struggle in their readability, structure, and debugging process due to their nested nature. Because of these downsides, we recommend leveraging CTEs over subqueries whenever possible. - -I have not been made a subquery convert, but I’m walking away from this a little less intimidated by subqueries and I hope you are too. - -## Further reading - -Please check out some of our favorite readings related to subqueries! - -- [Glossary: CTE](https://docs.getdbt.com/terms/cte) -- [On the importance of naming: model naming conventions (Part 1)](https://docs.getdbt.com/blog/on-the-importance-of-naming) diff --git a/website/docs/terms/surrogate-key.md b/website/docs/terms/surrogate-key.md deleted file mode 100644 index a53db3090cd..00000000000 --- a/website/docs/terms/surrogate-key.md +++ /dev/null @@ -1,196 +0,0 @@ ---- -id: surrogate-key -title: Surrogate key -description: A surrogate key is a unique identifier derived from the data itself. It's commonly a hashed value of multiple columns that will create a unique id for each row. -displayText: surrogate key -hoverSnippet: A surrogate key is a unique identifier derived from the data itself. It often takes the form of a hashed value of multiple columns that will create a uniqueness constraint for each row. ---- - - - What is a surrogate key in database table? - dbt Labs - - -A surrogate key is a unique identifier derived from the data itself. It often takes the form of a hashed value of multiple columns that will create a uniqueness constraint for each row. You will need to create a surrogate key for every table that doesn't have a natural . - -Why would you ever need to make a surrogate key? Shouldn’t all tables innately just have a field that uniquely identifies each row? Now that would be too easy… - -Let’s say you have a table with all license plate numbers and the state of the plate. While license plate numbers are unique to their state, there could be duplicate license plate numbers across different states. So by default, there’s no natural key that can uniquely identify each row here. In order to uniquely identify each record in this table, you could create a surrogate key based on the unique combination of license plate number and its state. - -## Surrogate keys, natural keys, and primary keys oh my! - -Primary keys can be established two ways: naturally or derived through the data in a surrogate key. - -* A __natural key__ is a primary key that is innate to the data. Perhaps in some tables there’s a unique `id` field in each table that would act as the natural key. You can use documentation like entity relationship diagrams (ERDs) to help understand natural keys in APIs or backend application database tables. -* A __surrogate key__ is a hashed value of multiple fields in a dataset that create a uniqueness constraint on that dataset. You’ll essentially need to make a surrogate key in every table that lacks a natural key. - -:::note Note -You may also hear about primary keys being a form of a _constraint_ on a database object. Column constraints are specified in the to create or alter a database object. For data warehouses that support the enforcement of primary key constraints, this means that an error would be raised if a field's uniqueness or non-nullness was broken upon an `INSERT` or `UPDATE` statement. Most modern data warehouses don’t support _and_ enforce [primary key constraints](https://docs.getdbt.com/terms/primary-key#Data-warehouse-support-for-primary-keys), so it’s important to have [automated testing](https://docs.getdbt.com/blog/primary-key-testing#how-to-test-primary-keys-with-dbt) in-place to ensure your primary keys are unique and not null. -::: - -## How surrogate keys are created - -In analytics engineering, you can generate surrogate keys using a hashing method of your choice. Remember, in order to truly create a uniqueness constraint on a database object, you’ll need to hash the fields together that _make each row unique_; when you generate a correct surrogate key for a dataset, you’re really establishing the true of that dataset. - -Let’s take this to an example. Below, there is a table you pull from an ad platform that collects `calendar_date`, `ad_id`, and some performance columns. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
calendar_datead_idimpressionsspendclicksconversions
2022-05-16212887444523.009432166
2022-05-162143236.4940
2022-05-05212125600117244.561731856
- - -In this state, this table has no natural key that can act as a primary key. You know the grain of this table: this is showing performance for each `ad_id` per `calendar_date`. Therefore, hashing those two fields will create a uniqueness constraint on this table. - -To create a surrogate key for this table using the MD5 function, run the following: - -```sql -select - md5(calendar_date || ad_id) as unique_id, - * -from {{ source('ad_platform', 'custom_daily_report')}} -``` - -After executing this, the table would now have the `unique_id` field now uniquely identifying each row. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
unique_idcalendar_datead_idimpressionsspendclicksconversions
62aef884fbe3470ce7d9a92140b09b172022-05-16212887444523.009432166
ea385f7a5e560ef4d8a78f7d913927e42022-05-162143236.4940
53a33f257d1d4f2446469ac5adad1c0c2022-05-05212125600117244.561731856
- -## Testing surrogate keys - -Amazing, you just made a surrogate key! You can just move on to the next data model, right? No!! It’s critically important to test your surrogate keys for uniqueness and non-null values to ensure that the correct fields were chosen to create the surrogate key. - -In order to test for null and unique values you can utilize code-based data tests like [dbt tests](/docs/build/data-tests), that can check fields for nullness and uniqueness. You can additionally utilize simple SQL queries or unit tests to check if surrogate key count and non-nullness is correct. - -## A note on hashing algorithms - -Depending on your data warehouse, there’s several cryptographic hashing options to create surrogate keys. The primary hashing methods include MD5 or other algorithms, like HASH or SHA. Choosing the appropriate hashing function is dependent on your dataset and what your warehouse supports. - - - - - - - - - - - - - - - - - - - - - - -
Hashing algorithmBit lengthKnown collisions?
HASH64 bitsYes, past ~4 billion elements
MD5128 bitsYes, but incredibly unlikely
SHA256256 bitsNo
- -:::note Note -A collision occurs when two pieces of data that are different end up hashing to the same value. If a collision occurs, a different hashing method should be used. -::: - - -## Why we like surrogate keys - -Let’s keep it brief: surrogate keys allow data folks to quickly understand the grain of the database object and are compatible across many different data warehouses. - - -### Readability - -Because surrogate keys are comprised of the fields that make a uniqueness constraint on the data, you can quickly identify the grain of the data. For example, if you see in your data model that the surrogate key field is created by hashing the `ad_id` and `calendar_date` fields, you can immediately know the true grain of the data. When you clearly understand the grain of a database object, this can make for an easier understanding of how entities join together and fan out. - - -### Compatibility - -Making a surrogate key involves a relatively straightforward usage of SQL: maybe some coalescing, concatenation, and a hashing method. Most, if not all, modern data warehouses support both the ability to concat, coalesce, and hash fields. They may not have the exact same syntax or hashing functions available, but their core functionality is the same. - -:::tip Tip -dbt supports several macros to help data folks write DRY (don’t repeat yourself) code. The [surrogate_key macro](https://github.com/dbt-labs/dbt-utils#surrogate_key-source) helps you create surrogate keys with the MD5 function without having to worry about coalescing potentially null field values. -::: - - -## Performance concerns for surrogate keys - -In the past, you may have seen surrogate keys take the form of integers (ex. 1, 2, 3, 4). These surrogate keys were often limited to 4-bit integers that could be indexed quickly. However, in the practice of analytics engineering, surrogate keys derived from the data often take the form of a hashed string value. Given this form, these surrogate keys are not necessarily optimized for performance for large table scans and complex joins. For large data models (millions, billions, trillions of rows) that have surrogate keys, you should materialize them as tables or [incremental models](https://docs.getdbt.com/docs/build/incremental-models) to help make joining entities more efficient. - -## Conclusion - -Surrogate keys are unique row identifiers that are created by using columns in a database object to create a uniqueness constraint on the data. To create a surrogate key, you will use a cryptographic algorithm usually in the form of the MD5 function to hash together fields that create a uniqueness constraint on the dataset. Ultimately, surrogate keys are a great way to create unique row identifiers for database objects that lack them naturally and allow folks to easily identify the grain of the data. - -## Further reading - -Want to learn more about keys, dbt, and everything in-between? Check out the following: - -* [Glossary: Primary keys](https://docs.getdbt.com/terms/primary-key) -* [Generating surrogate keys across warehouses](https://docs.getdbt.com/blog/sql-surrogate-keys) -* [Generating an auto-incrementing ID in dbt](https://discourse.getdbt.com/t/generating-an-auto-incrementing-id-in-dbt/579/2) -* [The most underutilized function in SQL](https://www.getdbt.com/blog/the-most-underutilized-function-in-sql/) diff --git a/website/docs/terms/table.md b/website/docs/terms/table.md deleted file mode 100644 index bfc4e680660..00000000000 --- a/website/docs/terms/table.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -id: table -title: Table -description: "Read this guide to understand how tables work in dbt." -displayText: table -hoverSnippet: In simplest terms, a table is the direct storage of data in rows and columns. Think excel sheet with raw values in each of the cells. ---- - -In simplest terms, a table is the direct storage of data in rows and columns. Think excel sheet with raw values in each of the cells. - -Here is an example of a table: - -| character_id | first_name | last_name | email | -| ------------ | ------------ | --------- | --------------------- | -| 01 | Frodo | Baggins | frodo@lotr.com | -| 02 | Bilbo | Baggins | bilbo@theshire.co.uk | -| 03 | Gandalf | The Grey | greywizard1@gmail.com | - -Tables do use storage in your . The data can be queried directly because you are directly pulling from the raw data itself. If a particular table was created by underlying data, the table will not be automatically updated. - -This table definition applies to most data warehouses, however, there are different flavors of tables for different warehouses. For example, Snowflake has transient and temporary tables that support different features. - -## Why are tables useful? - -Tables are an excellent choice for persisting transformed data in your warehouse at the time of execution. However, if the underlying data used is changed, the table will not reflect the underlying changes. If that is something you need, dbt Labs recommends views. diff --git a/website/docs/terms/view.md b/website/docs/terms/view.md deleted file mode 100644 index 53c122ca9e6..00000000000 --- a/website/docs/terms/view.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -id: view -title: View -description: Read this guide to understand how views work in dbt. -displayText: view -hoverSnippet: A view (as opposed to a table) is a defined passthrough SQL query that can be run against a database (or data warehouse). ---- -:::important This page could use some love -This term would benefit from additional depth and examples. Have knowledge to contribute? [Create an issue in the docs.getdbt.com repository](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to begin the process of becoming a glossary contributor! -::: - -A view (as opposed to a ) is a defined passthrough SQL query that can be run against a database (or ). A view doesn’t store data, like a table does, but it defines the logic that you need to fetch the underlying data. - -For example, you might define a SQL view to count new users in a day: - -```sql - select - created_date, - count(distinct(user_id)) as new_users - from users - group by created_date -``` - -But this SQL might get tedious to write over and over again, so instead you could define it as a view called `new_users`, and instead query `select * from new_users`. - -When that `new_users` query runs, the underlying view compiles and runs against the database. - -## Tips on using views - -A healthy relationship with views is built on expectations. - -You shouldn’t expect a view in itself to be your final destination in terms of data modeling (they’re slow + often more costly to query than tables, not great for connecting to a downstream process like reporting), but you should trust them to get you from point A to point B. - -## Further reading - -- [Best practices guide on choosing table vs view materializations](/best-practices) diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 74aade52e50..a6cb4e40628 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -79,13 +79,13 @@ var siteSettings = { }, announcementBarActive: true, announcementBarLink: - "https://www.getdbt.com/resources/webinars/dbt-cloud-demos-with-experts/?utm_medium=internal&utm_source=docs&utm_campaign=q2-2025_biweekly-demos_aw&utm_content=biweekly-demos____&utm_term=all_all__", + "https://www.getdbt.com/resources/webinars/dbt-cloud-demos-with-experts/?utm_medium=i[…]ly-demos_aw&utm_content=biweekly-demos____&utm_term=all_all__", // Set community spotlight member on homepage // This is the ID for a specific file under docs/community/spotlight - communitySpotlightMember: "meagan-palmer", + communitySpotlightMember: "original-dbt-athena-maintainers", prism: { theme: (() => { - var theme = themes.nightOwl; + var theme = themes.nightOwl; // Add additional rule to nightowl theme in order to change // the color of YAML keys (to be different than values). // There weren't many Prism themes that differentiated @@ -161,10 +161,6 @@ var siteSettings = { label: "Developer blog", to: "/blog", }, - { - label: "Glossary", - to: "/glossary", - }, ], }, { @@ -205,8 +201,15 @@ var siteSettings = { links: [ { html: ` + + diff --git a/website/snippets/_auto-exposures-view.md b/website/snippets/_auto-exposures-view.md new file mode 100644 index 00000000000..d30b47ae21d --- /dev/null +++ b/website/snippets/_auto-exposures-view.md @@ -0,0 +1,20 @@ +## View auto-exposures in dbt Explorer + +After setting up auto-exposures in dbt Cloud, you can view them in dbt Explorer for a richer experience: +1. Navigate to dbt Explorer by clicking on the **Explore** link in the navigation. +2. From the **Overview** page, you can view auto-exposures from a couple of places: + - From the **Exposures** menu item under **Resources**. This menu provides a comprehensive list of all the exposures so you can quickly access and manage them. + + + - Locate directly from within the **File tree** under the **imported_from_tableau** sub-folder. This view integrates exposures seamlessly with your project files, making it easy to find and reference them from your project's structure. + + + - From the **Project lineage** view, which visualizes the dependencies and relationships in your project. Exposures are represented with the Tableau icon, offering an intuitive way to see how they fit into your project's overall data flow. + + + + + + + + diff --git a/website/snippets/_bigquery-dataproc.md b/website/snippets/_bigquery-dataproc.md new file mode 100644 index 00000000000..054ab7cb64d --- /dev/null +++ b/website/snippets/_bigquery-dataproc.md @@ -0,0 +1,3 @@ +To run dbt Python models on GCP, dbt uses companion services, Dataproc and Cloud Storage, that offer tight integrations with BigQuery. You may use an existing Dataproc cluster and Cloud Storage bucket, or create new ones: +- https://cloud.google.com/dataproc/docs/guides/create-cluster +- https://cloud.google.com/storage/docs/creating-buckets \ No newline at end of file diff --git a/website/snippets/_cloud-environments-info.md b/website/snippets/_cloud-environments-info.md index 166165be855..6addd6a3a7a 100644 --- a/website/snippets/_cloud-environments-info.md +++ b/website/snippets/_cloud-environments-info.md @@ -82,39 +82,5 @@ If you're developing in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in #### Only the **top-level keys** are accepted in extended attributes This means that if you want to change a specific sub-key value, you must provide the entire top-level key as a JSON block in your resulting YAML. For example, if you want to customize a particular field within a [service account JSON](/docs/core/connect-data-platform/bigquery-setup#service-account-json) for your BigQuery connection (like 'project_id' or 'client_email'), you need to provide an override for the entire top-level `keyfile_json` main key/attribute using extended attributes. Include the sub-fields as a nested JSON block. -### Git repository caching -At the start of every job run, dbt Cloud clones the project's Git repository so it has the latest versions of your project's code and runs `dbt deps` to install your dependencies. -For improved reliability and performance on your job runs, you can enable dbt Cloud to keep a cache of the project's Git repository. So, if there's a third-party outage that causes the cloning operation to fail, dbt Cloud will instead use the cached copy of the repo so your jobs can continue running as scheduled. - -dbt Cloud caches your project's Git repo after each successful run and retains it for 8 days if there are no repo updates. It caches all packages regardless of installation method and does not fetch code outside of the job runs. - -dbt Cloud will use the cached copy of your project's Git repo under these circumstances: - -- Outages from third-party services (for example, the [dbt package hub](https://hub.getdbt.com/)). -- Git authentication fails. -- There are syntax errors in the `packages.yml` file. You can set up and use [continuous integration (CI)](/docs/deploy/continuous-integration) to find these errors sooner. -- If a package doesn't work with the current dbt version. You can set up and use [continuous integration (CI)](/docs/deploy/continuous-integration) to identify this issue sooner. - -To enable Git repository caching, select **Account settings** from the gear menu and enable the **Repository caching** option. - - - -:::note - -This feature is only available on the dbt Cloud Enterprise plan. - -::: - -### Partial parsing - -At the start of every dbt invocation, dbt reads all the files in your project, extracts information, and constructs an internal manifest containing every object (model, source, macro, and so on). Among other things, it uses the `ref()`, `source()`, and `config()` macro calls within models to set properties, infer dependencies, and construct your project's DAG. When dbt finishes parsing your project, it stores the internal manifest in a file called `partial_parse.msgpack`. - -Parsing projects can be time-consuming, especially for large projects with hundreds of models and thousands of files. To reduce the time it takes dbt to parse your project, use the partial parsing feature in dbt Cloud for your environment. When enabled, dbt Cloud uses the `partial_parse.msgpack` file to determine which files have changed (if any) since the project was last parsed, and then it parses _only_ the changed files and the files related to those changes. - -Partial parsing in dbt Cloud requires dbt version 1.4 or newer. The feature does have some known limitations. Refer to [Known limitations](/reference/parsing#known-limitations) to learn more about them. - -To enable, select **Account settings** from the gear menu and enable the **Partial parsing** option. - - diff --git a/website/snippets/_core-to-cloud-guide-table.md b/website/snippets/_core-to-cloud-guide-table.md index 415476dace0..8648ec29c32 100644 --- a/website/snippets/_core-to-cloud-guide-table.md +++ b/website/snippets/_core-to-cloud-guide-table.md @@ -3,3 +3,14 @@ | [Move from dbt Core to dbt Cloud: What you need to know](/guides/core-cloud-2) | Understand the considerations and methods needed in your move from dbt Core to dbt Cloud. | Team leads
Admins | | [Move from dbt Core to dbt Cloud: Get started](/guides/core-to-cloud-1?step=1) | Learn the steps needed to move from dbt Core to dbt Cloud. | Developers
Data engineers
Data analysts | | [Move from dbt Core to dbt Cloud: Optimization tips](/guides/core-to-cloud-3) | Learn how to optimize your dbt Cloud experience with common scenarios and useful tips. | Everyone | + +### Why move to dbt Cloud? +If your team is using dbt Core today, you could be reading this guide because: + +- You’ve realized the burden of maintaining that deployment. +- The person who set it up has since left. +- You’re interested in what dbt Cloud could do to better manage the complexity of your dbt deployment, democratize access to more contributors, or improve security and governance practices. + +Moving from dbt Core to dbt Cloud simplifies workflows by providing a fully managed environment that improves collaboration, security, and orchestration. With dbt Cloud, you gain access to features like cross-team collaboration ([dbt Mesh](/best-practices/how-we-mesh/mesh-1-intro)), version management, streamlined CI/CD, [dbt Explorer](/docs/collaborate/explore-projects) for comprehensive insights, and more — making it easier to manage complex dbt deployments and scale your data workflows efficiently. + +It's ideal for teams looking to reduce the burden of maintaining their own infrastructure while enhancing governance and productivity. diff --git a/website/snippets/_enterprise-permissions-table.md b/website/snippets/_enterprise-permissions-table.md index cef68e894f5..688e8911bf4 100644 --- a/website/snippets/_enterprise-permissions-table.md +++ b/website/snippets/_enterprise-permissions-table.md @@ -19,6 +19,7 @@ Account roles enable you to manage the dbt Cloud account and manage the account | Audit logs | R | | | | R | R | | Auth provider | W | | | | W | R | | Billing | W | W | | | | R | +| Connections | W | | | W | | | | Groups | W | | | R | W | R | | Invitations | W | | | W | W | R | | IP restrictions | W | | | | W | R | @@ -34,7 +35,6 @@ Account roles enable you to manage the dbt Cloud account and manage the account |Project-level permission | Account Admin | Billing admin | Project creator | Security admin | Viewer | |:-------------------------|:-------------:|:-------------:|:---------------:|:--------------:|:------:| -| Data platform connections | W | | W | | R | | Environment credentials (deployment) | W | | W | | R | | Custom env. variables | W | | W | | R | | Data platform configurations | W | | W | | R | @@ -43,7 +43,6 @@ Account roles enable you to manage the dbt Cloud account and manage the account | Jobs | W | | W | | R | | Metadata GraphQL API access | R | | R | | R | | Permissions | W | | W | W | R | -| Profile | W | | W | | R | | Projects | W | | W | R | R | | Repositories | W | | W | | R | | Runs | W | | W | | R | @@ -61,6 +60,7 @@ The project roles enable you to work within the projects in various capacities. | Account settings | R | | R | | R | | | | | | | R | | | Auth provider | | | | | | | | | | | | | | | Billing | | | | | | | | | | | | | | +| Connections | R | R | R | R | R | R | | | | | R | R | | | Groups | R | | R | R | R | | | | | | R | R | | | Invitations | W | R | R | R | R | R | | R | | | R | R | | | Licenses | W | R | R | R | R | R | | R | | | | R | | @@ -74,17 +74,15 @@ The project roles enable you to work within the projects in various capacities. |Project-level permission | Admin | Analyst | Database admin | Developer | Git Admin | Job admin | Job runner | Job viewer | Metadata

(Discovery API only) | Semantic Layer | Stakeholder | Team admin | Webhook | |--------------------------|:-----:|:-------:|:--------------:|:---------:|:---------:|:---------:|:-----------:|:-----------:|:--------:|:--------------:|:-----------:|:----------:|:-------:| -| Data platform connections | W | R | W | R | R | R | | | | | R | R | | -| Environment credentials (deployment) | W | W | W | W | R | W | | | | | R | R | | +| Environment credentials (deployment) | W | W | W | W | R | W | | | | | R | R | | | Custom env. variables | W | W | W | W | W | W | | R | | | R | W | | -| Data platform configurations | W | W | W | W | R | W | | | | | R | R | | +| Data platform configurations| W | W | W | W | R | W | | | | | R | R | | | Develop
(IDE or dbt Cloud CLI) | W | W | | W | | | | | | | | | | | Environments | W | R | R | R | R | W | | R | | | R | R | | -| Jobs | W | R | R | W | R | W | R | R | | | R | R | | +| Jobs | W | R | R | R | R | W | R | R | | | R | R | | | Metadata GraphQL API access | R | R | R | R | R | R | | R | R | | R | R | | -| Permissions (Groups & Licenses) | W | | R | R | R | | | | | | | R | | -| Profile (Credentials) | W | R | | R | R | R | | | | | R | | | +| Permissions (Groups & Licenses) | W | | R | R | R | | | | | | | R | | | | | R | | | | Projects | W | W | W | W | W | R | | R | | | R | W | | | Repositories | W | | R | R | W | | | | | | R | R | | -| Runs | W | R | R | W | R | W | W | R | | | R | R | | +| Runs | W | R | R | R | R | W | W | R | | | R | R | | | Semantic Layer config | W | R | W | R | R | R | | | | W | R | R | | diff --git a/website/snippets/_log-relational-cache.md b/website/snippets/_log-relational-cache.md new file mode 100644 index 00000000000..4249030f94e --- /dev/null +++ b/website/snippets/_log-relational-cache.md @@ -0,0 +1,5 @@ +

The `LOG_CACHE_EVENTS` config allows detailed logging for {props.event}, which are disabled by default.

+ +```text +dbt --log-cache-events compile +``` diff --git a/website/snippets/_mesh-cycle-detection.md b/website/snippets/_mesh-cycle-detection.md index 2a48d0a15bd..2b4b17385ba 100644 --- a/website/snippets/_mesh-cycle-detection.md +++ b/website/snippets/_mesh-cycle-detection.md @@ -1,8 +1,5 @@ -Currently, the default behavior for "project" dependencies enforces that these relationships only go in one direction, meaning that the `jaffle_finance` project could not add a new model that depends, on any public models produced by the `jaffle_marketing` project. dbt will check for cycles across projects and raise errors if any are detected. +You can enable bidirectional dependencies across projects so these relationships can go in either direction, meaning that the `jaffle_finance` project can add a new model that depends on any public models produced by the `jaffle_marketing` project, so long as the new dependency doesn't introduce any node-level cycles. dbt checks for cycles across projects and raises errors if any are detected. -However, many teams may want to be able to share data assets back and forth between teams. _We've added support for enabling bidirectional dependencies across projects, currently in beta_. - -To enable this in your account, set the environment variable `DBT_CLOUD_PROJECT_CYCLES_ALLOWED` to `TRUE` in all your dbt Cloud environments. This allows you to create bidirectional dependencies between projects, so long as the new dependency does not introduce any node-level cycles. When setting up projects that depend on each other, it's important to do so in a stepwise fashion. Each project must run and produce public models before the original producer project can take a dependency on the original consumer project. For example, the order of operations would be as follows for a simple two-project setup: @@ -10,5 +7,3 @@ When setting up projects that depend on each other, it's important to do so in a 2. The `project_b` project adds `project_a` as a dependency. 3. The `project_b` project runs in a deployment environment and produces public models. 4. The `project_a` project adds `project_b` as a dependency. - -If you enable this feature and experience any issues, please reach out to [dbt Cloud support](mailto:support@getdbt.com). diff --git a/website/snippets/_new-sl-setup.md b/website/snippets/_new-sl-setup.md index 383a1215f5b..39cd2b22b9a 100644 --- a/website/snippets/_new-sl-setup.md +++ b/website/snippets/_new-sl-setup.md @@ -1,4 +1,6 @@ -You must be part of the Owner group and have the correct [license](/docs/cloud/manage-access/seats-and-users) and [permissions](/docs/cloud/manage-access/self-service-permissions) to set up the Semantic Layer at the environment and project level. +import SLEnvVars from '/snippets/_sl-env-vars.md'; + +You must be part of the Owner group and have the correct [license](/docs/cloud/manage-access/seats-and-users) and [permissions](/docs/cloud/manage-access/enterprise-permissions) to set up the Semantic Layer at the environment and project level. - Enterprise plan: - Developer license with Account Admin permissions, or - Owner with a Developer license, assigned Project Creator, Database Admin, or Admin permissions. @@ -13,32 +15,41 @@ Select the environment where you want to enable the Semantic Layer: 2. On the **Settings** left sidebar, select the specific project you want to enable the Semantic Layer for. 3. In the **Project details** page, navigate to the **Semantic Layer** section. Select **Configure Semantic Layer**. - + 4. In the **Set Up Semantic Layer Configuration** page, select the deployment environment you want for the Semantic Layer and click **Save**. This provides administrators with the flexibility to choose the environment where the Semantic Layer will be enabled. -:::tip dbt Cloud Enterprise can skip to [Add more credentials](#4-add-more-credentials) -dbt Cloud Enterprise plans can add multiple credentials and have a different set up. Skip to [Add more credentials](#4-add-more-credentials) for more configuration details. -::: + ### 2. Add a credential and create service tokens -The dbt Semantic Layer uses [service tokens](/docs/dbt-cloud-apis/service-tokens) for authentication which are tied to an underlying data platform credential that you configure. The credential configured is used to execute queries that the Semantic Layer issues against your data platform. This credential controls the physical access to underlying data accessed by the Semantic Layer, and all access policies set in the data platform for this credential will be respected. +The dbt Semantic Layer uses [service tokens](/docs/dbt-cloud-apis/service-tokens) for authentication which are tied to an underlying data platform credential that you configure. The credential configured is used to execute queries that the Semantic Layer issues against your data platform. + +This credential controls the physical access to underlying data accessed by the Semantic Layer, and all access policies set in the data platform for this credential will be respected. + +| Feature | Team plan | Enterprise plan | +| --- | :---: | :---: | +| Service tokens | Can create multiple service tokens linked to one credential. | Can use multiple credentials and link multiple service tokens to each credential. Note that you cannot link a single service token to more than one credential. | +| Credentials per project | One credential per project. | Can [add multiple](#4-add-more-credentials) credentials per project. | +| Link multiple service tokens to a single credential | ✅ | ✅ | -dbt Cloud Enterprise plans can add multiple credentials and map those to service tokens. Refer to [Add more credentials](#4-add-more-credentials) for more information. +*If you're on a Team plan and need to add more credentials, consider upgrading to our [Enterprise plan](https://www.getdbt.com/contact). Enterprise users can refer to [Add more credentials](#4-add-more-credentials) for detailed steps on adding multiple credentials.* -1. In the **Set Up Semantic Layer Configuration** page, enter the credentials specific to your data platform that you want the Semantic Layer to use. +1. After selecting the deployment environment, you should see the **Credentials & service tokens** page. +2. Click the **Add Semantic Layer credential** button. +3. In the **1. Add credentials** section, enter the credentials specific to your data platform that you want the Semantic Layer to use. - Use credentials with minimal privileges. The Semantic Layer requires read access to the schema(s) containing the dbt models used in your semantic models for downstream applications - - Note, environment variables such as `{{env_var('DBT_WAREHOUSE')}`, aren't supported in the dbt Semantic Layer yet. You must use the actual credentials. - + - -2. Create a **Service Token** after you add the credential. - * Enterprise plans: Name and generate a service token on the credential page directly. - * Team plans: You can return to the **Project Details** page and click the **Generate a Service Token** button. -3. Name the token and save it. Once the token is generated, you won't be able to view this token again so make sure to record it somewhere safe. + + +4. After adding credentials, scroll to **2. Map new service token**. +5. Name the token and ensure the permission set includes 'Semantic Layer Only' and 'Metadata Only'. +6. Click **Save**. Once the token is generated, you won't be able to view this token again so make sure to record it somewhere safe. :::info -Teams plans can create multiple service tokens that map to one underlying credential. Adding [multiple credentials](#4-add-more-credentials) for tailored access is available for Enterprise plans. +- Team plans can create multiple service tokens that link to a single underlying credential, but each project can only have one credential. +- Enterprise plans can [add multiple credentials](#4-add-more-credentials) and map those to service tokens for tailored access. Book a free live demo to discover the full potential of dbt Cloud Enterprise. ::: @@ -59,24 +70,39 @@ We recommend configuring credentials and service tokens to reflect your teams an Note that: - Admins can link multiple service tokens to a single credential within a project, but each service token can only be linked to one credential per project. - When you send a request through the APIs, the service token of the linked credential will follow access policies of the underlying view and tables used to build your semantic layer requests. -- [Environment variables](/docs/build/environment-variables), like `{{env_var('DBT_WAREHOUSE')}` aren't supported the dbt Semantic Layer yet. You must use the actual credentials instead. +- To add multiple credentials and map them to service tokens: -1. After configuring your environment, on the **Credentials & service tokens** page click the **Add Semantic Layer credential** button to configure the credential for your data platform. -2. On the **Create New Semantic Layer Credential** page, you can create multiple credentials and map them to a service token. -3. In the **Add credentials** section, fill in the data platform's credential fields. We recommend using “read-only” credentials. +1. After configuring your environment, on the **Credentials & service tokens** page, click the **Add Semantic Layer credential** button to create multiple credentials and map them to a service token. +2. In the **1. Add credentials** section, fill in the data platform's credential fields. We recommend using “read-only” credentials. -4. In the **Map new service token** section, map a service token to the credential you configured in the previous step. dbt Cloud automatically selects the service token permission set you need (Semantic Layer Only and Metadata Only). - - To add another service token, click **Add service token** under the **Linked service tokens** section. -5. Click **Save** to link the service token to the credential. Remember to copy and save the service token securely, as it won't be viewable again after generation. - +3. In the **2. Map new service token** section, map a service token to the credential you configured in the previous step. dbt Cloud automatically selects the service token permission set you need (Semantic Layer Only and Metadata Only). + +4. To add another service token during configuration, click **Add Service Token**. +5. You can link more service tokens to the same credential later on in the **Semantic Layer Configuration Details** page. To add another service token to an existing Semantic Layer configuration, click **Add service token** under the **Linked service tokens** section. +6. Click **Save** to link the service token to the credential. Remember to copy and save the service token securely, as it won't be viewable again after generation. + + +7. To delete a credential, go back to the **Credentials & service tokens** page. +8. Under **Linked Service Tokens**, click **Edit** and, select **Delete Credential** to remove a credential. -6. To delete a credential, go back to the **Semantic Layer & Credential**s page. Select **Delete credential** to remove a credential and click **Save**. - When you delete a credential, any service tokens mapped to that credential in the project will no longer work and will break for any end users. +## Delete configuration +You can delete the entire Semantic Layer configuration for a project. Note that deleting the Semantic Layer configuration will remove all credentials and unlink all service tokens to the project. It will also cause all queries to the Semantic Layer to fail. + +Follow these steps to delete the Semantic Layer configuration for a project: + +1. Navigate to the **Project details** page. +2. In the **Semantic Layer** section, select **Delete Semantic Layer**. +3. Confirm the deletion by clicking **Yes, delete semantic layer** in the confirmation pop up. + +To re-enable the dbt Semantic Layer setup in the future, you will need to recreate your setup configurations by following the [previous steps](#set-up-dbt-semantic-layer). If your semantic models and metrics are still in your project, no changes are needed. If you've removed them, you'll need to set up the YAML configs again. + + + ## Additional configuration The following are the additional flexible configurations for Semantic Layer credentials. diff --git a/website/snippets/_packages_or_dependencies.md b/website/snippets/_packages_or_dependencies.md index 8d21768b0bf..a822b9773db 100644 --- a/website/snippets/_packages_or_dependencies.md +++ b/website/snippets/_packages_or_dependencies.md @@ -1,16 +1,23 @@ ## Use cases -Starting from dbt v1.6, we added a new configuration file called `dependencies.yml`. The file can contain both types of dependencies: "package" and "project" dependencies. -- ["Package" dependencies](/docs/build/packages#how-do-i-add-a-package-to-my-project) lets you add source code from someone else's dbt project into your own, like a library. -- ["Project" dependencies](/docs/collaborate/govern/project-dependencies) provide a different way to build on top of someone else's work in dbt. +The following setup will work for every dbt project: + +- Add [any package dependencies](/docs/collaborate/govern/project-dependencies#when-to-use-project-dependencies) to `packages.yml` +- Add [any project dependencies](/docs/collaborate/govern/project-dependencies#when-to-use-package-dependencies) to `dependencies.yml` + +However, you may be able to consolidate both into a single `dependencies.yml` file. Read the following section to learn more. + +#### About packages.yml and dependencies.yml +The `dependencies.yml`. file can contain both types of dependencies: "package" and "project" dependencies. +- [Package dependencies](/docs/build/packages#how-do-i-add-a-package-to-my-project) lets you add source code from someone else's dbt project into your own, like a library. +- Project dependencies provide a different way to build on top of someone else's work in dbt. If your dbt project doesn't require the use of Jinja within the package specifications, you can simply rename your existing `packages.yml` to `dependencies.yml`. However, something to note is if your project's package specifications use Jinja, particularly for scenarios like adding an environment variable or a [Git token method](/docs/build/packages#git-token-method) in a private Git package specification, you should continue using the `packages.yml` file name. -Examine the following tabs to understand the differences and determine when should use to `dependencies.yml` or `packages.yml`. +Use the following toggles to understand the differences and determine when to use `dependencies.yml` or `packages.yml` (or both). Refer to the [FAQs](#faqs) for more info. - - + Project dependencies are designed for the [dbt Mesh](/best-practices/how-we-mesh/mesh-1-intro) and [cross-project reference](/docs/collaborate/govern/project-dependencies#how-to-write-cross-project-ref) workflow: @@ -19,9 +26,9 @@ Project dependencies are designed for the [dbt Mesh](/best-practices/how-we-mesh - Private packages are not supported in `dependencies.yml` because they intentionally don't support Jinja rendering or conditional configuration. This is to maintain static and predictable configuration and ensures compatibility with other services, like dbt Cloud. - Use `dependencies.yml` for organization and maintainability if you're using both [cross-project refs](/docs/collaborate/govern/project-dependencies#how-to-write-cross-project-ref) and [dbt Hub packages](https://hub.getdbt.com/). This reduces the need for multiple YAML files to manage dependencies. - + - + Package dependencies allow you to add source code from someone else's dbt project into your own, like a library: @@ -31,5 +38,5 @@ Package dependencies allow you to add source code from someone else's dbt projec - `packages.yml` supports Jinja rendering for historical reasons, allowing dynamic configurations. This can be useful if you need to insert values, like a [Git token method](/docs/build/packages#git-token-method) from an environment variable, into your package specifications. Currently, to use private git repositories in dbt, you need to use a workaround that involves embedding a git token with Jinja. This is not ideal as it requires extra steps like creating a user and sharing a git token. We're planning to introduce a simpler method soon that won't require Jinja-embedded secret environment variables. For that reason, `dependencies.yml` does not support Jinja. - - + + diff --git a/website/snippets/_privatelink-across-providers.md b/website/snippets/_privatelink-across-providers.md new file mode 100644 index 00000000000..1a9db462b8e --- /dev/null +++ b/website/snippets/_privatelink-across-providers.md @@ -0,0 +1 @@ +PrivateLink endpoints can't connect across cloud providers. For a PrivateLink connection to work, both dbt Cloud and the server (like {props.type}) must be hosted on the same cloud provider. For example, dbt Cloud hosted on AWS cannot connect via PrivateLink to services hosted on Azure, and dbt Cloud hosted on Azure can’t connect via Private Link to services hosted on AWS. diff --git a/website/snippets/_privatelink-cross-zone-load-balancing.md b/website/snippets/_privatelink-cross-zone-load-balancing.md new file mode 100644 index 00000000000..cb879e5602b --- /dev/null +++ b/website/snippets/_privatelink-cross-zone-load-balancing.md @@ -0,0 +1,6 @@ + +:::note Cross-Zone Load Balancing +We highly recommend cross-zone load balancing for your NLB or Target Group; some connections may require it. Cross-zone load balancing may also [improve routing distribution and connection resiliency](https://docs.aws.amazon.com/elasticloadbalancing/latest/userguide/how-elastic-load-balancing-works.html#cross-zone-load-balancing). Note that cross-zone connectivity may incur additional data transfer charges, though this should be minimal for requests from dbt Cloud. + +- [Enabling cross-zone load balancing for a load balancer or target group](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/edit-target-group-attributes.html#target-group-cross-zone) +::: diff --git a/website/snippets/_privatelink-troubleshooting.md b/website/snippets/_privatelink-troubleshooting.md new file mode 100644 index 00000000000..6db6e313e95 --- /dev/null +++ b/website/snippets/_privatelink-troubleshooting.md @@ -0,0 +1,54 @@ +## Troubleshooting + +If the PrivateLink endpoint has been provisioned and configured in dbt Cloud, but connectivity is still failing, check the following in your networking setup to ensure requests and responses can be successfully routed between dbt Cloud and the backing service. + +### Configuration + +Start with the configuration: + + + +The Network Load Balancer (NLB) associated with the VPC Endpoint Service must either not have an associated Security Group or the Security Group must have a rule that allows requests from the appropriate dbt Cloud _private CIDR(s)_. Note that this differs from the static public IPs listed on the dbt Cloud Connection page. dbt Support can provide the correct private CIDR(s) upon request. + - **Note***: To test if this is the issue, temporarily adding an allow rule of `10.0.0.0/8` should allow connectivity until the rule can be refined to a smaller CIDR + + + + + +Check that there is a Listener connected to the NLB that matches the port that dbt Cloud is trying to connect to. This Listener must have a configured action to forward to a Target Group with targets that point to your backing service. At least one (but preferably all) of these targets must be **Healthy**. Unhealthy targets could suggest that the backing service is, in fact, unhealthy or that the service is protected by a Security Group that doesn't allow requests from the NLB. + + + + + +Check that _Cross-zone load balancing_ is enabled for your NLB (check the **Attributes** tab of the NLB in the AWS console). If this is disabled, and the zones that dbt Cloud is connected to are misaligned with the zones where the service is running, requests may not be able to be routed correctly. Enabling cross-zone load balancing will also make the connection more resilient in the case of a failover in a zone outage scenario. Cross-zone connectivity may incur additional data transfer charges, though this should be minimal for requests from dbt Cloud. + + + + + +If all the above check out, it may be possible that requests are not routing correctly within the private network. This could be due to a misconfiguration in the VPCs routing tables or access control lists. Review these settings with your network administrator to ensure that requests can be routed from the VPC Endpoint Service to the backing service and that the response can be returned to the VPC Endpoint Service. One way to test this is to create a VPC endpoint in another VPC in your network to test that connectivity is working independent of dbt's connection. + + + +### Monitoring + +To help isolate connection issues over a PrivateLink connection from dbt Cloud, there are a few monitoring sources that can be used to verify request activity. Requests must first be sent to the endpoint to see anything in the monitoring. Contact dbt Support to understand when connection testing occurred or request new connection attempts. Use these times to correlate with activity in the following monitoring sources. + + + +In the AWS Console, navigate to VPC -> Endpoint Services. Select the Endpoint Service being tested and click the **Monitoring** tab. Update the time selection to include when test connection attempts were sent. If there is activity in the _New connections_ and _Bytes processed_ graphs, then requests have been received by the Endpoint Service, suggesting that the dbt endpoint is routing properly. + + + + + +In the AWS Console, navigate to EC2 -> Load Balancers. Select the Network Load Balancer (NLB) being tested and click the **Monitoring** tab. Update the time selection to include when test connection attempts were sent. If there is activity in the _New flow count_ and _Processed bytes_ graphs, then requests have been received by the NLB from the Endpoint Service, suggesting the NLB Listener, Target Group, and Security Group are correctly configured. + + + + + +VPC Flow Logs can provide various helpful information for requests being routed through your VPCs, though they can sometimes be challenging to locate and interpret. Flow logs can be written to either S3 or CloudWatch Logs, so determine the availability of these logs for your VPC and query them accordingly. Flow logs record the Elastic Network Interface (ENI) ID, source and destination IP and port, and whether the request was accepted or rejected by the security group and/or network ACL. This can be useful in understanding if a request arrived at a certain network interface and whether that request was accepted, potentially illuminating overly restrictive rules. For more information on accessing and interpreting VPC Flow Logs, see the related [AWS documentation](https://docs.aws.amazon.com/vpc/latest/userguide/flow-logs.html). + + \ No newline at end of file diff --git a/website/snippets/_python-compatibility-matrix.md b/website/snippets/_python-compatibility-matrix.md index 9ecc6c097ea..5a633c2b2a1 100644 --- a/website/snippets/_python-compatibility-matrix.md +++ b/website/snippets/_python-compatibility-matrix.md @@ -6,4 +6,3 @@ | Python 3.11 | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | Python 3.10 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | Python 3.9 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| Python 3.8 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | diff --git a/website/snippets/_render-method.md b/website/snippets/_render-method.md new file mode 100644 index 00000000000..00407a20251 --- /dev/null +++ b/website/snippets/_render-method.md @@ -0,0 +1,17 @@ +#### The render method + +The `.render()` method is generally used to resolve or evaluate Jinja expressions (such as `{{ source(...) }}`) during runtime. + +When using the `--empty flag`, dbt may skip processing `ref()` or `source()` for optimization. To avoid compilation errors and to explicitly tell dbt to process a specific relation (`ref()` or `source()`), use the `.render()` method in your model file. For example: + + + + +```Jinja +{{ config( + pre_hook = [ + "alter external table {{ source('sys', 'customers').render() }} refresh" + ] +``` + + diff --git a/website/snippets/_self-service-permissions-table.md b/website/snippets/_self-service-permissions-table.md deleted file mode 100644 index 12f6577ddbd..00000000000 --- a/website/snippets/_self-service-permissions-table.md +++ /dev/null @@ -1,57 +0,0 @@ - -There are 3 roles available for self-service dbt Cloud accounts: - -- **Owner** — Full access to account features. -- **Member** — Robust access to the account with restrictions on features that can alter billing or security. -- **Read-only** — Read-only access to features. - -Key: - -* (W)rite — Create new or modify existing. Includes `send`, `create`, `delete`, `allocate`, `modify`, and `read`. -* (R)ead — Can view but can not create or change any fields. -* No value — No access to the feature. - -Permissions: - -* Account-level permissions — Permissions related to management of the dbt Cloud account. For example, billing and account settings. -* Project-level permissions — Permissions related to the projects in dbt Cloud. For example, repos and access to the IDE or dbt Cloud CLI. - - -#### Account permissions for account roles - -| Account-level permission| Owner | Member | Read-only | -|:-------------------------|:----:|:------:|:---------:| -| Account settings | W | W | | -| Audit logs | R | | | -| Auth provider | W | | | -| Billing | W | | | -| Groups | W | R | R | -| Invitations | W | W | R | -| Licenses | W | R | | -| Members | W | R | R | -| Project (create) | W | W | | -| Public models | R | R | R | -| Service tokens | W | | | -| Webhooks | W | W | | - -#### Project permissions for account roles - -|Project-level permission | Owner | Member | Read-only | -|:------------------------|:-----:|:-------:|:---------:| -| Adapters | W | W | R | -| Connections | W | W | R | -| Credentials | W | W | R | -| Custom env. variables | W | W | R | -| dbt adapters | W | W | | -| Develop (IDE or dbt Cloud CLI)| W | W | | -| Environments | W | W | R | -| Jobs | W | W | R | -| Metadata | R | R | R | -| Permissions | W | R | | -| Profile | W | W | R | -| Projects | W | W | R | -| Repositories | W | W | R | -| Runs | W | W | R | -| Semantic Layer Config | W | W | R | - - diff --git a/website/snippets/_sl-course.md b/website/snippets/_sl-course.md index 6be9ec7e959..1400be91f37 100644 --- a/website/snippets/_sl-course.md +++ b/website/snippets/_sl-course.md @@ -3,7 +3,7 @@ Explore our [dbt Semantic Layer on-demand course](https://learn.getdbt.com/courses/semantic-layer) to learn how to define and query metrics in your dbt project. -Additionally, dive into mini-courses for querying the dbt Semantic Layer in your favorite tools: [Tableau](https://courses.getdbt.com/courses/tableau-querying-the-semantic-layer), [Hex](https://courses.getdbt.com/courses/hex-querying-the-semantic-layer), and [Mode](https://courses.getdbt.com/courses/mode-querying-the-semantic-layer). +Additionally, dive into mini-courses for querying the dbt Semantic Layer in your favorite tools: [Tableau](https://courses.getdbt.com/courses/tableau-querying-the-semantic-layer), [Excel](https://learn.getdbt.com/courses/querying-the-semantic-layer-with-excel), [Hex](https://courses.getdbt.com/courses/hex-querying-the-semantic-layer), and [Mode](https://courses.getdbt.com/courses/mode-querying-the-semantic-layer). diff --git a/website/snippets/_sl-env-vars.md b/website/snippets/_sl-env-vars.md new file mode 100644 index 00000000000..eddb3952782 --- /dev/null +++ b/website/snippets/_sl-env-vars.md @@ -0,0 +1,5 @@ +Use [Extended Attributes](/docs/dbt-cloud-environments#extended-attributes) and [Environment Variables](/docs/build/environment-variables) when connecting to the Semantic Layer. If you set a value directly in the Semantic Layer Credentials, it will have a higher priority than Extended Attributes. When using environment variables, the default value for the environment will be used. + +For example, set the warehouse by using `{{env_var('DBT_WAREHOUSE')}}` in your Semantic Layer credentials. + +Similarly, if you set the account value using `{{env_var('DBT_ACCOUNT')}}` in Extended Attributes, dbt will check both the Extended Attributes and the environment variable. diff --git a/website/snippets/_sl-excel-gsheets.md b/website/snippets/_sl-excel-gsheets.md index 6a356b15e94..f6d4678bf6e 100644 --- a/website/snippets/_sl-excel-gsheets.md +++ b/website/snippets/_sl-excel-gsheets.md @@ -4,8 +4,8 @@

When querying your data with {props.type}:

    -
  • It returns the data to the cell you have clicked on, and each cell where data is requested will have a note attached to it, indicating what has been queried and the timestamp.
  • -
  • {props.bullet_1}
  • +
  • It returns the data to the cell you clicked on.
  • +
  • {props.bullet_1}
  • {props.bullet_2}
@@ -65,38 +65,30 @@
  • For time dimensions, you can use the time range selector to filter on presets or custom options. The time range selector applies only to the primary time dimension (metric_time). For all other time dimensions that aren't metric_time, you can use the "Where" option to apply filters.
  • -#### Querying without headers or columns +#### Other settings -

    If you would like to just query the data values without the headers, you can optionally select the Exclude Column Names box.

    +

    If you would like to just query the data values without the headers, you can optionally select the Exclude column names box.

    +

    To return your results and keep any previously selected data below it intact, un-select the Clear trailing rows box. By default, we'll clear all trailing rows if there's stale data.

    - - - diff --git a/website/snippets/_sl-measures-parameters.md b/website/snippets/_sl-measures-parameters.md index 5b0a09a9f33..728d63c6b4f 100644 --- a/website/snippets/_sl-measures-parameters.md +++ b/website/snippets/_sl-measures-parameters.md @@ -2,7 +2,7 @@ | --- | --- | --- | | [`name`](/docs/build/measures#name) | Provide a name for the measure, which must be unique and can't be repeated across all semantic models in your dbt project. | Required | | [`description`](/docs/build/measures#description) | Describes the calculated measure. | Optional | -| [`agg`](/docs/build/measures#aggregation) | dbt supports the following aggregations: `sum`, `max`, `min`, `avg`, `median`, `count_distinct`, `percentile`, and `sum_boolean`. | Required | +| [`agg`](/docs/build/measures#aggregation) | dbt supports the following aggregations: `sum`, `max`, `min`, `average`, `median`, `count_distinct`, `percentile`, and `sum_boolean`. | Required | | [`expr`](/docs/build/measures#expr) | Either reference an existing column in the table or use a SQL expression to create or derive a new one. | Optional | | [`non_additive_dimension`](/docs/build/measures#non-additive-dimensions) | Non-additive dimensions can be specified for measures that cannot be aggregated over certain dimensions, such as bank account balances, to avoid producing incorrect results. | Optional | | `agg_params` | Specific aggregation properties, such as a percentile. | Optional | diff --git a/website/snippets/_sl-partner-links.md b/website/snippets/_sl-partner-links.md index 71daaaa1d0a..7d08323239b 100644 --- a/website/snippets/_sl-partner-links.md +++ b/website/snippets/_sl-partner-links.md @@ -17,11 +17,25 @@ The following tools integrate with the dbt Semantic Layer: icon="google-sheets-logo-icon"/> +
    + + + + +
    +
    - @@ -68,9 +82,9 @@ The following tools integrate with the dbt Semantic Layer: - @@ -82,9 +96,9 @@ The following tools integrate with the dbt Semantic Layer: - diff --git a/website/snippets/_sl-run-prod-job.md b/website/snippets/_sl-run-prod-job.md index 8eb4049efc8..318b8d27cbf 100644 --- a/website/snippets/_sl-run-prod-job.md +++ b/website/snippets/_sl-run-prod-job.md @@ -1,9 +1,22 @@ -Once you’ve committed and merged your metric changes in your dbt project, you can perform a job run in your deployment environment in dbt Cloud to materialize your metrics. The deployment environment is only supported for the dbt Semantic Layer currently. +This section explains how you can perform a job run in your deployment environment in dbt Cloud to materialize and deploy your metrics. Currently, the deployment environment is only supported. -1. In dbt Cloud, create a new [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) or use an existing environment on dbt 1.6 or higher. +1. Once you’ve [defined your semantic models and metrics](/guides/sl-snowflake-qs?step=10), commit and merge your metric changes in your dbt project. +2. In dbt Cloud, create a new [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) or use an existing environment on dbt 1.6 or higher. * Note — Deployment environment is currently supported (_development experience coming soon_) -2. To create a new environment, navigate to **Deploy** in the navigation menu, select **Environments**, and then select **Create new environment**. -3. Fill in your deployment credentials with your Snowflake username and password. You can name the schema anything you want. Click **Save** to create your new production environment. -4. [Create a new deploy job](/docs/deploy/deploy-jobs#create-and-schedule-jobs) that runs in the environment you just created. Go back to the **Deploy** menu, select **Jobs**, select **Create job**, and click **Deploy job**. -5. Set the job to run a `dbt build` and select the **Generate docs on run** checkbox. -6. Run the job and make sure it runs successfully. +3. To create a new environment, navigate to **Deploy** in the navigation menu, select **Environments**, and then select **Create new environment**. +4. Fill in your deployment credentials with your Snowflake username and password. You can name the schema anything you want. Click **Save** to create your new production environment. +5. [Create a new deploy job](/docs/deploy/deploy-jobs#create-and-schedule-jobs) that runs in the environment you just created. Go back to the **Deploy** menu, select **Jobs**, select **Create job**, and click **Deploy job**. +6. Set the job to run a `dbt parse` job to parse your projects and generate a [`semantic_manifest.json` artifact](/reference/artifacts/sl-manifest) file. Although running `dbt build` isn't required, you can choose to do so if needed. +7. Run the job by clicking the **Run now** button. Monitor the job's progress in real-time through the **Run summary** tab. + + Once the job completes successfully, your dbt project, including the generated documentation, will be fully deployed and available for use in your production environment. If any issues arise, review the logs to diagnose and address any errors. + +
    + +What’s happening internally? + +- Merging the code into your main branch allows dbt Cloud to pull those changes and build the definition in the manifest produced by the run.
    +- Re-running the job in the deployment environment helps materialize the models, which the metrics depend on, in the data platform. It also makes sure that the manifest is up to date.
    +- The Semantic Layer APIs pull in the most recent manifest and enables your integration to extract metadata from it. + +
    diff --git a/website/snippets/_sl-test-and-query-metrics.md b/website/snippets/_sl-test-and-query-metrics.md index 0b637550cbb..9d996554b31 100644 --- a/website/snippets/_sl-test-and-query-metrics.md +++ b/website/snippets/_sl-test-and-query-metrics.md @@ -1,6 +1,6 @@ To work with metrics in dbt, you have several tools to validate or run commands. Here's how you can test and query metrics depending on your setup: -- [**dbt Cloud IDE users**](#dbt-cloud-ide-users) — Currently, running MetricFlow commands directly in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) isn't supported, but is coming soon. You can view metrics visually through the DAG in the **Lineage** tab without directly running commands. +- [**dbt Cloud IDE users**](#dbt-cloud-ide-users) — Run [MetricFlow commands](/docs/build/metricflow-commands#metricflow-commands) directly in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) to query/preview metrics. View metrics visually in the **Lineage** tab. - [**dbt Cloud CLI users**](#dbt-cloud-cli-users) — The [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) enables you to run [MetricFlow commands](/docs/build/metricflow-commands#metricflow-commands) to query and preview metrics directly in your command line interface. - **dbt Core users** — Use the MetricFlow CLI for command execution. While this guide focuses on dbt Cloud users, dbt Core users can find detailed MetricFlow CLI setup instructions in the [MetricFlow commands](/docs/build/metricflow-commands#metricflow-commands) page. Note that to use the dbt Semantic Layer, you need to have a [Team or Enterprise account](https://www.getdbt.com/). @@ -8,7 +8,9 @@ Alternatively, you can run commands with SQL client tools like DataGrip, DBeaver ### dbt Cloud IDE users -You can view your metrics in the dbt Cloud IDE by viewing them in the **Lineage** tab. The dbt Cloud IDE **Status button** (located in the bottom right of the editor) displays an **Error** status if there's an error in your metric or semantic model definition. You can click the button to see the specific issue and resolve it. +You can use the `dbt sl` prefix before the command name to execute them in dbt Cloud. For example, to list all metrics, run `dbt sl list metrics`. For a complete list of the MetricFlow commands available in the dbt Cloud IDE, refer to the [MetricFlow commands](/docs/build/metricflow-commands#metricflow-commandss) page. + +The dbt Cloud IDE **Status button** (located in the bottom right of the editor) displays an **Error** status if there's an error in your metric or semantic model definition. You can click the button to see the specific issue and resolve it. Once viewed, make sure you commit and merge your changes in your project. diff --git a/website/snippets/_snapshot-yaml-spec.md b/website/snippets/_snapshot-yaml-spec.md new file mode 100644 index 00000000000..cb1675ce5bd --- /dev/null +++ b/website/snippets/_snapshot-yaml-spec.md @@ -0,0 +1,6 @@ +:::info Use the latest snapshot syntax + +In [dbt Cloud Versionless](/docs/dbt-versions/versionless-cloud) or [dbt Core v1.9 and later](/docs/dbt-versions/core), you can configure snapshots in YAML files using the updated syntax within your `snapshots/` directory (as defined by the [`snapshot-paths` config](/reference/project-configs/snapshot-paths)). + +This syntax allows for faster, more efficient snapshot management. To use it, upgrade to Versionless or dbt v1.9 or newer. +::: diff --git a/website/snippets/_state-modified-compare.md b/website/snippets/_state-modified-compare.md new file mode 100644 index 00000000000..c7bba1c8bdf --- /dev/null +++ b/website/snippets/_state-modified-compare.md @@ -0,0 +1,3 @@ +You need to build the state directory using dbt v1.9 or higher, or [Versionless](/docs/dbt-versions/versionless-cloud) dbt Cloud, and you need to set `state_modified_compare_more_unrendered_values` to `true` within your dbt_project.yml. + +If the state directory was built with an older dbt version or if the `state_modified_compare_more_unrendered_values` behavior change flag was either not set or set to `false`, you need to rebuild the state directory to avoid false positives during state comparison with `state:modified`. diff --git a/website/snippets/_v2-sl-prerequisites.md b/website/snippets/_v2-sl-prerequisites.md index f8108849f4f..68f6e7d10b7 100644 --- a/website/snippets/_v2-sl-prerequisites.md +++ b/website/snippets/_v2-sl-prerequisites.md @@ -1,4 +1,4 @@ -- Have a dbt Cloud Team or Enterprise account. Single-tenant accounts should contact their account representative setup. +- Have a dbt Cloud Team or Enterprise account. Single-tenant accounts should contact their account representative for setup. - Ensure your production and development environments use [dbt version 1.6 or higher](/docs/dbt-versions/upgrade-dbt-version-in-cloud). - Use Snowflake, BigQuery, Databricks, or Redshift. - Create a successful run in the environment where you configure the Semantic Layer. diff --git a/website/snippets/core-versions-table.md b/website/snippets/core-versions-table.md index ebeb7cc031a..743b59c6bb7 100644 --- a/website/snippets/core-versions-table.md +++ b/website/snippets/core-versions-table.md @@ -2,15 +2,19 @@ | dbt Core | Initial release | Support level and end date | |:-------------------------------------------------------------:|:---------------:|:-------------------------------------:| -| [**v1.8**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.8) | May 9 2024 | Active — May 8, 2025 | -| [**v1.7**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.7) | Nov 2, 2023 | Critical — Nov 1, 2024 | -| [**v1.6**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.6) | Jul 31, 2023 | End of Life* ⚠️ | -| [**v1.5**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.5) | Apr 27, 2023 | End of Life* ⚠️ | -| [**v1.4**](/docs/dbt-versions/core-upgrade/Older%20versions/upgrading-to-v1.4) | Jan 25, 2023 | End of Life* ⚠️ | -| [**v1.3**](/docs/dbt-versions/core-upgrade/Older%20versions/upgrading-to-v1.3) | Oct 12, 2022 | End of Life* ⚠️ | -| [**v1.2**](/docs/dbt-versions/core-upgrade/Older%20versions/upgrading-to-v1.2) | Jul 26, 2022 | End of Life* ⚠️ | -| [**v1.1**](/docs/dbt-versions/core-upgrade/Older%20versions/upgrading-to-v1.1) | Apr 28, 2022 | End of Life* ⚠️ | -| [**v1.0**](/docs/dbt-versions/core-upgrade/Older%20versions/upgrading-to-v1.0) | Dec 3, 2021 | End of Life* ⚠️ | +| [**v1.8**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.8) | May 9 2024 | Active Support — May 8, 2025 | +| [**v1.7**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.7) | Nov 2, 2023 |
    **dbt Core and dbt Cloud Developer & Team customers:** End of Life
    **dbt Cloud Enterprise customers:** Critical Support until further notice 1
    | +| [**v1.6**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.6) | Jul 31, 2023 | End of Life ⚠️ | +| [**v1.5**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.5) | Apr 27, 2023 | End of Life ⚠️ | +| [**v1.4**](/docs/dbt-versions/core-upgrade/Older%20versions/upgrading-to-v1.4) | Jan 25, 2023 | End of Life ⚠️ | +| [**v1.3**](/docs/dbt-versions/core-upgrade/Older%20versions/upgrading-to-v1.3) | Oct 12, 2022 | End of Life ⚠️ | +| [**v1.2**](/docs/dbt-versions/core-upgrade/Older%20versions/upgrading-to-v1.2) | Jul 26, 2022 | End of Life ⚠️ | +| [**v1.1**](/docs/dbt-versions/core-upgrade/Older%20versions/upgrading-to-v1.1) | Apr 28, 2022 | End of Life ⚠️ | +| [**v1.0**](/docs/dbt-versions/core-upgrade/Older%20versions/upgrading-to-v1.0) | Dec 3, 2021 | End of Life ⚠️ | | **v0.X** ⛔️ | (Various dates) | Deprecated ⛔️ | Deprecated ⛔️ | -_*All versions of dbt Core since v1.0 are available in dbt Cloud until further notice. Versions that are EOL do not receive any fixes. For the best support, we recommend upgrading to a version released within the past 12 months._ +All functionality in dbt Core since the v1.7 release is available in dbt Cloud, early and continuously, by selecting ["Versionless"](https://docs.getdbt.com/docs/dbt-versions/versionless-cloud). + +1 "Versionless" is now required for the Developer and Teams plans on dbt Cloud. Accounts using older dbt versions will be migrated to "Versionless." + +For customers of dbt Cloud Enterprise, dbt v1.7 will continue to be available as an option while dbt Labs rolls out a mechanism for "extended" upgrades. In the meantime, dbt Labs strongly recommends migrating any environments that are still running on older unsupported versions to "Versionless" dbt or dbt v1.7. diff --git a/website/src/components/blogPostCard/styles.module.css b/website/src/components/blogPostCard/styles.module.css index 388dc8a8c8b..6c97f6060f0 100644 --- a/website/src/components/blogPostCard/styles.module.css +++ b/website/src/components/blogPostCard/styles.module.css @@ -33,7 +33,7 @@ .imageContentContainer { width: 100%; border-radius: var(--border-radius) var(--border-radius) 0px 0px; - min-height: 200px; + min-height: 250px; } diff --git a/website/src/components/communitySpotlightCard/index.js b/website/src/components/communitySpotlightCard/index.js index 122edee8f06..e6f36dfb69b 100644 --- a/website/src/components/communitySpotlightCard/index.js +++ b/website/src/components/communitySpotlightCard/index.js @@ -49,7 +49,8 @@ function CommunitySpotlightCard({ frontMatter, isSpotlightMember = false }) { companyName, organization, socialLinks, - communityAward + communityAward, + communityAwardYear, } = frontMatter // Get meta description text @@ -74,7 +75,7 @@ function CommunitySpotlightCard({ frontMatter, isSpotlightMember = false }) { ) : null} {communityAward ? (
    - Community Award Recipient + Community Award Recipient {communityAwardYear}
    ) : null} {image && ( diff --git a/website/src/components/communitySpotlightList/index.js b/website/src/components/communitySpotlightList/index.js index 8355fd0958b..76cc5de2121 100644 --- a/website/src/components/communitySpotlightList/index.js +++ b/website/src/components/communitySpotlightList/index.js @@ -11,7 +11,7 @@ const communityDescription = "The dbt Community is where analytics engineering l // This date determines where the 'Previously on the Spotlight" text will show. // Any spotlight members with a 'dateCreated' field before this date // will be under the 'Previously..' header. -const currentSpotlightDate = new Date('2024-07-26') +const currentSpotlightDate = new Date('2024-10-30') function CommunitySpotlightList({ spotlightData }) { const { siteConfig } = useDocusaurusContext() diff --git a/website/src/components/hero/index.js b/website/src/components/hero/index.js index e4bef8e234b..073fb518998 100644 --- a/website/src/components/hero/index.js +++ b/website/src/components/hero/index.js @@ -1,7 +1,7 @@ import React from 'react'; import styles from './styles.module.css'; -function Hero({ heading, subheading, showGraphic = false, customStyles = {}, classNames = '', colClassNames = '' }) { +function Hero({ heading, subheading, showGraphic = false, customStyles = {}, classNames = '', colClassNames = '', callToActionsTitle, callToActions = [] }) { return (
    {showGraphic && ( @@ -12,6 +12,22 @@ function Hero({ heading, subheading, showGraphic = false, customStyles = {}, cla
    diff --git a/website/src/components/hero/styles.module.css b/website/src/components/hero/styles.module.css index f596b53762a..67d3c8c5d68 100644 --- a/website/src/components/hero/styles.module.css +++ b/website/src/components/hero/styles.module.css @@ -49,3 +49,34 @@ width: 60%; } } + +.callToActionsTitle { + font-weight: bold; + margin-top: 20px; + margin-bottom: 20px; + font-size: 1.25rem; + display: block; +} + +.callToActions { + display: flex; + flex-flow: wrap; + gap: 0.8rem; + justify-content: center; +} + +.callToAction { + outline: #fff solid 1px; + border-radius: 4px; + padding: 0 12px; + color: #fff; + transition: all .2s; + cursor: pointer; +} + +.callToAction:hover, .callToAction:active, .callToAction:focus { + text-decoration: none; + outline: rgb(4, 115, 119) solid 1px; + background-color: rgb(4, 115, 119); + color: #fff; +} diff --git a/website/src/components/quickstartGuideList/index.js b/website/src/components/quickstartGuideList/index.js index 0f4b5764340..2b87ae3e4a1 100644 --- a/website/src/components/quickstartGuideList/index.js +++ b/website/src/components/quickstartGuideList/index.js @@ -61,7 +61,7 @@ function QuickstartList({ quickstartData }) { // Update the URL with the new search parameters history.replace({ search: params.toString() }); -}; + }; // Handle all filters const handleDataFilter = () => { @@ -98,6 +98,30 @@ function QuickstartList({ quickstartData }) { handleDataFilter(); }, [selectedTags, selectedLevel, searchInput]); // Added searchInput to dependency array + // Set the featured guides that will show as CTAs in the hero section + // The value of the tag must match a tag in the frontmatter of the guides in order for the filter to apply after clicking + const heroCTAs = [ + { + title: 'Quickstart guides', + value: 'Quickstart' + }, + { + title: 'Use Jinja to improve your SQL code', + value: 'Jinja' + }, + { + title: 'Orchestration', + value: 'Orchestration' + }, + ]; + + // Function to handle CTA clicks + const handleCallToActionClick = (value) => { + const params = new URLSearchParams(location.search); + params.set('tags', value); + history.replace({ search: params.toString() }); + }; + return ( @@ -111,6 +135,13 @@ function QuickstartList({ quickstartData }) { showGraphic={false} customStyles={{ marginBottom: 0 }} classNames={styles.quickstartHero} + callToActions={heroCTAs.map(guide => ({ + title: guide.title, + href: guide.href, + onClick: () => handleCallToActionClick(guide.value), + newTab: guide.newTab + }))} + callToActionsTitle={'Popular guides'} />
    @@ -135,7 +166,7 @@ function QuickstartList({ quickstartData }) {
    - ) + ); } export default QuickstartList; diff --git a/website/src/components/quickstartTOC/index.js b/website/src/components/quickstartTOC/index.js index c28d462ceb1..cf9c3fa5e3f 100644 --- a/website/src/components/quickstartTOC/index.js +++ b/website/src/components/quickstartTOC/index.js @@ -82,14 +82,15 @@ function QuickstartTOC() { buttonContainer.classList.add(style.buttonContainer); const prevButton = document.createElement("a"); const nextButton = document.createElement("a"); - + prevButton.innerHTML = ' Back'; prevButton.classList.add(clsx(style.button, style.prevButton)); prevButton.disabled = index === 0; prevButton.addEventListener("click", () => handlePrev(index + 1)); - nextButton.innerHTML = 'Next '; + nextButton.innerHTML = + 'Next '; nextButton.classList.add(clsx(style.button, style.nextButton)); nextButton.disabled = index === stepWrappers.length - 1; nextButton.addEventListener("click", () => handleNext(index + 1)); @@ -108,8 +109,26 @@ function QuickstartTOC() { } }); - const quickstartTitle = document.querySelector("header h1"); - quickstartTitle.classList.add(style.quickstartTitle); + // Get title by frontmatter title + let quickstartTitle = document?.querySelector("header h1"); + + // Get lifecycle badge from markdown content h1 + // Example: # Demo h1 title + const markdownTitleLifeCycleBadge = document?.querySelectorAll( + ".quickstart-container .step-container > h1 > span.lifecycle-badge" + ); + + if (quickstartTitle) { + quickstartTitle.classList.add(style.quickstartTitle); + + // If markdown title also set, check if LifeCycle span + // elements exists and move to quickstartTitle if so. + if (markdownTitleLifeCycleBadge?.length) { + for (let i = 0; i < markdownTitleLifeCycleBadge.length; i++) { + quickstartTitle.appendChild(markdownTitleLifeCycleBadge[i]); + } + } + } } }, [mounted]); diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index 760acb736cb..0c9a1b8b623 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -1,11 +1,10 @@ import React, { useState, useEffect } from 'react' -import Link from '@docusaurus/Link'; import ReactTooltip from "react-tooltip"; import styles from './styles.module.css'; {/* Props: - id: filename of term + id: maps to term in website/docs/terms/hover-terms.md children (optional): to display different text other than displayText property for term */} @@ -20,48 +19,44 @@ export default function Term({ id, children = undefined }) { setPageReady(true) }) - const file = require('../../../docs/terms/' + id + '.md') - if(!file) - return null - - const fm = file.frontMatter - if(!fm) - return null + // Get terms file + const file = require('../../../docs/terms/hover-terms.md') + + // Get term by id + const term = file?.frontMatter?.[id] - const { displayText, hoverSnippet } = fm + // If term not found in file, return children if available or null + if(!term) + return children || null + + // Get properties from front matter + const { displayText, hoverSnippet } = term; + + // If component has children, show children text, + // Else, default to displayText frontmatter field, + // Or filename if displayText not set + const displayValue = children ? children : displayText ? displayText : id return ( <> - {pageReady ? ( + {pageReady && hoverSnippet ? ( <> - + {displayValue} + + - {/* If component has children, show children text, - Else, default to displayText frontmatter field, - Or filename if displayText not set - */} - {children ? children : displayText ? displayText : id} - - {hoverSnippet && ( - - {hoverSnippet} - - )} + {hoverSnippet} + ) : ( - {children ? children : displayText ? displayText : id} + {displayValue} )} - ) + ); } diff --git a/website/src/components/term/styles.module.css b/website/src/components/term/styles.module.css index 482e76bda52..8cc86743941 100644 --- a/website/src/components/term/styles.module.css +++ b/website/src/components/term/styles.module.css @@ -1,11 +1,8 @@ .term { - cursor: pointer; position: relative; text-decoration: underline dotted var(--ifm-font-color-base); - color: var(--ifm-font-color-base); } .term:hover { - color: var(--ifm-link-color); text-decoration: underline dotted var(--ifm-link-color); } .termToolTip { diff --git a/website/src/components/variable/index.js b/website/src/components/variable/index.js deleted file mode 100644 index 444f3203327..00000000000 --- a/website/src/components/variable/index.js +++ /dev/null @@ -1,45 +0,0 @@ -import React, { useState, useEffect, useContext } from 'react' -import { dbtVariables } from '../../../dbt-global-variables'; -import VersionContext from '../../stores/VersionContext'; - -export default function Var({ name }) { - if(!name) - return null - - const [variableName, setVariableName] = useState('') - - const { version } = useContext(VersionContext) - - const currentVariable = dbtVariables[name] - if(!currentVariable) - return null - - useEffect(() => { - if(currentVariable?.versions?.length && version) { - {/* - * If versions set for variable - * show correct variable name for current version - * - * Sort by lowest version first - * If this version is greater or equal to the active version - * Show this variable name - * If no match is found, show original variable name - * - */} - const thisVersionVariable = currentVariable.versions - .sort((item1, item2) => (parseFloat(item1.version) > parseFloat(item2.version)) ? 1 : -1) - .find(varVersion => - parseFloat(varVersion.version) >= parseFloat(version) ? true : false - ) - - !thisVersionVariable - ? setVariableName(currentVariable.name) - : setVariableName(thisVersionVariable.name) - - } else { - setVariableName(currentVariable.name) - } - }, [version]) - - return { variableName } -} diff --git a/website/src/components/versionBlock/index.js b/website/src/components/versionBlock/index.js index 01d6381c0e1..f7982d6eacf 100644 --- a/website/src/components/versionBlock/index.js +++ b/website/src/components/versionBlock/index.js @@ -1,39 +1,25 @@ import React, { useState, useEffect, useContext } from 'react' import VersionContext from '../../stores/VersionContext'; +import { availableInCurrentVersion } from '../../utils/available-in-current-version'; -export default function VersionBlock({ firstVersion = 0, lastVersion = undefined, children }) { - const { version } = useContext(VersionContext) +export default function VersionBlock({ firstVersion = "0", lastVersion = undefined, children }) { + const { version } = useContext(VersionContext); - const [loading, setLoading] = useState(true) + const [loading, setLoading] = useState(true); // Hide versionBlock components until version ready useEffect(() => { - version && setLoading(false) - }, [version]) + version && setLoading(false); + }, [version]); // Only check version if current version set - if(version) { - const currentVersionVal = parseFloat(version) - const firstVersionVal = parseFloat(firstVersion) - {/* - * If last version set, check if current version greater than last version - * Or if current version less than first version - * If either is true, hide block - * Else, if current version less than first version, hide block - */} - if(lastVersion) { - if((currentVersionVal > parseFloat(lastVersion)) - || (currentVersionVal < firstVersionVal)) - return null - } else { - if(currentVersionVal < firstVersionVal) { - return null - } - - } + if (version) { + if (!availableInCurrentVersion( + version, + firstVersion, + lastVersion + )) return null; } - return loading - ? null - : <>{children} + return loading ? null : <>{children}; } diff --git a/website/src/css/custom.css b/website/src/css/custom.css index a4eabddbdca..e240a5dfabf 100644 --- a/website/src/css/custom.css +++ b/website/src/css/custom.css @@ -705,6 +705,10 @@ html[data-theme="dark"] .alert table { color: var(--ifm-color-gray-900) !important; } +.alert blockquote { + color: inherit; +} + .linkout { background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' viewBox='0 0 18 22.5' version='1.1' x='0px' y='0px'%3E%3Ctitle%3Elink%3C/title%3E%3Cdesc%3ECreated with Sketch.%3C/desc%3E%3Cg stroke='none' stroke-width='1' fill='none' fill-rule='evenodd'%3E%3Cpath d='M9.24264069,6.41438508 C8.8521164,6.80490937 8.21895142,6.80490937 7.82842712,6.41438508 C7.43790283,6.02386078 7.43790283,5.39069581 7.82842712,5.00017151 L10.6570258,2.17157288 C12.2191229,0.609475708 14.7517828,0.609475708 16.31388,2.17157288 C17.8759772,3.73367004 17.8759772,6.26632996 16.31388,7.82842712 L13.4854529,10.6568542 C13.0949286,11.0473785 12.4617636,11.0473785 12.0712393,10.6568542 C11.680715,10.26633 11.680715,9.63316498 12.0712393,9.24264069 L14.8996664,6.41421356 C15.680715,5.63316498 15.680715,4.36683502 14.8996664,3.58578644 C14.1186179,2.80473785 12.8522879,2.80473785 12.0712393,3.58578644 L9.24264069,6.41438508 Z M9.2428122,12.0710678 C9.63333649,11.6805435 10.2665015,11.6805435 10.6570258,12.0710678 C11.0475501,12.4615921 11.0475501,13.0947571 10.6570258,13.4852814 L7.82842712,16.31388 C6.26632996,17.8759772 3.73367004,17.8759772 2.17157288,16.31388 C0.609475708,14.7517828 0.609475708,12.2191229 2.17157288,10.6570258 L5,7.82859864 C5.39052429,7.43807435 6.02368927,7.43807435 6.41421356,7.82859864 C6.80473785,8.21912293 6.80473785,8.85228791 6.41421356,9.2428122 L3.58578644,12.0712393 C2.80473785,12.8522879 2.80473785,14.1186179 3.58578644,14.8996664 C4.36683502,15.680715 5.63316498,15.680715 6.41421356,14.8996664 L9.2428122,12.0710678 Z M12.0712393,6.41421356 C12.4617636,6.80473785 12.4617636,7.43790283 12.0712393,7.82842712 L7.82859864,12.0710678 C7.43807435,12.4615921 6.80490937,12.4615921 6.41438508,12.0710678 C6.02386078,11.6805435 6.02386078,11.0473785 6.41438508,10.6568542 L10.6570258,6.41421356 C11.0475501,6.02368927 11.680715,6.02368927 12.0712393,6.41421356 Z' fill='%23000000' fill-rule='nonzero'/%3E%3C/g%3E%3C/svg%3E"); background-size: 2rem 2rem; @@ -2026,6 +2030,13 @@ html[data-theme="dark"] .theme-doc-sidebar-container>div>button.button:hover { width: 100%; } +/* Hide `contentTitle` h1 from content for guides. + This header appears in the upperleft section for guides. +*/ +.quickstart-container .step-container > h1 { + display: none; +} + .quickstart-container .step-container .intro { order: -1; } diff --git a/website/src/pages/styles.js b/website/src/pages/styles.js deleted file mode 100644 index 23d13d10813..00000000000 --- a/website/src/pages/styles.js +++ /dev/null @@ -1,176 +0,0 @@ - -import React from 'react'; -import Layout from '@theme/Layout'; -import CodeBlock from '@theme/CodeBlock'; -import Changelog from '@site/src/components/changelog'; -import CloudCore from '@site/src/components/cloudcore'; -import Collapsible from '@site/src/components/collapsible'; -import FAQ from '@site/src/components/faqs'; -import File from '@site/src/components/file'; -import Lightbox from '@site/src/components/lightbox'; -import LoomVideo from '@site/src/components/loom'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import YoutubeVideo from '@site/src/components/youtube'; - -function Styles() { - return ( - -
    -
    -

    - The following components are baked into the Markdown compilation context, - so there is no need to import them from Markdown files. Simply add the components - inline to use them. -

    -
    -
    -

    Changelog

    -
    {`
    -    

    This functionality has changed in dbt v0.16.0

    -
    -`}
    - This functionality has changed in dbt v0.16.0 -
    -
    -

    CloudCore

    -
    {`
    -    
    -

    The first div contains Cloud info

    -
    -
    -

    The second div contains Core info

    -
    -
    -`}
    - -
    -

    The first div contains Cloud info

    -
    -
    -

    The second div contains Core info

    -
    -
    -
    -
    -

    Collapsible

    -
    {`
    -    
    -

    Shows and hides children elements

    -
    -
    -`}
    - -
    -

    Shows and hides children elements

    -
    -
    -
    -
    -

    FAQList

    -
    {``}
    -

    (Not shown)

    -
    - -
    -

    FAQ

    -
    {``}
    - - -
    - -
    -

    File

    -
    {`
    -
    -\`\`\`yml
    -password: hunter2
    -\`\`\`
    -
    -
    -
    -`}
    - -
    -                        password: hunter2
    -                    
    -
    -
    - -
    -

    Lightbox

    -
    {``}
    - -
    - -
    -

    Markdown Links

    - Refer to the Links section of the Content Style Guide to read about how you can use links in the dbt product documentation. -
    - -
    -

    LoomVideo

    -
    {``}
    - -
    - -
    -

    Tabs

    -
    {`
    -
    -
    -
    -\`\`\`sql
    -select id from customers
    -\`\`\`
    -
    -
    -
    -
    -\`\`\`sql
    -select "ID" from customers
    -\`\`\`
    -
    -
    -
    -`}
    - - - -
    -select id from customers
    -
    - -
    - - -
    -select "ID" from customers
    -
    - -
    -
    -
    -
    -

    YoutubeVideo

    -
    {``}
    - -
    -
    -
    - ); -} - -export default Styles; diff --git a/website/src/stores/VersionContext.js b/website/src/stores/VersionContext.js index a87ceb3e429..522994c441b 100644 --- a/website/src/stores/VersionContext.js +++ b/website/src/stores/VersionContext.js @@ -50,10 +50,10 @@ export const VersionContextProvider = ({ value = "", children }) => { const updateVersion = (e) => { if(!e.target) return - - const vRegex = /(?:v)?(\d+(\.\d+)*)/ // Regex that will parse out the version number, even if there is/isn't a 'v' in front of version number and a '(Beta)' afterwards. - const versionValue = e.target.text.match(vRegex)[1] + // Get selected version value from `dbt-version` data attribute + const versionValue = e.target?.dataset?.dbtVersion + versionValue && setVersion(versionValue) window.localStorage.setItem('dbtVersion', versionValue) @@ -66,9 +66,11 @@ export const VersionContextProvider = ({ value = "", children }) => { // Determine isPrerelease status + End of Life date for current version const currentVersion = versions.find(ver => ver.version === version) - if(currentVersion) + if(currentVersion) { context.EOLDate = currentVersion.EOLDate context.isPrerelease = currentVersion?.isPrerelease + context.customDisplay = currentVersion?.customDisplay; + } // Get latest stable release const latestStableRelease = versions.find(ver => !ver?.isPrerelease) diff --git a/website/src/theme/DocItem/Content/index.js b/website/src/theme/DocItem/Content/index.js index 1af386bc018..46ffe88ea6a 100644 --- a/website/src/theme/DocItem/Content/index.js +++ b/website/src/theme/DocItem/Content/index.js @@ -29,12 +29,18 @@ import styles from "./styles.module.css"; function useSyntheticTitle() { const { metadata, frontMatter, contentTitle } = useDoc(); - const shouldRender = - !frontMatter.hide_title && typeof contentTitle === "undefined"; + + const shouldRender = + metadata?.id?.includes("guides/") || + ( + !frontMatter.hide_title && typeof contentTitle === "undefined" + ); + if (!shouldRender) { return null; } - return metadata.title; + + return contentTitle || metadata.title; } export default function DocItemContent({ children }) { const syntheticTitle = useSyntheticTitle(); diff --git a/website/src/theme/DocRoot/Layout/Main/index.js b/website/src/theme/DocRoot/Layout/Main/index.js index 7303e484863..154c3cbfab6 100644 --- a/website/src/theme/DocRoot/Layout/Main/index.js +++ b/website/src/theme/DocRoot/Layout/Main/index.js @@ -43,11 +43,14 @@ export default function DocRootLayoutMain({ latestStableRelease, } = useContext(VersionContext); - const { pageAvailable, firstAvailableVersion } = pageVersionCheck( - dbtVersion, - versionedPages, - currentDocRoute - ); + const { + pageAvailable, + firstAvailableVersion, + lastAvailableVersion + } = pageVersionCheck(dbtVersion, versionedPages, currentDocRoute); + + const hasFirstAvailableVersion = + firstAvailableVersion && firstAvailableVersion !== "0"; // Check whether this version is a isPrerelease, and show banner if so const [PreData, setPreData] = useState({ @@ -71,7 +74,7 @@ export default function DocRootLayoutMain({ } else { setPreData({ showisPrereleaseBanner: true, - isPrereleaseBannerText: `You are currently viewing v${dbtVersion}, which is a prerelease of dbt Core. The latest stable version is v${latestStableRelease}`, + isPrereleaseBannerText: `You are viewing the docs for a prerelease version of dbt Core. There may be features described that are still in development, incomplete, or unstable. For the latest generally available features, install the latest stable version`, }); } // If EOLDate not set for version, do not show banner @@ -86,12 +89,12 @@ export default function DocRootLayoutMain({ if (new Date() > new Date(EOLDate)) { setEOLData({ showEOLBanner: true, - EOLBannerText: `This version of dbt Core is no longer supported. No patch releases will be made, even for critical security issues. For better performance, improved security, and new features, you should upgrade to ${latestStableRelease}, the latest stable version.`, + EOLBannerText: `This version of dbt Core is no longer supported. There will be no more patches or security fixes. For improved performance, security, and features, upgrade to the latest stable version. Some dbt Cloud customers might have an extended critical support window. `, }); } else if (new Date() > threeMonths) { setEOLData({ showEOLBanner: true, - EOLBannerText: `This version of dbt Core is nearing the end of its critical support period. For better performance, improved security, and new features, you should upgrade to ${latestStableRelease}, the latest stable version.`, + EOLBannerText: `This version of dbt Core is nearing the end of its critical support period. For improved perfomance, security, and features, upgrade to the latest stable version.`, }); } else { setEOLData({ @@ -116,21 +119,6 @@ export default function DocRootLayoutMain({ hiddenSidebarContainer && styles.docItemWrapperEnhanced )} > - {!pageAvailable && dbtVersion && firstAvailableVersion && ( -
    - -

    - Unfortunately, this feature is not available in dbt Core version{" "} - {dbtVersion} -

    -

    - {" "} - You should upgrade to {firstAvailableVersion} or later if you - want to use this feature. -

    -
    -
    - )} {PreData.showisPrereleaseBanner && (
    diff --git a/website/src/theme/MDXComponents/index.js b/website/src/theme/MDXComponents/index.js index 66ab70a0167..d136222a0ce 100644 --- a/website/src/theme/MDXComponents/index.js +++ b/website/src/theme/MDXComponents/index.js @@ -27,7 +27,6 @@ import Snippet from '@site/src/components/snippet'; import YoutubeVideo from '@site/src/components/youtube'; import WistiaVideo from '@site/src/components/wistia'; import VersionBlock from '@site/src/components/versionBlock'; -import Var from '@site/src/components/variable'; import Term from '@site/src/components/term'; import EventsFeed from '@site/src/components/events'; import { DiscourseFeed, DiscourseHelpFeed } from '@site/src/components/discourse'; @@ -84,7 +83,6 @@ const MDXComponents = { WHCode: WHCode, YoutubeVideo: YoutubeVideo, VersionBlock: VersionBlock, - Var: Var, Term: Term, EventsFeed: EventsFeed, DiscourseFeed: DiscourseFeed, diff --git a/website/src/theme/NavbarItem/DropdownNavbarItem.js b/website/src/theme/NavbarItem/DropdownNavbarItem.js index 8c29daeac21..dc86692dcac 100644 --- a/website/src/theme/NavbarItem/DropdownNavbarItem.js +++ b/website/src/theme/NavbarItem/DropdownNavbarItem.js @@ -78,74 +78,89 @@ function DropdownNavbarItemDesktop({ useEffect(() => { setShowVersionDropdown(true) }, [showVersionDropdown]) - + return (
    + className={clsx("navbar__item", "dropdown", "dropdown--hoverable", { + "dropdown--right": position === "right", + "dropdown--show": showDropdown, + "dropdown--version--hide": !showVersionDropdown, + })} + > e.preventDefault()} onKeyDown={(e) => { - if (e.key === 'Enter') { + if (e.key === "Enter") { e.preventDefault(); setShowDropdown(!showDropdown); } }} - label={className === "nav-versioning" ? `v${versionContext.version} ${versionContext?.isPrerelease ? "(Beta)" : ""}` : props.children ?? props.label} + label={ + className === "nav-versioning" + ? `${versionContext?.customDisplay ? `${versionContext.customDisplay}` : `v${versionContext.version} ${versionContext?.isPrerelease ? "(Beta)" : ""}`}` + : props.children ?? props.label + } > {props.children ?? props.label}
      - {items.map((childItemProps, i) => ( - - {className === "nav-versioning" ? ( -
    • - { - handleVersionMenuClick() - versionContext.updateVersion(e) - } - } - >{childItemProps.label} - {versions.find((version) => (childItemProps.label == version.version))?.isPrerelease && " (Beta)"} -
    • - ) : ( - { - if (i === items.length - 1 && e.key === 'Tab') { - e.preventDefault(); - setShowDropdown(false); - const nextNavbarItem = dropdownRef.current.nextElementSibling; - if (nextNavbarItem) { - const targetItem = - nextNavbarItem instanceof HTMLAnchorElement - ? nextNavbarItem - : // Next item is another dropdown; focus on the inner - // anchor element instead so there's outline - nextNavbarItem.querySelector('a'); - targetItem.focus(); + {items.map((childItemProps, i) => { + const thisVersion = versions.find( + (version) => childItemProps.label == version.version + ); + const versionDisplay = thisVersion?.customDisplay ? thisVersion.customDisplay : `${childItemProps.label} ${thisVersion?.isPrerelease ? " (Beta)" : ""}`; + + return ( + + {className === "nav-versioning" ? ( +
    • + { + handleVersionMenuClick(); + versionContext.updateVersion(e); + }} + > + {versionDisplay} + +
    • + ) : ( + { + if (i === items.length - 1 && e.key === "Tab") { + e.preventDefault(); + setShowDropdown(false); + const nextNavbarItem = + dropdownRef.current.nextElementSibling; + if (nextNavbarItem) { + const targetItem = + nextNavbarItem instanceof HTMLAnchorElement + ? nextNavbarItem + : // Next item is another dropdown; focus on the inner + // anchor element instead so there's outline + nextNavbarItem.querySelector("a"); + targetItem.focus(); + } } - } - }} - activeClassName="dropdown__link--active" - {...childItemProps} - key={i} - /> - )} -
      - ))} + }} + activeClassName="dropdown__link--active" + {...childItemProps} + key={i} + /> + )} +
      + ); + } + )}
    ); @@ -171,42 +186,55 @@ function DropdownNavbarItemMobile({ }, [localPathname, containsActive, setCollapsed]); return (
  • + className={clsx("menu__list-item", { + "menu__list-item--collapsed": collapsed, + })} + > { e.preventDefault(); toggleCollapsed(); }} - label={className === "nav-versioning" ? `v${versionContext.version} ${versionContext.isPrerelease ? "(Beta)" : ""}` : props.children ?? props.label} + label={ + className === "nav-versioning" + ? `${versionContext?.customDisplay ? `${versionContext.customDisplay}` : `v${versionContext.version} ${versionContext?.isPrerelease ? "(Beta)" : ""}`}` + : props.children ?? props.label + } > {props.children ?? props.label} {items.map((childItemProps, i) => { - childItemProps.label = versions.find((version) => (childItemProps.label == version.version))?.isPrerelease ? `${childItemProps.label} (Beta)` : `${childItemProps.label}`; + const thisVersion = versions.find( + (version) => childItemProps.label == version.version + ); + const versionDisplay = thisVersion?.customDisplay + ? thisVersion.customDisplay + : `${childItemProps.label} ${thisVersion?.isPrerelease ? " (Beta)" : ""}`; + return ( versionContext.updateVersion(e) - : onClick + data-dbt-version={childItemProps.label} + onClick={ + className === "nav-versioning" + ? (e) => versionContext.updateVersion(e) + : onClick } activeClassName="menu__link--active" {...childItemProps} + label={versionDisplay} key={i} /> - ) - } - )} + ); + })}
  • ); diff --git a/website/src/utils/available-in-current-version.js b/website/src/utils/available-in-current-version.js new file mode 100644 index 00000000000..ce10fc822a9 --- /dev/null +++ b/website/src/utils/available-in-current-version.js @@ -0,0 +1,49 @@ +import { sortVersions } from "./sort-versions"; + +export const availableInCurrentVersion = ( + currentVersion, + firstVersion = "0", + lastVersion = undefined +) => { + // If `firstVersion` prop set on VersionBlock component without a value, + // it defaults to boolean `true`. This overrides to ensure `firstVersion` is string. + if(typeof firstVersion === "boolean") { + firstVersion = "0" + } + // Do the same to ensure `lastVersion` cannot be a boolean + if (typeof lastVersion === "boolean") { + lastVersion = undefined; + } + + // Get versions sorted from earliest to latest + const sortedVersions = sortVersions([ + currentVersion, + firstVersion, + ...(lastVersion ? [lastVersion] : []), + ]); + // Get index of current version, and first/last version props passed into component + const currentVersionIndex = sortedVersions?.indexOf(currentVersion); + const firstVersionIndex = sortedVersions?.indexOf(firstVersion); + const lastVersionIndex = sortedVersions?.indexOf(lastVersion); + { + /* + * If last version set, check if current version greater than last version + * Or if current version less than first version + * If either is true, hide block + * Else, if current version less than first version, hide block + */ + } + if (lastVersionIndex >= 0) { + if ( + currentVersionIndex > lastVersionIndex || + currentVersionIndex < firstVersionIndex + ) + return false; + } else { + if (currentVersionIndex < firstVersionIndex) { + return false; + } + } + + return true +} diff --git a/website/src/utils/category-version-check.js b/website/src/utils/category-version-check.js index bf75853cd8a..f4f263a1115 100644 --- a/website/src/utils/category-version-check.js +++ b/website/src/utils/category-version-check.js @@ -1,3 +1,5 @@ +import { availableInCurrentVersion } from "./available-in-current-version"; + export default function categoryVersionCheck(version, versionedCategories, category) { let categoryAvailableObj = { categoryAvailable: true @@ -7,30 +9,17 @@ export default function categoryVersionCheck(version, versionedCategories, categ return categoryAvailableObj const itemFound = versionedCategories.find(vcategory => vcategory.category === category) - + if (itemFound) { - const { firstVersion, lastVersion } = itemFound - const currentVersionVal = parseFloat(version) - const firstVersionVal = parseFloat(firstVersion) || 0 - - categoryAvailableObj.firstAvailableVersion = firstVersion - - // Determine if category within version range - if (lastVersion) { - const lastVersionVal = parseFloat(lastVersion) - // If lastVersion set for category, - // check if current version is higher than lastVersion - // or if current version is less than firstVersion - // If true, remove category in sidebar - if (currentVersionVal > lastVersionVal || currentVersionVal < firstVersionVal) { - categoryAvailableObj.categoryAvailable = false - } - } else if (firstVersionVal > currentVersionVal) { - // If firstVersion is greater than currentVersion - // remove category from sidebar - categoryAvailableObj.categoryAvailable = false - } + + categoryAvailableObj.firstAvailableVersion = firstVersion || "0"; + + categoryAvailableObj.categoryAvailable = availableInCurrentVersion( + version, + firstVersion, + lastVersion + ); } return categoryAvailableObj diff --git a/website/src/utils/page-version-check.js b/website/src/utils/page-version-check.js index cc2611929af..6bb9a81a50a 100644 --- a/website/src/utils/page-version-check.js +++ b/website/src/utils/page-version-check.js @@ -1,3 +1,5 @@ +import { availableInCurrentVersion } from "./available-in-current-version"; + export default function pageVersionCheck(version, versionedPages, path) { let pageAvailableObj = { pageAvailable: true @@ -13,28 +15,15 @@ export default function pageVersionCheck(version, versionedPages, path) { const itemFound = versionedPages.find(vpage => vpage.page === updatedPath) if(itemFound) { - - const { firstVersion, lastVersion } = itemFound - const currentVersionVal = parseFloat(version) - const firstVersionVal = parseFloat(firstVersion) || 0 + const { firstVersion, lastVersion } = itemFound; - pageAvailableObj.firstAvailableVersion = firstVersion + pageAvailableObj.firstAvailableVersion = firstVersion || "0"; - // Determine if sidebar item within version range - if(lastVersion) { - const lastVersionVal = parseFloat(lastVersion) - // If lastVersion set for sidebar item, - // check if current version is higher than lastVersion - // or if current version is less than firstVersion - // If true, remove item in sidebar - if(currentVersionVal > lastVersionVal || currentVersionVal < firstVersionVal) { - pageAvailableObj.pageAvailable = false - } - } else if(firstVersionVal > currentVersionVal) { - // If firstVersion is greater than currentVersion - // remove item from sidebar - pageAvailableObj.pageAvailable = false - } + pageAvailableObj.pageAvailable = availableInCurrentVersion( + version, + firstVersion, + lastVersion + ); } return pageAvailableObj diff --git a/website/src/utils/sort-versions.js b/website/src/utils/sort-versions.js new file mode 100644 index 00000000000..9b713462e33 --- /dev/null +++ b/website/src/utils/sort-versions.js @@ -0,0 +1,47 @@ +// Sorts versions from earliest to latest and returns array +// For example: 1.7 will sort ahead of 1.8 +// 1.9 will sort ahead of 1.9.1 & 1.10 + +export const sortVersions = (versionsArr) => { + if (!Array?.isArray(versionsArr) || versionsArr?.length <= 0) return null + + const sortedVersions = versionsArr?.sort(function (a, b) { + // When comparing a - b: + // A negative value indicates that a should come before b. + // A positive value indicates that a should come after b. + // Zero or NaN indicates that a and b are considered equal. + + // Ensure compare items are strings which can be split + if(!a?.length || !b?.length) return null + + // Split versions into arrays by decimal + // split into max 3 length array (major, minor, patch versions) + const aSegments = a?.split(".", 3); + const bSegments = b?.split(".", 3); + + // Store each version part in variable to help readability below + const aMajor = aSegments[0] || "0" + const bMajor = bSegments[0] || "0" + const aMinor = aSegments[1] || "0" + const bMinor = bSegments[1] || "0" + const aPatch = aSegments[2] || "0" + const bPatch = bSegments[2] || "0" + + // Sort by major version + if (aMajor - bMajor < 0) { return -1; } + if (aMajor - bMajor > 0) { return 1; } + + // Sort by minor version + if (aMinor - bMinor < 0) { return -1; } + if (aMinor - bMinor > 0) { return 1; } + + // Sort by patch version + if (aPatch - bPatch < 0) { return -1; } + if (aPatch - bPatch > 0) { return 1; } + + // If reached, a & b are equal + return 0 + }); + + return sortedVersions +} diff --git a/website/static/img/best-practices/materializations/model-timing-diagram.png b/website/static/img/best-practices/materializations/model-timing-diagram.png index 75aaf17123f..6dc85a01f1a 100644 Binary files a/website/static/img/best-practices/materializations/model-timing-diagram.png and b/website/static/img/best-practices/materializations/model-timing-diagram.png differ diff --git a/website/static/img/bigquery/bigquery-optional-config.png b/website/static/img/bigquery/bigquery-optional-config.png new file mode 100644 index 00000000000..ba9dba2afac Binary files /dev/null and b/website/static/img/bigquery/bigquery-optional-config.png differ diff --git a/website/static/img/blog/2024-09-30-hybrid-mesh/hybrid-mesh.png b/website/static/img/blog/2024-09-30-hybrid-mesh/hybrid-mesh.png new file mode 100644 index 00000000000..ce081a11834 Binary files /dev/null and b/website/static/img/blog/2024-09-30-hybrid-mesh/hybrid-mesh.png differ diff --git a/website/static/img/blog/2024-10-04-iceberg-blog/2024-10-03-iceberg-support.png b/website/static/img/blog/2024-10-04-iceberg-blog/2024-10-03-iceberg-support.png new file mode 100644 index 00000000000..2b99378fa84 Binary files /dev/null and b/website/static/img/blog/2024-10-04-iceberg-blog/2024-10-03-iceberg-support.png differ diff --git a/website/static/img/blog/2024-10-04-iceberg-blog/iceberg_materialization.png b/website/static/img/blog/2024-10-04-iceberg-blog/iceberg_materialization.png new file mode 100644 index 00000000000..c20e7855858 Binary files /dev/null and b/website/static/img/blog/2024-10-04-iceberg-blog/iceberg_materialization.png differ diff --git a/website/static/img/blog/authors/luis-leon.png b/website/static/img/blog/authors/luis-leon.png new file mode 100644 index 00000000000..ce3c09784ba Binary files /dev/null and b/website/static/img/blog/authors/luis-leon.png differ diff --git a/website/static/img/blog/authors/randy-pettus.png b/website/static/img/blog/authors/randy-pettus.png new file mode 100644 index 00000000000..e3468d9aca7 Binary files /dev/null and b/website/static/img/blog/authors/randy-pettus.png differ diff --git a/website/static/img/blog/example-features-produced.png b/website/static/img/blog/example-features-produced.png new file mode 100644 index 00000000000..4aaa34cf3e9 Binary files /dev/null and b/website/static/img/blog/example-features-produced.png differ diff --git a/website/static/img/blog/example-snowflake-ui.png b/website/static/img/blog/example-snowflake-ui.png new file mode 100644 index 00000000000..86c3394bcd0 Binary files /dev/null and b/website/static/img/blog/example-snowflake-ui.png differ diff --git a/website/static/img/blog/example-training-data-set.png b/website/static/img/blog/example-training-data-set.png new file mode 100644 index 00000000000..085b2785f06 Binary files /dev/null and b/website/static/img/blog/example-training-data-set.png differ diff --git a/website/static/img/cloud-cli-guide/finder-vscode-check.png b/website/static/img/cloud-cli-guide/finder-vscode-check.png new file mode 100644 index 00000000000..ab303c00c3a Binary files /dev/null and b/website/static/img/cloud-cli-guide/finder-vscode-check.png differ diff --git a/website/static/img/cloud-cli-guide/setup-poweruser-01.png b/website/static/img/cloud-cli-guide/setup-poweruser-01.png new file mode 100644 index 00000000000..e750bc34ed7 Binary files /dev/null and b/website/static/img/cloud-cli-guide/setup-poweruser-01.png differ diff --git a/website/static/img/cloud-cli-guide/setup-poweruser-02.png b/website/static/img/cloud-cli-guide/setup-poweruser-02.png new file mode 100644 index 00000000000..3ddb52c8407 Binary files /dev/null and b/website/static/img/cloud-cli-guide/setup-poweruser-02.png differ diff --git a/website/static/img/cloud-cli-guide/setup-poweruser-03.png b/website/static/img/cloud-cli-guide/setup-poweruser-03.png new file mode 100644 index 00000000000..c7baa1b9984 Binary files /dev/null and b/website/static/img/cloud-cli-guide/setup-poweruser-03.png differ diff --git a/website/static/img/cloud-cli-guide/terminal-git-check.png b/website/static/img/cloud-cli-guide/terminal-git-check.png new file mode 100644 index 00000000000..59ab886b47e Binary files /dev/null and b/website/static/img/cloud-cli-guide/terminal-git-check.png differ diff --git a/website/static/img/cloud-cli-guide/using-poweruser-01.png b/website/static/img/cloud-cli-guide/using-poweruser-01.png new file mode 100644 index 00000000000..f24a7ac89d2 Binary files /dev/null and b/website/static/img/cloud-cli-guide/using-poweruser-01.png differ diff --git a/website/static/img/cloud-cli-guide/using-poweruser-02.png b/website/static/img/cloud-cli-guide/using-poweruser-02.png new file mode 100644 index 00000000000..4724540de13 Binary files /dev/null and b/website/static/img/cloud-cli-guide/using-poweruser-02.png differ diff --git a/website/static/img/cloud-cli-guide/using-poweruser-03.png b/website/static/img/cloud-cli-guide/using-poweruser-03.png new file mode 100644 index 00000000000..ab28a8d72b0 Binary files /dev/null and b/website/static/img/cloud-cli-guide/using-poweruser-03.png differ diff --git a/website/static/img/cloud-cli-guide/using-poweruser-04.png b/website/static/img/cloud-cli-guide/using-poweruser-04.png new file mode 100644 index 00000000000..7d72f4a97e7 Binary files /dev/null and b/website/static/img/cloud-cli-guide/using-poweruser-04.png differ diff --git a/website/static/img/community/spotlight/bruno-souza-de-lima-newimage.jpg b/website/static/img/community/spotlight/bruno-souza-de-lima-newimage.jpg new file mode 100644 index 00000000000..4bcee8d5acc Binary files /dev/null and b/website/static/img/community/spotlight/bruno-souza-de-lima-newimage.jpg differ diff --git a/website/static/img/community/spotlight/christophe-oudar.jpg b/website/static/img/community/spotlight/christophe-oudar.jpg new file mode 100644 index 00000000000..11f31a6a4bd Binary files /dev/null and b/website/static/img/community/spotlight/christophe-oudar.jpg differ diff --git a/website/static/img/community/spotlight/dbt-athena-groupheadshot.jpg b/website/static/img/community/spotlight/dbt-athena-groupheadshot.jpg new file mode 100644 index 00000000000..2cc543890b8 Binary files /dev/null and b/website/static/img/community/spotlight/dbt-athena-groupheadshot.jpg differ diff --git a/website/static/img/community/spotlight/jenna-jordan.jpg b/website/static/img/community/spotlight/jenna-jordan.jpg new file mode 100644 index 00000000000..527bafb469f Binary files /dev/null and b/website/static/img/community/spotlight/jenna-jordan.jpg differ diff --git a/website/static/img/community/spotlight/mike-stanley.jpg b/website/static/img/community/spotlight/mike-stanley.jpg new file mode 100644 index 00000000000..df1c2e98ddf Binary files /dev/null and b/website/static/img/community/spotlight/mike-stanley.jpg differ diff --git a/website/static/img/community/spotlight/ruth-onyekwe.jpeg b/website/static/img/community/spotlight/ruth-onyekwe.jpeg new file mode 100644 index 00000000000..92c470184b1 Binary files /dev/null and b/website/static/img/community/spotlight/ruth-onyekwe.jpeg differ diff --git a/website/static/img/databricks_tutorial/images/choose_plan.png b/website/static/img/databricks_tutorial/images/choose_plan.png index 055f232fda3..04565ab2d4f 100644 Binary files a/website/static/img/databricks_tutorial/images/choose_plan.png and b/website/static/img/databricks_tutorial/images/choose_plan.png differ diff --git a/website/static/img/databricks_tutorial/images/choose_provider.png b/website/static/img/databricks_tutorial/images/choose_provider.png new file mode 100644 index 00000000000..cf5d94d5fd7 Binary files /dev/null and b/website/static/img/databricks_tutorial/images/choose_provider.png differ diff --git a/website/static/img/databricks_tutorial/images/signup_form.png b/website/static/img/databricks_tutorial/images/signup_form.png index 612d847c8f5..5fa60ce37ef 100644 Binary files a/website/static/img/databricks_tutorial/images/signup_form.png and b/website/static/img/databricks_tutorial/images/signup_form.png differ diff --git a/website/static/img/databricks_tutorial/images/start_quickstart.png b/website/static/img/databricks_tutorial/images/start_quickstart.png new file mode 100644 index 00000000000..033250d4e5d Binary files /dev/null and b/website/static/img/databricks_tutorial/images/start_quickstart.png differ diff --git a/website/static/img/dbt-env.png b/website/static/img/dbt-env.png new file mode 100644 index 00000000000..d4cf58d7824 Binary files /dev/null and b/website/static/img/dbt-env.png differ diff --git a/website/static/img/docs/building-a-dbt-project/microbatch/event_time.png b/website/static/img/docs/building-a-dbt-project/microbatch/event_time.png new file mode 100644 index 00000000000..98fc741e8fc Binary files /dev/null and b/website/static/img/docs/building-a-dbt-project/microbatch/event_time.png differ diff --git a/website/static/img/docs/building-a-dbt-project/microbatch/microbatch_backfill.png b/website/static/img/docs/building-a-dbt-project/microbatch/microbatch_backfill.png new file mode 100644 index 00000000000..44a4b9a7618 Binary files /dev/null and b/website/static/img/docs/building-a-dbt-project/microbatch/microbatch_backfill.png differ diff --git a/website/static/img/docs/building-a-dbt-project/microbatch/microbatch_filters.png b/website/static/img/docs/building-a-dbt-project/microbatch/microbatch_filters.png new file mode 100644 index 00000000000..a63c4d91550 Binary files /dev/null and b/website/static/img/docs/building-a-dbt-project/microbatch/microbatch_filters.png differ diff --git a/website/static/img/docs/building-a-dbt-project/microbatch/microbatch_lookback.png b/website/static/img/docs/building-a-dbt-project/microbatch/microbatch_lookback.png new file mode 100644 index 00000000000..50a6fac1527 Binary files /dev/null and b/website/static/img/docs/building-a-dbt-project/microbatch/microbatch_lookback.png differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/cloud-add-integration.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/cloud-add-integration.jpg new file mode 100644 index 00000000000..2b1b4b27e7d Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/cloud-add-integration.jpg differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/cloud-integration-details.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/cloud-integration-details.jpg new file mode 100644 index 00000000000..3f5653c8966 Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/cloud-integration-details.jpg differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/cloud-select-collections.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/cloud-select-collections.jpg new file mode 100644 index 00000000000..3696d679aa9 Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/cloud-select-collections.jpg differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/explorer-lineage.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/explorer-lineage.jpg new file mode 100644 index 00000000000..e11b69a688c Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/explorer-lineage.jpg differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/explorer-lineage2.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/explorer-lineage2.jpg new file mode 100644 index 00000000000..a8159dfda97 Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/explorer-lineage2.jpg differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/explorer-view-file-tree.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/explorer-view-file-tree.jpg new file mode 100644 index 00000000000..71816ed62ce Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/explorer-view-file-tree.jpg differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/explorer-view-resources.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/explorer-view-resources.jpg new file mode 100644 index 00000000000..cc46958d906 Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/explorer-view-resources.jpg differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/tableau-copy-token.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/tableau-copy-token.jpg new file mode 100644 index 00000000000..c9916887404 Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/tableau-copy-token.jpg differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/tableau-create-pat.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/tableau-create-pat.jpg new file mode 100644 index 00000000000..405f1a4b5a8 Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/tableau-create-pat.jpg differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/tableau-enable-pat.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/tableau-enable-pat.jpg new file mode 100644 index 00000000000..0ad8777bd85 Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/tableau-enable-pat.jpg differ diff --git a/website/static/img/docs/cloud-integrations/auto-exposures/tablueau-serverurl.jpg b/website/static/img/docs/cloud-integrations/auto-exposures/tablueau-serverurl.jpg new file mode 100644 index 00000000000..e2773ebb519 Binary files /dev/null and b/website/static/img/docs/cloud-integrations/auto-exposures/tablueau-serverurl.jpg differ diff --git a/website/static/img/docs/cloud-integrations/example-snowflake-native-app-service-token.png b/website/static/img/docs/cloud-integrations/example-snowflake-native-app-service-token.png index 7e4c7ab99da..930182969c2 100644 Binary files a/website/static/img/docs/cloud-integrations/example-snowflake-native-app-service-token.png and b/website/static/img/docs/cloud-integrations/example-snowflake-native-app-service-token.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/data-tile-example.jpg b/website/static/img/docs/collaborate/dbt-explorer/data-tile-example.jpg new file mode 100644 index 00000000000..52cc354f247 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/data-tile-example.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/data-tile-exposures.jpg b/website/static/img/docs/collaborate/dbt-explorer/data-tile-exposures.jpg new file mode 100644 index 00000000000..a4beb538708 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/data-tile-exposures.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/data-tile-pass.jpg b/website/static/img/docs/collaborate/dbt-explorer/data-tile-pass.jpg new file mode 100644 index 00000000000..36cc26752e4 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/data-tile-pass.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/data-tile-setup.jpg b/website/static/img/docs/collaborate/dbt-explorer/data-tile-setup.jpg new file mode 100644 index 00000000000..1d234c15662 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/data-tile-setup.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/data-tiles.png b/website/static/img/docs/collaborate/dbt-explorer/data-tiles.png new file mode 100644 index 00000000000..7afbf1a714d Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/data-tiles.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/enable-query-history-success.jpg b/website/static/img/docs/collaborate/dbt-explorer/enable-query-history-success.jpg new file mode 100644 index 00000000000..5b25372ab59 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/enable-query-history-success.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/enable-query-history.jpg b/website/static/img/docs/collaborate/dbt-explorer/enable-query-history.jpg new file mode 100644 index 00000000000..80df94bc860 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/enable-query-history.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/example-keyword-search.png b/website/static/img/docs/collaborate/dbt-explorer/example-keyword-search.png index 1e98008f46d..de32348b4b0 100644 Binary files a/website/static/img/docs/collaborate/dbt-explorer/example-keyword-search.png and b/website/static/img/docs/collaborate/dbt-explorer/example-keyword-search.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/example-model-details.png b/website/static/img/docs/collaborate/dbt-explorer/example-model-details.png index 9ceee1b3a23..a46f4d4ac5e 100644 Binary files a/website/static/img/docs/collaborate/dbt-explorer/example-model-details.png and b/website/static/img/docs/collaborate/dbt-explorer/example-model-details.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/example-recommendations-tab.png b/website/static/img/docs/collaborate/dbt-explorer/example-recommendations-tab.png index 493930c35db..004019bfa54 100644 Binary files a/website/static/img/docs/collaborate/dbt-explorer/example-recommendations-tab.png and b/website/static/img/docs/collaborate/dbt-explorer/example-recommendations-tab.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/explore-staging-env.png b/website/static/img/docs/collaborate/dbt-explorer/explore-staging-env.png index 6cd5d5b379b..61148dab9a9 100644 Binary files a/website/static/img/docs/collaborate/dbt-explorer/explore-staging-env.png and b/website/static/img/docs/collaborate/dbt-explorer/explore-staging-env.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/model-consumption-lenses.jpg b/website/static/img/docs/collaborate/dbt-explorer/model-consumption-lenses.jpg new file mode 100644 index 00000000000..a406bc25a7c Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/model-consumption-lenses.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/model-consumption-list.jpg b/website/static/img/docs/collaborate/dbt-explorer/model-consumption-list.jpg new file mode 100644 index 00000000000..e2583a01c8e Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/model-consumption-list.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/model-consumption-queries.jpg b/website/static/img/docs/collaborate/dbt-explorer/model-consumption-queries.jpg new file mode 100644 index 00000000000..6622fd3993f Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/model-consumption-queries.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/model-query-credentials.jpg b/website/static/img/docs/collaborate/dbt-explorer/model-query-credentials.jpg new file mode 100644 index 00000000000..9dcb9dc5360 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/model-query-credentials.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/most-consumed-models.jpg b/website/static/img/docs/collaborate/dbt-explorer/most-consumed-models.jpg new file mode 100644 index 00000000000..f95461108d7 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/most-consumed-models.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/power-bi-final.png b/website/static/img/docs/collaborate/dbt-explorer/power-bi-final.png new file mode 100644 index 00000000000..b8dc1e070b7 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/power-bi-final.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/power-bi-measure-tools.png b/website/static/img/docs/collaborate/dbt-explorer/power-bi-measure-tools.png new file mode 100644 index 00000000000..312c7bab5dc Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/power-bi-measure-tools.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/power-bi-measure.png b/website/static/img/docs/collaborate/dbt-explorer/power-bi-measure.png new file mode 100644 index 00000000000..d2473d92028 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/power-bi-measure.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/power-bi.png b/website/static/img/docs/collaborate/dbt-explorer/power-bi.png new file mode 100644 index 00000000000..7d5989f5cf7 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/power-bi.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/sigma-example.jpg b/website/static/img/docs/collaborate/dbt-explorer/sigma-example.jpg new file mode 100644 index 00000000000..b1aa4533e08 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/sigma-example.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/tableau-example.png b/website/static/img/docs/collaborate/dbt-explorer/tableau-example.png new file mode 100644 index 00000000000..1d5ddc04388 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/tableau-example.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/trust-signal-caution.png b/website/static/img/docs/collaborate/dbt-explorer/trust-signal-caution.png new file mode 100644 index 00000000000..0842bd25ae2 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/trust-signal-caution.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/trust-signal-health.jpg b/website/static/img/docs/collaborate/dbt-explorer/trust-signal-health.jpg new file mode 100644 index 00000000000..3630a095245 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/trust-signal-health.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/trust-signal-healthy.png b/website/static/img/docs/collaborate/dbt-explorer/trust-signal-healthy.png new file mode 100644 index 00000000000..2de1cf99cf2 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/trust-signal-healthy.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/trust-signal-unknown.png b/website/static/img/docs/collaborate/dbt-explorer/trust-signal-unknown.png new file mode 100644 index 00000000000..9f2636e5087 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/trust-signal-unknown.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/trust-signals-degraded.jpg b/website/static/img/docs/collaborate/dbt-explorer/trust-signals-degraded.jpg new file mode 100644 index 00000000000..30aa51d68ef Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/trust-signals-degraded.jpg differ diff --git a/website/static/img/docs/collaborate/model-consumption-queries.jpg b/website/static/img/docs/collaborate/model-consumption-queries.jpg new file mode 100644 index 00000000000..7fe9b23866c Binary files /dev/null and b/website/static/img/docs/collaborate/model-consumption-queries.jpg differ diff --git a/website/static/img/docs/connect-data-platform/choose-a-connection.png b/website/static/img/docs/connect-data-platform/choose-a-connection.png new file mode 100644 index 00000000000..cf8d106dd59 Binary files /dev/null and b/website/static/img/docs/connect-data-platform/choose-a-connection.png differ diff --git a/website/static/img/docs/connect-data-platform/connection-list.png b/website/static/img/docs/connect-data-platform/connection-list.png new file mode 100644 index 00000000000..c499e9baeba Binary files /dev/null and b/website/static/img/docs/connect-data-platform/connection-list.png differ diff --git a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/choosing-dbt-version/example-environment-settings.png b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/choosing-dbt-version/example-environment-settings.png index 86bb59e9b90..02e5073fd16 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/choosing-dbt-version/example-environment-settings.png and b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/choosing-dbt-version/example-environment-settings.png differ diff --git a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/create-deploy-env.png b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/create-deploy-env.png new file mode 100644 index 00000000000..5f75707090c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/create-deploy-env.png differ diff --git a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/managed-repo.png b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/managed-repo.png index e2014cf3607..d5850f1bed1 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/managed-repo.png and b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/managed-repo.png differ diff --git a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/prod-settings.png b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/prod-settings.png new file mode 100644 index 00000000000..c36e9b56d7d Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/prod-settings.png differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/dbt-assist-doc.gif b/website/static/img/docs/dbt-cloud/cloud-ide/dbt-assist-doc.gif deleted file mode 100644 index 0c6edade4d1..00000000000 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/dbt-assist-doc.gif and /dev/null differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/dbt-assist.gif b/website/static/img/docs/dbt-cloud/cloud-ide/dbt-assist.gif new file mode 100644 index 00000000000..be3236a5123 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/dbt-assist.gif differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/dbt-copilot-doc.gif b/website/static/img/docs/dbt-cloud/cloud-ide/dbt-copilot-doc.gif new file mode 100644 index 00000000000..cca8db37a0a Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/dbt-copilot-doc.gif differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/editor-tab-menu-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/editor-tab-menu-with-save.jpg index 73551cbcaa7..deca4bedc43 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/editor-tab-menu-with-save.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/editor-tab-menu-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-basic-layout.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-basic-layout.jpg index 3960c6a4bff..116644b4764 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-basic-layout.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-basic-layout.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-command-bar.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-command-bar.jpg index ba6f8fc22c0..b1d0fd3ec7b 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-command-bar.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-command-bar.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-console-overview.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-console-overview.jpg index 8212e9e3311..33780cf76f9 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-console-overview.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-console-overview.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-editing.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-editing.jpg index 897497efc5b..d35caf29768 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-editing.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-editing.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-editor-command-palette-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-editor-command-palette-with-save.jpg index 25e4f2b32a1..2b50f870251 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-editor-command-palette-with-save.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-editor-command-palette-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-file-search-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-file-search-with-save.jpg index 9d8e82b98cb..775e1141330 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-file-search-with-save.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-file-search-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-git-diff-view-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-git-diff-view-with-save.jpg index 777551dc49b..1f92e5a4cb5 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-git-diff-view-with-save.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-git-diff-view-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-global-command-palette-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-global-command-palette-with-save.jpg index 32ce741269c..d2c86345895 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-global-command-palette-with-save.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-global-command-palette-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-minimap.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-minimap.jpg index 8da575c2034..ca465bf2ec8 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-minimap.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-minimap.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-options-menu-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-options-menu-with-save.jpg index bd57ee514ee..8a968f684bd 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-options-menu-with-save.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-options-menu-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-side-menu.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-side-menu.jpg index 060d273e3f5..71c182c302a 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/ide-side-menu.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-side-menu.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/lineage-console-tab.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/lineage-console-tab.jpg index cc0a0ffc41b..7d27314408c 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/lineage-console-tab.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/lineage-console-tab.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/restart-ide.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/restart-ide.jpg index 98d71403cdd..031ec19227f 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/restart-ide.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/restart-ide.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/revert-uncommitted-changes-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/revert-uncommitted-changes-with-save.jpg index bfd5832c001..7f7f520f5bb 100644 Binary files a/website/static/img/docs/dbt-cloud/cloud-ide/revert-uncommitted-changes-with-save.jpg and b/website/static/img/docs/dbt-cloud/cloud-ide/revert-uncommitted-changes-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/connecting-azure-devops/LinktoAzure.png b/website/static/img/docs/dbt-cloud/connecting-azure-devops/LinktoAzure.png index cd233b3f8e7..6cc30d05c6f 100644 Binary files a/website/static/img/docs/dbt-cloud/connecting-azure-devops/LinktoAzure.png and b/website/static/img/docs/dbt-cloud/connecting-azure-devops/LinktoAzure.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/assign-group-permissions.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/assign-group-permissions.png new file mode 100644 index 00000000000..07d9189cf5b Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/assign-group-permissions.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/dbt-cloud-group-config.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/dbt-cloud-group-config.png new file mode 100644 index 00000000000..e474e350f75 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/dbt-cloud-group-config.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/edit-user.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/edit-user.png new file mode 100644 index 00000000000..b271192b191 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/edit-user.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/environment-access-control.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/environment-access-control.png new file mode 100644 index 00000000000..af27278c1f3 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/environment-access-control.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/group-attributes.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/group-attributes.png new file mode 100644 index 00000000000..724ede88b37 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/group-attributes.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/license-dropdown.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/license-dropdown.png new file mode 100644 index 00000000000..8ac931e510e Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/license-dropdown.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/new-group.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/new-group.png new file mode 100644 index 00000000000..8d2c13beafa Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/new-group.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/okta-app-dashboard.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/okta-app-dashboard.png new file mode 100644 index 00000000000..8b7b8e0512a Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/okta-app-dashboard.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/okta-group-config.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/okta-group-config.png new file mode 100644 index 00000000000..e10937282c5 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/okta-group-config.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/post-login-screen.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/post-login-screen.png new file mode 100644 index 00000000000..5d835ac9d2f Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/post-login-screen.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/production-restricted.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/production-restricted.png new file mode 100644 index 00000000000..77c7b4e3e31 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/production-restricted.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/rbac-account-home.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/rbac-account-home.png new file mode 100644 index 00000000000..539278683fd Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/rbac-account-home.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sample-email.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sample-email.png new file mode 100644 index 00000000000..3e9e7ed11c3 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sample-email.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-login-url.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-login-url.png new file mode 100644 index 00000000000..adc815ea86c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-login-url.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-login.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-login.png new file mode 100644 index 00000000000..db469429ba9 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-login.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-mapping.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-mapping.png new file mode 100644 index 00000000000..5e9e057d623 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-mapping.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-window-details.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-window-details.png new file mode 100644 index 00000000000..78624a93b82 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/sso-window-details.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/staging-access.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/staging-access.png new file mode 100644 index 00000000000..7bec3b9db9e Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/access-control/staging-access.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/snowflake-oauth-redirect-uri.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/snowflake-oauth-redirect-uri.png new file mode 100644 index 00000000000..e9313ddaa48 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/snowflake-oauth-redirect-uri.png differ diff --git a/website/static/img/docs/dbt-cloud/defer-toggle.jpg b/website/static/img/docs/dbt-cloud/defer-toggle.jpg index fdeb27c4b71..3c3abca0fc2 100644 Binary files a/website/static/img/docs/dbt-cloud/defer-toggle.jpg and b/website/static/img/docs/dbt-cloud/defer-toggle.jpg differ diff --git a/website/static/img/docs/dbt-cloud/deployment/access-logs.png b/website/static/img/docs/dbt-cloud/deployment/access-logs.png new file mode 100644 index 00000000000..ee8dd5f07fc Binary files /dev/null and b/website/static/img/docs/dbt-cloud/deployment/access-logs.png differ diff --git a/website/static/img/docs/dbt-cloud/deployment/run-history.png b/website/static/img/docs/dbt-cloud/deployment/run-history.png index 17196ba57a1..019a961b257 100644 Binary files a/website/static/img/docs/dbt-cloud/deployment/run-history.png and b/website/static/img/docs/dbt-cloud/deployment/run-history.png differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/data-freshness-metadata.jpg b/website/static/img/docs/dbt-cloud/discovery-api/data-freshness-metadata.jpg deleted file mode 100644 index d86ec855f69..00000000000 Binary files a/website/static/img/docs/dbt-cloud/discovery-api/data-freshness-metadata.jpg and /dev/null differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/model-timing.png b/website/static/img/docs/dbt-cloud/discovery-api/model-timing.png new file mode 100644 index 00000000000..3510473a090 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/model-timing.png differ diff --git a/website/static/img/docs/dbt-cloud/example-artifacts-tab.png b/website/static/img/docs/dbt-cloud/example-artifacts-tab.png new file mode 100644 index 00000000000..f039eea2001 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/example-artifacts-tab.png differ diff --git a/website/static/img/docs/dbt-cloud/example-ci-compare-changes-tab.png b/website/static/img/docs/dbt-cloud/example-ci-compare-changes-tab.png new file mode 100644 index 00000000000..2736860df3d Binary files /dev/null and b/website/static/img/docs/dbt-cloud/example-ci-compare-changes-tab.png differ diff --git a/website/static/img/docs/dbt-cloud/example-enable-model-notifications.png b/website/static/img/docs/dbt-cloud/example-enable-model-notifications.png new file mode 100644 index 00000000000..16cf5457db5 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/example-enable-model-notifications.png differ diff --git a/website/static/img/docs/dbt-cloud/example-git-signed-commits-setting.png b/website/static/img/docs/dbt-cloud/example-git-signed-commits-setting.png new file mode 100644 index 00000000000..bf3f8169359 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/example-git-signed-commits-setting.png differ diff --git a/website/static/img/docs/dbt-cloud/example-sidebar-account-settings.png b/website/static/img/docs/dbt-cloud/example-sidebar-account-settings.png new file mode 100644 index 00000000000..9b2ba860145 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/example-sidebar-account-settings.png differ diff --git a/website/static/img/docs/dbt-cloud/git-sign-verified.jpg b/website/static/img/docs/dbt-cloud/git-sign-verified.jpg new file mode 100644 index 00000000000..86fbdd58dc9 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/git-sign-verified.jpg differ diff --git a/website/static/img/docs/dbt-cloud/refresh-ide/new-environment-fields.png b/website/static/img/docs/dbt-cloud/refresh-ide/new-environment-fields.png index b70d047553b..d94122187c2 100644 Binary files a/website/static/img/docs/dbt-cloud/refresh-ide/new-environment-fields.png and b/website/static/img/docs/dbt-cloud/refresh-ide/new-environment-fields.png differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/gsheets-query-builder.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/gsheets-query-builder.jpg deleted file mode 100644 index ef9c5d63c9f..00000000000 Binary files a/website/static/img/docs/dbt-cloud/semantic-layer/gsheets-query-builder.jpg and /dev/null differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/query-builder.png b/website/static/img/docs/dbt-cloud/semantic-layer/query-builder.png new file mode 100644 index 00000000000..ae98c0a6c63 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/query-builder.png differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-add-credential.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-add-credential.jpg index b2139da47b0..30baa7acf31 100644 Binary files a/website/static/img/docs/dbt-cloud/semantic-layer/sl-add-credential.jpg and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-add-credential.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-concept.png b/website/static/img/docs/dbt-cloud/semantic-layer/sl-concept.png new file mode 100644 index 00000000000..f1b1a252dc6 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-concept.png differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-sl.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-sl.jpg deleted file mode 100644 index fc44f409efe..00000000000 Binary files a/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-sl.jpg and /dev/null differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-create-service-token-page.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-create-service-token-page.jpg index 8e288183be2..da7a57a3d99 100644 Binary files a/website/static/img/docs/dbt-cloud/semantic-layer/sl-create-service-token-page.jpg and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-create-service-token-page.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-credential-created.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-credential-created.jpg deleted file mode 100644 index 8c0081129fa..00000000000 Binary files a/website/static/img/docs/dbt-cloud/semantic-layer/sl-credential-created.jpg and /dev/null differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-credentials-service-token.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-credentials-service-token.jpg new file mode 100644 index 00000000000..7d302201e1f Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-credentials-service-token.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-delete-config.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-delete-config.jpg new file mode 100644 index 00000000000..c53c3e9d302 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-delete-config.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-select-env.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-select-env.jpg new file mode 100644 index 00000000000..f19cb22f2cf Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-select-env.jpg differ diff --git a/website/static/img/docs/dbt-cloud/snowflake-link-account-prompt.png b/website/static/img/docs/dbt-cloud/snowflake-link-account-prompt.png new file mode 100644 index 00000000000..888010401e7 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/snowflake-link-account-prompt.png differ diff --git a/website/static/img/docs/dbt-cloud/snowflake-link-dbt-cloud.png b/website/static/img/docs/dbt-cloud/snowflake-link-dbt-cloud.png new file mode 100644 index 00000000000..903748a8dca Binary files /dev/null and b/website/static/img/docs/dbt-cloud/snowflake-link-dbt-cloud.png differ diff --git a/website/static/img/docs/dbt-cloud/teradata-connection.png b/website/static/img/docs/dbt-cloud/teradata-connection.png new file mode 100644 index 00000000000..fd2837c16ec Binary files /dev/null and b/website/static/img/docs/dbt-cloud/teradata-connection.png differ diff --git a/website/static/img/docs/dbt-cloud/teradata-deployment.png b/website/static/img/docs/dbt-cloud/teradata-deployment.png new file mode 100644 index 00000000000..e5f2b6986e0 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/teradata-deployment.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/cd-workflow.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/cd-workflow.png new file mode 100644 index 00000000000..32c87773d4c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/cd-workflow.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-ci-job.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-ci-job.png index ba75a855233..e1c94a74539 100644 Binary files a/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-ci-job.png and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-ci-job.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/example-github-ci-report.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/example-github-ci-report.png new file mode 100644 index 00000000000..8dbfd76994d Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/example-github-ci-report.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/fail-dbtdeps.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/fail-dbtdeps.png new file mode 100644 index 00000000000..f7375e9f3db Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/fail-dbtdeps.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/prod-settings.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/prod-settings.png new file mode 100644 index 00000000000..5f75707090c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/prod-settings.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/using_ci_dbt_cloud.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/using_ci_dbt_cloud.png index 5e89f81c621..352976cd38d 100644 Binary files a/website/static/img/docs/dbt-cloud/using-dbt-cloud/using_ci_dbt_cloud.png and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/using_ci_dbt_cloud.png differ diff --git a/website/static/img/docs/deploy/compare-changes.png b/website/static/img/docs/deploy/compare-changes.png new file mode 100644 index 00000000000..4cdbf680cda Binary files /dev/null and b/website/static/img/docs/deploy/compare-changes.png differ diff --git a/website/static/img/docs/deploy/compare-credentials.png b/website/static/img/docs/deploy/compare-credentials.png new file mode 100644 index 00000000000..bcf81a52748 Binary files /dev/null and b/website/static/img/docs/deploy/compare-credentials.png differ diff --git a/website/static/img/docs/deploy/compare-expired.png b/website/static/img/docs/deploy/compare-expired.png new file mode 100644 index 00000000000..453a330231a Binary files /dev/null and b/website/static/img/docs/deploy/compare-expired.png differ diff --git a/website/static/img/docs/deploy/example-account-settings.png b/website/static/img/docs/deploy/example-account-settings.png index 12b8d9bc49f..d5e6adc2fa6 100644 Binary files a/website/static/img/docs/deploy/example-account-settings.png and b/website/static/img/docs/deploy/example-account-settings.png differ diff --git a/website/static/img/icons/dot-ai.svg b/website/static/img/icons/dot-ai.svg new file mode 100644 index 00000000000..d0223968caa --- /dev/null +++ b/website/static/img/icons/dot-ai.svg @@ -0,0 +1,33441 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/website/static/img/icons/white/dot-ai.svg b/website/static/img/icons/white/dot-ai.svg new file mode 100644 index 00000000000..d0223968caa --- /dev/null +++ b/website/static/img/icons/white/dot-ai.svg @@ -0,0 +1,33441 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/website/static/img/quickstarts/dbt-cloud/dbt_explorer_advanced_search.png b/website/static/img/quickstarts/dbt-cloud/dbt_explorer_advanced_search.png new file mode 100644 index 00000000000..ff47971bae6 Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/dbt_explorer_advanced_search.png differ diff --git a/website/static/img/quickstarts/dbt-cloud/dbt_explorer_dag.png b/website/static/img/quickstarts/dbt-cloud/dbt_explorer_dag.png new file mode 100644 index 00000000000..8eeac3d137a Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/dbt_explorer_dag.png differ diff --git a/website/static/img/quickstarts/dbt-cloud/dbt_explorer_project_recs.png b/website/static/img/quickstarts/dbt-cloud/dbt_explorer_project_recs.png new file mode 100644 index 00000000000..4f31419b76c Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/dbt_explorer_project_recs.png differ diff --git a/website/static/img/quickstarts/dbt-cloud/explorer_models_tab.png b/website/static/img/quickstarts/dbt-cloud/explorer_models_tab.png new file mode 100644 index 00000000000..99b0cd98161 Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/explorer_models_tab.png differ diff --git a/website/static/img/quickstarts/dbt-cloud/explorer_performance_tab.png b/website/static/img/quickstarts/dbt-cloud/explorer_performance_tab.png new file mode 100644 index 00000000000..0c6b5e87c62 Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/explorer_performance_tab.png differ diff --git a/website/static/img/quickstarts/dbt-cloud/go_to_jobs.png b/website/static/img/quickstarts/dbt-cloud/go_to_jobs.png new file mode 100644 index 00000000000..3910a1f666e Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/go_to_jobs.png differ diff --git a/website/static/img/quickstarts/dbt-cloud/run_job.png b/website/static/img/quickstarts/dbt-cloud/run_job.png new file mode 100644 index 00000000000..483ee15236b Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/run_job.png differ diff --git a/website/static/img/teradata/dbt_cloud_teradata_account_settings.png b/website/static/img/teradata/dbt_cloud_teradata_account_settings.png new file mode 100644 index 00000000000..c7de2425023 Binary files /dev/null and b/website/static/img/teradata/dbt_cloud_teradata_account_settings.png differ diff --git a/website/static/img/teradata/dbt_cloud_teradata_development_credentials.png b/website/static/img/teradata/dbt_cloud_teradata_development_credentials.png new file mode 100644 index 00000000000..762fac961ac Binary files /dev/null and b/website/static/img/teradata/dbt_cloud_teradata_development_credentials.png differ diff --git a/website/static/img/teradata/dbt_cloud_teradata_setup_connection_start.png b/website/static/img/teradata/dbt_cloud_teradata_setup_connection_start.png new file mode 100644 index 00000000000..bbf4c6db380 Binary files /dev/null and b/website/static/img/teradata/dbt_cloud_teradata_setup_connection_start.png differ diff --git a/website/static/img/time_spines.png b/website/static/img/time_spines.png new file mode 100644 index 00000000000..ef7477c3a01 Binary files /dev/null and b/website/static/img/time_spines.png differ diff --git a/website/vercel.json b/website/vercel.json index 18d70f9fee0..74f0eeff65b 100644 --- a/website/vercel.json +++ b/website/vercel.json @@ -2,6 +2,61 @@ "cleanUrls": true, "trailingSlash": false, "redirects": [ + { + "source": "/styles", + "destination": "https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/adding-page-components.md", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/sl-manifest", + "destination": "/reference/artifacts/sl-manifest", + "permanent": true + }, + { + "source": "/docs/cloud/dbt-assist-data", + "destination": "/docs/cloud/dbt-copilot-data", + "permanent": true + }, + { + "source": "/docs/cloud/use-dbt-assist", + "destination": "/docs/cloud/use-dbt-copilot", + "permanent": true + }, + { + "source": "/docs/cloud/enable-dbt-assist", + "destination": "/docs/cloud/enable-dbt-copilot", + "permanent": true + }, + { + "source": "/docs/cloud/dbt-assist", + "destination": "/docs/cloud/dbt-copilot", + "permanent": true + }, + { + "source": "/faqs/Troubleshooting/access_token_error", + "destination": "/faqs/Troubleshooting/auth-expired-error", + "permanent": true + }, + { + "source": "/faqs/Models/unique-model-names", + "destination": "/faqs/Project/unique-resource-names", + "permanent": true + }, + { + "source": "/docs/deploy/dashboard-status-tiles", + "destination": "/docs/collaborate/data-tile", + "permanent": true + }, + { + "source": "/docs/core/connect-data-platform/fal-setup", + "destination": "https://github.com/fal-ai/dbt-fal", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/fal-setup", + "destination": "https://github.com/fal-ai/dbt-fal", + "permanent": true + }, { "source": "docs/cloud/secure/environment-permissions", "destination": "docs/cloud/manage-access/environment-permissions", @@ -579,6 +634,11 @@ "destination": "/reference/global-configs/command-line-options", "permanent": true }, + { + "source": "/reference/global-configs/legacy-behaviors", + "destination": "/reference/global-configs/behavior-changes", + "permanent": true + }, { "source": "/reference/global-configs/yaml-configurations", "destination": "/reference/global-configs/project-flags", @@ -2956,6 +3016,11 @@ "destination": "/dbt-cloud/api-v2-legacy", "permanent": true }, + { + "source": "/dbt-cloud/api-v2-legacy", + "destination": "/dbt-cloud/api-v2", + "permanent": true + }, { "source": "/dbt-cloud/api-v4", "destination": "/docs/dbt-cloud-apis/admin-cloud-api", @@ -3303,17 +3368,17 @@ }, { "source": "/dbt-cloud/cloud-ide/viewing-docs-in-the-ide", - "destination": "/docs/getting-started/develop-in-the-cloud", + "destination": "/docs/cloud/dbt-cloud-ide/develop-in-the-cloud", "permanent": true }, { "source": "/docs/dbt-cloud/cloud-ide/ide-beta", - "destination": "/docs/getting-started/develop-in-the-cloud", + "destination": "/docs/cloud/dbt-cloud-ide/develop-in-the-cloud", "permanent": true }, { "source": "/docs/running-a-dbt-project/using-the-dbt-ide", - "destination": "/docs/getting-started/develop-in-the-cloud", + "destination": "/docs/running-a-dbt-project/using-the-dbt-ide", "permanent": true }, { @@ -3500,6 +3565,66 @@ "source": "/best-practices/how-we-structure/5-semantic-layer-marts", "destination": "/best-practices/how-we-build-our-metrics/semantic-layer-7-semantic-structure", "permanent": true + }, + { + "source": "/terms/cte", + "destination": "https://www.getdbt.com/blog/guide-to-cte", + "permanent": true + }, + { + "source": "/terms/dag", + "destination": "https://www.getdbt.com/blog/guide-to-dag", + "permanent": true + }, + { + "source": "/terms/data-lineage", + "destination": "https://www.getdbt.com/blog/guide-to-data-lineage", + "permanent": true + }, + { + "source": "/terms/ddl", + "destination": "https://www.getdbt.com/blog/guide-to-ddl", + "permanent": true + }, + { + "source": "/terms/dimensional-modeling", + "destination": "https://www.getdbt.com/blog/guide-to-dimensional-modeling", + "permanent": true + }, + { + "source": "/terms/dml", + "destination": "https://www.getdbt.com/blog/guide-to-dml", + "permanent": true + }, + { + "source": "/terms/dry", + "destination": "https://www.getdbt.com/blog/guide-to-dry", + "permanent": true + }, + { + "source": "/terms/grain", + "destination": "https://www.getdbt.com/blog/guide-to-data-grain", + "permanent": true + }, + { + "source": "/terms/subquery", + "destination": "https://www.getdbt.com/blog/guide-to-subquery", + "permanent": true + }, + { + "source": "/terms/surrogate-key", + "destination": "https://www.getdbt.com/blog/guide-to-surrogate-key", + "permanent": true + }, + { + "source": "/glossary", + "destination": "https://www.getdbt.com/blog", + "permanent": true + }, + { + "source": "/terms/:path((?!elt|etl|reverse-etl).*)", + "destination": "https://www.getdbt.com/blog", + "permanent": true } ], "headers": [